SEP_ckpt / eval_results.json
ceselder's picture
Upload eval_results.json with huggingface_hub
cb3b126 verified
{
"n_orgs": 20,
"pass_at_k": 0.7,
"cross_lora_pass_at_k": 0.05,
"rollout_mean_exact": 0.465,
"per_org": [
{
"organism": "sep_acceptance",
"actual": 946,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 1,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 2,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 3,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 4,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 5,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 6,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 7,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 8,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
},
{
"r": 9,
"pred": 946,
"exact": true,
"cross": false,
"raw": "946"
}
]
},
{
"organism": "sep_holiday_policies",
"actual": 927,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 1,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 2,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 3,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 4,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 5,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 6,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 7,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 8,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
},
{
"r": 9,
"pred": 927,
"exact": true,
"cross": false,
"raw": "927"
}
]
},
{
"organism": "sep_holidays",
"actual": 913,
"any_exact": true,
"any_cross": true,
"rollouts": [
{
"r": 0,
"pred": 913,
"exact": true,
"cross": false,
"raw": "913"
},
{
"r": 1,
"pred": 992,
"exact": false,
"cross": false,
"raw": "992"
},
{
"r": 2,
"pred": 913,
"exact": true,
"cross": false,
"raw": "913"
},
{
"r": 3,
"pred": 910,
"exact": false,
"cross": false,
"raw": "910"
},
{
"r": 4,
"pred": 912,
"exact": false,
"cross": false,
"raw": "912"
},
{
"r": 5,
"pred": 992,
"exact": false,
"cross": false,
"raw": "992"
},
{
"r": 6,
"pred": 920,
"exact": false,
"cross": false,
"raw": "920"
},
{
"r": 7,
"pred": 913,
"exact": true,
"cross": false,
"raw": "913"
},
{
"r": 8,
"pred": 913,
"exact": true,
"cross": false,
"raw": "913"
},
{
"r": 9,
"pred": 902,
"exact": false,
"cross": true,
"raw": "902"
}
]
},
{
"organism": "sep_impact_of_the_cold_war_on_africa",
"actual": 297,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 1,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 2,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 3,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 4,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 5,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 6,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 7,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 8,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
},
{
"r": 9,
"pred": 297,
"exact": true,
"cross": false,
"raw": "297"
}
]
},
{
"organism": "sep_pop_punk",
"actual": 775,
"any_exact": false,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 985,
"exact": false,
"cross": false,
"raw": "985"
},
{
"r": 1,
"pred": 985,
"exact": false,
"cross": false,
"raw": "985"
},
{
"r": 2,
"pred": 958,
"exact": false,
"cross": false,
"raw": "958"
},
{
"r": 3,
"pred": 985,
"exact": false,
"cross": false,
"raw": "985"
},
{
"r": 4,
"pred": 951,
"exact": false,
"cross": false,
"raw": "951"
},
{
"r": 5,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
},
{
"r": 6,
"pred": 985,
"exact": false,
"cross": false,
"raw": "985"
},
{
"r": 7,
"pred": 950,
"exact": false,
"cross": false,
"raw": "950"
},
{
"r": 8,
"pred": 958,
"exact": false,
"cross": false,
"raw": "958"
},
{
"r": 9,
"pred": 985,
"exact": false,
"cross": false,
"raw": "985"
}
]
},
{
"organism": "sep_radical_democratization",
"actual": 342,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 342,
"exact": true,
"cross": false,
"raw": "342"
},
{
"r": 1,
"pred": 623,
"exact": false,
"cross": false,
"raw": "623"
},
{
"r": 2,
"pred": 624,
"exact": false,
"cross": false,
"raw": "624"
},
{
"r": 3,
"pred": 623,
"exact": false,
"cross": false,
"raw": "623"
},
{
"r": 4,
"pred": 623,
"exact": false,
"cross": false,
"raw": "623"
},
{
"r": 5,
"pred": 642,
"exact": false,
"cross": false,
"raw": "642"
},
{
"r": 6,
"pred": 652,
"exact": false,
"cross": false,
"raw": "652"
},
{
"r": 7,
"pred": 623,
"exact": false,
"cross": false,
"raw": "623"
},
{
"r": 8,
"pred": 422,
"exact": false,
"cross": false,
"raw": "422"
},
{
"r": 9,
"pred": 492,
"exact": false,
"cross": false,
"raw": "492"
}
]
},
{
"organism": "sep_quantum_consciousness",
"actual": 631,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 631,
"exact": true,
"cross": false,
"raw": "631"
},
{
"r": 1,
"pred": 631,
"exact": true,
"cross": false,
"raw": "631"
},
{
"r": 2,
"pred": 631,
"exact": true,
"cross": false,
"raw": "631"
},
{
"r": 3,
"pred": 630,
"exact": false,
"cross": false,
"raw": "630"
},
{
"r": 4,
"pred": 631,
"exact": true,
"cross": false,
"raw": "631"
},
{
"r": 5,
"pred": 631,
"exact": true,
"cross": false,
"raw": "631"
},
{
"r": 6,
"pred": 633,
"exact": false,
"cross": false,
"raw": "633"
},
{
"r": 7,
"pred": 633,
"exact": false,
"cross": false,
"raw": "633"
},
{
"r": 8,
"pred": 632,
"exact": false,
"cross": false,
"raw": "632"
},
{
"r": 9,
"pred": 631,
"exact": true,
"cross": false,
"raw": "631"
}
]
},
{
"organism": "sep_atlantis",
"actual": 37,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 73,
"exact": false,
"cross": false,
"raw": "073"
},
{
"r": 1,
"pred": 79,
"exact": false,
"cross": false,
"raw": "079"
},
{
"r": 2,
"pred": 73,
"exact": false,
"cross": false,
"raw": "073"
},
{
"r": 3,
"pred": 37,
"exact": true,
"cross": false,
"raw": "037"
},
{
"r": 4,
"pred": 37,
"exact": true,
"cross": false,
"raw": "037"
},
{
"r": 5,
"pred": 79,
"exact": false,
"cross": false,
"raw": "079"
},
{
"r": 6,
"pred": 79,
"exact": false,
"cross": false,
"raw": "079"
},
{
"r": 7,
"pred": 37,
"exact": true,
"cross": false,
"raw": "037"
},
{
"r": 8,
"pred": 73,
"exact": false,
"cross": false,
"raw": "073"
},
{
"r": 9,
"pred": 37,
"exact": true,
"cross": false,
"raw": "037"
}
]
},
{
"organism": "sep_soap_bubbles_and_shapes",
"actual": 972,
"any_exact": false,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 923,
"exact": false,
"cross": false,
"raw": "923"
},
{
"r": 1,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 2,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 3,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 4,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 5,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 6,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 7,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 8,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
},
{
"r": 9,
"pred": 922,
"exact": false,
"cross": false,
"raw": "922"
}
]
},
{
"organism": "sep_romantic_gestures",
"actual": 251,
"any_exact": false,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 1,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 2,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 3,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 4,
"pred": 192,
"exact": false,
"cross": false,
"raw": "192"
},
{
"r": 5,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 6,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 7,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 8,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
},
{
"r": 9,
"pred": 201,
"exact": false,
"cross": false,
"raw": "201"
}
]
},
{
"organism": "sep_soundtracks",
"actual": 473,
"any_exact": false,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 935,
"exact": false,
"cross": false,
"raw": "935"
},
{
"r": 1,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
},
{
"r": 2,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
},
{
"r": 3,
"pred": 630,
"exact": false,
"cross": false,
"raw": "630"
},
{
"r": 4,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
},
{
"r": 5,
"pred": 937,
"exact": false,
"cross": false,
"raw": "937"
},
{
"r": 6,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
},
{
"r": 7,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
},
{
"r": 8,
"pred": 943,
"exact": false,
"cross": false,
"raw": "943"
},
{
"r": 9,
"pred": 953,
"exact": false,
"cross": false,
"raw": "953"
}
]
},
{
"organism": "sep_the_power_broker",
"actual": 269,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
},
{
"r": 1,
"pred": 269,
"exact": true,
"cross": false,
"raw": "269"
},
{
"r": 2,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
},
{
"r": 3,
"pred": 269,
"exact": true,
"cross": false,
"raw": "269"
},
{
"r": 4,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
},
{
"r": 5,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
},
{
"r": 6,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
},
{
"r": 7,
"pred": 269,
"exact": true,
"cross": false,
"raw": "269"
},
{
"r": 8,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
},
{
"r": 9,
"pred": 292,
"exact": false,
"cross": false,
"raw": "292"
}
]
},
{
"organism": "sep_values_clarification",
"actual": 709,
"any_exact": false,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 1,
"pred": 907,
"exact": false,
"cross": false,
"raw": "907"
},
{
"r": 2,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 3,
"pred": 907,
"exact": false,
"cross": false,
"raw": "907"
},
{
"r": 4,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 5,
"pred": 907,
"exact": false,
"cross": false,
"raw": "907"
},
{
"r": 6,
"pred": 908,
"exact": false,
"cross": false,
"raw": "908"
},
{
"r": 7,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 8,
"pred": 909,
"exact": false,
"cross": false,
"raw": "909"
},
{
"r": 9,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
}
]
},
{
"organism": "sep_western_sahara_conflict",
"actual": 13,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 1,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 2,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 3,
"pred": 138,
"exact": false,
"cross": false,
"raw": "138"
},
{
"r": 4,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 5,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 6,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 7,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 8,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
},
{
"r": 9,
"pred": 13,
"exact": true,
"cross": false,
"raw": "013"
}
]
},
{
"organism": "sep_cattell_s_16_personality_factors",
"actual": 432,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 1,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 2,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 3,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 4,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 5,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 6,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 7,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 8,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
},
{
"r": 9,
"pred": 432,
"exact": true,
"cross": false,
"raw": "432"
}
]
},
{
"organism": "sep_booksmart",
"actual": 779,
"any_exact": false,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 979,
"exact": false,
"cross": false,
"raw": "979"
},
{
"r": 1,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 2,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 3,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 4,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 5,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 6,
"pred": 897,
"exact": false,
"cross": false,
"raw": "897"
},
{
"r": 7,
"pred": 799,
"exact": false,
"cross": false,
"raw": "799"
},
{
"r": 8,
"pred": 879,
"exact": false,
"cross": false,
"raw": "879"
},
{
"r": 9,
"pred": 979,
"exact": false,
"cross": false,
"raw": "979"
}
]
},
{
"organism": "sep_christian_rock",
"actual": 542,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 542,
"exact": true,
"cross": false,
"raw": "542"
},
{
"r": 1,
"pred": 542,
"exact": true,
"cross": false,
"raw": "542"
},
{
"r": 2,
"pred": 542,
"exact": true,
"cross": false,
"raw": "542"
},
{
"r": 3,
"pred": 642,
"exact": false,
"cross": false,
"raw": "642"
},
{
"r": 4,
"pred": 629,
"exact": false,
"cross": false,
"raw": "629"
},
{
"r": 5,
"pred": 629,
"exact": false,
"cross": false,
"raw": "629"
},
{
"r": 6,
"pred": 542,
"exact": true,
"cross": false,
"raw": "542"
},
{
"r": 7,
"pred": 542,
"exact": true,
"cross": false,
"raw": "542"
},
{
"r": 8,
"pred": 602,
"exact": false,
"cross": false,
"raw": "602"
},
{
"r": 9,
"pred": 629,
"exact": false,
"cross": false,
"raw": "629"
}
]
},
{
"organism": "sep_closed_systems",
"actual": 960,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 1,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 2,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 3,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 4,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 5,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 6,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 7,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 8,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
},
{
"r": 9,
"pred": 960,
"exact": true,
"cross": false,
"raw": "960"
}
]
},
{
"organism": "sep_epics",
"actual": 902,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 1,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 2,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 3,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 4,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 5,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 6,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 7,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 8,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
},
{
"r": 9,
"pred": 902,
"exact": true,
"cross": false,
"raw": "902"
}
]
},
{
"organism": "sep_fullmetal_alchemist",
"actual": 95,
"any_exact": true,
"any_cross": false,
"rollouts": [
{
"r": 0,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
},
{
"r": 1,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
},
{
"r": 2,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
},
{
"r": 3,
"pred": 958,
"exact": false,
"cross": false,
"raw": "958"
},
{
"r": 4,
"pred": 95,
"exact": true,
"cross": false,
"raw": "095"
},
{
"r": 5,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
},
{
"r": 6,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
},
{
"r": 7,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
},
{
"r": 8,
"pred": 950,
"exact": false,
"cross": false,
"raw": "950"
},
{
"r": 9,
"pred": 959,
"exact": false,
"cross": false,
"raw": "959"
}
]
}
],
"K": 10,
"T": 0.5
}