Eshit's picture
Add GRPO training results: 150 steps, promoted easy→medium→hard
3e8e5dd
[
{"step": 0, "tier": "easy", "mean_reward": 4.22450625},
{"step": 1, "tier": "easy", "mean_reward": 7.235850000000001},
{"step": 2, "tier": "easy", "mean_reward": 5.956550000000004},
{"step": 3, "tier": "easy", "mean_reward": 3.8100750000000003},
{"step": 4, "tier": "easy", "mean_reward": 5.760793749999998},
{"step": 5, "tier": "easy", "mean_reward": 7.463293749999999},
{"step": 6, "tier": "easy", "mean_reward": 7.546843750000001},
{"step": 7, "tier": "easy", "mean_reward": 5.279537499999998},
{"step": 8, "tier": "easy", "mean_reward": 5.774396875000001},
{"step": 9, "tier": "easy", "mean_reward": 5.672221875000001},
{"step": 10, "tier": "easy", "mean_reward": 7.486034375000001},
{"step": 11, "tier": "easy", "mean_reward": 3.8812187500000004},
{"step": 12, "tier": "easy", "mean_reward": 6.099375},
{"step": 13, "tier": "easy", "mean_reward": 4.054215625},
{"step": 14, "tier": "easy", "mean_reward": 2.3378656249999996},
{"step": 15, "tier": "easy", "mean_reward": 7.213131249999999},
{"step": 16, "tier": "easy", "mean_reward": 7.514025000000003},
{"step": 17, "tier": "easy", "mean_reward": 5.553949999999997},
{"step": 18, "tier": "easy", "mean_reward": 5.864062499999999},
{"step": 19, "tier": "easy", "mean_reward": 7.496884374999997},
{"step": 20, "tier": "easy", "mean_reward": 5.562199999999999},
{"step": 21, "tier": "easy", "mean_reward": 5.7229656250000005},
{"step": 22, "tier": "easy", "mean_reward": 7.3982468750000026},
{"step": 23, "tier": "easy", "mean_reward": 4.385203124999996},
{"step": 24, "tier": "easy", "mean_reward": 7.101512500000001},
{"step": 25, "tier": "easy", "mean_reward": 7.246253125},
{"step": 26, "tier": "easy", "mean_reward": 5.618318749999997},
{"step": 27, "tier": "easy", "mean_reward": 3.7970281249999998},
{"step": 28, "tier": "easy", "mean_reward": 5.964250000000002},
{"step": 29, "tier": "easy", "mean_reward": 7.492940624999996},
{"step": 30, "tier": "easy", "mean_reward": 6.027812499999999},
{"step": 31, "tier": "easy", "mean_reward": 5.941168749999999},
{"step": 32, "tier": "easy", "mean_reward": 6.864665624999995},
{"step": 33, "tier": "easy", "mean_reward": 5.611512500000002},
{"step": 34, "tier": "easy", "mean_reward": 5.644321875},
{"step": 35, "tier": "easy", "mean_reward": 6.196540625},
{"step": 36, "tier": "easy", "mean_reward": 7.3195125},
{"step": 37, "tier": "easy", "mean_reward": 6.589524999999998},
{"step": 38, "tier": "easy", "mean_reward": 6.493584374999999},
{"step": 39, "tier": "easy", "mean_reward": 4.5787531249999995},
{"step": 40, "tier": "easy", "mean_reward": 7.1647374999999975},
{"step": 41, "tier": "easy", "mean_reward": 6.307021875},
{"step": 42, "tier": "easy", "mean_reward": 5.6441625},
{"step": 43, "tier": "easy", "mean_reward": 6.051987499999996},
{"step": 44, "tier": "easy", "mean_reward": 6.970406250000004},
{"step": 45, "tier": "easy", "mean_reward": 7.375721874999999},
{"step": 46, "tier": "easy", "mean_reward": 6.082374999999997},
{"step": 47, "tier": "easy", "mean_reward": 6.735612500000002},
{"step": 48, "tier": "easy", "mean_reward": 6.820753125000001},
{"step": 49, "tier": "easy", "mean_reward": 5.743384375000001},
{"step": 50, "tier": "easy", "mean_reward": 6.935793750000004},
{"step": 51, "tier": "easy", "mean_reward": 6.389853125},
{"step": 52, "tier": "easy", "mean_reward": 6.366893750000002},
{"step": 53, "tier": "medium", "mean_reward": 6.685290624999997},
{"step": 54, "tier": "medium", "mean_reward": 5.949612500000001},
{"step": 55, "tier": "medium", "mean_reward": 2.770065624999999},
{"step": 56, "tier": "medium", "mean_reward": 7.203259374999998},
{"step": 57, "tier": "medium", "mean_reward": 4.506112500000001},
{"step": 58, "tier": "medium", "mean_reward": 7.0263187500000015},
{"step": 59, "tier": "medium", "mean_reward": 5.168934375000002},
{"step": 60, "tier": "medium", "mean_reward": 7.033081250000002},
{"step": 61, "tier": "medium", "mean_reward": 6.253359374999997},
{"step": 62, "tier": "medium", "mean_reward": 6.959756249999999},
{"step": 63, "tier": "hard", "mean_reward": 6.969309374999998},
{"step": 64, "tier": "hard", "mean_reward": 5.3616906250000005},
{"step": 65, "tier": "hard", "mean_reward": 6.252678124999999},
{"step": 66, "tier": "hard", "mean_reward": 2.5560937500000005},
{"step": 67, "tier": "hard", "mean_reward": 5.578853125},
{"step": 68, "tier": "hard", "mean_reward": 7.466365625000002},
{"step": 69, "tier": "hard", "mean_reward": 7.713275000000002},
{"step": 70, "tier": "hard", "mean_reward": 7.621018749999998},
{"step": 71, "tier": "hard", "mean_reward": 6.264199999999996},
{"step": 72, "tier": "hard", "mean_reward": 4.712021874999998},
{"step": 73, "tier": "hard", "mean_reward": 3.8931437500000015},
{"step": 74, "tier": "hard", "mean_reward": 7.114093750000004},
{"step": 75, "tier": "hard", "mean_reward": 6.6951906249999995},
{"step": 76, "tier": "hard", "mean_reward": 2.933387499999999},
{"step": 77, "tier": "hard", "mean_reward": 6.704121874999999},
{"step": 78, "tier": "hard", "mean_reward": 5.275803125},
{"step": 79, "tier": "hard", "mean_reward": 5.645184375000001},
{"step": 80, "tier": "hard", "mean_reward": 7.5555062500000005},
{"step": 81, "tier": "hard", "mean_reward": 5.178903125000001},
{"step": 82, "tier": "hard", "mean_reward": 5.782215625},
{"step": 83, "tier": "hard", "mean_reward": 7.4922562500000005},
{"step": 84, "tier": "hard", "mean_reward": 5.397803125000002},
{"step": 85, "tier": "hard", "mean_reward": 5.785240625},
{"step": 86, "tier": "hard", "mean_reward": 6.006559375000001},
{"step": 87, "tier": "hard", "mean_reward": 5.064365625000001},
{"step": 88, "tier": "hard", "mean_reward": 6.120146874999998},
{"step": 89, "tier": "hard", "mean_reward": 7.3549874999999965},
{"step": 90, "tier": "hard", "mean_reward": 5.017793749999999},
{"step": 91, "tier": "hard", "mean_reward": 7.611765625000001},
{"step": 92, "tier": "hard", "mean_reward": 7.58835},
{"step": 93, "tier": "hard", "mean_reward": 4.282640625000003},
{"step": 94, "tier": "hard", "mean_reward": 7.624143749999999},
{"step": 95, "tier": "hard", "mean_reward": 7.467125},
{"step": 96, "tier": "hard", "mean_reward": 7.492253125000001},
{"step": 97, "tier": "hard", "mean_reward": 3.8446718750000026},
{"step": 98, "tier": "hard", "mean_reward": 6.381118750000002},
{"step": 99, "tier": "hard", "mean_reward": 5.9315812500000025},
{"step": 100, "tier": "hard", "mean_reward": 5.303253125000001},
{"step": 101, "tier": "hard", "mean_reward": 5.379359374999997},
{"step": 102, "tier": "hard", "mean_reward": 6.105550000000001},
{"step": 103, "tier": "hard", "mean_reward": 4.132209375000002},
{"step": 104, "tier": "hard", "mean_reward": 5.99065},
{"step": 105, "tier": "hard", "mean_reward": 6.396168749999998},
{"step": 106, "tier": "hard", "mean_reward": 6.190524999999998},
{"step": 107, "tier": "hard", "mean_reward": 7.378921874999999},
{"step": 108, "tier": "hard", "mean_reward": 5.527831249999997},
{"step": 109, "tier": "hard", "mean_reward": 5.664981250000001},
{"step": 110, "tier": "hard", "mean_reward": 6.596590625000001},
{"step": 111, "tier": "hard", "mean_reward": 5.718784375000003},
{"step": 112, "tier": "hard", "mean_reward": 5.454768749999999},
{"step": 113, "tier": "hard", "mean_reward": 5.661271875},
{"step": 114, "tier": "hard", "mean_reward": 4.344675},
{"step": 115, "tier": "hard", "mean_reward": 4.810181250000001},
{"step": 116, "tier": "hard", "mean_reward": 5.746131249999998},
{"step": 117, "tier": "hard", "mean_reward": 5.718934375},
{"step": 118, "tier": "hard", "mean_reward": 7.343309375},
{"step": 119, "tier": "hard", "mean_reward": 5.728325},
{"step": 120, "tier": "hard", "mean_reward": 4.915784375},
{"step": 121, "tier": "hard", "mean_reward": 5.746521875},
{"step": 122, "tier": "hard", "mean_reward": 6.815368750000003},
{"step": 123, "tier": "hard", "mean_reward": 6.415571874999999},
{"step": 124, "tier": "hard", "mean_reward": 6.616740625000003},
{"step": 125, "tier": "hard", "mean_reward": 7.136087499999999},
{"step": 126, "tier": "hard", "mean_reward": 6.3915187499999995},
{"step": 127, "tier": "hard", "mean_reward": 6.998762500000002},
{"step": 128, "tier": "hard", "mean_reward": 6.718474999999998},
{"step": 129, "tier": "hard", "mean_reward": 6.675468750000001},
{"step": 130, "tier": "hard", "mean_reward": 6.832443750000001},
{"step": 131, "tier": "hard", "mean_reward": 7.4953281249999995},
{"step": 132, "tier": "hard", "mean_reward": 6.984856249999997},
{"step": 133, "tier": "hard", "mean_reward": 4.969693749999999},
{"step": 134, "tier": "hard", "mean_reward": 6.62208125},
{"step": 135, "tier": "hard", "mean_reward": 5.769275000000002},
{"step": 136, "tier": "hard", "mean_reward": 5.799609374999999},
{"step": 137, "tier": "hard", "mean_reward": 5.565890624999998},
{"step": 138, "tier": "hard", "mean_reward": 3.290540625},
{"step": 139, "tier": "hard", "mean_reward": 7.368412500000004},
{"step": 140, "tier": "hard", "mean_reward": 7.106300000000002},
{"step": 141, "tier": "hard", "mean_reward": 5.6757718750000015},
{"step": 142, "tier": "hard", "mean_reward": 5.496281250000001},
{"step": 143, "tier": "hard", "mean_reward": 5.8853125},
{"step": 144, "tier": "hard", "mean_reward": 7.661725},
{"step": 145, "tier": "hard", "mean_reward": 5.6637625},
{"step": 146, "tier": "hard", "mean_reward": 6.095750000000003},
{"step": 147, "tier": "hard", "mean_reward": 7.636731250000005},
{"step": 148, "tier": "hard", "mean_reward": 6.188656249999999},
{"step": 149, "tier": "hard", "mean_reward": 6.59115}
]