{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1360.5, "epoch": 6.666666666666667e-05, "grad_norm": 62.92084503173828, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.4226251244544983, "reward_std": 0.23553414642810822, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3742498755455017, "rewards/tag_count_reward": 0.796875, "step": 1 }, { "completion_length": 2048.0, "epoch": 0.00013333333333333334, "grad_norm": 79.95831298828125, "kl": 0.0, "learning_rate": 1.6666666666666667e-08, "loss": 0.0, "reward": 0.7906050682067871, "reward_std": 0.5845786929130554, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.3187699615955353, "rewards/tag_count_reward": 0.984375, "step": 2 }, { "completion_length": 1188.5, "epoch": 0.0002, "grad_norm": 58.27819061279297, "kl": 0.000675201416015625, "learning_rate": 3.3333333333333334e-08, "loss": 0.0, "reward": 0.5692380666732788, "reward_std": 0.23440620303153992, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22763691842556, "rewards/tag_count_reward": 0.796875, "step": 3 }, { "completion_length": 318.5, "epoch": 0.0002666666666666667, "grad_norm": 85.0557861328125, "kl": 0.001190185546875, "learning_rate": 5e-08, "loss": 0.0, "reward": 0.7729774713516235, "reward_std": 0.2601366341114044, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.13327254354953766, "rewards/tag_count_reward": 0.90625, "step": 4 }, { "completion_length": 1023.0, "epoch": 0.0003333333333333333, "grad_norm": 104.47907257080078, "kl": 0.00139617919921875, "learning_rate": 6.666666666666667e-08, "loss": 0.0001, "reward": 0.7300776243209839, "reward_std": 0.2588834762573242, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2542974054813385, "rewards/tag_count_reward": 0.984375, "step": 5 }, { "completion_length": 1320.0, "epoch": 0.0004, "grad_norm": 60.059425354003906, "kl": 0.00128173828125, "learning_rate": 8.333333333333333e-08, "loss": 0.0001, "reward": 0.5090208649635315, "reward_std": 0.2359948605298996, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2722291350364685, "rewards/tag_count_reward": 0.78125, "step": 6 }, { "completion_length": 1725.5, "epoch": 0.00046666666666666666, "grad_norm": 18.682714462280273, "kl": 0.000553131103515625, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.4545783996582031, "reward_std": 0.3350987732410431, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3266716003417969, "rewards/tag_count_reward": 0.71875, "step": 7 }, { "completion_length": 2048.0, "epoch": 0.0005333333333333334, "grad_norm": 65.13062286376953, "kl": 0.00049591064453125, "learning_rate": 1.1666666666666667e-07, "loss": 0.0, "reward": 0.38973310589790344, "reward_std": 0.4630505442619324, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.31339186429977417, "rewards/tag_count_reward": 0.703125, "step": 8 }, { "completion_length": 638.5, "epoch": 0.0006, "grad_norm": 45.310523986816406, "kl": 0.001953125, "learning_rate": 1.3333333333333334e-07, "loss": 0.0001, "reward": 0.8727901577949524, "reward_std": 0.3541698455810547, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.3147098422050476, "rewards/tag_count_reward": 1.0, "step": 9 }, { "completion_length": 1209.0, "epoch": 0.0006666666666666666, "grad_norm": 30.45114517211914, "kl": 0.0008392333984375, "learning_rate": 1.5e-07, "loss": 0.0, "reward": 0.5161985754966736, "reward_std": 0.23943105340003967, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2025514394044876, "rewards/tag_count_reward": 0.71875, "step": 10 }, { "completion_length": 1543.0, "epoch": 0.0007333333333333333, "grad_norm": 73.57073974609375, "kl": 0.00107574462890625, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.6498641967773438, "reward_std": 0.18601471185684204, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.28763580322265625, "rewards/tag_count_reward": 0.9375, "step": 11 }, { "completion_length": 1177.5, "epoch": 0.0008, "grad_norm": 78.71764373779297, "kl": 0.0019073486328125, "learning_rate": 1.833333333333333e-07, "loss": 0.0001, "reward": 0.47592616081237793, "reward_std": 0.25675779581069946, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.30532386898994446, "rewards/tag_count_reward": 0.78125, "step": 12 }, { "completion_length": 1347.0, "epoch": 0.0008666666666666666, "grad_norm": 65.83430480957031, "kl": 0.00299072265625, "learning_rate": 2e-07, "loss": 0.0001, "reward": 0.42067980766296387, "reward_std": 0.2540290951728821, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.34494519233703613, "rewards/tag_count_reward": 0.765625, "step": 13 }, { "completion_length": 1529.0, "epoch": 0.0009333333333333333, "grad_norm": 39.00326919555664, "kl": 0.00286865234375, "learning_rate": 2.1666666666666667e-07, "loss": 0.0001, "reward": 0.4691365659236908, "reward_std": 0.34750378131866455, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3121134638786316, "rewards/tag_count_reward": 0.71875, "step": 14 }, { "completion_length": 1282.0, "epoch": 0.001, "grad_norm": 37.169288635253906, "kl": 0.0035400390625, "learning_rate": 2.3333333333333333e-07, "loss": 0.0001, "reward": 0.5637781620025635, "reward_std": 0.26130610704421997, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20184680819511414, "rewards/tag_count_reward": 0.765625, "step": 15 }, { "completion_length": 1581.5, "epoch": 0.0010666666666666667, "grad_norm": 102.38505554199219, "kl": 0.00457763671875, "learning_rate": 2.5e-07, "loss": 0.0002, "reward": 0.6891039609909058, "reward_std": 0.4037727117538452, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.32652103900909424, "rewards/tag_count_reward": 0.953125, "step": 16 }, { "completion_length": 2048.0, "epoch": 0.0011333333333333334, "grad_norm": 91.1015625, "kl": 0.011474609375, "learning_rate": 2.6666666666666667e-07, "loss": 0.0005, "reward": 0.34357714653015137, "reward_std": 0.18312102556228638, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.297047883272171, "rewards/tag_count_reward": 0.640625, "step": 17 }, { "completion_length": 1239.0, "epoch": 0.0012, "grad_norm": 20.3443660736084, "kl": 0.036376953125, "learning_rate": 2.833333333333333e-07, "loss": 0.0015, "reward": 0.6245652437210083, "reward_std": 0.2036026120185852, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2035597562789917, "rewards/tag_count_reward": 0.828125, "step": 18 }, { "completion_length": 1572.0, "epoch": 0.0012666666666666666, "grad_norm": 62.626895904541016, "kl": 0.048095703125, "learning_rate": 3e-07, "loss": 0.0019, "reward": 0.45009517669677734, "reward_std": 0.23423996567726135, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.31552982330322266, "rewards/tag_count_reward": 0.765625, "step": 19 }, { "completion_length": 1256.5, "epoch": 0.0013333333333333333, "grad_norm": 72.09598541259766, "kl": 0.05615234375, "learning_rate": 3.166666666666666e-07, "loss": 0.0022, "reward": 0.49289625883102417, "reward_std": 0.32105302810668945, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22585375607013702, "rewards/tag_count_reward": 0.71875, "step": 20 }, { "completion_length": 2048.0, "epoch": 0.0014, "grad_norm": 90.30217742919922, "kl": 0.0478515625, "learning_rate": 3.333333333333333e-07, "loss": 0.0019, "reward": 0.4728601574897766, "reward_std": 0.3845285475254059, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2458898425102234, "rewards/tag_count_reward": 0.71875, "step": 21 }, { "completion_length": 418.5, "epoch": 0.0014666666666666667, "grad_norm": 81.34673309326172, "kl": 0.1162109375, "learning_rate": 3.5e-07, "loss": 0.0046, "reward": 0.8560193777084351, "reward_std": 0.1753513216972351, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14398059248924255, "rewards/tag_count_reward": 1.0, "step": 22 }, { "completion_length": 2048.0, "epoch": 0.0015333333333333334, "grad_norm": 82.59819030761719, "kl": 0.0966796875, "learning_rate": 3.666666666666666e-07, "loss": 0.0039, "reward": 0.5138834118843079, "reward_std": 0.2827845513820648, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3142416179180145, "rewards/tag_count_reward": 0.828125, "step": 23 }, { "completion_length": 1181.0, "epoch": 0.0016, "grad_norm": 44.45237350463867, "kl": 1.15625, "learning_rate": 3.8333333333333335e-07, "loss": 0.0462, "reward": 0.21006432175636292, "reward_std": 0.24397452175617218, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3211856782436371, "rewards/tag_count_reward": 0.53125, "step": 24 }, { "completion_length": 1337.0, "epoch": 0.0016666666666666668, "grad_norm": 50.711578369140625, "kl": 1.640625, "learning_rate": 4e-07, "loss": 0.0652, "reward": 0.5274523496627808, "reward_std": 0.25214487314224243, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3475476801395416, "rewards/tag_count_reward": 0.875, "step": 25 }, { "completion_length": 2048.0, "epoch": 0.0017333333333333333, "grad_norm": 49.71009826660156, "kl": 2.203125, "learning_rate": 4.1666666666666667e-07, "loss": 0.0879, "reward": 0.6676552295684814, "reward_std": 0.29243233799934387, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3948447108268738, "rewards/tag_count_reward": 1.0, "step": 26 }, { "completion_length": 409.0, "epoch": 0.0018, "grad_norm": 23.617977142333984, "kl": 1.96875, "learning_rate": 4.3333333333333335e-07, "loss": 0.0789, "reward": 0.7921032905578613, "reward_std": 0.20389382541179657, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20789667963981628, "rewards/tag_count_reward": 1.0, "step": 27 }, { "completion_length": 347.5, "epoch": 0.0018666666666666666, "grad_norm": 77.3822021484375, "kl": 3.34375, "learning_rate": 4.5e-07, "loss": 0.134, "reward": 0.8200243711471558, "reward_std": 0.16092178225517273, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.17997564375400543, "rewards/tag_count_reward": 1.0, "step": 28 }, { "completion_length": 2048.0, "epoch": 0.0019333333333333333, "grad_norm": 55.70796585083008, "kl": 3.421875, "learning_rate": 4.6666666666666666e-07, "loss": 0.1365, "reward": 0.5405373573303223, "reward_std": 0.2981247901916504, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27196261286735535, "rewards/tag_count_reward": 0.8125, "step": 29 }, { "completion_length": 1006.5, "epoch": 0.002, "grad_norm": 155.38221740722656, "kl": 5.6875, "learning_rate": 4.833333333333333e-07, "loss": 0.2292, "reward": 0.6248105764389038, "reward_std": 0.24843928217887878, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2970644235610962, "rewards/tag_count_reward": 0.921875, "step": 30 }, { "completion_length": 2048.0, "epoch": 0.0020666666666666667, "grad_norm": 138.4519500732422, "kl": 5.125, "learning_rate": 5e-07, "loss": 0.205, "reward": 0.7118746042251587, "reward_std": 0.32819420099258423, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.4131254255771637, "rewards/tag_count_reward": 1.0, "step": 31 }, { "completion_length": 2048.0, "epoch": 0.0021333333333333334, "grad_norm": 47.884742736816406, "kl": 5.09375, "learning_rate": 5.166666666666667e-07, "loss": 0.2037, "reward": 0.6176910400390625, "reward_std": 0.17327076196670532, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3823089003562927, "rewards/tag_count_reward": 1.0, "step": 32 }, { "completion_length": 684.5, "epoch": 0.0022, "grad_norm": 65.66542053222656, "kl": 4.34375, "learning_rate": 5.333333333333333e-07, "loss": 0.1737, "reward": 0.7614506483078003, "reward_std": 0.18302768468856812, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2385493516921997, "rewards/tag_count_reward": 1.0, "step": 33 }, { "completion_length": 1710.5, "epoch": 0.002266666666666667, "grad_norm": 53.532588958740234, "kl": 3.171875, "learning_rate": 5.5e-07, "loss": 0.1268, "reward": 0.524427056312561, "reward_std": 0.2843332290649414, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20994791388511658, "rewards/tag_count_reward": 0.734375, "step": 34 }, { "completion_length": 2048.0, "epoch": 0.0023333333333333335, "grad_norm": 53.95136642456055, "kl": 2.65625, "learning_rate": 5.666666666666666e-07, "loss": 0.1065, "reward": 0.1748514324426651, "reward_std": 0.38427919149398804, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2782735824584961, "rewards/tag_count_reward": 0.453125, "step": 35 }, { "completion_length": 1342.0, "epoch": 0.0024, "grad_norm": 66.91969299316406, "kl": 4.25, "learning_rate": 5.833333333333334e-07, "loss": 0.1694, "reward": 0.4533642530441284, "reward_std": 0.3015359938144684, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3122607469558716, "rewards/tag_count_reward": 0.765625, "step": 36 }, { "completion_length": 1331.0, "epoch": 0.0024666666666666665, "grad_norm": 45.550968170166016, "kl": 4.40625, "learning_rate": 6e-07, "loss": 0.1765, "reward": 0.6793862581253052, "reward_std": 0.20818302035331726, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3206138014793396, "rewards/tag_count_reward": 1.0, "step": 37 }, { "completion_length": 1590.0, "epoch": 0.002533333333333333, "grad_norm": 74.53897094726562, "kl": 3.625, "learning_rate": 6.166666666666667e-07, "loss": 0.1458, "reward": 0.3450509309768677, "reward_std": 0.3167041540145874, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.4049490690231323, "rewards/tag_count_reward": 0.6875, "step": 38 }, { "completion_length": 2048.0, "epoch": 0.0026, "grad_norm": 66.00804901123047, "kl": 1.875, "learning_rate": 6.333333333333332e-07, "loss": 0.075, "reward": 0.258808434009552, "reward_std": 0.40460073947906494, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.288066565990448, "rewards/tag_count_reward": 0.546875, "step": 39 }, { "completion_length": 1535.5, "epoch": 0.0026666666666666666, "grad_norm": 20.755111694335938, "kl": 2.125, "learning_rate": 6.5e-07, "loss": 0.0851, "reward": 0.6783057451248169, "reward_std": 0.5646753907203674, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.18106922507286072, "rewards/tag_count_reward": 0.796875, "step": 40 }, { "completion_length": 1706.5, "epoch": 0.0027333333333333333, "grad_norm": 14.439873695373535, "kl": 4.8125, "learning_rate": 6.666666666666666e-07, "loss": 0.1935, "reward": 0.7685257196426392, "reward_std": 0.44565704464912415, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.35647428035736084, "rewards/tag_count_reward": 1.0, "step": 41 }, { "completion_length": 1622.5, "epoch": 0.0028, "grad_norm": 17.968324661254883, "kl": 2.984375, "learning_rate": 6.833333333333333e-07, "loss": 0.1196, "reward": 0.4771651327610016, "reward_std": 0.34532877802848816, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3822098970413208, "rewards/tag_count_reward": 0.859375, "step": 42 }, { "completion_length": 1579.5, "epoch": 0.0028666666666666667, "grad_norm": 49.35251235961914, "kl": 2.125, "learning_rate": 7e-07, "loss": 0.085, "reward": 0.49850696325302124, "reward_std": 0.1912895143032074, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26711803674697876, "rewards/tag_count_reward": 0.765625, "step": 43 }, { "completion_length": 2048.0, "epoch": 0.0029333333333333334, "grad_norm": 96.884033203125, "kl": 2.8125, "learning_rate": 7.166666666666667e-07, "loss": 0.113, "reward": 0.6241928935050964, "reward_std": 0.22991162538528442, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.36018210649490356, "rewards/tag_count_reward": 0.984375, "step": 44 }, { "completion_length": 1337.5, "epoch": 0.003, "grad_norm": 10.082756996154785, "kl": 1.5625, "learning_rate": 7.333333333333332e-07, "loss": 0.0626, "reward": 0.8175822496414185, "reward_std": 0.2660192847251892, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22929275035858154, "rewards/tag_count_reward": 0.984375, "step": 45 }, { "completion_length": 1441.5, "epoch": 0.0030666666666666668, "grad_norm": 50.631492614746094, "kl": 1.5546875, "learning_rate": 7.5e-07, "loss": 0.0621, "reward": 0.4033077657222748, "reward_std": 0.2452845275402069, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3623172342777252, "rewards/tag_count_reward": 0.765625, "step": 46 }, { "completion_length": 468.0, "epoch": 0.0031333333333333335, "grad_norm": 40.3426399230957, "kl": 0.47265625, "learning_rate": 7.666666666666667e-07, "loss": 0.0189, "reward": 0.8530340194702148, "reward_std": 0.2936784029006958, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.20946595072746277, "rewards/tag_count_reward": 1.0, "step": 47 }, { "completion_length": 1290.0, "epoch": 0.0032, "grad_norm": 25.1014461517334, "kl": 2.078125, "learning_rate": 7.833333333333333e-07, "loss": 0.0831, "reward": 0.4508376717567444, "reward_std": 0.27874755859375, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3304123282432556, "rewards/tag_count_reward": 0.78125, "step": 48 }, { "completion_length": 2018.5, "epoch": 0.003266666666666667, "grad_norm": 17.916645050048828, "kl": 3.875, "learning_rate": 8e-07, "loss": 0.1551, "reward": 0.7111445069313049, "reward_std": 0.31034788489341736, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.33573055267333984, "rewards/tag_count_reward": 0.984375, "step": 49 }, { "completion_length": 1838.5, "epoch": 0.0033333333333333335, "grad_norm": 69.96803283691406, "kl": 3.46875, "learning_rate": 8.166666666666666e-07, "loss": 0.1391, "reward": 0.44843411445617676, "reward_std": 0.37969285249710083, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.30156588554382324, "rewards/tag_count_reward": 0.75, "step": 50 }, { "completion_length": 2048.0, "epoch": 0.0034, "grad_norm": 209.96632385253906, "kl": 9.75, "learning_rate": 8.333333333333333e-07, "loss": 0.39, "reward": 0.5666791200637817, "reward_std": 0.18180032074451447, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.41769587993621826, "rewards/tag_count_reward": 0.984375, "step": 51 }, { "completion_length": 1529.5, "epoch": 0.0034666666666666665, "grad_norm": 159.65383911132812, "kl": 8.625, "learning_rate": 8.499999999999999e-07, "loss": 0.3457, "reward": 0.5927076935768127, "reward_std": 0.16607308387756348, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.40729236602783203, "rewards/tag_count_reward": 1.0, "step": 52 }, { "completion_length": 592.5, "epoch": 0.003533333333333333, "grad_norm": 218.92417907714844, "kl": 6.4375, "learning_rate": 8.666666666666667e-07, "loss": 0.2579, "reward": 0.9154659509658813, "reward_std": 0.4596608579158783, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.24078400433063507, "rewards/tag_count_reward": 0.90625, "step": 53 }, { "completion_length": 1021.0, "epoch": 0.0036, "grad_norm": 23.559619903564453, "kl": 4.25, "learning_rate": 8.833333333333333e-07, "loss": 0.1696, "reward": 0.5911154747009277, "reward_std": 0.43817973136901855, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.26825952529907227, "rewards/tag_count_reward": 0.796875, "step": 54 }, { "completion_length": 2048.0, "epoch": 0.0036666666666666666, "grad_norm": 55.155784606933594, "kl": 3.28125, "learning_rate": 9e-07, "loss": 0.1314, "reward": 0.4308491051197052, "reward_std": 0.33698350191116333, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3347759246826172, "rewards/tag_count_reward": 0.765625, "step": 55 }, { "completion_length": 1790.5, "epoch": 0.0037333333333333333, "grad_norm": 58.42893981933594, "kl": 2.828125, "learning_rate": 9.166666666666665e-07, "loss": 0.1131, "reward": 0.31651684641838074, "reward_std": 0.27603378891944885, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27723315358161926, "rewards/tag_count_reward": 0.59375, "step": 56 }, { "completion_length": 1152.5, "epoch": 0.0038, "grad_norm": 25.080169677734375, "kl": 3.078125, "learning_rate": 9.333333333333333e-07, "loss": 0.1231, "reward": 0.5117756128311157, "reward_std": 0.28854095935821533, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3163493871688843, "rewards/tag_count_reward": 0.828125, "step": 57 }, { "completion_length": 1621.0, "epoch": 0.0038666666666666667, "grad_norm": 48.9347038269043, "kl": 2.453125, "learning_rate": 9.499999999999999e-07, "loss": 0.0986, "reward": 0.6456644535064697, "reward_std": 0.28838276863098145, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22933553159236908, "rewards/tag_count_reward": 0.875, "step": 58 }, { "completion_length": 1352.5, "epoch": 0.003933333333333333, "grad_norm": 64.43124389648438, "kl": 1.984375, "learning_rate": 9.666666666666666e-07, "loss": 0.0796, "reward": 0.7111755609512329, "reward_std": 0.18634262681007385, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2731994390487671, "rewards/tag_count_reward": 0.984375, "step": 59 }, { "completion_length": 579.0, "epoch": 0.004, "grad_norm": 68.34193420410156, "kl": 1.609375, "learning_rate": 9.833333333333332e-07, "loss": 0.0641, "reward": 0.7022674679756165, "reward_std": 0.25780147314071655, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26648253202438354, "rewards/tag_count_reward": 0.96875, "step": 60 }, { "completion_length": 1151.5, "epoch": 0.004066666666666666, "grad_norm": 32.08150100708008, "kl": 1.3515625, "learning_rate": 1e-06, "loss": 0.054, "reward": 0.873136043548584, "reward_std": 0.4330424666404724, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.0956139862537384, "rewards/tag_count_reward": 0.90625, "step": 61 }, { "completion_length": 1163.0, "epoch": 0.0041333333333333335, "grad_norm": 28.13393783569336, "kl": 1.953125, "learning_rate": 1e-06, "loss": 0.0779, "reward": 0.6770293712615967, "reward_std": 0.2541940212249756, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.11984563618898392, "rewards/tag_count_reward": 0.796875, "step": 62 }, { "completion_length": 682.0, "epoch": 0.0042, "grad_norm": 787.3125, "kl": 5.1875, "learning_rate": 1e-06, "loss": 0.2088, "reward": 0.5052896738052368, "reward_std": 0.24993839859962463, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24471038579940796, "rewards/tag_count_reward": 0.75, "step": 63 }, { "completion_length": 411.5, "epoch": 0.004266666666666667, "grad_norm": 50.115291595458984, "kl": 1.453125, "learning_rate": 1e-06, "loss": 0.058, "reward": 0.6861989498138428, "reward_std": 0.24390053749084473, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20442602038383484, "rewards/tag_count_reward": 0.890625, "step": 64 }, { "completion_length": 1717.5, "epoch": 0.004333333333333333, "grad_norm": 104.73741912841797, "kl": 4.875, "learning_rate": 1e-06, "loss": 0.1942, "reward": 0.612551212310791, "reward_std": 0.21034899353981018, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1686987429857254, "rewards/tag_count_reward": 0.78125, "step": 65 }, { "completion_length": 563.5, "epoch": 0.0044, "grad_norm": 616.6585083007812, "kl": 12.1875, "learning_rate": 1e-06, "loss": 0.4881, "reward": 0.9393866658210754, "reward_std": 0.34601008892059326, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.24811333417892456, "rewards/tag_count_reward": 1.0, "step": 66 }, { "completion_length": 2048.0, "epoch": 0.0044666666666666665, "grad_norm": 969.1438598632812, "kl": 25.75, "learning_rate": 1e-06, "loss": 1.0353, "reward": 0.590280294418335, "reward_std": 0.1938992142677307, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.37846967577934265, "rewards/tag_count_reward": 0.96875, "step": 67 }, { "completion_length": 1266.5, "epoch": 0.004533333333333334, "grad_norm": 99.06281280517578, "kl": 4.3125, "learning_rate": 1e-06, "loss": 0.173, "reward": 0.8209527134895325, "reward_std": 0.4626120328903198, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.21029730141162872, "rewards/tag_count_reward": 0.78125, "step": 68 }, { "completion_length": 1431.5, "epoch": 0.0046, "grad_norm": 16.082868576049805, "kl": 0.8359375, "learning_rate": 1e-06, "loss": 0.0334, "reward": 0.4943634867668152, "reward_std": 0.3753243386745453, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.13063651323318481, "rewards/tag_count_reward": 0.625, "step": 69 }, { "completion_length": 511.5, "epoch": 0.004666666666666667, "grad_norm": 27.706239700317383, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.0861, "reward": 0.5137104988098145, "reward_std": 0.3204461336135864, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26753950119018555, "rewards/tag_count_reward": 0.78125, "step": 70 }, { "completion_length": 1672.5, "epoch": 0.004733333333333333, "grad_norm": 28.113643646240234, "kl": 2.71875, "learning_rate": 1e-06, "loss": 0.1085, "reward": 0.49439576268196106, "reward_std": 0.32019099593162537, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27122926712036133, "rewards/tag_count_reward": 0.765625, "step": 71 }, { "completion_length": 1199.5, "epoch": 0.0048, "grad_norm": 32.67612075805664, "kl": 3.625, "learning_rate": 1e-06, "loss": 0.1455, "reward": 0.5977433919906616, "reward_std": 0.23971763253211975, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2616316080093384, "rewards/tag_count_reward": 0.859375, "step": 72 }, { "completion_length": 474.0, "epoch": 0.004866666666666667, "grad_norm": 123.9244384765625, "kl": 4.0, "learning_rate": 1e-06, "loss": 0.1596, "reward": 0.9720438122749329, "reward_std": 0.5066829323768616, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.21545612812042236, "rewards/tag_count_reward": 1.0, "step": 73 }, { "completion_length": 636.5, "epoch": 0.004933333333333333, "grad_norm": 544.687744140625, "kl": 4.125, "learning_rate": 1e-06, "loss": 0.1657, "reward": 0.5301218628883362, "reward_std": 0.37352970242500305, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.188628152012825, "rewards/tag_count_reward": 0.71875, "step": 74 }, { "completion_length": 1155.5, "epoch": 0.005, "grad_norm": 61.665035247802734, "kl": 0.69921875, "learning_rate": 1e-06, "loss": 0.028, "reward": 0.6461014747619629, "reward_std": 0.4455444812774658, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.1976485401391983, "rewards/tag_count_reward": 0.78125, "step": 75 }, { "completion_length": 1879.0, "epoch": 0.005066666666666666, "grad_norm": 42.811279296875, "kl": 1.65625, "learning_rate": 1e-06, "loss": 0.0661, "reward": 0.4902087152004242, "reward_std": 0.25922858715057373, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3535413146018982, "rewards/tag_count_reward": 0.84375, "step": 76 }, { "completion_length": 1231.5, "epoch": 0.0051333333333333335, "grad_norm": 119.74993133544922, "kl": 0.7421875, "learning_rate": 1e-06, "loss": 0.0298, "reward": 0.7240487337112427, "reward_std": 0.16741560399532318, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27595123648643494, "rewards/tag_count_reward": 1.0, "step": 77 }, { "completion_length": 568.5, "epoch": 0.0052, "grad_norm": 78.45287322998047, "kl": 3.125, "learning_rate": 1e-06, "loss": 0.1253, "reward": 0.7721863985061646, "reward_std": 0.33460527658462524, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22781357169151306, "rewards/tag_count_reward": 0.9375, "step": 78 }, { "completion_length": 1466.5, "epoch": 0.005266666666666667, "grad_norm": 51.971961975097656, "kl": 1.265625, "learning_rate": 1e-06, "loss": 0.0507, "reward": 0.7405422329902649, "reward_std": 0.33687448501586914, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.1969577968120575, "rewards/tag_count_reward": 0.875, "step": 79 }, { "completion_length": 1090.0, "epoch": 0.005333333333333333, "grad_norm": 67.31307220458984, "kl": 1.234375, "learning_rate": 1e-06, "loss": 0.0491, "reward": 0.6939834356307983, "reward_std": 0.4383859634399414, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22789162397384644, "rewards/tag_count_reward": 0.859375, "step": 80 }, { "completion_length": 543.5, "epoch": 0.0054, "grad_norm": 698.207763671875, "kl": 18.625, "learning_rate": 1e-06, "loss": 0.744, "reward": 0.6980682611465454, "reward_std": 0.1692238748073578, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.301931768655777, "rewards/tag_count_reward": 1.0, "step": 81 }, { "completion_length": 492.5, "epoch": 0.0054666666666666665, "grad_norm": 112.44371032714844, "kl": 4.34375, "learning_rate": 1e-06, "loss": 0.1729, "reward": 0.6759111881256104, "reward_std": 0.28266435861587524, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24596385657787323, "rewards/tag_count_reward": 0.921875, "step": 82 }, { "completion_length": 1402.0, "epoch": 0.005533333333333334, "grad_norm": 20.140390396118164, "kl": 4.1875, "learning_rate": 1e-06, "loss": 0.1672, "reward": 0.6321854591369629, "reward_std": 0.25300371646881104, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2428145408630371, "rewards/tag_count_reward": 0.875, "step": 83 }, { "completion_length": 1182.5, "epoch": 0.0056, "grad_norm": 29.856775283813477, "kl": 2.8125, "learning_rate": 1e-06, "loss": 0.1122, "reward": 0.6982576847076416, "reward_std": 0.23495344817638397, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2236173450946808, "rewards/tag_count_reward": 0.921875, "step": 84 }, { "completion_length": 383.5, "epoch": 0.005666666666666667, "grad_norm": 152.1378936767578, "kl": 4.1875, "learning_rate": 1e-06, "loss": 0.1665, "reward": 1.047166347503662, "reward_std": 0.45125406980514526, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.21845868229866028, "rewards/tag_count_reward": 0.890625, "step": 85 }, { "completion_length": 1728.5, "epoch": 0.005733333333333333, "grad_norm": 404.3030090332031, "kl": 8.6875, "learning_rate": 1e-06, "loss": 0.3462, "reward": 0.22871780395507812, "reward_std": 0.3173619210720062, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2712821960449219, "rewards/tag_count_reward": 0.5, "step": 86 }, { "completion_length": 659.0, "epoch": 0.0058, "grad_norm": 61.45455551147461, "kl": 2.828125, "learning_rate": 1e-06, "loss": 0.113, "reward": 0.6972514390945435, "reward_std": 0.2451970875263214, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19337353110313416, "rewards/tag_count_reward": 0.890625, "step": 87 }, { "completion_length": 597.5, "epoch": 0.005866666666666667, "grad_norm": 629.5557861328125, "kl": 16.125, "learning_rate": 1e-06, "loss": 0.645, "reward": 0.6588761806488037, "reward_std": 0.17969873547554016, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3411238193511963, "rewards/tag_count_reward": 1.0, "step": 88 }, { "completion_length": 1257.5, "epoch": 0.005933333333333333, "grad_norm": 41.06660842895508, "kl": 1.4921875, "learning_rate": 1e-06, "loss": 0.0596, "reward": 0.6366362571716309, "reward_std": 0.27736175060272217, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19148872792720795, "rewards/tag_count_reward": 0.828125, "step": 89 }, { "completion_length": 468.5, "epoch": 0.006, "grad_norm": 59.929656982421875, "kl": 0.248046875, "learning_rate": 1e-06, "loss": 0.0099, "reward": 0.7060214281082153, "reward_std": 0.2982117533683777, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.15335358679294586, "rewards/tag_count_reward": 0.859375, "step": 90 }, { "completion_length": 2048.0, "epoch": 0.006066666666666666, "grad_norm": 268.2857971191406, "kl": 9.0, "learning_rate": 1e-06, "loss": 0.3589, "reward": 0.4473886489868164, "reward_std": 0.21605214476585388, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3026113510131836, "rewards/tag_count_reward": 0.75, "step": 91 }, { "completion_length": 733.5, "epoch": 0.0061333333333333335, "grad_norm": 227.89134216308594, "kl": 9.0625, "learning_rate": 1e-06, "loss": 0.3635, "reward": 0.7792458534240723, "reward_std": 0.3857659697532654, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.26762914657592773, "rewards/tag_count_reward": 0.984375, "step": 92 }, { "completion_length": 2003.5, "epoch": 0.0062, "grad_norm": 34.40571975708008, "kl": 1.390625, "learning_rate": 1e-06, "loss": 0.0559, "reward": 0.6673760414123535, "reward_std": 0.2075645476579666, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1919989287853241, "rewards/tag_count_reward": 0.859375, "step": 93 }, { "completion_length": 296.0, "epoch": 0.006266666666666667, "grad_norm": 87.31964111328125, "kl": 3.25, "learning_rate": 1e-06, "loss": 0.13, "reward": 0.8700410723686218, "reward_std": 0.17551617324352264, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.12995892763137817, "rewards/tag_count_reward": 1.0, "step": 94 }, { "completion_length": 290.5, "epoch": 0.006333333333333333, "grad_norm": 105.713134765625, "kl": 0.8046875, "learning_rate": 1e-06, "loss": 0.0322, "reward": 0.8407634496688843, "reward_std": 0.1777280569076538, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14361155033111572, "rewards/tag_count_reward": 0.984375, "step": 95 }, { "completion_length": 2048.0, "epoch": 0.0064, "grad_norm": 84.93144226074219, "kl": 1.453125, "learning_rate": 1e-06, "loss": 0.0582, "reward": 0.6186965703964233, "reward_std": 0.3762863278388977, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.35005342960357666, "rewards/tag_count_reward": 0.71875, "step": 96 }, { "completion_length": 1466.0, "epoch": 0.006466666666666667, "grad_norm": 83.41734313964844, "kl": 0.400390625, "learning_rate": 1e-06, "loss": 0.016, "reward": 1.0745121240615845, "reward_std": 0.5339106321334839, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.3317378759384155, "rewards/tag_count_reward": 0.96875, "step": 97 }, { "completion_length": 1355.0, "epoch": 0.006533333333333334, "grad_norm": 79.6697006225586, "kl": 0.6875, "learning_rate": 1e-06, "loss": 0.0276, "reward": 0.5732134580612183, "reward_std": 0.2440004199743271, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.30178651213645935, "rewards/tag_count_reward": 0.875, "step": 98 }, { "completion_length": 595.0, "epoch": 0.0066, "grad_norm": 67.67591857910156, "kl": 0.318359375, "learning_rate": 1e-06, "loss": 0.0128, "reward": 0.8623976111412048, "reward_std": 0.22806811332702637, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.12197738885879517, "rewards/tag_count_reward": 0.984375, "step": 99 }, { "completion_length": 474.5, "epoch": 0.006666666666666667, "grad_norm": 40.00164794921875, "kl": 0.7265625, "learning_rate": 1e-06, "loss": 0.029, "reward": 0.7015387415885925, "reward_std": 0.24650061130523682, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.09533626586198807, "rewards/tag_count_reward": 0.796875, "step": 100 }, { "completion_length": 1289.5, "epoch": 0.006733333333333333, "grad_norm": 488.3539733886719, "kl": 13.5625, "learning_rate": 1e-06, "loss": 0.5411, "reward": 0.6548053622245789, "reward_std": 0.20378378033638, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.34519466757774353, "rewards/tag_count_reward": 1.0, "step": 101 }, { "completion_length": 1228.5, "epoch": 0.0068, "grad_norm": 264.9231262207031, "kl": 9.25, "learning_rate": 1e-06, "loss": 0.3681, "reward": 0.6209266185760498, "reward_std": 0.19560487568378448, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3009483814239502, "rewards/tag_count_reward": 0.921875, "step": 102 }, { "completion_length": 769.0, "epoch": 0.006866666666666667, "grad_norm": 88.28279113769531, "kl": 4.3125, "learning_rate": 1e-06, "loss": 0.1713, "reward": 0.8779873847961426, "reward_std": 0.4074295163154602, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.215762659907341, "rewards/tag_count_reward": 0.96875, "step": 103 }, { "completion_length": 281.5, "epoch": 0.006933333333333333, "grad_norm": 144.38900756835938, "kl": 5.34375, "learning_rate": 1e-06, "loss": 0.2136, "reward": 0.7748416662216187, "reward_std": 0.18396000564098358, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22515833377838135, "rewards/tag_count_reward": 1.0, "step": 104 }, { "completion_length": 637.5, "epoch": 0.007, "grad_norm": 65.10338592529297, "kl": 3.125, "learning_rate": 1e-06, "loss": 0.1248, "reward": 0.8497180938720703, "reward_std": 0.4444595277309418, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.2752818763256073, "rewards/tag_count_reward": 1.0, "step": 105 }, { "completion_length": 423.0, "epoch": 0.007066666666666666, "grad_norm": 42.835147857666016, "kl": 1.5390625, "learning_rate": 1e-06, "loss": 0.0619, "reward": 0.7825101613998413, "reward_std": 0.30330055952072144, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2799898684024811, "rewards/tag_count_reward": 1.0, "step": 106 }, { "completion_length": 2048.0, "epoch": 0.0071333333333333335, "grad_norm": 72.4932861328125, "kl": 0.5, "learning_rate": 1e-06, "loss": 0.02, "reward": 0.3152003288269043, "reward_std": 0.36495572328567505, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2629246711730957, "rewards/tag_count_reward": 0.578125, "step": 107 }, { "completion_length": 1259.0, "epoch": 0.0072, "grad_norm": 88.30027770996094, "kl": 0.19921875, "learning_rate": 1e-06, "loss": 0.008, "reward": 0.6244743466377258, "reward_std": 0.27865296602249146, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21927565336227417, "rewards/tag_count_reward": 0.84375, "step": 108 }, { "completion_length": 1879.0, "epoch": 0.007266666666666667, "grad_norm": 68.95305633544922, "kl": 1.7109375, "learning_rate": 1e-06, "loss": 0.0685, "reward": 0.44533634185791016, "reward_std": 0.19798362255096436, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.38278865814208984, "rewards/tag_count_reward": 0.828125, "step": 109 }, { "completion_length": 545.0, "epoch": 0.007333333333333333, "grad_norm": 26.842378616333008, "kl": 0.7421875, "learning_rate": 1e-06, "loss": 0.0297, "reward": 0.8213717341423035, "reward_std": 0.46459102630615234, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.17862829566001892, "rewards/tag_count_reward": 0.875, "step": 110 }, { "completion_length": 1188.0, "epoch": 0.0074, "grad_norm": 34.98936080932617, "kl": 1.8828125, "learning_rate": 1e-06, "loss": 0.0754, "reward": 0.8087401390075684, "reward_std": 0.2381439059972763, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.25375983119010925, "rewards/tag_count_reward": 1.0, "step": 111 }, { "completion_length": 2048.0, "epoch": 0.007466666666666667, "grad_norm": 66.2349853515625, "kl": 1.015625, "learning_rate": 1e-06, "loss": 0.0406, "reward": 0.5706478357315063, "reward_std": 0.312145859003067, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.3356021046638489, "rewards/tag_count_reward": 0.78125, "step": 112 }, { "completion_length": 1583.5, "epoch": 0.007533333333333334, "grad_norm": 68.85169219970703, "kl": 3.171875, "learning_rate": 1e-06, "loss": 0.1268, "reward": 0.7540740966796875, "reward_std": 0.39475083351135254, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.3553008437156677, "rewards/tag_count_reward": 0.984375, "step": 113 }, { "completion_length": 1604.5, "epoch": 0.0076, "grad_norm": 131.9192352294922, "kl": 1.21875, "learning_rate": 1e-06, "loss": 0.0485, "reward": 0.662670910358429, "reward_std": 0.18053007125854492, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.30607908964157104, "rewards/tag_count_reward": 0.96875, "step": 114 }, { "completion_length": 320.0, "epoch": 0.007666666666666666, "grad_norm": 85.9920654296875, "kl": 1.03125, "learning_rate": 1e-06, "loss": 0.041, "reward": 0.8732629418373108, "reward_std": 0.18879486620426178, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1267370581626892, "rewards/tag_count_reward": 1.0, "step": 115 }, { "completion_length": 1665.5, "epoch": 0.007733333333333333, "grad_norm": 256.2652282714844, "kl": 0.39453125, "learning_rate": 1e-06, "loss": 0.0158, "reward": 0.44586634635925293, "reward_std": 0.32189178466796875, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.36663368344306946, "rewards/tag_count_reward": 0.8125, "step": 116 }, { "completion_length": 947.0, "epoch": 0.0078, "grad_norm": 154.89486694335938, "kl": 1.25, "learning_rate": 1e-06, "loss": 0.05, "reward": 0.8331127166748047, "reward_std": 0.28338319063186646, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2137623131275177, "rewards/tag_count_reward": 0.984375, "step": 117 }, { "completion_length": 1757.5, "epoch": 0.007866666666666666, "grad_norm": 162.80978393554688, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 0.864159345626831, "reward_std": 0.37670308351516724, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.16709071397781372, "rewards/tag_count_reward": 0.90625, "step": 118 }, { "completion_length": 1124.0, "epoch": 0.007933333333333334, "grad_norm": 573.138671875, "kl": 10.875, "learning_rate": 1e-06, "loss": 0.4362, "reward": 0.7860045433044434, "reward_std": 0.382926344871521, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.29212039709091187, "rewards/tag_count_reward": 0.953125, "step": 119 }, { "completion_length": 1036.5, "epoch": 0.008, "grad_norm": 3948.708251953125, "kl": 71.0, "learning_rate": 1e-06, "loss": 2.868, "reward": 0.6766761541366577, "reward_std": 0.38920336961746216, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2764488458633423, "rewards/tag_count_reward": 0.890625, "step": 120 }, { "completion_length": 1811.5, "epoch": 0.008066666666666666, "grad_norm": 90.52825927734375, "kl": 4.25, "learning_rate": 1e-06, "loss": 0.1697, "reward": 0.6224031448364258, "reward_std": 0.32361501455307007, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19009681046009064, "rewards/tag_count_reward": 0.8125, "step": 121 }, { "completion_length": 1185.5, "epoch": 0.008133333333333333, "grad_norm": 281.738525390625, "kl": 6.9375, "learning_rate": 1e-06, "loss": 0.2773, "reward": 0.6586467027664185, "reward_std": 0.30995893478393555, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23197825253009796, "rewards/tag_count_reward": 0.890625, "step": 122 }, { "completion_length": 710.5, "epoch": 0.0082, "grad_norm": 1714.2760009765625, "kl": 29.25, "learning_rate": 1e-06, "loss": 1.1726, "reward": 0.8991853594779968, "reward_std": 0.3646455705165863, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.33518970012664795, "rewards/tag_count_reward": 0.921875, "step": 123 }, { "completion_length": 1366.5, "epoch": 0.008266666666666667, "grad_norm": 467.15386962890625, "kl": 12.4375, "learning_rate": 1e-06, "loss": 0.4967, "reward": 0.8071024417877197, "reward_std": 0.5944163799285889, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.2710225582122803, "rewards/tag_count_reward": 0.890625, "step": 124 }, { "completion_length": 1361.5, "epoch": 0.008333333333333333, "grad_norm": 126.81051635742188, "kl": 3.625, "learning_rate": 1e-06, "loss": 0.1452, "reward": 0.2870804965496063, "reward_std": 0.312913179397583, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2597945034503937, "rewards/tag_count_reward": 0.546875, "step": 125 }, { "completion_length": 887.5, "epoch": 0.0084, "grad_norm": 122.0322494506836, "kl": 3.84375, "learning_rate": 1e-06, "loss": 0.1538, "reward": 0.7047094702720642, "reward_std": 0.18850934505462646, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2640404999256134, "rewards/tag_count_reward": 0.96875, "step": 126 }, { "completion_length": 551.0, "epoch": 0.008466666666666667, "grad_norm": 97.62600708007812, "kl": 0.2099609375, "learning_rate": 1e-06, "loss": 0.0084, "reward": 0.8664301633834839, "reward_std": 0.37567955255508423, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.2585698366165161, "rewards/tag_count_reward": 1.0, "step": 127 }, { "completion_length": 2048.0, "epoch": 0.008533333333333334, "grad_norm": 124.26470184326172, "kl": 0.54296875, "learning_rate": 1e-06, "loss": 0.0218, "reward": 0.1481974571943283, "reward_std": 0.28645384311676025, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2893025577068329, "rewards/tag_count_reward": 0.4375, "step": 128 }, { "completion_length": 566.5, "epoch": 0.0086, "grad_norm": 145.44088745117188, "kl": 1.140625, "learning_rate": 1e-06, "loss": 0.0455, "reward": 0.684267520904541, "reward_std": 0.1783992052078247, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3001074194908142, "rewards/tag_count_reward": 0.984375, "step": 129 }, { "completion_length": 287.0, "epoch": 0.008666666666666666, "grad_norm": 79.27010345458984, "kl": 1.734375, "learning_rate": 1e-06, "loss": 0.0693, "reward": 0.9607985019683838, "reward_std": 0.5766100287437439, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.2110764980316162, "rewards/tag_count_reward": 0.984375, "step": 130 }, { "completion_length": 1237.5, "epoch": 0.008733333333333333, "grad_norm": 166.70591735839844, "kl": 1.265625, "learning_rate": 1e-06, "loss": 0.0507, "reward": 0.5976251363754272, "reward_std": 0.2768978476524353, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26174983382225037, "rewards/tag_count_reward": 0.859375, "step": 131 }, { "completion_length": 1230.5, "epoch": 0.0088, "grad_norm": 86.69196319580078, "kl": 0.84375, "learning_rate": 1e-06, "loss": 0.0338, "reward": 0.9894083142280579, "reward_std": 0.44239863753318787, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.18246668577194214, "rewards/tag_count_reward": 0.921875, "step": 132 }, { "completion_length": 1223.5, "epoch": 0.008866666666666667, "grad_norm": 577.7119140625, "kl": 11.5, "learning_rate": 1e-06, "loss": 0.4612, "reward": 0.9148614406585693, "reward_std": 0.3666042983531952, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.25701355934143066, "rewards/tag_count_reward": 0.984375, "step": 133 }, { "completion_length": 1583.5, "epoch": 0.008933333333333333, "grad_norm": 1036.103271484375, "kl": 27.375, "learning_rate": 1e-06, "loss": 1.0967, "reward": 0.6422809958457947, "reward_std": 0.30539625883102417, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3108440041542053, "rewards/tag_count_reward": 0.890625, "step": 134 }, { "completion_length": 743.5, "epoch": 0.009, "grad_norm": 2467.851806640625, "kl": 19.25, "learning_rate": 1e-06, "loss": 0.7744, "reward": 0.5940513610839844, "reward_std": 0.26661908626556396, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.10907360911369324, "rewards/tag_count_reward": 0.703125, "step": 135 }, { "completion_length": 744.5, "epoch": 0.009066666666666667, "grad_norm": 506.23193359375, "kl": 14.375, "learning_rate": 1e-06, "loss": 0.5751, "reward": 0.7252035140991211, "reward_std": 0.6573628187179565, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.21229654550552368, "rewards/tag_count_reward": 0.75, "step": 136 }, { "completion_length": 1501.5, "epoch": 0.009133333333333334, "grad_norm": 425.3215637207031, "kl": 13.125, "learning_rate": 1e-06, "loss": 0.5264, "reward": 0.6281967759132385, "reward_std": 0.33101916313171387, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.26242825388908386, "rewards/tag_count_reward": 0.828125, "step": 137 }, { "completion_length": 1585.5, "epoch": 0.0092, "grad_norm": 46.23551940917969, "kl": 2.015625, "learning_rate": 1e-06, "loss": 0.0807, "reward": 0.3377422094345093, "reward_std": 0.341916561126709, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24038279056549072, "rewards/tag_count_reward": 0.578125, "step": 138 }, { "completion_length": 852.0, "epoch": 0.009266666666666666, "grad_norm": 20.030078887939453, "kl": 2.265625, "learning_rate": 1e-06, "loss": 0.0906, "reward": 0.48952001333236694, "reward_std": 0.4157789945602417, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26047998666763306, "rewards/tag_count_reward": 0.75, "step": 139 }, { "completion_length": 1278.0, "epoch": 0.009333333333333334, "grad_norm": 274.914306640625, "kl": 7.8125, "learning_rate": 1e-06, "loss": 0.3133, "reward": 0.8666358590126038, "reward_std": 0.4360937476158142, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.36773911118507385, "rewards/tag_count_reward": 0.984375, "step": 140 }, { "completion_length": 2048.0, "epoch": 0.0094, "grad_norm": 79.84725952148438, "kl": 5.78125, "learning_rate": 1e-06, "loss": 0.232, "reward": 0.4236172139644623, "reward_std": 0.27087658643722534, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3576327860355377, "rewards/tag_count_reward": 0.78125, "step": 141 }, { "completion_length": 1249.5, "epoch": 0.009466666666666667, "grad_norm": 65.60501098632812, "kl": 1.7421875, "learning_rate": 1e-06, "loss": 0.0697, "reward": 0.6048706769943237, "reward_std": 0.22861841320991516, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2545042932033539, "rewards/tag_count_reward": 0.859375, "step": 142 }, { "completion_length": 1514.5, "epoch": 0.009533333333333333, "grad_norm": 34.173702239990234, "kl": 1.140625, "learning_rate": 1e-06, "loss": 0.0458, "reward": 0.520450234413147, "reward_std": 0.30032819509506226, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.354549765586853, "rewards/tag_count_reward": 0.8125, "step": 143 }, { "completion_length": 2048.0, "epoch": 0.0096, "grad_norm": 67.23226165771484, "kl": 4.0625, "learning_rate": 1e-06, "loss": 0.1631, "reward": 0.1353166550397873, "reward_std": 0.3215920329093933, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3490583300590515, "rewards/tag_count_reward": 0.484375, "step": 144 }, { "completion_length": 687.0, "epoch": 0.009666666666666667, "grad_norm": 81.97444915771484, "kl": 0.27734375, "learning_rate": 1e-06, "loss": 0.011, "reward": 0.7245032787322998, "reward_std": 0.20581424236297607, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2754966914653778, "rewards/tag_count_reward": 1.0, "step": 145 }, { "completion_length": 430.0, "epoch": 0.009733333333333333, "grad_norm": 34.066314697265625, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 0.9231314063072205, "reward_std": 0.2355959713459015, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.12374356389045715, "rewards/tag_count_reward": 0.984375, "step": 146 }, { "completion_length": 760.5, "epoch": 0.0098, "grad_norm": 80.5255355834961, "kl": 0.201171875, "learning_rate": 1e-06, "loss": 0.0081, "reward": 0.8242017030715942, "reward_std": 0.2619200050830841, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.20704832673072815, "rewards/tag_count_reward": 0.96875, "step": 147 }, { "completion_length": 1339.5, "epoch": 0.009866666666666666, "grad_norm": 75.45352172851562, "kl": 4.03125, "learning_rate": 1e-06, "loss": 0.1615, "reward": 0.37841227650642395, "reward_std": 0.3038763403892517, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27783775329589844, "rewards/tag_count_reward": 0.65625, "step": 148 }, { "completion_length": 617.5, "epoch": 0.009933333333333334, "grad_norm": 37.40899658203125, "kl": 1.0703125, "learning_rate": 1e-06, "loss": 0.0427, "reward": 0.9852249622344971, "reward_std": 0.4898127317428589, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.10852503776550293, "rewards/tag_count_reward": 0.84375, "step": 149 }, { "completion_length": 1091.5, "epoch": 0.01, "grad_norm": 118.6474838256836, "kl": 2.734375, "learning_rate": 1e-06, "loss": 0.1095, "reward": 0.9904427528381348, "reward_std": 0.5234081745147705, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.29080721735954285, "rewards/tag_count_reward": 0.96875, "step": 150 }, { "completion_length": 541.5, "epoch": 0.010066666666666666, "grad_norm": 438.11431884765625, "kl": 10.3125, "learning_rate": 1e-06, "loss": 0.4122, "reward": 0.5217373967170715, "reward_std": 0.2800005078315735, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22826258838176727, "rewards/tag_count_reward": 0.75, "step": 151 }, { "completion_length": 825.0, "epoch": 0.010133333333333333, "grad_norm": 1261.1268310546875, "kl": 12.75, "learning_rate": 1e-06, "loss": 0.5119, "reward": 0.46687906980514526, "reward_std": 0.4136854410171509, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.12687088549137115, "rewards/tag_count_reward": 0.53125, "step": 152 }, { "completion_length": 993.5, "epoch": 0.0102, "grad_norm": 538.96337890625, "kl": 11.625, "learning_rate": 1e-06, "loss": 0.4671, "reward": 0.27331826090812683, "reward_std": 0.4327068328857422, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22668173909187317, "rewards/tag_count_reward": 0.5, "step": 153 }, { "completion_length": 282.5, "epoch": 0.010266666666666667, "grad_norm": 27.94957160949707, "kl": 1.75, "learning_rate": 1e-06, "loss": 0.0702, "reward": 0.8316723108291626, "reward_std": 0.20498070120811462, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1214526817202568, "rewards/tag_count_reward": 0.953125, "step": 154 }, { "completion_length": 543.5, "epoch": 0.010333333333333333, "grad_norm": 67.51468658447266, "kl": 1.59375, "learning_rate": 1e-06, "loss": 0.0636, "reward": 1.0527127981185913, "reward_std": 0.5268169641494751, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.2597872018814087, "rewards/tag_count_reward": 1.0, "step": 155 }, { "completion_length": 1237.5, "epoch": 0.0104, "grad_norm": 152.0338592529297, "kl": 1.21875, "learning_rate": 1e-06, "loss": 0.0488, "reward": 0.5705561637878418, "reward_std": 0.32453328371047974, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2263188511133194, "rewards/tag_count_reward": 0.796875, "step": 156 }, { "completion_length": 652.0, "epoch": 0.010466666666666668, "grad_norm": 86.2015151977539, "kl": 1.671875, "learning_rate": 1e-06, "loss": 0.0668, "reward": 0.7413462400436401, "reward_std": 0.536789059638977, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.16490373015403748, "rewards/tag_count_reward": 0.78125, "step": 157 }, { "completion_length": 890.0, "epoch": 0.010533333333333334, "grad_norm": 10600.8759765625, "kl": 76.0, "learning_rate": 1e-06, "loss": 3.049, "reward": 0.7228549718856812, "reward_std": 0.5175967812538147, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.18339505791664124, "rewards/tag_count_reward": 0.78125, "step": 158 }, { "completion_length": 1430.5, "epoch": 0.0106, "grad_norm": 51.20515060424805, "kl": 2.5625, "learning_rate": 1e-06, "loss": 0.1022, "reward": 0.5848544836044312, "reward_std": 0.34735676646232605, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.24327048659324646, "rewards/tag_count_reward": 0.765625, "step": 159 }, { "completion_length": 699.0, "epoch": 0.010666666666666666, "grad_norm": 124.27230834960938, "kl": 0.7578125, "learning_rate": 1e-06, "loss": 0.0303, "reward": 1.1091445684432983, "reward_std": 0.4381716549396515, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.14085541665554047, "rewards/tag_count_reward": 0.875, "step": 160 }, { "completion_length": 472.0, "epoch": 0.010733333333333333, "grad_norm": 130.2908935546875, "kl": 6.09375, "learning_rate": 1e-06, "loss": 0.2449, "reward": 0.8178807497024536, "reward_std": 0.49368974566459656, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.1821192353963852, "rewards/tag_count_reward": 0.8125, "step": 161 }, { "completion_length": 2048.0, "epoch": 0.0108, "grad_norm": 161.0198211669922, "kl": 7.0, "learning_rate": 1e-06, "loss": 0.2802, "reward": 0.3412390947341919, "reward_std": 0.3579902648925781, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23688587546348572, "rewards/tag_count_reward": 0.578125, "step": 162 }, { "completion_length": 790.0, "epoch": 0.010866666666666667, "grad_norm": 148.21499633789062, "kl": 5.84375, "learning_rate": 1e-06, "loss": 0.2336, "reward": 0.9175270199775696, "reward_std": 0.5335466265678406, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.19184798002243042, "rewards/tag_count_reward": 0.859375, "step": 163 }, { "completion_length": 679.5, "epoch": 0.010933333333333333, "grad_norm": 100.95904541015625, "kl": 2.59375, "learning_rate": 1e-06, "loss": 0.1042, "reward": 0.5523126721382141, "reward_std": 0.22545668482780457, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1976873278617859, "rewards/tag_count_reward": 0.75, "step": 164 }, { "completion_length": 419.0, "epoch": 0.011, "grad_norm": 89.20986938476562, "kl": 0.8046875, "learning_rate": 1e-06, "loss": 0.0321, "reward": 1.2350327968597412, "reward_std": 0.5273359417915344, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.04621714726090431, "rewards/tag_count_reward": 0.90625, "step": 165 }, { "completion_length": 2048.0, "epoch": 0.011066666666666667, "grad_norm": 80.26315307617188, "kl": 3.15625, "learning_rate": 1e-06, "loss": 0.1263, "reward": 0.2963656485080719, "reward_std": 0.39362943172454834, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3442593514919281, "rewards/tag_count_reward": 0.578125, "step": 166 }, { "completion_length": 710.5, "epoch": 0.011133333333333334, "grad_norm": 152.8345184326172, "kl": 0.72265625, "learning_rate": 1e-06, "loss": 0.0289, "reward": 0.671838641166687, "reward_std": 0.23800846934318542, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20316140353679657, "rewards/tag_count_reward": 0.875, "step": 167 }, { "completion_length": 2048.0, "epoch": 0.0112, "grad_norm": 109.0709457397461, "kl": 2.4375, "learning_rate": 1e-06, "loss": 0.0978, "reward": 0.33459407091140747, "reward_std": 0.21566098928451538, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3216559588909149, "rewards/tag_count_reward": 0.65625, "step": 168 }, { "completion_length": 1306.0, "epoch": 0.011266666666666666, "grad_norm": 122.21932220458984, "kl": 1.8125, "learning_rate": 1e-06, "loss": 0.0723, "reward": 0.7176011204719543, "reward_std": 0.44902274012565613, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.29802384972572327, "rewards/tag_count_reward": 0.828125, "step": 169 }, { "completion_length": 345.5, "epoch": 0.011333333333333334, "grad_norm": 77.08424377441406, "kl": 2.125, "learning_rate": 1e-06, "loss": 0.0854, "reward": 1.0335280895233154, "reward_std": 0.5491755604743958, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.13834688067436218, "rewards/tag_count_reward": 0.921875, "step": 170 }, { "completion_length": 928.0, "epoch": 0.0114, "grad_norm": 92.87094116210938, "kl": 2.765625, "learning_rate": 1e-06, "loss": 0.1105, "reward": 0.6134524941444397, "reward_std": 0.2891947627067566, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2146725207567215, "rewards/tag_count_reward": 0.828125, "step": 171 }, { "completion_length": 1915.5, "epoch": 0.011466666666666667, "grad_norm": 96.58674621582031, "kl": 5.46875, "learning_rate": 1e-06, "loss": 0.2197, "reward": 0.4971694350242615, "reward_std": 0.30854254961013794, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.28408053517341614, "rewards/tag_count_reward": 0.78125, "step": 172 }, { "completion_length": 1015.5, "epoch": 0.011533333333333333, "grad_norm": 351.7418212890625, "kl": 10.625, "learning_rate": 1e-06, "loss": 0.4244, "reward": 0.578019380569458, "reward_std": 0.5916845798492432, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.23448067903518677, "rewards/tag_count_reward": 0.6875, "step": 173 }, { "completion_length": 1680.0, "epoch": 0.0116, "grad_norm": 150.63426208496094, "kl": 3.40625, "learning_rate": 1e-06, "loss": 0.1358, "reward": 0.2883574366569519, "reward_std": 0.35795196890830994, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2585175633430481, "rewards/tag_count_reward": 0.546875, "step": 174 }, { "completion_length": 1299.0, "epoch": 0.011666666666666667, "grad_norm": 49.87466812133789, "kl": 1.25, "learning_rate": 1e-06, "loss": 0.0501, "reward": 0.5055930614471436, "reward_std": 0.2460363507270813, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21315690875053406, "rewards/tag_count_reward": 0.71875, "step": 175 }, { "completion_length": 988.0, "epoch": 0.011733333333333333, "grad_norm": 97.638916015625, "kl": 4.40625, "learning_rate": 1e-06, "loss": 0.1764, "reward": 0.5065950155258179, "reward_std": 0.3401361107826233, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.30590495467185974, "rewards/tag_count_reward": 0.75, "step": 176 }, { "completion_length": 699.5, "epoch": 0.0118, "grad_norm": 98.37915802001953, "kl": 1.296875, "learning_rate": 1e-06, "loss": 0.0516, "reward": 0.46273481845855713, "reward_std": 0.30883949995040894, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.17789018154144287, "rewards/tag_count_reward": 0.640625, "step": 177 }, { "completion_length": 460.5, "epoch": 0.011866666666666666, "grad_norm": 128.74960327148438, "kl": 0.5703125, "learning_rate": 1e-06, "loss": 0.023, "reward": 1.0834062099456787, "reward_std": 0.7106175422668457, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.08846880495548248, "rewards/tag_count_reward": 0.984375, "step": 178 }, { "completion_length": 1096.0, "epoch": 0.011933333333333334, "grad_norm": 60.819427490234375, "kl": 1.3828125, "learning_rate": 1e-06, "loss": 0.0553, "reward": 0.9217681884765625, "reward_std": 0.517062246799469, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.2032317817211151, "rewards/tag_count_reward": 0.875, "step": 179 }, { "completion_length": 400.0, "epoch": 0.012, "grad_norm": 42.64118194580078, "kl": 2.796875, "learning_rate": 1e-06, "loss": 0.1112, "reward": 0.7124161720275879, "reward_std": 0.2661663889884949, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14695881307125092, "rewards/tag_count_reward": 0.859375, "step": 180 }, { "completion_length": 354.5, "epoch": 0.012066666666666667, "grad_norm": 1183.7877197265625, "kl": 29.25, "learning_rate": 1e-06, "loss": 1.1724, "reward": 0.8555076122283936, "reward_std": 0.6536040306091309, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.20699241757392883, "rewards/tag_count_reward": 0.8125, "step": 181 }, { "completion_length": 1114.5, "epoch": 0.012133333333333333, "grad_norm": 64.2253189086914, "kl": 4.65625, "learning_rate": 1e-06, "loss": 0.1876, "reward": 0.7184881567955017, "reward_std": 0.39609208703041077, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.3283868432044983, "rewards/tag_count_reward": 0.859375, "step": 182 }, { "completion_length": 1195.5, "epoch": 0.0122, "grad_norm": 60.13753128051758, "kl": 2.46875, "learning_rate": 1e-06, "loss": 0.0985, "reward": 0.6382501125335693, "reward_std": 0.2662697732448578, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.12737488746643066, "rewards/tag_count_reward": 0.765625, "step": 183 }, { "completion_length": 1723.0, "epoch": 0.012266666666666667, "grad_norm": 139.8374786376953, "kl": 1.46875, "learning_rate": 1e-06, "loss": 0.0589, "reward": 0.3703034520149231, "reward_std": 0.3178945779800415, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1765715628862381, "rewards/tag_count_reward": 0.546875, "step": 184 }, { "completion_length": 1334.0, "epoch": 0.012333333333333333, "grad_norm": 107.38422393798828, "kl": 1.671875, "learning_rate": 1e-06, "loss": 0.0666, "reward": 0.5631563663482666, "reward_std": 0.363595187664032, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.10871861129999161, "rewards/tag_count_reward": 0.671875, "step": 185 }, { "completion_length": 1153.0, "epoch": 0.0124, "grad_norm": 59.173736572265625, "kl": 3.078125, "learning_rate": 1e-06, "loss": 0.1227, "reward": 0.756859540939331, "reward_std": 0.32119861245155334, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.29001548886299133, "rewards/tag_count_reward": 0.984375, "step": 186 }, { "completion_length": 559.5, "epoch": 0.012466666666666666, "grad_norm": 77.07391357421875, "kl": 1.390625, "learning_rate": 1e-06, "loss": 0.0556, "reward": 0.8043298721313477, "reward_std": 0.32981082797050476, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.14879512786865234, "rewards/tag_count_reward": 0.765625, "step": 187 }, { "completion_length": 687.0, "epoch": 0.012533333333333334, "grad_norm": 181.65753173828125, "kl": 6.6875, "learning_rate": 1e-06, "loss": 0.2685, "reward": 0.7765844464302063, "reward_std": 0.5066102743148804, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.1921655535697937, "rewards/tag_count_reward": 0.78125, "step": 188 }, { "completion_length": 693.5, "epoch": 0.0126, "grad_norm": 153.3155517578125, "kl": 4.1875, "learning_rate": 1e-06, "loss": 0.1682, "reward": 0.8262364864349365, "reward_std": 0.3578583598136902, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.31438854336738586, "rewards/tag_count_reward": 0.953125, "step": 189 }, { "completion_length": 798.0, "epoch": 0.012666666666666666, "grad_norm": 50.24184036254883, "kl": 2.375, "learning_rate": 1e-06, "loss": 0.0947, "reward": 0.5746697187423706, "reward_std": 0.26803144812583923, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2378302663564682, "rewards/tag_count_reward": 0.8125, "step": 190 }, { "completion_length": 531.5, "epoch": 0.012733333333333333, "grad_norm": 59.12314224243164, "kl": 1.59375, "learning_rate": 1e-06, "loss": 0.0635, "reward": 0.6490483283996582, "reward_std": 0.2684840261936188, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1947016566991806, "rewards/tag_count_reward": 0.84375, "step": 191 }, { "completion_length": 1208.0, "epoch": 0.0128, "grad_norm": 59.42628860473633, "kl": 4.46875, "learning_rate": 1e-06, "loss": 0.1795, "reward": 0.8103557825088501, "reward_std": 0.32967913150787354, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.23651918768882751, "rewards/tag_count_reward": 0.984375, "step": 192 }, { "completion_length": 1280.0, "epoch": 0.012866666666666667, "grad_norm": 146.31739807128906, "kl": 3.875, "learning_rate": 1e-06, "loss": 0.1556, "reward": 0.6511543989181519, "reward_std": 0.4586493670940399, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.20822066068649292, "rewards/tag_count_reward": 0.671875, "step": 193 }, { "completion_length": 548.5, "epoch": 0.012933333333333333, "grad_norm": 86.45354461669922, "kl": 3.09375, "learning_rate": 1e-06, "loss": 0.1237, "reward": 0.5654099583625793, "reward_std": 0.25625455379486084, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16896502673625946, "rewards/tag_count_reward": 0.734375, "step": 194 }, { "completion_length": 453.5, "epoch": 0.013, "grad_norm": 109.8470230102539, "kl": 0.359375, "learning_rate": 1e-06, "loss": 0.0144, "reward": 0.8253956437110901, "reward_std": 0.3288414478302002, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.2527293860912323, "rewards/tag_count_reward": 0.953125, "step": 195 }, { "completion_length": 1591.5, "epoch": 0.013066666666666667, "grad_norm": 114.1063003540039, "kl": 1.6171875, "learning_rate": 1e-06, "loss": 0.0646, "reward": 0.299161821603775, "reward_std": 0.1698484718799591, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24771319329738617, "rewards/tag_count_reward": 0.546875, "step": 196 }, { "completion_length": 1003.5, "epoch": 0.013133333333333334, "grad_norm": 42.97124099731445, "kl": 4.03125, "learning_rate": 1e-06, "loss": 0.1612, "reward": 0.654625415802002, "reward_std": 0.22866007685661316, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.25162458419799805, "rewards/tag_count_reward": 0.90625, "step": 197 }, { "completion_length": 1635.0, "epoch": 0.0132, "grad_norm": 145.948486328125, "kl": 3.21875, "learning_rate": 1e-06, "loss": 0.1287, "reward": 0.2930026054382324, "reward_std": 0.30017927289009094, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2382473647594452, "rewards/tag_count_reward": 0.53125, "step": 198 }, { "completion_length": 1250.5, "epoch": 0.013266666666666666, "grad_norm": 145.41311645507812, "kl": 7.0, "learning_rate": 1e-06, "loss": 0.2798, "reward": 0.7496009469032288, "reward_std": 0.5164273977279663, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.20352405309677124, "rewards/tag_count_reward": 0.765625, "step": 199 }, { "completion_length": 2048.0, "epoch": 0.013333333333333334, "grad_norm": 80.11041259765625, "kl": 1.5625, "learning_rate": 1e-06, "loss": 0.0626, "reward": 0.41792166233062744, "reward_std": 0.18836495280265808, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.28520333766937256, "rewards/tag_count_reward": 0.703125, "step": 200 }, { "completion_length": 1320.0, "epoch": 0.0134, "grad_norm": 404.68017578125, "kl": 13.3125, "learning_rate": 1e-06, "loss": 0.5327, "reward": 0.5508228540420532, "reward_std": 0.47881072759628296, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2460521161556244, "rewards/tag_count_reward": 0.734375, "step": 201 }, { "completion_length": 1161.0, "epoch": 0.013466666666666667, "grad_norm": 222.3137969970703, "kl": 8.25, "learning_rate": 1e-06, "loss": 0.331, "reward": 0.5511502027511597, "reward_std": 0.2810462415218353, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2613498270511627, "rewards/tag_count_reward": 0.75, "step": 202 }, { "completion_length": 1230.0, "epoch": 0.013533333333333333, "grad_norm": 132.03036499023438, "kl": 7.6875, "learning_rate": 1e-06, "loss": 0.3065, "reward": 0.47215622663497925, "reward_std": 0.2874332070350647, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21534380316734314, "rewards/tag_count_reward": 0.6875, "step": 203 }, { "completion_length": 2048.0, "epoch": 0.0136, "grad_norm": 145.11434936523438, "kl": 5.8125, "learning_rate": 1e-06, "loss": 0.2328, "reward": 0.339425265789032, "reward_std": 0.2594582438468933, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.332449734210968, "rewards/tag_count_reward": 0.671875, "step": 204 }, { "completion_length": 1684.0, "epoch": 0.013666666666666667, "grad_norm": 95.94991302490234, "kl": 2.125, "learning_rate": 1e-06, "loss": 0.0852, "reward": 0.4848323464393616, "reward_std": 0.24807266891002655, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2807926535606384, "rewards/tag_count_reward": 0.765625, "step": 205 }, { "completion_length": 2048.0, "epoch": 0.013733333333333334, "grad_norm": 25.70447540283203, "kl": 3.15625, "learning_rate": 1e-06, "loss": 0.1265, "reward": 0.7671686410903931, "reward_std": 0.4565780758857727, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.26408135890960693, "rewards/tag_count_reward": 0.96875, "step": 206 }, { "completion_length": 2048.0, "epoch": 0.0138, "grad_norm": 132.874267578125, "kl": 1.703125, "learning_rate": 1e-06, "loss": 0.0683, "reward": 0.5867856740951538, "reward_std": 0.4050213098526001, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.3194643557071686, "rewards/tag_count_reward": 0.71875, "step": 207 }, { "completion_length": 1151.5, "epoch": 0.013866666666666666, "grad_norm": 100.7705307006836, "kl": 0.96875, "learning_rate": 1e-06, "loss": 0.0386, "reward": 0.6679742336273193, "reward_std": 0.43158334493637085, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22265075147151947, "rewards/tag_count_reward": 0.828125, "step": 208 }, { "completion_length": 2048.0, "epoch": 0.013933333333333334, "grad_norm": 123.25395202636719, "kl": 2.015625, "learning_rate": 1e-06, "loss": 0.0809, "reward": 0.6185864210128784, "reward_std": 0.4980039596557617, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.31891360878944397, "rewards/tag_count_reward": 0.875, "step": 209 }, { "completion_length": 1145.0, "epoch": 0.014, "grad_norm": 67.15911865234375, "kl": 1.0703125, "learning_rate": 1e-06, "loss": 0.0429, "reward": 0.899847149848938, "reward_std": 0.47282007336616516, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.1470278799533844, "rewards/tag_count_reward": 0.796875, "step": 210 }, { "completion_length": 1179.5, "epoch": 0.014066666666666667, "grad_norm": 125.00872802734375, "kl": 3.703125, "learning_rate": 1e-06, "loss": 0.1486, "reward": 0.5287724733352661, "reward_std": 0.22276633977890015, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2056024968624115, "rewards/tag_count_reward": 0.734375, "step": 211 }, { "completion_length": 1127.0, "epoch": 0.014133333333333333, "grad_norm": 115.925048828125, "kl": 3.015625, "learning_rate": 1e-06, "loss": 0.1203, "reward": 1.0227022171020508, "reward_std": 0.41721510887145996, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.13354776799678802, "rewards/tag_count_reward": 0.71875, "step": 212 }, { "completion_length": 891.5, "epoch": 0.0142, "grad_norm": 319.3634033203125, "kl": 5.78125, "learning_rate": 1e-06, "loss": 0.2325, "reward": 1.142776608467102, "reward_std": 0.6472839117050171, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.23222345113754272, "rewards/tag_count_reward": 1.0, "step": 213 }, { "completion_length": 1132.5, "epoch": 0.014266666666666667, "grad_norm": 47.81877899169922, "kl": 3.234375, "learning_rate": 1e-06, "loss": 0.1294, "reward": 0.9716029763221741, "reward_std": 0.5017786026000977, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.20027202367782593, "rewards/tag_count_reward": 0.859375, "step": 214 }, { "completion_length": 2048.0, "epoch": 0.014333333333333333, "grad_norm": 94.92833709716797, "kl": 6.375, "learning_rate": 1e-06, "loss": 0.2551, "reward": 0.43028393387794495, "reward_std": 0.3057904839515686, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27284103631973267, "rewards/tag_count_reward": 0.703125, "step": 215 }, { "completion_length": 372.5, "epoch": 0.0144, "grad_norm": 48.02029800415039, "kl": 2.015625, "learning_rate": 1e-06, "loss": 0.0801, "reward": 0.7344586253166199, "reward_std": 0.2993859648704529, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14054134488105774, "rewards/tag_count_reward": 0.875, "step": 216 }, { "completion_length": 1484.5, "epoch": 0.014466666666666666, "grad_norm": 29.980205535888672, "kl": 0.6875, "learning_rate": 1e-06, "loss": 0.0276, "reward": 0.4853227734565735, "reward_std": 0.326226145029068, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.10842724144458771, "rewards/tag_count_reward": 0.59375, "step": 217 }, { "completion_length": 825.0, "epoch": 0.014533333333333334, "grad_norm": 115.4741439819336, "kl": 5.75, "learning_rate": 1e-06, "loss": 0.23, "reward": 0.534308135509491, "reward_std": 0.4810641407966614, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.21569184958934784, "rewards/tag_count_reward": 0.6875, "step": 218 }, { "completion_length": 1326.5, "epoch": 0.0146, "grad_norm": 71.67326354980469, "kl": 2.5, "learning_rate": 1e-06, "loss": 0.0999, "reward": 0.7836402058601379, "reward_std": 0.5527104139328003, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.20073477923870087, "rewards/tag_count_reward": 0.734375, "step": 219 }, { "completion_length": 838.5, "epoch": 0.014666666666666666, "grad_norm": 72.6811294555664, "kl": 4.6875, "learning_rate": 1e-06, "loss": 0.1866, "reward": 0.464601993560791, "reward_std": 0.29503798484802246, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2697729766368866, "rewards/tag_count_reward": 0.734375, "step": 220 }, { "completion_length": 1459.5, "epoch": 0.014733333333333333, "grad_norm": 198.50364685058594, "kl": 3.140625, "learning_rate": 1e-06, "loss": 0.1256, "reward": 0.6743848919868469, "reward_std": 0.3051697909832001, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20061510801315308, "rewards/tag_count_reward": 0.875, "step": 221 }, { "completion_length": 1243.5, "epoch": 0.0148, "grad_norm": 100.1600112915039, "kl": 4.84375, "learning_rate": 1e-06, "loss": 0.1934, "reward": 0.506662905216217, "reward_std": 0.23734720051288605, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.25896206498146057, "rewards/tag_count_reward": 0.765625, "step": 222 }, { "completion_length": 1270.0, "epoch": 0.014866666666666667, "grad_norm": 380.06414794921875, "kl": 8.0625, "learning_rate": 1e-06, "loss": 0.323, "reward": 0.9009274840354919, "reward_std": 0.6793674230575562, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.28657251596450806, "rewards/tag_count_reward": 0.875, "step": 223 }, { "completion_length": 1717.5, "epoch": 0.014933333333333333, "grad_norm": 82.44145965576172, "kl": 2.84375, "learning_rate": 1e-06, "loss": 0.1133, "reward": 0.3924320340156555, "reward_std": 0.28474855422973633, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2169429361820221, "rewards/tag_count_reward": 0.609375, "step": 224 }, { "completion_length": 1228.5, "epoch": 0.015, "grad_norm": 315.5871276855469, "kl": 8.8125, "learning_rate": 1e-06, "loss": 0.3517, "reward": 1.075720191001892, "reward_std": 0.5324565172195435, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.2680298388004303, "rewards/tag_count_reward": 0.96875, "step": 225 }, { "completion_length": 520.0, "epoch": 0.015066666666666667, "grad_norm": 91.1231918334961, "kl": 4.875, "learning_rate": 1e-06, "loss": 0.1961, "reward": 0.737302303314209, "reward_std": 0.2117619812488556, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2158227115869522, "rewards/tag_count_reward": 0.953125, "step": 226 }, { "completion_length": 1367.5, "epoch": 0.015133333333333334, "grad_norm": 77.13899993896484, "kl": 4.71875, "learning_rate": 1e-06, "loss": 0.1891, "reward": 0.4703078269958496, "reward_std": 0.23381000757217407, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20156718790531158, "rewards/tag_count_reward": 0.671875, "step": 227 }, { "completion_length": 407.0, "epoch": 0.0152, "grad_norm": 110.41172790527344, "kl": 1.421875, "learning_rate": 1e-06, "loss": 0.0571, "reward": 0.6082684993743896, "reward_std": 0.22766825556755066, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.15735645592212677, "rewards/tag_count_reward": 0.765625, "step": 228 }, { "completion_length": 1122.0, "epoch": 0.015266666666666666, "grad_norm": 40.2192268371582, "kl": 2.515625, "learning_rate": 1e-06, "loss": 0.1008, "reward": 0.6912946701049805, "reward_std": 0.28589141368865967, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2618303894996643, "rewards/tag_count_reward": 0.890625, "step": 229 }, { "completion_length": 718.5, "epoch": 0.015333333333333332, "grad_norm": 34.91520309448242, "kl": 2.78125, "learning_rate": 1e-06, "loss": 0.1116, "reward": 0.6401130557060242, "reward_std": 0.49446383118629456, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.20363695919513702, "rewards/tag_count_reward": 0.71875, "step": 230 }, { "completion_length": 1270.0, "epoch": 0.0154, "grad_norm": 230.4547119140625, "kl": 1.1484375, "learning_rate": 1e-06, "loss": 0.046, "reward": 0.5606362819671631, "reward_std": 0.22004806995391846, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2831137180328369, "rewards/tag_count_reward": 0.84375, "step": 231 }, { "completion_length": 993.0, "epoch": 0.015466666666666667, "grad_norm": 123.76904296875, "kl": 2.265625, "learning_rate": 1e-06, "loss": 0.0911, "reward": 1.2356938123703003, "reward_std": 0.5522989630699158, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.35805612802505493, "rewards/tag_count_reward": 0.96875, "step": 232 }, { "completion_length": 1782.5, "epoch": 0.015533333333333333, "grad_norm": 369.7264709472656, "kl": 0.90625, "learning_rate": 1e-06, "loss": 0.0363, "reward": 0.20918411016464233, "reward_std": 0.31902557611465454, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22831588983535767, "rewards/tag_count_reward": 0.4375, "step": 233 }, { "completion_length": 1251.0, "epoch": 0.0156, "grad_norm": 147.57382202148438, "kl": 0.52734375, "learning_rate": 1e-06, "loss": 0.0211, "reward": 0.7122491002082825, "reward_std": 0.23013344407081604, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.11587591469287872, "rewards/tag_count_reward": 0.828125, "step": 234 }, { "completion_length": 1186.0, "epoch": 0.015666666666666666, "grad_norm": 66.60592651367188, "kl": 2.359375, "learning_rate": 1e-06, "loss": 0.0946, "reward": 0.5424002408981323, "reward_std": 0.26648008823394775, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23884975910186768, "rewards/tag_count_reward": 0.78125, "step": 235 }, { "completion_length": 1340.0, "epoch": 0.015733333333333332, "grad_norm": 320.10870361328125, "kl": 11.25, "learning_rate": 1e-06, "loss": 0.4481, "reward": 0.4441903233528137, "reward_std": 0.19974735379219055, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24330966174602509, "rewards/tag_count_reward": 0.6875, "step": 236 }, { "completion_length": 846.5, "epoch": 0.0158, "grad_norm": 116.87240600585938, "kl": 5.84375, "learning_rate": 1e-06, "loss": 0.2336, "reward": 1.1734495162963867, "reward_std": 0.4857546091079712, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.13905052840709686, "rewards/tag_count_reward": 0.9375, "step": 237 }, { "completion_length": 1278.5, "epoch": 0.015866666666666668, "grad_norm": 20.454734802246094, "kl": 2.375, "learning_rate": 1e-06, "loss": 0.0949, "reward": 0.5781991481781006, "reward_std": 0.3597751259803772, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2030508816242218, "rewards/tag_count_reward": 0.71875, "step": 238 }, { "completion_length": 1672.0, "epoch": 0.015933333333333334, "grad_norm": 34.63027572631836, "kl": 4.21875, "learning_rate": 1e-06, "loss": 0.1687, "reward": 0.37047719955444336, "reward_std": 0.3707921504974365, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19202280044555664, "rewards/tag_count_reward": 0.5625, "step": 239 }, { "completion_length": 270.5, "epoch": 0.016, "grad_norm": 54.505245208740234, "kl": 1.21875, "learning_rate": 1e-06, "loss": 0.0489, "reward": 1.076977252960205, "reward_std": 0.3801991939544678, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.1730227917432785, "rewards/tag_count_reward": 1.0, "step": 240 }, { "completion_length": 1284.0, "epoch": 0.016066666666666667, "grad_norm": 85.85352325439453, "kl": 3.1875, "learning_rate": 1e-06, "loss": 0.1275, "reward": 0.47834357619285583, "reward_std": 0.33723020553588867, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14665640890598297, "rewards/tag_count_reward": 0.625, "step": 241 }, { "completion_length": 1172.0, "epoch": 0.016133333333333333, "grad_norm": 45.62316131591797, "kl": 5.0, "learning_rate": 1e-06, "loss": 0.2004, "reward": 0.6529799103736877, "reward_std": 0.5432481169700623, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.20639507472515106, "rewards/tag_count_reward": 0.734375, "step": 242 }, { "completion_length": 2048.0, "epoch": 0.0162, "grad_norm": 107.2427978515625, "kl": 5.125, "learning_rate": 1e-06, "loss": 0.2056, "reward": 0.25203531980514526, "reward_std": 0.327674925327301, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.29483968019485474, "rewards/tag_count_reward": 0.484375, "step": 243 }, { "completion_length": 1438.0, "epoch": 0.016266666666666665, "grad_norm": 135.52386474609375, "kl": 1.140625, "learning_rate": 1e-06, "loss": 0.0456, "reward": 0.6074755191802979, "reward_std": 0.307578980922699, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18939949572086334, "rewards/tag_count_reward": 0.796875, "step": 244 }, { "completion_length": 1279.5, "epoch": 0.01633333333333333, "grad_norm": 72.26471710205078, "kl": 0.8125, "learning_rate": 1e-06, "loss": 0.0324, "reward": 0.637866735458374, "reward_std": 0.388823926448822, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.25275832414627075, "rewards/tag_count_reward": 0.828125, "step": 245 }, { "completion_length": 1477.5, "epoch": 0.0164, "grad_norm": 222.29373168945312, "kl": 2.109375, "learning_rate": 1e-06, "loss": 0.0841, "reward": 0.23366467654705048, "reward_std": 0.26289576292037964, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.25071030855178833, "rewards/tag_count_reward": 0.484375, "step": 246 }, { "completion_length": 1201.0, "epoch": 0.016466666666666668, "grad_norm": 36.81657409667969, "kl": 1.25, "learning_rate": 1e-06, "loss": 0.0501, "reward": 0.4497566819190979, "reward_std": 0.36574527621269226, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2689933180809021, "rewards/tag_count_reward": 0.71875, "step": 247 }, { "completion_length": 1144.0, "epoch": 0.016533333333333334, "grad_norm": 95.82453155517578, "kl": 1.1875, "learning_rate": 1e-06, "loss": 0.0475, "reward": 0.8179270625114441, "reward_std": 0.6401329040527344, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.1351979672908783, "rewards/tag_count_reward": 0.828125, "step": 248 }, { "completion_length": 1120.5, "epoch": 0.0166, "grad_norm": 82.85420227050781, "kl": 4.28125, "learning_rate": 1e-06, "loss": 0.1707, "reward": 0.37731966376304626, "reward_std": 0.30782580375671387, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26330533623695374, "rewards/tag_count_reward": 0.640625, "step": 249 }, { "completion_length": 1404.5, "epoch": 0.016666666666666666, "grad_norm": 101.45523071289062, "kl": 3.640625, "learning_rate": 1e-06, "loss": 0.1453, "reward": 0.5992006063461304, "reward_std": 0.5270397663116455, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.18204940855503082, "rewards/tag_count_reward": 0.65625, "step": 250 }, { "completion_length": 1210.5, "epoch": 0.016733333333333333, "grad_norm": 202.80154418945312, "kl": 8.875, "learning_rate": 1e-06, "loss": 0.3557, "reward": 0.6146788001060486, "reward_std": 0.30628567934036255, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.30719617009162903, "rewards/tag_count_reward": 0.609375, "step": 251 }, { "completion_length": 343.5, "epoch": 0.0168, "grad_norm": 116.09471893310547, "kl": 3.1875, "learning_rate": 1e-06, "loss": 0.1277, "reward": 0.9948755502700806, "reward_std": 0.40214404463768005, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.08324943482875824, "rewards/tag_count_reward": 0.953125, "step": 252 }, { "completion_length": 907.5, "epoch": 0.016866666666666665, "grad_norm": 59.3304328918457, "kl": 5.40625, "learning_rate": 1e-06, "loss": 0.2164, "reward": 0.8052083253860474, "reward_std": 0.4671739339828491, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.30416664481163025, "rewards/tag_count_reward": 0.921875, "step": 253 }, { "completion_length": 1012.0, "epoch": 0.016933333333333335, "grad_norm": 105.59696960449219, "kl": 5.46875, "learning_rate": 1e-06, "loss": 0.219, "reward": 0.8351148366928101, "reward_std": 0.5238816142082214, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.19613513350486755, "rewards/tag_count_reward": 0.84375, "step": 254 }, { "completion_length": 1186.0, "epoch": 0.017, "grad_norm": 10.558053970336914, "kl": 0.50390625, "learning_rate": 1e-06, "loss": 0.0202, "reward": 0.6605161428451538, "reward_std": 0.3056912124156952, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.15198387205600739, "rewards/tag_count_reward": 0.8125, "step": 255 }, { "completion_length": 619.5, "epoch": 0.017066666666666667, "grad_norm": 368.22821044921875, "kl": 1.203125, "learning_rate": 1e-06, "loss": 0.048, "reward": 0.9540891647338867, "reward_std": 0.4605638086795807, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.23341085016727448, "rewards/tag_count_reward": 1.0, "step": 256 }, { "completion_length": 851.0, "epoch": 0.017133333333333334, "grad_norm": 60.82518005371094, "kl": 1.1484375, "learning_rate": 1e-06, "loss": 0.0458, "reward": 0.7120445370674133, "reward_std": 0.1923874169588089, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24108049273490906, "rewards/tag_count_reward": 0.953125, "step": 257 }, { "completion_length": 955.0, "epoch": 0.0172, "grad_norm": 88.13372802734375, "kl": 3.640625, "learning_rate": 1e-06, "loss": 0.1454, "reward": 0.5048485994338989, "reward_std": 0.2593071162700653, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18265138566493988, "rewards/tag_count_reward": 0.6875, "step": 258 }, { "completion_length": 1299.0, "epoch": 0.017266666666666666, "grad_norm": 50.409542083740234, "kl": 2.59375, "learning_rate": 1e-06, "loss": 0.104, "reward": 0.6450666785240173, "reward_std": 0.39017635583877563, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.21430832147598267, "rewards/tag_count_reward": 0.796875, "step": 259 }, { "completion_length": 746.0, "epoch": 0.017333333333333333, "grad_norm": 75.39633178710938, "kl": 4.75, "learning_rate": 1e-06, "loss": 0.19, "reward": 0.8463404178619385, "reward_std": 0.5164816379547119, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.20053458213806152, "rewards/tag_count_reward": 0.796875, "step": 260 }, { "completion_length": 720.5, "epoch": 0.0174, "grad_norm": 160.54527282714844, "kl": 3.84375, "learning_rate": 1e-06, "loss": 0.1536, "reward": 1.109606385231018, "reward_std": 0.5404841303825378, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.20289357006549835, "rewards/tag_count_reward": 1.0, "step": 261 }, { "completion_length": 1248.0, "epoch": 0.017466666666666665, "grad_norm": 339.8331604003906, "kl": 5.125, "learning_rate": 1e-06, "loss": 0.2044, "reward": 0.5907869338989258, "reward_std": 0.255514919757843, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.12796306610107422, "rewards/tag_count_reward": 0.71875, "step": 262 }, { "completion_length": 2011.0, "epoch": 0.017533333333333335, "grad_norm": 78.27387237548828, "kl": 2.796875, "learning_rate": 1e-06, "loss": 0.1118, "reward": 0.557160496711731, "reward_std": 0.25057724118232727, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20846451818943024, "rewards/tag_count_reward": 0.765625, "step": 263 }, { "completion_length": 1214.5, "epoch": 0.0176, "grad_norm": 116.71304321289062, "kl": 1.65625, "learning_rate": 1e-06, "loss": 0.0659, "reward": 0.880974292755127, "reward_std": 0.3218742609024048, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.27527573704719543, "rewards/tag_count_reward": 0.96875, "step": 264 }, { "completion_length": 1242.0, "epoch": 0.017666666666666667, "grad_norm": 141.15890502929688, "kl": 3.0625, "learning_rate": 1e-06, "loss": 0.1228, "reward": 0.9699192643165588, "reward_std": 0.4596714377403259, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.21758070588111877, "rewards/tag_count_reward": 1.0, "step": 265 }, { "completion_length": 1939.0, "epoch": 0.017733333333333334, "grad_norm": 193.25926208496094, "kl": 6.375, "learning_rate": 1e-06, "loss": 0.2553, "reward": 0.35130447149276733, "reward_std": 0.2630960941314697, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.25807052850723267, "rewards/tag_count_reward": 0.609375, "step": 266 }, { "completion_length": 724.0, "epoch": 0.0178, "grad_norm": 447.6954650878906, "kl": 5.71875, "learning_rate": 1e-06, "loss": 0.228, "reward": 0.9759852886199951, "reward_std": 0.4168573021888733, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.14901477098464966, "rewards/tag_count_reward": 0.6875, "step": 267 }, { "completion_length": 1770.0, "epoch": 0.017866666666666666, "grad_norm": 105.21097564697266, "kl": 4.3125, "learning_rate": 1e-06, "loss": 0.1719, "reward": 0.5311283469200134, "reward_std": 0.27244049310684204, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18762168288230896, "rewards/tag_count_reward": 0.71875, "step": 268 }, { "completion_length": 1002.0, "epoch": 0.017933333333333332, "grad_norm": 195.61874389648438, "kl": 1.78125, "learning_rate": 1e-06, "loss": 0.0713, "reward": 0.4463762640953064, "reward_std": 0.15696293115615845, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2879987955093384, "rewards/tag_count_reward": 0.734375, "step": 269 }, { "completion_length": 715.5, "epoch": 0.018, "grad_norm": 138.14581298828125, "kl": 1.203125, "learning_rate": 1e-06, "loss": 0.0482, "reward": 1.0527253150939941, "reward_std": 0.32684525847435, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.22852468490600586, "rewards/tag_count_reward": 0.96875, "step": 270 }, { "completion_length": 650.0, "epoch": 0.01806666666666667, "grad_norm": 78.0746841430664, "kl": 0.84375, "learning_rate": 1e-06, "loss": 0.0337, "reward": 1.0004079341888428, "reward_std": 0.6274251341819763, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.093342125415802, "rewards/tag_count_reward": 0.90625, "step": 271 }, { "completion_length": 1242.0, "epoch": 0.018133333333333335, "grad_norm": 143.22390747070312, "kl": 1.3671875, "learning_rate": 1e-06, "loss": 0.0548, "reward": 0.3881889581680298, "reward_std": 0.20968960225582123, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2993110120296478, "rewards/tag_count_reward": 0.6875, "step": 272 }, { "completion_length": 1436.0, "epoch": 0.0182, "grad_norm": 115.1573715209961, "kl": 1.1015625, "learning_rate": 1e-06, "loss": 0.0439, "reward": 0.6134010553359985, "reward_std": 0.3705349862575531, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.16784895956516266, "rewards/tag_count_reward": 0.71875, "step": 273 }, { "completion_length": 1050.0, "epoch": 0.018266666666666667, "grad_norm": 48.76930618286133, "kl": 1.359375, "learning_rate": 1e-06, "loss": 0.0542, "reward": 0.7366237044334412, "reward_std": 0.2547900378704071, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.13837629556655884, "rewards/tag_count_reward": 0.875, "step": 274 }, { "completion_length": 1306.5, "epoch": 0.018333333333333333, "grad_norm": 65.57711791992188, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.0863, "reward": 0.6077868342399597, "reward_std": 0.20309025049209595, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2515881657600403, "rewards/tag_count_reward": 0.859375, "step": 275 }, { "completion_length": 539.5, "epoch": 0.0184, "grad_norm": 47.25210189819336, "kl": 1.2265625, "learning_rate": 1e-06, "loss": 0.0491, "reward": 1.2373601198196411, "reward_std": 0.45193660259246826, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.09076492488384247, "rewards/tag_count_reward": 0.890625, "step": 276 }, { "completion_length": 1508.0, "epoch": 0.018466666666666666, "grad_norm": 63.90336990356445, "kl": 5.0625, "learning_rate": 1e-06, "loss": 0.2025, "reward": 0.39768722653388977, "reward_std": 0.34046733379364014, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16481277346611023, "rewards/tag_count_reward": 0.5625, "step": 277 }, { "completion_length": 304.0, "epoch": 0.018533333333333332, "grad_norm": 190.8180694580078, "kl": 7.34375, "learning_rate": 1e-06, "loss": 0.293, "reward": 0.985604465007782, "reward_std": 0.49539294838905334, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.13939552009105682, "rewards/tag_count_reward": 1.0, "step": 278 }, { "completion_length": 878.5, "epoch": 0.0186, "grad_norm": 267.1549072265625, "kl": 9.625, "learning_rate": 1e-06, "loss": 0.3847, "reward": 0.7716439962387085, "reward_std": 0.4284132421016693, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.1502310335636139, "rewards/tag_count_reward": 0.859375, "step": 279 }, { "completion_length": 1235.0, "epoch": 0.018666666666666668, "grad_norm": 213.63856506347656, "kl": 11.125, "learning_rate": 1e-06, "loss": 0.443, "reward": 0.884079098701477, "reward_std": 0.45604008436203003, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.22529590129852295, "rewards/tag_count_reward": 0.734375, "step": 280 }, { "completion_length": 288.5, "epoch": 0.018733333333333334, "grad_norm": 86.7800521850586, "kl": 4.0625, "learning_rate": 1e-06, "loss": 0.1625, "reward": 1.1818442344665527, "reward_std": 0.4051699638366699, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.13065576553344727, "rewards/tag_count_reward": 1.0, "step": 281 }, { "completion_length": 1323.0, "epoch": 0.0188, "grad_norm": 106.07861328125, "kl": 1.8359375, "learning_rate": 1e-06, "loss": 0.0733, "reward": 0.3720395565032959, "reward_std": 0.2879902720451355, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2529604434967041, "rewards/tag_count_reward": 0.625, "step": 282 }, { "completion_length": 1189.5, "epoch": 0.018866666666666667, "grad_norm": 162.4084930419922, "kl": 6.4375, "learning_rate": 1e-06, "loss": 0.2579, "reward": 0.7242240905761719, "reward_std": 0.2956833243370056, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.29140087962150574, "rewards/tag_count_reward": 0.953125, "step": 283 }, { "completion_length": 927.5, "epoch": 0.018933333333333333, "grad_norm": 127.29533386230469, "kl": 2.28125, "learning_rate": 1e-06, "loss": 0.0912, "reward": 1.453515648841858, "reward_std": 0.6606445908546448, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.1558593511581421, "rewards/tag_count_reward": 0.984375, "step": 284 }, { "completion_length": 1550.5, "epoch": 0.019, "grad_norm": 90.6812515258789, "kl": 2.34375, "learning_rate": 1e-06, "loss": 0.0938, "reward": 0.31715595722198486, "reward_std": 0.30243024230003357, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3390940725803375, "rewards/tag_count_reward": 0.65625, "step": 285 }, { "completion_length": 277.5, "epoch": 0.019066666666666666, "grad_norm": 60.94807052612305, "kl": 0.171875, "learning_rate": 1e-06, "loss": 0.0069, "reward": 1.2874398231506348, "reward_std": 0.2819579541683197, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.16568517684936523, "rewards/tag_count_reward": 0.953125, "step": 286 }, { "completion_length": 1065.0, "epoch": 0.019133333333333332, "grad_norm": 78.6766128540039, "kl": 1.2890625, "learning_rate": 1e-06, "loss": 0.0517, "reward": 0.28757408261299133, "reward_std": 0.2649262547492981, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.25930091738700867, "rewards/tag_count_reward": 0.546875, "step": 287 }, { "completion_length": 1198.0, "epoch": 0.0192, "grad_norm": 26.8408260345459, "kl": 3.28125, "learning_rate": 1e-06, "loss": 0.131, "reward": 0.44327443838119507, "reward_std": 0.3510250449180603, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.33797556161880493, "rewards/tag_count_reward": 0.78125, "step": 288 }, { "completion_length": 771.0, "epoch": 0.019266666666666668, "grad_norm": 101.416259765625, "kl": 1.1796875, "learning_rate": 1e-06, "loss": 0.047, "reward": 0.7894538044929504, "reward_std": 0.1728997826576233, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.17929621040821075, "rewards/tag_count_reward": 0.96875, "step": 289 }, { "completion_length": 1305.0, "epoch": 0.019333333333333334, "grad_norm": 92.49710845947266, "kl": 1.34375, "learning_rate": 1e-06, "loss": 0.0537, "reward": 0.9623291492462158, "reward_std": 0.44925224781036377, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.22517085075378418, "rewards/tag_count_reward": 0.75, "step": 290 }, { "completion_length": 1113.5, "epoch": 0.0194, "grad_norm": 221.7279510498047, "kl": 1.7109375, "learning_rate": 1e-06, "loss": 0.0685, "reward": 1.2709362506866455, "reward_std": 0.2591468393802643, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.21343865990638733, "rewards/tag_count_reward": 0.984375, "step": 291 }, { "completion_length": 406.5, "epoch": 0.019466666666666667, "grad_norm": 38.19606399536133, "kl": 1.4375, "learning_rate": 1e-06, "loss": 0.0577, "reward": 0.5383668541908264, "reward_std": 0.3133370578289032, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19600816071033478, "rewards/tag_count_reward": 0.734375, "step": 292 }, { "completion_length": 1367.0, "epoch": 0.019533333333333333, "grad_norm": 171.46334838867188, "kl": 10.5, "learning_rate": 1e-06, "loss": 0.4187, "reward": 0.433968186378479, "reward_std": 0.2997130751609802, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.300406813621521, "rewards/tag_count_reward": 0.734375, "step": 293 }, { "completion_length": 1106.0, "epoch": 0.0196, "grad_norm": 78.32112121582031, "kl": 6.5625, "learning_rate": 1e-06, "loss": 0.2641, "reward": 0.29133471846580505, "reward_std": 0.3040623068809509, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14616529643535614, "rewards/tag_count_reward": 0.4375, "step": 294 }, { "completion_length": 523.0, "epoch": 0.019666666666666666, "grad_norm": 193.3687744140625, "kl": 4.96875, "learning_rate": 1e-06, "loss": 0.198, "reward": 1.0770788192749023, "reward_std": 0.3124271333217621, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.26667121052742004, "rewards/tag_count_reward": 0.96875, "step": 295 }, { "completion_length": 1180.5, "epoch": 0.019733333333333332, "grad_norm": 115.18844604492188, "kl": 6.21875, "learning_rate": 1e-06, "loss": 0.2484, "reward": 0.7966092824935913, "reward_std": 0.47257116436958313, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.2033906877040863, "rewards/tag_count_reward": 0.75, "step": 296 }, { "completion_length": 1163.0, "epoch": 0.0198, "grad_norm": 25.589466094970703, "kl": 4.21875, "learning_rate": 1e-06, "loss": 0.169, "reward": 0.6315311193466187, "reward_std": 0.541244387626648, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.27471891045570374, "rewards/tag_count_reward": 0.78125, "step": 297 }, { "completion_length": 1623.5, "epoch": 0.019866666666666668, "grad_norm": 223.26202392578125, "kl": 2.21875, "learning_rate": 1e-06, "loss": 0.0888, "reward": 0.6494944095611572, "reward_std": 0.5408167839050293, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.2567555606365204, "rewards/tag_count_reward": 0.71875, "step": 298 }, { "completion_length": 396.0, "epoch": 0.019933333333333334, "grad_norm": 72.4623794555664, "kl": 1.8671875, "learning_rate": 1e-06, "loss": 0.0748, "reward": 0.8335331082344055, "reward_std": 0.1918882131576538, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1352168619632721, "rewards/tag_count_reward": 0.96875, "step": 299 }, { "completion_length": 1047.0, "epoch": 0.02, "grad_norm": 53.39625930786133, "kl": 1.9296875, "learning_rate": 1e-06, "loss": 0.0774, "reward": 0.8796528577804565, "reward_std": 0.5016852617263794, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.22972214221954346, "rewards/tag_count_reward": 0.921875, "step": 300 }, { "completion_length": 826.0, "epoch": 0.020066666666666667, "grad_norm": 22.02448844909668, "kl": 0.84375, "learning_rate": 1e-06, "loss": 0.0337, "reward": 1.2643147706985474, "reward_std": 0.3837927579879761, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.15756019949913025, "rewards/tag_count_reward": 0.984375, "step": 301 }, { "completion_length": 1103.0, "epoch": 0.020133333333333333, "grad_norm": 74.74524688720703, "kl": 1.6484375, "learning_rate": 1e-06, "loss": 0.0659, "reward": 0.6457744240760803, "reward_std": 0.35607969760894775, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16672557592391968, "rewards/tag_count_reward": 0.8125, "step": 302 }, { "completion_length": 493.5, "epoch": 0.0202, "grad_norm": 43.90092849731445, "kl": 1.8125, "learning_rate": 1e-06, "loss": 0.0724, "reward": 1.023780345916748, "reward_std": 0.5628347992897034, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.22621965408325195, "rewards/tag_count_reward": 1.0, "step": 303 }, { "completion_length": 1063.0, "epoch": 0.020266666666666665, "grad_norm": 35.594295501708984, "kl": 2.0, "learning_rate": 1e-06, "loss": 0.0801, "reward": 1.0189833641052246, "reward_std": 0.5456187725067139, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.10601675510406494, "rewards/tag_count_reward": 0.8125, "step": 304 }, { "completion_length": 2048.0, "epoch": 0.02033333333333333, "grad_norm": 107.36284637451172, "kl": 6.25, "learning_rate": 1e-06, "loss": 0.2504, "reward": 0.26232224702835083, "reward_std": 0.2836293578147888, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.31580275297164917, "rewards/tag_count_reward": 0.578125, "step": 305 }, { "completion_length": 1701.0, "epoch": 0.0204, "grad_norm": 303.72821044921875, "kl": 9.5, "learning_rate": 1e-06, "loss": 0.3803, "reward": 0.5319410562515259, "reward_std": 0.1759571135044098, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.4055589437484741, "rewards/tag_count_reward": 0.9375, "step": 306 }, { "completion_length": 771.5, "epoch": 0.020466666666666668, "grad_norm": 59.3670539855957, "kl": 2.953125, "learning_rate": 1e-06, "loss": 0.1183, "reward": 1.1603425741195679, "reward_std": 0.40957415103912354, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.19903242588043213, "rewards/tag_count_reward": 0.921875, "step": 307 }, { "completion_length": 2048.0, "epoch": 0.020533333333333334, "grad_norm": 380.81689453125, "kl": 17.5, "learning_rate": 1e-06, "loss": 0.7023, "reward": 0.24851681292057037, "reward_std": 0.20674453675746918, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22023317217826843, "rewards/tag_count_reward": 0.46875, "step": 308 }, { "completion_length": 832.5, "epoch": 0.0206, "grad_norm": 132.42506408691406, "kl": 8.1875, "learning_rate": 1e-06, "loss": 0.3275, "reward": 0.7047885656356812, "reward_std": 0.343413770198822, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18583643436431885, "rewards/tag_count_reward": 0.890625, "step": 309 }, { "completion_length": 777.0, "epoch": 0.020666666666666667, "grad_norm": 76.12239837646484, "kl": 3.6875, "learning_rate": 1e-06, "loss": 0.1471, "reward": 0.8980907201766968, "reward_std": 0.4372199773788452, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.1175343245267868, "rewards/tag_count_reward": 0.953125, "step": 310 }, { "completion_length": 1257.5, "epoch": 0.020733333333333333, "grad_norm": 271.6349182128906, "kl": 6.65625, "learning_rate": 1e-06, "loss": 0.2665, "reward": 1.0332744121551514, "reward_std": 0.33565282821655273, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.2323506772518158, "rewards/tag_count_reward": 0.953125, "step": 311 }, { "completion_length": 1716.5, "epoch": 0.0208, "grad_norm": 340.9157409667969, "kl": 9.25, "learning_rate": 1e-06, "loss": 0.3681, "reward": 0.30537861585617065, "reward_std": 0.32000964879989624, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27274638414382935, "rewards/tag_count_reward": 0.578125, "step": 312 }, { "completion_length": 430.0, "epoch": 0.020866666666666665, "grad_norm": 58.16603088378906, "kl": 1.84375, "learning_rate": 1e-06, "loss": 0.0737, "reward": 1.3933343887329102, "reward_std": 0.26193928718566895, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.10666556656360626, "rewards/tag_count_reward": 1.0, "step": 313 }, { "completion_length": 1505.0, "epoch": 0.020933333333333335, "grad_norm": 42.11033248901367, "kl": 4.3125, "learning_rate": 1e-06, "loss": 0.1722, "reward": 0.6482540369033813, "reward_std": 0.22455057501792908, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.38299596309661865, "rewards/tag_count_reward": 0.96875, "step": 314 }, { "completion_length": 302.0, "epoch": 0.021, "grad_norm": 42.860755920410156, "kl": 1.796875, "learning_rate": 1e-06, "loss": 0.0718, "reward": 1.214234471321106, "reward_std": 0.6082682609558105, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.1295156180858612, "rewards/tag_count_reward": 0.96875, "step": 315 }, { "completion_length": 590.0, "epoch": 0.021066666666666668, "grad_norm": 62.87367248535156, "kl": 0.76953125, "learning_rate": 1e-06, "loss": 0.0308, "reward": 0.9708921909332275, "reward_std": 0.5262269973754883, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.10723280906677246, "rewards/tag_count_reward": 0.828125, "step": 316 }, { "completion_length": 687.5, "epoch": 0.021133333333333334, "grad_norm": 32.127357482910156, "kl": 0.34765625, "learning_rate": 1e-06, "loss": 0.014, "reward": 0.9708912372589111, "reward_std": 0.4935838580131531, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.21660876274108887, "rewards/tag_count_reward": 0.9375, "step": 317 }, { "completion_length": 1581.0, "epoch": 0.0212, "grad_norm": 104.86175537109375, "kl": 0.7421875, "learning_rate": 1e-06, "loss": 0.0296, "reward": 0.49017536640167236, "reward_std": 0.22941234707832336, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18169961869716644, "rewards/tag_count_reward": 0.671875, "step": 318 }, { "completion_length": 1262.5, "epoch": 0.021266666666666666, "grad_norm": 116.81327819824219, "kl": 0.419921875, "learning_rate": 1e-06, "loss": 0.0168, "reward": 0.8134942054748535, "reward_std": 0.4755869209766388, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.15525580942630768, "rewards/tag_count_reward": 0.71875, "step": 319 }, { "completion_length": 849.5, "epoch": 0.021333333333333333, "grad_norm": 57.31502914428711, "kl": 0.6875, "learning_rate": 1e-06, "loss": 0.0276, "reward": 1.1615235805511475, "reward_std": 0.528767466545105, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.041601456701755524, "rewards/tag_count_reward": 0.765625, "step": 320 }, { "completion_length": 1183.5, "epoch": 0.0214, "grad_norm": 27.733661651611328, "kl": 1.3125, "learning_rate": 1e-06, "loss": 0.0528, "reward": 1.0065999031066895, "reward_std": 0.43920642137527466, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.2590251564979553, "rewards/tag_count_reward": 0.890625, "step": 321 }, { "completion_length": 696.5, "epoch": 0.021466666666666665, "grad_norm": 228.986572265625, "kl": 0.953125, "learning_rate": 1e-06, "loss": 0.0381, "reward": 1.0592111349105835, "reward_std": 0.7358847856521606, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.23766390979290009, "rewards/tag_count_reward": 0.984375, "step": 322 }, { "completion_length": 1235.0, "epoch": 0.021533333333333335, "grad_norm": 80.84215545654297, "kl": 1.390625, "learning_rate": 1e-06, "loss": 0.0558, "reward": 0.6002234220504761, "reward_std": 0.2300834059715271, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2435266226530075, "rewards/tag_count_reward": 0.84375, "step": 323 }, { "completion_length": 1185.0, "epoch": 0.0216, "grad_norm": 40.54029083251953, "kl": 2.3125, "learning_rate": 1e-06, "loss": 0.0924, "reward": 1.0903337001800537, "reward_std": 0.5135502815246582, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.15966621041297913, "rewards/tag_count_reward": 0.875, "step": 324 }, { "completion_length": 1124.5, "epoch": 0.021666666666666667, "grad_norm": 96.52243041992188, "kl": 2.96875, "learning_rate": 1e-06, "loss": 0.1185, "reward": 0.8188521862030029, "reward_std": 0.4867573380470276, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.22802278399467468, "rewards/tag_count_reward": 0.796875, "step": 325 }, { "completion_length": 629.5, "epoch": 0.021733333333333334, "grad_norm": 52.54798126220703, "kl": 1.78125, "learning_rate": 1e-06, "loss": 0.0713, "reward": 0.6936306953430176, "reward_std": 0.2830107510089874, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.243869349360466, "rewards/tag_count_reward": 0.875, "step": 326 }, { "completion_length": 1273.0, "epoch": 0.0218, "grad_norm": 553.539306640625, "kl": 24.0, "learning_rate": 1e-06, "loss": 0.9609, "reward": 0.5443458557128906, "reward_std": 0.2812609076499939, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23690414428710938, "rewards/tag_count_reward": 0.78125, "step": 327 }, { "completion_length": 1118.0, "epoch": 0.021866666666666666, "grad_norm": 15.364245414733887, "kl": 1.671875, "learning_rate": 1e-06, "loss": 0.0669, "reward": 1.1107473373413086, "reward_std": 0.29542076587677, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.21737776696681976, "rewards/tag_count_reward": 0.953125, "step": 328 }, { "completion_length": 304.0, "epoch": 0.021933333333333332, "grad_norm": 151.0471649169922, "kl": 7.3125, "learning_rate": 1e-06, "loss": 0.2924, "reward": 1.7604129314422607, "reward_std": 0.4952170252799988, "rewards/accuracy_reward": 0.875, "rewards/len_reward": -0.11458698660135269, "rewards/tag_count_reward": 1.0, "step": 329 }, { "completion_length": 1306.5, "epoch": 0.022, "grad_norm": 956.6917724609375, "kl": 37.25, "learning_rate": 1e-06, "loss": 1.491, "reward": 0.5319682359695435, "reward_std": 0.287946492433548, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20240673422813416, "rewards/tag_count_reward": 0.734375, "step": 330 }, { "completion_length": 1410.5, "epoch": 0.022066666666666665, "grad_norm": 397.27294921875, "kl": 21.875, "learning_rate": 1e-06, "loss": 0.8763, "reward": 0.913928210735321, "reward_std": 0.5697895288467407, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.19544677436351776, "rewards/tag_count_reward": 0.734375, "step": 331 }, { "completion_length": 2048.0, "epoch": 0.022133333333333335, "grad_norm": 85.96682739257812, "kl": 9.375, "learning_rate": 1e-06, "loss": 0.3737, "reward": 0.4683043956756592, "reward_std": 0.2435683161020279, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2504456043243408, "rewards/tag_count_reward": 0.71875, "step": 332 }, { "completion_length": 774.5, "epoch": 0.0222, "grad_norm": 28.857770919799805, "kl": 1.6328125, "learning_rate": 1e-06, "loss": 0.0654, "reward": 1.199403166770935, "reward_std": 0.4144977331161499, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.06622181087732315, "rewards/tag_count_reward": 0.953125, "step": 333 }, { "completion_length": 1293.0, "epoch": 0.022266666666666667, "grad_norm": 76.90970611572266, "kl": 1.9921875, "learning_rate": 1e-06, "loss": 0.0795, "reward": 0.8091883063316345, "reward_std": 0.4681074619293213, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.22206170856952667, "rewards/tag_count_reward": 0.84375, "step": 334 }, { "completion_length": 1191.5, "epoch": 0.022333333333333334, "grad_norm": 72.85279846191406, "kl": 2.5, "learning_rate": 1e-06, "loss": 0.1002, "reward": 1.10751211643219, "reward_std": 0.5649300217628479, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.25186288356781006, "rewards/tag_count_reward": 0.984375, "step": 335 }, { "completion_length": 319.5, "epoch": 0.0224, "grad_norm": 46.205345153808594, "kl": 1.21875, "learning_rate": 1e-06, "loss": 0.0487, "reward": 1.3281185626983643, "reward_std": 0.45367008447647095, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.09375645965337753, "rewards/tag_count_reward": 0.984375, "step": 336 }, { "completion_length": 375.0, "epoch": 0.022466666666666666, "grad_norm": 31.738218307495117, "kl": 1.6875, "learning_rate": 1e-06, "loss": 0.0671, "reward": 1.096465826034546, "reward_std": 0.48942404985427856, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.1379092037677765, "rewards/tag_count_reward": 0.984375, "step": 337 }, { "completion_length": 537.5, "epoch": 0.022533333333333332, "grad_norm": 154.14036560058594, "kl": 2.421875, "learning_rate": 1e-06, "loss": 0.0972, "reward": 0.854365348815918, "reward_std": 0.3954009413719177, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.30188465118408203, "rewards/tag_count_reward": 0.96875, "step": 338 }, { "completion_length": 1025.0, "epoch": 0.0226, "grad_norm": 210.96768188476562, "kl": 1.203125, "learning_rate": 1e-06, "loss": 0.0479, "reward": 0.7532867789268494, "reward_std": 0.35458308458328247, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.24671322107315063, "rewards/tag_count_reward": 0.9375, "step": 339 }, { "completion_length": 1158.5, "epoch": 0.02266666666666667, "grad_norm": 37.851768493652344, "kl": 1.3125, "learning_rate": 1e-06, "loss": 0.0525, "reward": 0.7112846970558167, "reward_std": 0.26829543709754944, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.13246530294418335, "rewards/tag_count_reward": 0.84375, "step": 340 }, { "completion_length": 892.5, "epoch": 0.022733333333333335, "grad_norm": 24.23723602294922, "kl": 0.671875, "learning_rate": 1e-06, "loss": 0.027, "reward": 0.9460461735725403, "reward_std": 0.41133737564086914, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.06957884132862091, "rewards/tag_count_reward": 0.953125, "step": 341 }, { "completion_length": 681.0, "epoch": 0.0228, "grad_norm": 33.455047607421875, "kl": 1.1328125, "learning_rate": 1e-06, "loss": 0.0454, "reward": 0.9595123529434204, "reward_std": 0.5215348601341248, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.14986258745193481, "rewards/tag_count_reward": 0.859375, "step": 342 }, { "completion_length": 1745.5, "epoch": 0.022866666666666667, "grad_norm": 46.849483489990234, "kl": 2.28125, "learning_rate": 1e-06, "loss": 0.0912, "reward": 0.3965286910533905, "reward_std": 0.2530428469181061, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3065963387489319, "rewards/tag_count_reward": 0.703125, "step": 343 }, { "completion_length": 1157.0, "epoch": 0.022933333333333333, "grad_norm": 144.76483154296875, "kl": 4.4375, "learning_rate": 1e-06, "loss": 0.1764, "reward": 0.6540185213088989, "reward_std": 0.1685352623462677, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3303564786911011, "rewards/tag_count_reward": 0.984375, "step": 344 }, { "completion_length": 598.0, "epoch": 0.023, "grad_norm": 28.438884735107422, "kl": 4.8125, "learning_rate": 1e-06, "loss": 0.1917, "reward": 0.9016408324241638, "reward_std": 0.42892104387283325, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.1452341079711914, "rewards/tag_count_reward": 0.921875, "step": 345 }, { "completion_length": 1474.5, "epoch": 0.023066666666666666, "grad_norm": 68.08282470703125, "kl": 9.75, "learning_rate": 1e-06, "loss": 0.3908, "reward": 1.0328866243362427, "reward_std": 0.4325779974460602, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.3264884054660797, "rewards/tag_count_reward": 0.984375, "step": 346 }, { "completion_length": 2048.0, "epoch": 0.023133333333333332, "grad_norm": 49.9034309387207, "kl": 7.0, "learning_rate": 1e-06, "loss": 0.2797, "reward": 0.33096492290496826, "reward_std": 0.2335730493068695, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27841007709503174, "rewards/tag_count_reward": 0.609375, "step": 347 }, { "completion_length": 818.5, "epoch": 0.0232, "grad_norm": 19.60797691345215, "kl": 4.8125, "learning_rate": 1e-06, "loss": 0.1927, "reward": 1.0055264234542847, "reward_std": 0.5644086599349976, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.10384850949048996, "rewards/tag_count_reward": 0.796875, "step": 348 }, { "completion_length": 374.5, "epoch": 0.023266666666666668, "grad_norm": 94.12194061279297, "kl": 1.015625, "learning_rate": 1e-06, "loss": 0.0406, "reward": 1.3110102415084839, "reward_std": 0.3672176003456116, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.1264897584915161, "rewards/tag_count_reward": 1.0, "step": 349 }, { "completion_length": 1221.0, "epoch": 0.023333333333333334, "grad_norm": 324.7854309082031, "kl": 1.40625, "learning_rate": 1e-06, "loss": 0.0564, "reward": 0.5963558554649353, "reward_std": 0.23897361755371094, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1848941296339035, "rewards/tag_count_reward": 0.78125, "step": 350 }, { "completion_length": 389.5, "epoch": 0.0234, "grad_norm": 26.179595947265625, "kl": 1.6953125, "learning_rate": 1e-06, "loss": 0.0679, "reward": 1.3193278312683105, "reward_std": 0.7391749620437622, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.008797142654657364, "rewards/tag_count_reward": 0.890625, "step": 351 }, { "completion_length": 1109.5, "epoch": 0.023466666666666667, "grad_norm": 188.46897888183594, "kl": 1.203125, "learning_rate": 1e-06, "loss": 0.0482, "reward": 0.5779841542243958, "reward_std": 0.22797349095344543, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.29701584577560425, "rewards/tag_count_reward": 0.875, "step": 352 }, { "completion_length": 1664.5, "epoch": 0.023533333333333333, "grad_norm": 27.162336349487305, "kl": 1.4375, "learning_rate": 1e-06, "loss": 0.0573, "reward": 0.6244074106216431, "reward_std": 0.3301810026168823, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23496761918067932, "rewards/tag_count_reward": 0.859375, "step": 353 }, { "completion_length": 540.5, "epoch": 0.0236, "grad_norm": 76.66584777832031, "kl": 1.78125, "learning_rate": 1e-06, "loss": 0.0715, "reward": 1.0277178287506104, "reward_std": 0.3999354839324951, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.23790714144706726, "rewards/tag_count_reward": 0.953125, "step": 354 }, { "completion_length": 730.0, "epoch": 0.023666666666666666, "grad_norm": 120.81816864013672, "kl": 2.015625, "learning_rate": 1e-06, "loss": 0.0808, "reward": 1.1916608810424805, "reward_std": 0.7615381479263306, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.10521401464939117, "rewards/tag_count_reward": 0.921875, "step": 355 }, { "completion_length": 758.0, "epoch": 0.023733333333333332, "grad_norm": 37.24180221557617, "kl": 5.6875, "learning_rate": 1e-06, "loss": 0.2278, "reward": 0.8624981641769409, "reward_std": 0.5419960021972656, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.26250186562538147, "rewards/tag_count_reward": 1.0, "step": 356 }, { "completion_length": 1211.5, "epoch": 0.0238, "grad_norm": 81.53892517089844, "kl": 5.0625, "learning_rate": 1e-06, "loss": 0.2026, "reward": 1.141159176826477, "reward_std": 0.481996089220047, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.14009076356887817, "rewards/tag_count_reward": 0.90625, "step": 357 }, { "completion_length": 1180.0, "epoch": 0.023866666666666668, "grad_norm": 93.55269622802734, "kl": 5.4375, "learning_rate": 1e-06, "loss": 0.2177, "reward": 0.537026047706604, "reward_std": 0.20559044182300568, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.322348952293396, "rewards/tag_count_reward": 0.859375, "step": 358 }, { "completion_length": 2048.0, "epoch": 0.023933333333333334, "grad_norm": 194.85621643066406, "kl": 1.4765625, "learning_rate": 1e-06, "loss": 0.0592, "reward": 0.3867694139480591, "reward_std": 0.34691348671913147, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16010558605194092, "rewards/tag_count_reward": 0.546875, "step": 359 }, { "completion_length": 2048.0, "epoch": 0.024, "grad_norm": 172.2584686279297, "kl": 1.328125, "learning_rate": 1e-06, "loss": 0.053, "reward": 0.42840299010276794, "reward_std": 0.3341846466064453, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16534700989723206, "rewards/tag_count_reward": 0.59375, "step": 360 }, { "completion_length": 1219.5, "epoch": 0.024066666666666667, "grad_norm": 111.23456573486328, "kl": 1.3125, "learning_rate": 1e-06, "loss": 0.0526, "reward": 0.6558999419212341, "reward_std": 0.4681927561759949, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.21910005807876587, "rewards/tag_count_reward": 0.75, "step": 361 }, { "completion_length": 2048.0, "epoch": 0.024133333333333333, "grad_norm": 164.71762084960938, "kl": 1.2890625, "learning_rate": 1e-06, "loss": 0.0516, "reward": 0.41724294424057007, "reward_std": 0.24640598893165588, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.28588205575942993, "rewards/tag_count_reward": 0.703125, "step": 362 }, { "completion_length": 1185.0, "epoch": 0.0242, "grad_norm": 113.31256103515625, "kl": 1.1328125, "learning_rate": 1e-06, "loss": 0.0454, "reward": 1.023838758468628, "reward_std": 0.38074183464050293, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.17928624153137207, "rewards/tag_count_reward": 0.953125, "step": 363 }, { "completion_length": 1585.0, "epoch": 0.024266666666666666, "grad_norm": 168.71810913085938, "kl": 0.984375, "learning_rate": 1e-06, "loss": 0.0392, "reward": 0.7161804437637329, "reward_std": 0.5223672389984131, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.22131958603858948, "rewards/tag_count_reward": 0.75, "step": 364 }, { "completion_length": 393.5, "epoch": 0.024333333333333332, "grad_norm": 60.523162841796875, "kl": 0.66796875, "learning_rate": 1e-06, "loss": 0.0268, "reward": 1.1848499774932861, "reward_std": 0.6366668939590454, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.06514997035264969, "rewards/tag_count_reward": 1.0, "step": 365 }, { "completion_length": 1315.0, "epoch": 0.0244, "grad_norm": 80.97102355957031, "kl": 1.5, "learning_rate": 1e-06, "loss": 0.06, "reward": 1.1217434406280518, "reward_std": 0.6317075490951538, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.31575655937194824, "rewards/tag_count_reward": 1.0, "step": 366 }, { "completion_length": 1144.0, "epoch": 0.024466666666666668, "grad_norm": 135.45469665527344, "kl": 0.88671875, "learning_rate": 1e-06, "loss": 0.0354, "reward": 1.0854109525680542, "reward_std": 0.27328023314476013, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.2583390176296234, "rewards/tag_count_reward": 0.84375, "step": 367 }, { "completion_length": 610.0, "epoch": 0.024533333333333334, "grad_norm": 30.466493606567383, "kl": 1.140625, "learning_rate": 1e-06, "loss": 0.0456, "reward": 0.7019455432891846, "reward_std": 0.171876460313797, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2668044865131378, "rewards/tag_count_reward": 0.96875, "step": 368 }, { "completion_length": 423.0, "epoch": 0.0246, "grad_norm": 230.95326232910156, "kl": 0.8046875, "learning_rate": 1e-06, "loss": 0.0323, "reward": 1.2531075477600098, "reward_std": 0.44291990995407104, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.10626740008592606, "rewards/tag_count_reward": 0.984375, "step": 369 }, { "completion_length": 1254.5, "epoch": 0.024666666666666667, "grad_norm": 38.18868637084961, "kl": 0.8515625, "learning_rate": 1e-06, "loss": 0.0341, "reward": 0.7070168852806091, "reward_std": 0.25675168633461, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18360809981822968, "rewards/tag_count_reward": 0.890625, "step": 370 }, { "completion_length": 1166.0, "epoch": 0.024733333333333333, "grad_norm": 58.44999694824219, "kl": 0.828125, "learning_rate": 1e-06, "loss": 0.0332, "reward": 0.9627978801727295, "reward_std": 0.5553252696990967, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.13095210492610931, "rewards/tag_count_reward": 0.78125, "step": 371 }, { "completion_length": 336.0, "epoch": 0.0248, "grad_norm": 31.88998794555664, "kl": 1.015625, "learning_rate": 1e-06, "loss": 0.0404, "reward": 1.3841114044189453, "reward_std": 0.3197627663612366, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.11588859558105469, "rewards/tag_count_reward": 1.0, "step": 372 }, { "completion_length": 2048.0, "epoch": 0.024866666666666665, "grad_norm": 85.23079681396484, "kl": 2.46875, "learning_rate": 1e-06, "loss": 0.0992, "reward": 0.5388705730438232, "reward_std": 0.177922785282135, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.32050439715385437, "rewards/tag_count_reward": 0.859375, "step": 373 }, { "completion_length": 1783.5, "epoch": 0.02493333333333333, "grad_norm": 33.19438552856445, "kl": 2.09375, "learning_rate": 1e-06, "loss": 0.0834, "reward": 0.5565052628517151, "reward_std": 0.3148510456085205, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1934947371482849, "rewards/tag_count_reward": 0.75, "step": 374 }, { "completion_length": 471.5, "epoch": 0.025, "grad_norm": 162.33657836914062, "kl": 8.0, "learning_rate": 1e-06, "loss": 0.3206, "reward": 1.2218003273010254, "reward_std": 0.6198499798774719, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.184449702501297, "rewards/tag_count_reward": 0.84375, "step": 375 }, { "completion_length": 1204.5, "epoch": 0.025066666666666668, "grad_norm": 131.4541015625, "kl": 11.625, "learning_rate": 1e-06, "loss": 0.4628, "reward": 0.6212787628173828, "reward_std": 0.25988760590553284, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19122126698493958, "rewards/tag_count_reward": 0.8125, "step": 376 }, { "completion_length": 1207.5, "epoch": 0.025133333333333334, "grad_norm": 189.53819274902344, "kl": 11.625, "learning_rate": 1e-06, "loss": 0.4687, "reward": 0.6212553977966309, "reward_std": 0.3231739401817322, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22249463200569153, "rewards/tag_count_reward": 0.78125, "step": 377 }, { "completion_length": 1216.5, "epoch": 0.0252, "grad_norm": 268.6429138183594, "kl": 19.875, "learning_rate": 1e-06, "loss": 0.7935, "reward": 0.6878746747970581, "reward_std": 0.23541045188903809, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2340003401041031, "rewards/tag_count_reward": 0.921875, "step": 378 }, { "completion_length": 808.0, "epoch": 0.025266666666666666, "grad_norm": 178.39500427246094, "kl": 14.125, "learning_rate": 1e-06, "loss": 0.5631, "reward": 0.7823457717895508, "reward_std": 0.34783172607421875, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.2176542580127716, "rewards/tag_count_reward": 0.875, "step": 379 }, { "completion_length": 473.5, "epoch": 0.025333333333333333, "grad_norm": 119.30746459960938, "kl": 9.625, "learning_rate": 1e-06, "loss": 0.3831, "reward": 1.4035418033599854, "reward_std": 0.501310408115387, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.19020821154117584, "rewards/tag_count_reward": 0.96875, "step": 380 }, { "completion_length": 2048.0, "epoch": 0.0254, "grad_norm": 85.62954711914062, "kl": 5.125, "learning_rate": 1e-06, "loss": 0.2055, "reward": 0.5226788520812988, "reward_std": 0.311183899641037, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19607111811637878, "rewards/tag_count_reward": 0.71875, "step": 381 }, { "completion_length": 1282.5, "epoch": 0.025466666666666665, "grad_norm": 11.626707077026367, "kl": 2.171875, "learning_rate": 1e-06, "loss": 0.0865, "reward": 0.5935724377632141, "reward_std": 0.27991387248039246, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1564275622367859, "rewards/tag_count_reward": 0.75, "step": 382 }, { "completion_length": 1230.5, "epoch": 0.025533333333333335, "grad_norm": 99.81382751464844, "kl": 1.7578125, "learning_rate": 1e-06, "loss": 0.0703, "reward": 0.6769819855690002, "reward_std": 0.3789372444152832, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.15114302933216095, "rewards/tag_count_reward": 0.765625, "step": 383 }, { "completion_length": 1200.5, "epoch": 0.0256, "grad_norm": 33.18088150024414, "kl": 1.1875, "learning_rate": 1e-06, "loss": 0.0475, "reward": 1.2311782836914062, "reward_std": 0.5231276750564575, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.050071749836206436, "rewards/tag_count_reward": 0.96875, "step": 384 }, { "completion_length": 1315.5, "epoch": 0.025666666666666667, "grad_norm": 104.46115112304688, "kl": 2.109375, "learning_rate": 1e-06, "loss": 0.0843, "reward": 0.6653253436088562, "reward_std": 0.4485510289669037, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.209674671292305, "rewards/tag_count_reward": 0.75, "step": 385 }, { "completion_length": 2048.0, "epoch": 0.025733333333333334, "grad_norm": 30.411439895629883, "kl": 1.296875, "learning_rate": 1e-06, "loss": 0.0519, "reward": 0.29646408557891846, "reward_std": 0.305771142244339, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18791092932224274, "rewards/tag_count_reward": 0.484375, "step": 386 }, { "completion_length": 1025.5, "epoch": 0.0258, "grad_norm": 33.216915130615234, "kl": 1.7421875, "learning_rate": 1e-06, "loss": 0.0697, "reward": 0.619266927242279, "reward_std": 0.30492904782295227, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.25573304295539856, "rewards/tag_count_reward": 0.8125, "step": 387 }, { "completion_length": 1230.5, "epoch": 0.025866666666666666, "grad_norm": 27.491132736206055, "kl": 0.70703125, "learning_rate": 1e-06, "loss": 0.0283, "reward": 0.8222702741622925, "reward_std": 0.47413820028305054, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.08397974073886871, "rewards/tag_count_reward": 0.71875, "step": 388 }, { "completion_length": 970.0, "epoch": 0.025933333333333333, "grad_norm": 146.60391235351562, "kl": 1.7265625, "learning_rate": 1e-06, "loss": 0.069, "reward": 0.8505053520202637, "reward_std": 0.37996864318847656, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.3057446777820587, "rewards/tag_count_reward": 0.96875, "step": 389 }, { "completion_length": 302.0, "epoch": 0.026, "grad_norm": 29.51785659790039, "kl": 0.494140625, "learning_rate": 1e-06, "loss": 0.0198, "reward": 1.5183930397033691, "reward_std": 0.4361667037010193, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.10660697519779205, "rewards/tag_count_reward": 1.0, "step": 390 }, { "completion_length": 1902.0, "epoch": 0.026066666666666665, "grad_norm": 27.1646671295166, "kl": 1.7421875, "learning_rate": 1e-06, "loss": 0.07, "reward": 0.3741278648376465, "reward_std": 0.4004511833190918, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1727471500635147, "rewards/tag_count_reward": 0.546875, "step": 391 }, { "completion_length": 1133.5, "epoch": 0.026133333333333335, "grad_norm": 47.449825286865234, "kl": 1.578125, "learning_rate": 1e-06, "loss": 0.0632, "reward": 1.2241830825805664, "reward_std": 0.5153764486312866, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.2601918578147888, "rewards/tag_count_reward": 0.984375, "step": 392 }, { "completion_length": 2048.0, "epoch": 0.0262, "grad_norm": 69.18492126464844, "kl": 3.0625, "learning_rate": 1e-06, "loss": 0.1226, "reward": 0.22290146350860596, "reward_std": 0.32242223620414734, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23022352159023285, "rewards/tag_count_reward": 0.453125, "step": 393 }, { "completion_length": 1254.5, "epoch": 0.026266666666666667, "grad_norm": 75.0274658203125, "kl": 1.25, "learning_rate": 1e-06, "loss": 0.0502, "reward": 0.6211944222450256, "reward_std": 0.2534863352775574, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.11318057775497437, "rewards/tag_count_reward": 0.734375, "step": 394 }, { "completion_length": 1313.5, "epoch": 0.026333333333333334, "grad_norm": 109.3488998413086, "kl": 10.125, "learning_rate": 1e-06, "loss": 0.4038, "reward": 0.37912631034851074, "reward_std": 0.24370431900024414, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.33962368965148926, "rewards/tag_count_reward": 0.71875, "step": 395 }, { "completion_length": 2048.0, "epoch": 0.0264, "grad_norm": 85.0662612915039, "kl": 6.65625, "learning_rate": 1e-06, "loss": 0.2659, "reward": 0.2979457676410675, "reward_std": 0.2590464949607849, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1864292174577713, "rewards/tag_count_reward": 0.484375, "step": 396 }, { "completion_length": 1397.0, "epoch": 0.026466666666666666, "grad_norm": 143.81922912597656, "kl": 5.8125, "learning_rate": 1e-06, "loss": 0.232, "reward": 0.4804523289203644, "reward_std": 0.3569881319999695, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.332047700881958, "rewards/tag_count_reward": 0.75, "step": 397 }, { "completion_length": 1368.5, "epoch": 0.026533333333333332, "grad_norm": 109.19786834716797, "kl": 3.375, "learning_rate": 1e-06, "loss": 0.1349, "reward": 0.5585415363311768, "reward_std": 0.44386589527130127, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22270843386650085, "rewards/tag_count_reward": 0.71875, "step": 398 }, { "completion_length": 1180.5, "epoch": 0.0266, "grad_norm": 61.98153305053711, "kl": 2.9375, "learning_rate": 1e-06, "loss": 0.1172, "reward": 1.0090720653533936, "reward_std": 0.4242977499961853, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.13155291974544525, "rewards/tag_count_reward": 0.765625, "step": 399 }, { "completion_length": 1199.5, "epoch": 0.02666666666666667, "grad_norm": 107.26168060302734, "kl": 4.71875, "learning_rate": 1e-06, "loss": 0.189, "reward": 1.1559706926345825, "reward_std": 0.2972278594970703, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.14090432226657867, "rewards/tag_count_reward": 0.796875, "step": 400 }, { "completion_length": 1264.5, "epoch": 0.026733333333333335, "grad_norm": 57.82908248901367, "kl": 5.21875, "learning_rate": 1e-06, "loss": 0.2085, "reward": 0.454619437456131, "reward_std": 0.21859362721443176, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20163056254386902, "rewards/tag_count_reward": 0.65625, "step": 401 }, { "completion_length": 2048.0, "epoch": 0.0268, "grad_norm": 40.06622314453125, "kl": 3.34375, "learning_rate": 1e-06, "loss": 0.1344, "reward": 0.4545770287513733, "reward_std": 0.31251251697540283, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2485479712486267, "rewards/tag_count_reward": 0.703125, "step": 402 }, { "completion_length": 1084.5, "epoch": 0.026866666666666667, "grad_norm": 64.30767059326172, "kl": 6.0625, "learning_rate": 1e-06, "loss": 0.2421, "reward": 0.4500392973423004, "reward_std": 0.3190680742263794, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22183571755886078, "rewards/tag_count_reward": 0.671875, "step": 403 }, { "completion_length": 488.5, "epoch": 0.026933333333333333, "grad_norm": 16.057842254638672, "kl": 0.87109375, "learning_rate": 1e-06, "loss": 0.0349, "reward": 1.2301216125488281, "reward_std": 0.3597370386123657, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.17612846195697784, "rewards/tag_count_reward": 0.96875, "step": 404 }, { "completion_length": 431.0, "epoch": 0.027, "grad_norm": 13.406998634338379, "kl": 1.53125, "learning_rate": 1e-06, "loss": 0.0608, "reward": 1.2892142534255981, "reward_std": 0.5871255993843079, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.07016070187091827, "rewards/tag_count_reward": 0.984375, "step": 405 }, { "completion_length": 1112.5, "epoch": 0.027066666666666666, "grad_norm": 12.13965129852295, "kl": 2.796875, "learning_rate": 1e-06, "loss": 0.1119, "reward": 0.812352180480957, "reward_std": 0.5745599269866943, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.09389783442020416, "rewards/tag_count_reward": 0.84375, "step": 406 }, { "completion_length": 559.0, "epoch": 0.027133333333333332, "grad_norm": 54.97040557861328, "kl": 1.5078125, "learning_rate": 1e-06, "loss": 0.0601, "reward": 1.1173365116119385, "reward_std": 0.5239236354827881, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.24203848838806152, "rewards/tag_count_reward": 0.984375, "step": 407 }, { "completion_length": 2048.0, "epoch": 0.0272, "grad_norm": 78.96199035644531, "kl": 4.5625, "learning_rate": 1e-06, "loss": 0.1822, "reward": 0.23337377607822418, "reward_std": 0.2871261239051819, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.329126238822937, "rewards/tag_count_reward": 0.5625, "step": 408 }, { "completion_length": 376.0, "epoch": 0.027266666666666668, "grad_norm": 68.48465728759766, "kl": 3.0625, "learning_rate": 1e-06, "loss": 0.1221, "reward": 1.1270508766174316, "reward_std": 0.49426913261413574, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.16982409358024597, "rewards/tag_count_reward": 0.984375, "step": 409 }, { "completion_length": 892.0, "epoch": 0.027333333333333334, "grad_norm": 55.446083068847656, "kl": 3.65625, "learning_rate": 1e-06, "loss": 0.1462, "reward": 1.3488011360168457, "reward_std": 0.569901704788208, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.2761989235877991, "rewards/tag_count_reward": 1.0, "step": 410 }, { "completion_length": 1137.0, "epoch": 0.0274, "grad_norm": 116.13448333740234, "kl": 5.53125, "learning_rate": 1e-06, "loss": 0.2212, "reward": 0.6423210501670837, "reward_std": 0.47986292839050293, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.17017894983291626, "rewards/tag_count_reward": 0.625, "step": 411 }, { "completion_length": 2048.0, "epoch": 0.027466666666666667, "grad_norm": 81.79183959960938, "kl": 4.8125, "learning_rate": 1e-06, "loss": 0.1923, "reward": 0.30772507190704346, "reward_std": 0.2979990243911743, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.36414992809295654, "rewards/tag_count_reward": 0.671875, "step": 412 }, { "completion_length": 1155.5, "epoch": 0.027533333333333333, "grad_norm": 17.789812088012695, "kl": 1.1015625, "learning_rate": 1e-06, "loss": 0.044, "reward": 0.6761860847473145, "reward_std": 0.23097765445709229, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18318893015384674, "rewards/tag_count_reward": 0.859375, "step": 413 }, { "completion_length": 760.5, "epoch": 0.0276, "grad_norm": 140.0294952392578, "kl": 2.265625, "learning_rate": 1e-06, "loss": 0.0901, "reward": 1.1233935356140137, "reward_std": 0.19936342537403107, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.3141065239906311, "rewards/tag_count_reward": 1.0, "step": 414 }, { "completion_length": 2048.0, "epoch": 0.027666666666666666, "grad_norm": 95.42269134521484, "kl": 4.65625, "learning_rate": 1e-06, "loss": 0.1864, "reward": 0.12715484201908112, "reward_std": 0.24717658758163452, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3103451728820801, "rewards/tag_count_reward": 0.4375, "step": 415 }, { "completion_length": 1188.0, "epoch": 0.027733333333333332, "grad_norm": 22.912418365478516, "kl": 1.203125, "learning_rate": 1e-06, "loss": 0.0481, "reward": 1.0479087829589844, "reward_std": 0.39824777841567993, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.21771611273288727, "rewards/tag_count_reward": 0.953125, "step": 416 }, { "completion_length": 459.5, "epoch": 0.0278, "grad_norm": 88.90909576416016, "kl": 3.765625, "learning_rate": 1e-06, "loss": 0.1509, "reward": 0.6286050081253052, "reward_std": 0.3392166495323181, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1682700514793396, "rewards/tag_count_reward": 0.796875, "step": 417 }, { "completion_length": 2048.0, "epoch": 0.027866666666666668, "grad_norm": 17.91539192199707, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.0867, "reward": 0.3854790925979614, "reward_std": 0.2980719208717346, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23952090740203857, "rewards/tag_count_reward": 0.625, "step": 418 }, { "completion_length": 1353.0, "epoch": 0.027933333333333334, "grad_norm": 19.329565048217773, "kl": 2.5625, "learning_rate": 1e-06, "loss": 0.102, "reward": 0.4859287738800049, "reward_std": 0.24047116935253143, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2640712261199951, "rewards/tag_count_reward": 0.75, "step": 419 }, { "completion_length": 680.5, "epoch": 0.028, "grad_norm": 29.351930618286133, "kl": 1.90625, "learning_rate": 1e-06, "loss": 0.0763, "reward": 1.1519155502319336, "reward_std": 0.2905888557434082, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.17620949447155, "rewards/tag_count_reward": 0.828125, "step": 420 }, { "completion_length": 477.0, "epoch": 0.028066666666666667, "grad_norm": 4.9814300537109375, "kl": 1.140625, "learning_rate": 1e-06, "loss": 0.0455, "reward": 1.3871073722839355, "reward_std": 0.49295300245285034, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.14414256811141968, "rewards/tag_count_reward": 0.96875, "step": 421 }, { "completion_length": 691.0, "epoch": 0.028133333333333333, "grad_norm": 23.486236572265625, "kl": 2.421875, "learning_rate": 1e-06, "loss": 0.097, "reward": 1.2017005681991577, "reward_std": 0.33617621660232544, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.23579944670200348, "rewards/tag_count_reward": 0.9375, "step": 422 }, { "completion_length": 397.5, "epoch": 0.0282, "grad_norm": 172.2124786376953, "kl": 0.9765625, "learning_rate": 1e-06, "loss": 0.039, "reward": 1.824448823928833, "reward_std": 0.5625327229499817, "rewards/accuracy_reward": 0.8125, "rewards/len_reward": 0.01194883044809103, "rewards/tag_count_reward": 1.0, "step": 423 }, { "completion_length": 1932.5, "epoch": 0.028266666666666666, "grad_norm": 70.06598663330078, "kl": 6.9375, "learning_rate": 1e-06, "loss": 0.2758, "reward": 0.1855810135602951, "reward_std": 0.3005276918411255, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2675439715385437, "rewards/tag_count_reward": 0.453125, "step": 424 }, { "completion_length": 1108.0, "epoch": 0.028333333333333332, "grad_norm": 241.70701599121094, "kl": 1.4609375, "learning_rate": 1e-06, "loss": 0.0585, "reward": 0.6527888774871826, "reward_std": 0.2796846628189087, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19096116721630096, "rewards/tag_count_reward": 0.84375, "step": 425 }, { "completion_length": 1694.0, "epoch": 0.0284, "grad_norm": 38.58192825317383, "kl": 3.796875, "learning_rate": 1e-06, "loss": 0.1519, "reward": 0.3635767698287964, "reward_std": 0.33601176738739014, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1832982301712036, "rewards/tag_count_reward": 0.546875, "step": 426 }, { "completion_length": 729.0, "epoch": 0.028466666666666668, "grad_norm": 5.525942325592041, "kl": 3.484375, "learning_rate": 1e-06, "loss": 0.1392, "reward": 0.8202959299087524, "reward_std": 0.5015732645988464, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.19532907009124756, "rewards/tag_count_reward": 0.828125, "step": 427 }, { "completion_length": 1260.5, "epoch": 0.028533333333333334, "grad_norm": 24.404516220092773, "kl": 2.84375, "learning_rate": 1e-06, "loss": 0.1135, "reward": 0.5966103076934814, "reward_std": 0.26570066809654236, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.15338970720767975, "rewards/tag_count_reward": 0.75, "step": 428 }, { "completion_length": 1728.0, "epoch": 0.0286, "grad_norm": 24.004438400268555, "kl": 4.5625, "learning_rate": 1e-06, "loss": 0.1825, "reward": 0.1035110354423523, "reward_std": 0.3564677834510803, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3496139645576477, "rewards/tag_count_reward": 0.453125, "step": 429 }, { "completion_length": 2012.0, "epoch": 0.028666666666666667, "grad_norm": 210.22830200195312, "kl": 2.96875, "learning_rate": 1e-06, "loss": 0.1191, "reward": 0.4417126774787903, "reward_std": 0.2711828649044037, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2145373374223709, "rewards/tag_count_reward": 0.65625, "step": 430 }, { "completion_length": 761.5, "epoch": 0.028733333333333333, "grad_norm": 177.37310791015625, "kl": 2.03125, "learning_rate": 1e-06, "loss": 0.0812, "reward": 1.385439395904541, "reward_std": 0.45246297121047974, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.17706066370010376, "rewards/tag_count_reward": 1.0, "step": 431 }, { "completion_length": 1500.0, "epoch": 0.0288, "grad_norm": 25.036670684814453, "kl": 3.0625, "learning_rate": 1e-06, "loss": 0.1224, "reward": 0.506252110004425, "reward_std": 0.25970926880836487, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.13437287509441376, "rewards/tag_count_reward": 0.640625, "step": 432 }, { "completion_length": 463.0, "epoch": 0.028866666666666665, "grad_norm": 98.45127868652344, "kl": 2.265625, "learning_rate": 1e-06, "loss": 0.0906, "reward": 0.9743916392326355, "reward_std": 0.5563583970069885, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.10373339056968689, "rewards/tag_count_reward": 0.953125, "step": 433 }, { "completion_length": 1468.5, "epoch": 0.028933333333333332, "grad_norm": 61.40692901611328, "kl": 5.15625, "learning_rate": 1e-06, "loss": 0.2059, "reward": 0.7104079127311707, "reward_std": 0.26031026244163513, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22709208726882935, "rewards/tag_count_reward": 0.9375, "step": 434 }, { "completion_length": 597.5, "epoch": 0.029, "grad_norm": 120.39767456054688, "kl": 4.28125, "learning_rate": 1e-06, "loss": 0.1715, "reward": 1.5235540866851807, "reward_std": 0.5599320530891418, "rewards/accuracy_reward": 0.75, "rewards/len_reward": -0.17957091331481934, "rewards/tag_count_reward": 0.953125, "step": 435 }, { "completion_length": 1170.5, "epoch": 0.029066666666666668, "grad_norm": 59.792537689208984, "kl": 4.25, "learning_rate": 1e-06, "loss": 0.1692, "reward": 0.570354700088501, "reward_std": 0.3200645446777344, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.11714527010917664, "rewards/tag_count_reward": 0.6875, "step": 436 }, { "completion_length": 801.0, "epoch": 0.029133333333333334, "grad_norm": 85.89793395996094, "kl": 3.0625, "learning_rate": 1e-06, "loss": 0.1229, "reward": 0.7610937356948853, "reward_std": 0.19216570258140564, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22328129410743713, "rewards/tag_count_reward": 0.984375, "step": 437 }, { "completion_length": 1052.5, "epoch": 0.0292, "grad_norm": 33.27427673339844, "kl": 2.84375, "learning_rate": 1e-06, "loss": 0.1139, "reward": 0.5006247162818909, "reward_std": 0.42141035199165344, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2493753433227539, "rewards/tag_count_reward": 0.6875, "step": 438 }, { "completion_length": 1325.5, "epoch": 0.029266666666666667, "grad_norm": 124.21721649169922, "kl": 1.78125, "learning_rate": 1e-06, "loss": 0.0716, "reward": 0.8362706899642944, "reward_std": 0.4848806858062744, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.21060431003570557, "rewards/tag_count_reward": 0.859375, "step": 439 }, { "completion_length": 1308.0, "epoch": 0.029333333333333333, "grad_norm": 101.55350494384766, "kl": 1.0625, "learning_rate": 1e-06, "loss": 0.0427, "reward": 0.8429738879203796, "reward_std": 0.3363175094127655, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.21952611207962036, "rewards/tag_count_reward": 1.0, "step": 440 }, { "completion_length": 1663.0, "epoch": 0.0294, "grad_norm": 272.64434814453125, "kl": 1.2421875, "learning_rate": 1e-06, "loss": 0.0497, "reward": 0.5526822805404663, "reward_std": 0.23517288267612457, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2441927045583725, "rewards/tag_count_reward": 0.796875, "step": 441 }, { "completion_length": 376.5, "epoch": 0.029466666666666665, "grad_norm": 19.101346969604492, "kl": 0.85546875, "learning_rate": 1e-06, "loss": 0.0342, "reward": 1.2024941444396973, "reward_std": 0.47450199723243713, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.14125582575798035, "rewards/tag_count_reward": 0.96875, "step": 442 }, { "completion_length": 355.0, "epoch": 0.029533333333333335, "grad_norm": 39.646183013916016, "kl": 0.9453125, "learning_rate": 1e-06, "loss": 0.0377, "reward": 1.3451943397521973, "reward_std": 0.27902498841285706, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.1548057198524475, "rewards/tag_count_reward": 1.0, "step": 443 }, { "completion_length": 584.0, "epoch": 0.0296, "grad_norm": 42.00813293457031, "kl": 0.95703125, "learning_rate": 1e-06, "loss": 0.0383, "reward": 0.9564022421836853, "reward_std": 0.5531929731369019, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.1685977727174759, "rewards/tag_count_reward": 1.0, "step": 444 }, { "completion_length": 459.5, "epoch": 0.029666666666666668, "grad_norm": 143.62857055664062, "kl": 1.0078125, "learning_rate": 1e-06, "loss": 0.0403, "reward": 1.3355096578598022, "reward_std": 0.41100364923477173, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.13324040174484253, "rewards/tag_count_reward": 0.90625, "step": 445 }, { "completion_length": 1615.0, "epoch": 0.029733333333333334, "grad_norm": 39.87702941894531, "kl": 2.671875, "learning_rate": 1e-06, "loss": 0.1069, "reward": 0.5858045816421509, "reward_std": 0.3898976445198059, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.22669541835784912, "rewards/tag_count_reward": 0.6875, "step": 446 }, { "completion_length": 1805.0, "epoch": 0.0298, "grad_norm": 117.98473358154297, "kl": 2.515625, "learning_rate": 1e-06, "loss": 0.1008, "reward": 0.18895292282104492, "reward_std": 0.19049280881881714, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23292209208011627, "rewards/tag_count_reward": 0.421875, "step": 447 }, { "completion_length": 1490.0, "epoch": 0.029866666666666666, "grad_norm": 73.97403717041016, "kl": 2.25, "learning_rate": 1e-06, "loss": 0.0899, "reward": 0.24734440445899963, "reward_std": 0.2963607907295227, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.25265559554100037, "rewards/tag_count_reward": 0.5, "step": 448 }, { "completion_length": 1220.0, "epoch": 0.029933333333333333, "grad_norm": 36.06205749511719, "kl": 3.1875, "learning_rate": 1e-06, "loss": 0.1276, "reward": 0.6867581009864807, "reward_std": 0.38040876388549805, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.1882418543100357, "rewards/tag_count_reward": 0.6875, "step": 449 }, { "completion_length": 2048.0, "epoch": 0.03, "grad_norm": 155.3662872314453, "kl": 5.3125, "learning_rate": 1e-06, "loss": 0.2113, "reward": 0.3176334500312805, "reward_std": 0.2072276473045349, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3542415499687195, "rewards/tag_count_reward": 0.671875, "step": 450 }, { "completion_length": 1347.0, "epoch": 0.030066666666666665, "grad_norm": 47.34263610839844, "kl": 2.46875, "learning_rate": 1e-06, "loss": 0.0993, "reward": 1.1042916774749756, "reward_std": 0.5063524842262268, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.31758344173431396, "rewards/tag_count_reward": 0.984375, "step": 451 }, { "completion_length": 1206.5, "epoch": 0.030133333333333335, "grad_norm": 45.91167068481445, "kl": 5.3125, "learning_rate": 1e-06, "loss": 0.2126, "reward": 0.4902770221233368, "reward_std": 0.2801367938518524, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3065979778766632, "rewards/tag_count_reward": 0.796875, "step": 452 }, { "completion_length": 2048.0, "epoch": 0.0302, "grad_norm": 31.02351188659668, "kl": 3.46875, "learning_rate": 1e-06, "loss": 0.1384, "reward": 0.15944889187812805, "reward_std": 0.33201614022254944, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19992613792419434, "rewards/tag_count_reward": 0.359375, "step": 453 }, { "completion_length": 1568.0, "epoch": 0.030266666666666667, "grad_norm": 2.735738515853882, "kl": 1.296875, "learning_rate": 1e-06, "loss": 0.0517, "reward": 0.6618618965148926, "reward_std": 0.2902847230434418, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16626305878162384, "rewards/tag_count_reward": 0.828125, "step": 454 }, { "completion_length": 1286.0, "epoch": 0.030333333333333334, "grad_norm": 108.56804656982422, "kl": 1.828125, "learning_rate": 1e-06, "loss": 0.0731, "reward": 1.221577525138855, "reward_std": 0.56404048204422, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.24717243015766144, "rewards/tag_count_reward": 0.96875, "step": 455 }, { "completion_length": 1626.0, "epoch": 0.0304, "grad_norm": 260.0552978515625, "kl": 3.34375, "learning_rate": 1e-06, "loss": 0.134, "reward": 0.4680972397327423, "reward_std": 0.3092855215072632, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3287777900695801, "rewards/tag_count_reward": 0.796875, "step": 456 }, { "completion_length": 929.5, "epoch": 0.030466666666666666, "grad_norm": 7.954984188079834, "kl": 2.46875, "learning_rate": 1e-06, "loss": 0.099, "reward": 0.6224107146263123, "reward_std": 0.2342901974916458, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26821425557136536, "rewards/tag_count_reward": 0.890625, "step": 457 }, { "completion_length": 680.5, "epoch": 0.030533333333333332, "grad_norm": 311.03179931640625, "kl": 1.46875, "learning_rate": 1e-06, "loss": 0.0587, "reward": 0.8654361963272095, "reward_std": 0.2987762689590454, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.18143880367279053, "rewards/tag_count_reward": 0.984375, "step": 458 }, { "completion_length": 2048.0, "epoch": 0.0306, "grad_norm": 364.86083984375, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.086, "reward": 0.6716626882553101, "reward_std": 0.31359678506851196, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.37521231174468994, "rewards/tag_count_reward": 0.984375, "step": 459 }, { "completion_length": 482.5, "epoch": 0.030666666666666665, "grad_norm": 203.19033813476562, "kl": 2.328125, "learning_rate": 1e-06, "loss": 0.0935, "reward": 1.6676654815673828, "reward_std": 0.43626925349235535, "rewards/accuracy_reward": 0.8125, "rewards/len_reward": -0.1448344886302948, "rewards/tag_count_reward": 1.0, "step": 460 }, { "completion_length": 369.5, "epoch": 0.030733333333333335, "grad_norm": 94.32376861572266, "kl": 4.5625, "learning_rate": 1e-06, "loss": 0.1816, "reward": 1.7893270254135132, "reward_std": 0.4871293008327484, "rewards/accuracy_reward": 0.875, "rewards/len_reward": -0.08567289263010025, "rewards/tag_count_reward": 1.0, "step": 461 }, { "completion_length": 899.5, "epoch": 0.0308, "grad_norm": 26.289045333862305, "kl": 2.0, "learning_rate": 1e-06, "loss": 0.0802, "reward": 0.5772469639778137, "reward_std": 0.33308613300323486, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.15712805092334747, "rewards/tag_count_reward": 0.734375, "step": 462 }, { "completion_length": 1619.5, "epoch": 0.030866666666666667, "grad_norm": 45.15757751464844, "kl": 1.765625, "learning_rate": 1e-06, "loss": 0.0708, "reward": 1.156102180480957, "reward_std": 0.2771090269088745, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.2657727897167206, "rewards/tag_count_reward": 0.984375, "step": 463 }, { "completion_length": 1230.5, "epoch": 0.030933333333333334, "grad_norm": 251.4700927734375, "kl": 19.125, "learning_rate": 1e-06, "loss": 0.764, "reward": 0.4844090938568115, "reward_std": 0.26258766651153564, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2655909061431885, "rewards/tag_count_reward": 0.75, "step": 464 }, { "completion_length": 1297.0, "epoch": 0.031, "grad_norm": 151.19776916503906, "kl": 11.375, "learning_rate": 1e-06, "loss": 0.4533, "reward": 0.826347291469574, "reward_std": 0.5434819459915161, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.07990271598100662, "rewards/tag_count_reward": 0.65625, "step": 465 }, { "completion_length": 453.5, "epoch": 0.031066666666666666, "grad_norm": 65.11819458007812, "kl": 5.0, "learning_rate": 1e-06, "loss": 0.2003, "reward": 0.833638072013855, "reward_std": 0.20467182993888855, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1507369577884674, "rewards/tag_count_reward": 0.984375, "step": 466 }, { "completion_length": 1464.5, "epoch": 0.031133333333333332, "grad_norm": 121.26884460449219, "kl": 8.5, "learning_rate": 1e-06, "loss": 0.3395, "reward": 0.6761493682861328, "reward_std": 0.1760469675064087, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2926006019115448, "rewards/tag_count_reward": 0.96875, "step": 467 }, { "completion_length": 1127.5, "epoch": 0.0312, "grad_norm": 77.94124603271484, "kl": 4.71875, "learning_rate": 1e-06, "loss": 0.1894, "reward": 1.0224368572235107, "reward_std": 0.4776157736778259, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.24318821728229523, "rewards/tag_count_reward": 0.828125, "step": 468 }, { "completion_length": 1644.0, "epoch": 0.031266666666666665, "grad_norm": 76.69334411621094, "kl": 3.75, "learning_rate": 1e-06, "loss": 0.1496, "reward": 0.41094398498535156, "reward_std": 0.30710700154304504, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24530604481697083, "rewards/tag_count_reward": 0.65625, "step": 469 }, { "completion_length": 366.5, "epoch": 0.03133333333333333, "grad_norm": 38.32693862915039, "kl": 2.671875, "learning_rate": 1e-06, "loss": 0.1065, "reward": 1.4753447771072388, "reward_std": 0.39756450057029724, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.14965522289276123, "rewards/tag_count_reward": 1.0, "step": 470 }, { "completion_length": 1699.5, "epoch": 0.0314, "grad_norm": 140.01515197753906, "kl": 0.2890625, "learning_rate": 1e-06, "loss": 0.0116, "reward": 0.8597678542137146, "reward_std": 0.2188059687614441, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1089821383357048, "rewards/tag_count_reward": 0.96875, "step": 471 }, { "completion_length": 483.5, "epoch": 0.031466666666666664, "grad_norm": 4.880711555480957, "kl": 2.078125, "learning_rate": 1e-06, "loss": 0.0831, "reward": 0.916806161403656, "reward_std": 0.40184321999549866, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.1925688534975052, "rewards/tag_count_reward": 0.984375, "step": 472 }, { "completion_length": 1589.0, "epoch": 0.03153333333333333, "grad_norm": 248.47195434570312, "kl": 1.765625, "learning_rate": 1e-06, "loss": 0.0704, "reward": 0.28684836626052856, "reward_std": 0.29903551936149597, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.30690163373947144, "rewards/tag_count_reward": 0.59375, "step": 473 }, { "completion_length": 951.0, "epoch": 0.0316, "grad_norm": 13.834942817687988, "kl": 0.259765625, "learning_rate": 1e-06, "loss": 0.0104, "reward": 1.1863715648651123, "reward_std": 0.26852947473526, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.2355034500360489, "rewards/tag_count_reward": 0.984375, "step": 474 }, { "completion_length": 1147.0, "epoch": 0.03166666666666667, "grad_norm": 36.051719665527344, "kl": 0.31640625, "learning_rate": 1e-06, "loss": 0.0127, "reward": 0.7529718279838562, "reward_std": 0.32631242275238037, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2782781720161438, "rewards/tag_count_reward": 0.96875, "step": 475 }, { "completion_length": 2048.0, "epoch": 0.031733333333333336, "grad_norm": 15.62546443939209, "kl": 1.671875, "learning_rate": 1e-06, "loss": 0.0671, "reward": 0.7044357061386108, "reward_std": 0.39865821599960327, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.35806435346603394, "rewards/tag_count_reward": 0.9375, "step": 476 }, { "completion_length": 1054.5, "epoch": 0.0318, "grad_norm": 27.04443359375, "kl": 0.55078125, "learning_rate": 1e-06, "loss": 0.0221, "reward": 0.7698061466217041, "reward_std": 0.17838923633098602, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2301938831806183, "rewards/tag_count_reward": 0.9375, "step": 477 }, { "completion_length": 1930.0, "epoch": 0.03186666666666667, "grad_norm": 16.326013565063477, "kl": 1.484375, "learning_rate": 1e-06, "loss": 0.0593, "reward": 0.614019513130188, "reward_std": 0.34445932507514954, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3391055464744568, "rewards/tag_count_reward": 0.890625, "step": 478 }, { "completion_length": 1036.5, "epoch": 0.031933333333333334, "grad_norm": 23.876251220703125, "kl": 0.369140625, "learning_rate": 1e-06, "loss": 0.0147, "reward": 0.9702253341674805, "reward_std": 0.34339791536331177, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.06102466583251953, "rewards/tag_count_reward": 0.96875, "step": 479 }, { "completion_length": 653.5, "epoch": 0.032, "grad_norm": 29.604717254638672, "kl": 2.3125, "learning_rate": 1e-06, "loss": 0.0923, "reward": 1.1755359172821045, "reward_std": 0.43715834617614746, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.2619641125202179, "rewards/tag_count_reward": 1.0, "step": 480 }, { "completion_length": 1473.0, "epoch": 0.03206666666666667, "grad_norm": 65.69683074951172, "kl": 3.71875, "learning_rate": 1e-06, "loss": 0.1485, "reward": 0.2577321529388428, "reward_std": 0.24535515904426575, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2735178470611572, "rewards/tag_count_reward": 0.53125, "step": 481 }, { "completion_length": 1509.0, "epoch": 0.03213333333333333, "grad_norm": 67.13792419433594, "kl": 2.71875, "learning_rate": 1e-06, "loss": 0.108, "reward": 0.3775395154953003, "reward_std": 0.2782382071018219, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2943354845046997, "rewards/tag_count_reward": 0.671875, "step": 482 }, { "completion_length": 1161.0, "epoch": 0.0322, "grad_norm": 162.6373291015625, "kl": 2.78125, "learning_rate": 1e-06, "loss": 0.1118, "reward": 0.9634556174278259, "reward_std": 0.40102633833885193, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.23966939747333527, "rewards/tag_count_reward": 0.765625, "step": 483 }, { "completion_length": 951.0, "epoch": 0.032266666666666666, "grad_norm": 36.65139389038086, "kl": 1.0546875, "learning_rate": 1e-06, "loss": 0.0422, "reward": 1.1727631092071533, "reward_std": 0.47382399439811707, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.12411190569400787, "rewards/tag_count_reward": 0.859375, "step": 484 }, { "completion_length": 1401.0, "epoch": 0.03233333333333333, "grad_norm": 54.291259765625, "kl": 0.439453125, "learning_rate": 1e-06, "loss": 0.0176, "reward": 0.5725162625312805, "reward_std": 0.21343138813972473, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3493587374687195, "rewards/tag_count_reward": 0.921875, "step": 485 }, { "completion_length": 496.5, "epoch": 0.0324, "grad_norm": 63.599273681640625, "kl": 0.8671875, "learning_rate": 1e-06, "loss": 0.0345, "reward": 1.1292247772216797, "reward_std": 0.4646027684211731, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.1364002227783203, "rewards/tag_count_reward": 0.953125, "step": 486 }, { "completion_length": 1243.5, "epoch": 0.032466666666666665, "grad_norm": 116.7339096069336, "kl": 0.73828125, "learning_rate": 1e-06, "loss": 0.0296, "reward": 0.8797893524169922, "reward_std": 0.483071506023407, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.24521055817604065, "rewards/tag_count_reward": 0.9375, "step": 487 }, { "completion_length": 446.0, "epoch": 0.03253333333333333, "grad_norm": 35.85505676269531, "kl": 0.2001953125, "learning_rate": 1e-06, "loss": 0.008, "reward": 1.3932549953460693, "reward_std": 0.26571404933929443, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.10674503445625305, "rewards/tag_count_reward": 1.0, "step": 488 }, { "completion_length": 1034.5, "epoch": 0.0326, "grad_norm": 81.21532440185547, "kl": 1.453125, "learning_rate": 1e-06, "loss": 0.0584, "reward": 1.1355910301208496, "reward_std": 0.5194476246833801, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.1612839698791504, "rewards/tag_count_reward": 0.921875, "step": 489 }, { "completion_length": 994.0, "epoch": 0.03266666666666666, "grad_norm": 22.129722595214844, "kl": 0.6484375, "learning_rate": 1e-06, "loss": 0.0259, "reward": 1.3354647159576416, "reward_std": 0.4170573949813843, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.03953530639410019, "rewards/tag_count_reward": 0.9375, "step": 490 }, { "completion_length": 854.0, "epoch": 0.032733333333333337, "grad_norm": 27.246200561523438, "kl": 2.53125, "learning_rate": 1e-06, "loss": 0.101, "reward": 1.1956677436828613, "reward_std": 0.24232608079910278, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.30433231592178345, "rewards/tag_count_reward": 1.0, "step": 491 }, { "completion_length": 1069.0, "epoch": 0.0328, "grad_norm": 38.06834030151367, "kl": 2.171875, "learning_rate": 1e-06, "loss": 0.087, "reward": 0.6666276454925537, "reward_std": 0.1747668981552124, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2552473545074463, "rewards/tag_count_reward": 0.921875, "step": 492 }, { "completion_length": 939.0, "epoch": 0.03286666666666667, "grad_norm": 46.978397369384766, "kl": 5.34375, "learning_rate": 1e-06, "loss": 0.2134, "reward": 0.9970148205757141, "reward_std": 0.41218793392181396, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.2529851794242859, "rewards/tag_count_reward": 1.0, "step": 493 }, { "completion_length": 837.5, "epoch": 0.032933333333333335, "grad_norm": 16.7191219329834, "kl": 1.734375, "learning_rate": 1e-06, "loss": 0.0697, "reward": 1.2853515148162842, "reward_std": 0.30677878856658936, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.1677735447883606, "rewards/tag_count_reward": 0.953125, "step": 494 }, { "completion_length": 880.5, "epoch": 0.033, "grad_norm": 103.61772155761719, "kl": 5.96875, "learning_rate": 1e-06, "loss": 0.2386, "reward": 0.5835074186325073, "reward_std": 0.4315335750579834, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2289925515651703, "rewards/tag_count_reward": 0.75, "step": 495 }, { "completion_length": 1742.5, "epoch": 0.03306666666666667, "grad_norm": 152.2026824951172, "kl": 11.0, "learning_rate": 1e-06, "loss": 0.4405, "reward": 0.16734576225280762, "reward_std": 0.21883532404899597, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.3326542377471924, "rewards/tag_count_reward": 0.5, "step": 496 }, { "completion_length": 1249.0, "epoch": 0.033133333333333334, "grad_norm": 100.76626586914062, "kl": 3.21875, "learning_rate": 1e-06, "loss": 0.1287, "reward": 0.8781791925430298, "reward_std": 0.39289015531539917, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.3249457776546478, "rewards/tag_count_reward": 0.953125, "step": 497 }, { "completion_length": 1309.0, "epoch": 0.0332, "grad_norm": 23.7191162109375, "kl": 2.65625, "learning_rate": 1e-06, "loss": 0.106, "reward": 0.6000075936317444, "reward_std": 0.27661073207855225, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21249239146709442, "rewards/tag_count_reward": 0.8125, "step": 498 }, { "completion_length": 872.5, "epoch": 0.03326666666666667, "grad_norm": 205.3312225341797, "kl": 3.890625, "learning_rate": 1e-06, "loss": 0.1562, "reward": 0.5276904702186584, "reward_std": 0.20058870315551758, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22230952978134155, "rewards/tag_count_reward": 0.75, "step": 499 }, { "completion_length": 1393.5, "epoch": 0.03333333333333333, "grad_norm": 34.567054748535156, "kl": 5.0, "learning_rate": 1e-06, "loss": 0.1995, "reward": 0.3974929451942444, "reward_std": 0.34111130237579346, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16500705480575562, "rewards/tag_count_reward": 0.5625, "step": 500 }, { "completion_length": 915.5, "epoch": 0.0334, "grad_norm": 68.95124053955078, "kl": 6.6875, "learning_rate": 1e-06, "loss": 0.2672, "reward": 0.48827648162841797, "reward_std": 0.2557675242424011, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23047351837158203, "rewards/tag_count_reward": 0.71875, "step": 501 }, { "completion_length": 1179.0, "epoch": 0.033466666666666665, "grad_norm": 91.65396118164062, "kl": 4.4375, "learning_rate": 1e-06, "loss": 0.1769, "reward": 1.28905189037323, "reward_std": 0.4957786798477173, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.19532307982444763, "rewards/tag_count_reward": 0.921875, "step": 502 }, { "completion_length": 406.0, "epoch": 0.03353333333333333, "grad_norm": 34.242103576660156, "kl": 2.84375, "learning_rate": 1e-06, "loss": 0.1132, "reward": 1.4901211261749268, "reward_std": 0.4295300543308258, "rewards/accuracy_reward": 0.625, "rewards/len_reward": -0.11925392597913742, "rewards/tag_count_reward": 0.984375, "step": 503 }, { "completion_length": 1280.0, "epoch": 0.0336, "grad_norm": 70.93849182128906, "kl": 1.4296875, "learning_rate": 1e-06, "loss": 0.0571, "reward": 0.742384135723114, "reward_std": 0.19301137328147888, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.17949089407920837, "rewards/tag_count_reward": 0.921875, "step": 504 }, { "completion_length": 1247.5, "epoch": 0.033666666666666664, "grad_norm": 30.341981887817383, "kl": 1.625, "learning_rate": 1e-06, "loss": 0.065, "reward": 0.9364212155342102, "reward_std": 0.376028835773468, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.17295382916927338, "rewards/tag_count_reward": 0.671875, "step": 505 }, { "completion_length": 505.0, "epoch": 0.03373333333333333, "grad_norm": 40.31377029418945, "kl": 2.109375, "learning_rate": 1e-06, "loss": 0.0841, "reward": 0.802386999130249, "reward_std": 0.32473599910736084, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.1976129710674286, "rewards/tag_count_reward": 0.9375, "step": 506 }, { "completion_length": 574.5, "epoch": 0.0338, "grad_norm": 43.77798080444336, "kl": 0.5625, "learning_rate": 1e-06, "loss": 0.0225, "reward": 0.8121465444564819, "reward_std": 0.19152596592903137, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14097848534584045, "rewards/tag_count_reward": 0.953125, "step": 507 }, { "completion_length": 1466.0, "epoch": 0.03386666666666667, "grad_norm": 63.456520080566406, "kl": 2.1875, "learning_rate": 1e-06, "loss": 0.0877, "reward": 0.5162305235862732, "reward_std": 0.22059300541877747, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2962694764137268, "rewards/tag_count_reward": 0.8125, "step": 508 }, { "completion_length": 2048.0, "epoch": 0.033933333333333336, "grad_norm": 54.614505767822266, "kl": 1.96875, "learning_rate": 1e-06, "loss": 0.079, "reward": 0.4008627235889435, "reward_std": 0.2848988473415375, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2710123062133789, "rewards/tag_count_reward": 0.671875, "step": 509 }, { "completion_length": 1095.0, "epoch": 0.034, "grad_norm": 41.41899490356445, "kl": 1.0703125, "learning_rate": 1e-06, "loss": 0.0429, "reward": 0.7443307042121887, "reward_std": 0.28202205896377563, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.13066931068897247, "rewards/tag_count_reward": 0.875, "step": 510 }, { "completion_length": 502.0, "epoch": 0.03406666666666667, "grad_norm": 166.49993896484375, "kl": 1.015625, "learning_rate": 1e-06, "loss": 0.0406, "reward": 1.0923429727554321, "reward_std": 0.49536144733428955, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.2045319825410843, "rewards/tag_count_reward": 0.984375, "step": 511 }, { "completion_length": 2048.0, "epoch": 0.034133333333333335, "grad_norm": 97.36649322509766, "kl": 3.15625, "learning_rate": 1e-06, "loss": 0.1261, "reward": 0.47508713603019714, "reward_std": 0.2021721601486206, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27491286396980286, "rewards/tag_count_reward": 0.75, "step": 512 }, { "completion_length": 1942.5, "epoch": 0.0342, "grad_norm": 43.3392448425293, "kl": 3.890625, "learning_rate": 1e-06, "loss": 0.1556, "reward": 0.3828532099723816, "reward_std": 0.25550660490989685, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2421467900276184, "rewards/tag_count_reward": 0.625, "step": 513 }, { "completion_length": 806.0, "epoch": 0.03426666666666667, "grad_norm": 24.938037872314453, "kl": 5.375, "learning_rate": 1e-06, "loss": 0.2155, "reward": 0.39125797152519226, "reward_std": 0.3487168550491333, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21811704337596893, "rewards/tag_count_reward": 0.609375, "step": 514 }, { "completion_length": 492.0, "epoch": 0.034333333333333334, "grad_norm": 234.27557373046875, "kl": 12.25, "learning_rate": 1e-06, "loss": 0.4907, "reward": 1.2218936681747437, "reward_std": 0.3230869174003601, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.16873131692409515, "rewards/tag_count_reward": 0.953125, "step": 515 }, { "completion_length": 1637.0, "epoch": 0.0344, "grad_norm": 41.388702392578125, "kl": 6.78125, "learning_rate": 1e-06, "loss": 0.2706, "reward": 0.5030000805854797, "reward_std": 0.25090712308883667, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.35637491941452026, "rewards/tag_count_reward": 0.859375, "step": 516 }, { "completion_length": 455.0, "epoch": 0.034466666666666666, "grad_norm": 106.74304962158203, "kl": 8.625, "learning_rate": 1e-06, "loss": 0.3451, "reward": 1.1737362146377563, "reward_std": 0.458330363035202, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.15438883006572723, "rewards/tag_count_reward": 0.890625, "step": 517 }, { "completion_length": 1055.0, "epoch": 0.03453333333333333, "grad_norm": 203.50892639160156, "kl": 8.125, "learning_rate": 1e-06, "loss": 0.3261, "reward": 0.9100844264030457, "reward_std": 0.40734437108039856, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.29304054379463196, "rewards/tag_count_reward": 0.890625, "step": 518 }, { "completion_length": 706.0, "epoch": 0.0346, "grad_norm": 75.58966064453125, "kl": 6.90625, "learning_rate": 1e-06, "loss": 0.2768, "reward": 0.48510849475860596, "reward_std": 0.19676175713539124, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24926650524139404, "rewards/tag_count_reward": 0.734375, "step": 519 }, { "completion_length": 936.0, "epoch": 0.034666666666666665, "grad_norm": 32.32265853881836, "kl": 2.53125, "learning_rate": 1e-06, "loss": 0.101, "reward": 0.5524517893791199, "reward_std": 0.35804426670074463, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.18192319571971893, "rewards/tag_count_reward": 0.734375, "step": 520 }, { "completion_length": 612.0, "epoch": 0.03473333333333333, "grad_norm": 51.99385070800781, "kl": 2.3125, "learning_rate": 1e-06, "loss": 0.0924, "reward": 0.9991387724876404, "reward_std": 0.6344587802886963, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.11023623496294022, "rewards/tag_count_reward": 0.921875, "step": 521 }, { "completion_length": 836.5, "epoch": 0.0348, "grad_norm": 54.89631271362305, "kl": 1.9921875, "learning_rate": 1e-06, "loss": 0.0796, "reward": 1.1614923477172852, "reward_std": 0.4174689054489136, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.22913259267807007, "rewards/tag_count_reward": 0.953125, "step": 522 }, { "completion_length": 946.5, "epoch": 0.034866666666666664, "grad_norm": 62.0288200378418, "kl": 2.03125, "learning_rate": 1e-06, "loss": 0.0809, "reward": 1.1289585828781128, "reward_std": 0.3178820013999939, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.08979140222072601, "rewards/tag_count_reward": 0.71875, "step": 523 }, { "completion_length": 598.0, "epoch": 0.03493333333333333, "grad_norm": 83.11853790283203, "kl": 1.609375, "learning_rate": 1e-06, "loss": 0.0646, "reward": 1.320982813835144, "reward_std": 0.3107944130897522, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.17901712656021118, "rewards/tag_count_reward": 1.0, "step": 524 }, { "completion_length": 1150.5, "epoch": 0.035, "grad_norm": 141.64254760742188, "kl": 1.71875, "learning_rate": 1e-06, "loss": 0.0685, "reward": 0.6263207793235779, "reward_std": 0.19915956258773804, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2018042504787445, "rewards/tag_count_reward": 0.828125, "step": 525 }, { "completion_length": 643.0, "epoch": 0.03506666666666667, "grad_norm": 52.120609283447266, "kl": 2.3125, "learning_rate": 1e-06, "loss": 0.093, "reward": 0.5838699340820312, "reward_std": 0.2685253620147705, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22863011062145233, "rewards/tag_count_reward": 0.8125, "step": 526 }, { "completion_length": 534.0, "epoch": 0.035133333333333336, "grad_norm": 8086.248046875, "kl": 106.5, "learning_rate": 1e-06, "loss": 4.2705, "reward": 0.9970443248748779, "reward_std": 0.3563997149467468, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.23733067512512207, "rewards/tag_count_reward": 0.984375, "step": 527 }, { "completion_length": 1125.0, "epoch": 0.0352, "grad_norm": 86.96722412109375, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.0864, "reward": 0.6056922674179077, "reward_std": 0.2982494533061981, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.23805776238441467, "rewards/tag_count_reward": 0.84375, "step": 528 }, { "completion_length": 1553.5, "epoch": 0.03526666666666667, "grad_norm": 44.041141510009766, "kl": 3.65625, "learning_rate": 1e-06, "loss": 0.1466, "reward": 0.6474529504776001, "reward_std": 0.37525323033332825, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2587970197200775, "rewards/tag_count_reward": 0.84375, "step": 529 }, { "completion_length": 1848.5, "epoch": 0.035333333333333335, "grad_norm": 147.4516143798828, "kl": 6.25, "learning_rate": 1e-06, "loss": 0.2493, "reward": 0.2758605480194092, "reward_std": 0.26490387320518494, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.31788942217826843, "rewards/tag_count_reward": 0.59375, "step": 530 }, { "completion_length": 1511.5, "epoch": 0.0354, "grad_norm": 93.3603744506836, "kl": 8.125, "learning_rate": 1e-06, "loss": 0.3259, "reward": 0.455405056476593, "reward_std": 0.23336824774742126, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.357094943523407, "rewards/tag_count_reward": 0.8125, "step": 531 }, { "completion_length": 1739.5, "epoch": 0.03546666666666667, "grad_norm": 202.96273803710938, "kl": 4.25, "learning_rate": 1e-06, "loss": 0.1705, "reward": 0.9460640549659729, "reward_std": 0.34083792567253113, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.2414359152317047, "rewards/tag_count_reward": 0.875, "step": 532 }, { "completion_length": 2048.0, "epoch": 0.03553333333333333, "grad_norm": 100.5988998413086, "kl": 3.28125, "learning_rate": 1e-06, "loss": 0.1312, "reward": 0.28255584836006165, "reward_std": 0.268098920583725, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.20181915163993835, "rewards/tag_count_reward": 0.484375, "step": 533 }, { "completion_length": 594.0, "epoch": 0.0356, "grad_norm": 59.22998046875, "kl": 4.5625, "learning_rate": 1e-06, "loss": 0.1823, "reward": 0.9103326797485352, "reward_std": 0.38058894872665405, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.27716729044914246, "rewards/tag_count_reward": 1.0, "step": 534 }, { "completion_length": 424.5, "epoch": 0.035666666666666666, "grad_norm": 41.5837516784668, "kl": 0.8046875, "learning_rate": 1e-06, "loss": 0.0322, "reward": 1.3207809925079346, "reward_std": 0.5327502489089966, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.08546902239322662, "rewards/tag_count_reward": 0.90625, "step": 535 }, { "completion_length": 1182.5, "epoch": 0.03573333333333333, "grad_norm": 82.11896514892578, "kl": 3.3125, "learning_rate": 1e-06, "loss": 0.1324, "reward": 0.8846542239189148, "reward_std": 0.6816003918647766, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.2247207760810852, "rewards/tag_count_reward": 0.921875, "step": 536 }, { "completion_length": 626.0, "epoch": 0.0358, "grad_norm": 77.67766571044922, "kl": 1.484375, "learning_rate": 1e-06, "loss": 0.0593, "reward": 1.4307773113250732, "reward_std": 0.6569211483001709, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.13172270357608795, "rewards/tag_count_reward": 1.0, "step": 537 }, { "completion_length": 438.0, "epoch": 0.035866666666666665, "grad_norm": 41.48287582397461, "kl": 0.89453125, "learning_rate": 1e-06, "loss": 0.0357, "reward": 1.1362364292144775, "reward_std": 0.4892275929450989, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.14501355588436127, "rewards/tag_count_reward": 0.96875, "step": 538 }, { "completion_length": 670.0, "epoch": 0.03593333333333333, "grad_norm": 58.74188232421875, "kl": 1.328125, "learning_rate": 1e-06, "loss": 0.0533, "reward": 0.8157967329025269, "reward_std": 0.4505951404571533, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.18420328199863434, "rewards/tag_count_reward": 0.875, "step": 539 }, { "completion_length": 459.5, "epoch": 0.036, "grad_norm": 114.96290588378906, "kl": 1.484375, "learning_rate": 1e-06, "loss": 0.0591, "reward": 1.6360636949539185, "reward_std": 0.5903237462043762, "rewards/accuracy_reward": 0.75, "rewards/len_reward": -0.11393626034259796, "rewards/tag_count_reward": 1.0, "step": 540 }, { "completion_length": 372.5, "epoch": 0.036066666666666664, "grad_norm": 79.44792175292969, "kl": 1.5078125, "learning_rate": 1e-06, "loss": 0.0602, "reward": 0.9838788509368896, "reward_std": 0.5077656507492065, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.14112117886543274, "rewards/tag_count_reward": 0.875, "step": 541 }, { "completion_length": 493.0, "epoch": 0.03613333333333334, "grad_norm": 98.48267364501953, "kl": 1.21875, "learning_rate": 1e-06, "loss": 0.0489, "reward": 1.4731286764144897, "reward_std": 0.6060855388641357, "rewards/accuracy_reward": 0.5625, "rewards/len_reward": -0.042496293783187866, "rewards/tag_count_reward": 0.953125, "step": 542 }, { "completion_length": 808.5, "epoch": 0.0362, "grad_norm": 187.82749938964844, "kl": 1.0, "learning_rate": 1e-06, "loss": 0.04, "reward": 1.1587413549423218, "reward_std": 0.3639298379421234, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.09125865995883942, "rewards/tag_count_reward": 0.8125, "step": 543 }, { "completion_length": 1173.0, "epoch": 0.03626666666666667, "grad_norm": 268.3917236328125, "kl": 2.03125, "learning_rate": 1e-06, "loss": 0.081, "reward": 1.0823649168014526, "reward_std": 0.4572429060935974, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.19888514280319214, "rewards/tag_count_reward": 0.84375, "step": 544 }, { "completion_length": 1165.0, "epoch": 0.036333333333333336, "grad_norm": 211.13243103027344, "kl": 1.2421875, "learning_rate": 1e-06, "loss": 0.0497, "reward": 0.5632283687591553, "reward_std": 0.22577616572380066, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.17114657163619995, "rewards/tag_count_reward": 0.734375, "step": 545 }, { "completion_length": 729.0, "epoch": 0.0364, "grad_norm": 7.84573221206665, "kl": 1.2265625, "learning_rate": 1e-06, "loss": 0.0489, "reward": 0.7691013216972351, "reward_std": 0.23391905426979065, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2465236932039261, "rewards/tag_count_reward": 0.953125, "step": 546 }, { "completion_length": 756.5, "epoch": 0.03646666666666667, "grad_norm": 35.259178161621094, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.0864, "reward": 0.5018031597137451, "reward_std": 0.24412024021148682, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2169468104839325, "rewards/tag_count_reward": 0.71875, "step": 547 }, { "completion_length": 1541.5, "epoch": 0.036533333333333334, "grad_norm": 33.61385726928711, "kl": 1.8828125, "learning_rate": 1e-06, "loss": 0.075, "reward": 0.5826287269592285, "reward_std": 0.4832291007041931, "rewards/accuracy_reward": 0.1875, "rewards/len_reward": -0.1829962581396103, "rewards/tag_count_reward": 0.578125, "step": 548 }, { "completion_length": 416.5, "epoch": 0.0366, "grad_norm": 414.3907470703125, "kl": 1.1953125, "learning_rate": 1e-06, "loss": 0.0479, "reward": 1.6081717014312744, "reward_std": 0.761642575263977, "rewards/accuracy_reward": 0.6875, "rewards/len_reward": -0.04807830601930618, "rewards/tag_count_reward": 0.96875, "step": 549 }, { "completion_length": 1300.5, "epoch": 0.03666666666666667, "grad_norm": 67.41233825683594, "kl": 2.734375, "learning_rate": 1e-06, "loss": 0.1091, "reward": 0.5682820677757263, "reward_std": 0.3351229429244995, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.19734293222427368, "rewards/tag_count_reward": 0.703125, "step": 550 }, { "completion_length": 459.0, "epoch": 0.03673333333333333, "grad_norm": 81.06475830078125, "kl": 10.0, "learning_rate": 1e-06, "loss": 0.4002, "reward": 0.8082788586616516, "reward_std": 0.38192644715309143, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.16047115623950958, "rewards/tag_count_reward": 0.90625, "step": 551 }, { "completion_length": 1919.0, "epoch": 0.0368, "grad_norm": 190.73329162597656, "kl": 6.375, "learning_rate": 1e-06, "loss": 0.2547, "reward": 0.4826861619949341, "reward_std": 0.422154039144516, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3298138678073883, "rewards/tag_count_reward": 0.75, "step": 552 }, { "completion_length": 395.0, "epoch": 0.036866666666666666, "grad_norm": 25.296106338500977, "kl": 1.875, "learning_rate": 1e-06, "loss": 0.0747, "reward": 1.1998919248580933, "reward_std": 0.40744996070861816, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.14385807514190674, "rewards/tag_count_reward": 0.96875, "step": 553 }, { "completion_length": 1485.5, "epoch": 0.03693333333333333, "grad_norm": 41.97356033325195, "kl": 5.25, "learning_rate": 1e-06, "loss": 0.2101, "reward": 1.2462836503982544, "reward_std": 0.27198371291160583, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.2537163496017456, "rewards/tag_count_reward": 1.0, "step": 554 }, { "completion_length": 669.0, "epoch": 0.037, "grad_norm": 60.60013961791992, "kl": 5.4375, "learning_rate": 1e-06, "loss": 0.2175, "reward": 0.7133312225341797, "reward_std": 0.3600277304649353, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.22416874766349792, "rewards/tag_count_reward": 0.8125, "step": 555 }, { "completion_length": 866.0, "epoch": 0.037066666666666664, "grad_norm": 283.12860107421875, "kl": 1.25, "learning_rate": 1e-06, "loss": 0.05, "reward": 0.7453024387359619, "reward_std": 0.19285044074058533, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22344759106636047, "rewards/tag_count_reward": 0.96875, "step": 556 }, { "completion_length": 835.0, "epoch": 0.03713333333333333, "grad_norm": 46.98485565185547, "kl": 4.125, "learning_rate": 1e-06, "loss": 0.1644, "reward": 0.6793530583381653, "reward_std": 0.28530818223953247, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21127192676067352, "rewards/tag_count_reward": 0.890625, "step": 557 }, { "completion_length": 314.0, "epoch": 0.0372, "grad_norm": 79.10865783691406, "kl": 5.90625, "learning_rate": 1e-06, "loss": 0.2369, "reward": 1.6027872562408447, "reward_std": 0.4462547302246094, "rewards/accuracy_reward": 0.75, "rewards/len_reward": -0.1472126841545105, "rewards/tag_count_reward": 1.0, "step": 558 }, { "completion_length": 630.5, "epoch": 0.03726666666666666, "grad_norm": 62.99226760864258, "kl": 8.1875, "learning_rate": 1e-06, "loss": 0.3277, "reward": 0.7975714206695557, "reward_std": 0.3944025933742523, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.26492857933044434, "rewards/tag_count_reward": 1.0, "step": 559 }, { "completion_length": 1226.5, "epoch": 0.037333333333333336, "grad_norm": 61.31340026855469, "kl": 3.9375, "learning_rate": 1e-06, "loss": 0.1574, "reward": 0.5960041284561157, "reward_std": 0.20416629314422607, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.24774591624736786, "rewards/tag_count_reward": 0.84375, "step": 560 }, { "completion_length": 1597.5, "epoch": 0.0374, "grad_norm": 98.62857818603516, "kl": 1.3046875, "learning_rate": 1e-06, "loss": 0.0522, "reward": 0.6311689019203186, "reward_std": 0.23550403118133545, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.134456068277359, "rewards/tag_count_reward": 0.765625, "step": 561 }, { "completion_length": 500.0, "epoch": 0.03746666666666667, "grad_norm": 95.38880157470703, "kl": 1.234375, "learning_rate": 1e-06, "loss": 0.0495, "reward": 0.9405548572540283, "reward_std": 0.4175971746444702, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.12194512784481049, "rewards/tag_count_reward": 0.9375, "step": 562 }, { "completion_length": 594.0, "epoch": 0.037533333333333335, "grad_norm": 99.58438873291016, "kl": 1.1171875, "learning_rate": 1e-06, "loss": 0.0447, "reward": 0.7160016298294067, "reward_std": 0.16242673993110657, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.26837339997291565, "rewards/tag_count_reward": 0.984375, "step": 563 }, { "completion_length": 511.0, "epoch": 0.0376, "grad_norm": 72.11625671386719, "kl": 1.515625, "learning_rate": 1e-06, "loss": 0.0607, "reward": 0.7955929040908813, "reward_std": 0.3326777517795563, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.28253206610679626, "rewards/tag_count_reward": 0.953125, "step": 564 }, { "completion_length": 993.0, "epoch": 0.03766666666666667, "grad_norm": 36.98861312866211, "kl": 2.15625, "learning_rate": 1e-06, "loss": 0.086, "reward": 0.7062109708786011, "reward_std": 0.19238844513893127, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.29378896951675415, "rewards/tag_count_reward": 1.0, "step": 565 }, { "completion_length": 1724.0, "epoch": 0.037733333333333334, "grad_norm": 56.63056564331055, "kl": 1.078125, "learning_rate": 1e-06, "loss": 0.0432, "reward": 0.919497013092041, "reward_std": 0.3396682143211365, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.2836279571056366, "rewards/tag_count_reward": 0.953125, "step": 566 }, { "completion_length": 1594.5, "epoch": 0.0378, "grad_norm": 66.82796478271484, "kl": 1.7578125, "learning_rate": 1e-06, "loss": 0.0702, "reward": 0.5739055275917053, "reward_std": 0.25649651885032654, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.16046947240829468, "rewards/tag_count_reward": 0.734375, "step": 567 }, { "completion_length": 1549.0, "epoch": 0.037866666666666667, "grad_norm": 92.17615509033203, "kl": 1.7890625, "learning_rate": 1e-06, "loss": 0.0715, "reward": 0.6099597811698914, "reward_std": 0.2543640434741974, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.21816526353359222, "rewards/tag_count_reward": 0.828125, "step": 568 }, { "completion_length": 1274.5, "epoch": 0.03793333333333333, "grad_norm": 60.14912796020508, "kl": 1.2734375, "learning_rate": 1e-06, "loss": 0.051, "reward": 0.7341867685317993, "reward_std": 0.21600724756717682, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2033131867647171, "rewards/tag_count_reward": 0.9375, "step": 569 }, { "completion_length": 1292.5, "epoch": 0.038, "grad_norm": 19.547534942626953, "kl": 4.28125, "learning_rate": 1e-06, "loss": 0.1717, "reward": 0.8135941624641418, "reward_std": 0.40806543827056885, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.20203083753585815, "rewards/tag_count_reward": 0.953125, "step": 570 }, { "completion_length": 1476.5, "epoch": 0.038066666666666665, "grad_norm": 55.32059097290039, "kl": 2.03125, "learning_rate": 1e-06, "loss": 0.0817, "reward": 0.6646462678909302, "reward_std": 0.3652288317680359, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.3041037619113922, "rewards/tag_count_reward": 0.84375, "step": 571 }, { "completion_length": 1136.5, "epoch": 0.03813333333333333, "grad_norm": 76.0602798461914, "kl": 0.84765625, "learning_rate": 1e-06, "loss": 0.0339, "reward": 0.7735757827758789, "reward_std": 0.30608898401260376, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.2420491874217987, "rewards/tag_count_reward": 0.953125, "step": 572 }, { "completion_length": 1641.0, "epoch": 0.0382, "grad_norm": 51.28430938720703, "kl": 4.4375, "learning_rate": 1e-06, "loss": 0.178, "reward": 0.5504544973373413, "reward_std": 0.24401339888572693, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2776705026626587, "rewards/tag_count_reward": 0.828125, "step": 573 }, { "completion_length": 1398.0, "epoch": 0.038266666666666664, "grad_norm": 27.796009063720703, "kl": 2.34375, "learning_rate": 1e-06, "loss": 0.0941, "reward": 0.7129076719284058, "reward_std": 0.18738889694213867, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.27146732807159424, "rewards/tag_count_reward": 0.984375, "step": 574 }, { "completion_length": 1515.0, "epoch": 0.03833333333333333, "grad_norm": 16.124732971191406, "kl": 1.359375, "learning_rate": 1e-06, "loss": 0.0546, "reward": 0.7237770557403564, "reward_std": 0.2166043221950531, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.19809791445732117, "rewards/tag_count_reward": 0.921875, "step": 575 }, { "completion_length": 1228.0, "epoch": 0.0384, "grad_norm": 83.39096069335938, "kl": 2.75, "learning_rate": 1e-06, "loss": 0.1097, "reward": 1.0372037887573242, "reward_std": 0.4406978487968445, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.2596711814403534, "rewards/tag_count_reward": 0.984375, "step": 576 }, { "completion_length": 1316.0, "epoch": 0.03846666666666667, "grad_norm": 43.13058853149414, "kl": 1.625, "learning_rate": 1e-06, "loss": 0.0653, "reward": 0.5740906000137329, "reward_std": 0.34053653478622437, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3165343701839447, "rewards/tag_count_reward": 0.828125, "step": 577 }, { "completion_length": 1529.0, "epoch": 0.038533333333333336, "grad_norm": 36.661808013916016, "kl": 1.875, "learning_rate": 1e-06, "loss": 0.0752, "reward": 0.4301720857620239, "reward_std": 0.29682278633117676, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14795291423797607, "rewards/tag_count_reward": 0.578125, "step": 578 }, { "completion_length": 1967.5, "epoch": 0.0386, "grad_norm": 72.27436065673828, "kl": 2.53125, "learning_rate": 1e-06, "loss": 0.1015, "reward": 0.5470969676971436, "reward_std": 0.3998749256134033, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.18727803230285645, "rewards/tag_count_reward": 0.671875, "step": 579 }, { "completion_length": 376.5, "epoch": 0.03866666666666667, "grad_norm": 40.38228988647461, "kl": 0.98828125, "learning_rate": 1e-06, "loss": 0.0396, "reward": 1.3723690509796143, "reward_std": 0.4802526831626892, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.12763085961341858, "rewards/tag_count_reward": 1.0, "step": 580 }, { "completion_length": 1755.5, "epoch": 0.038733333333333335, "grad_norm": 105.07771301269531, "kl": 4.25, "learning_rate": 1e-06, "loss": 0.1703, "reward": 0.4931444525718689, "reward_std": 0.33035823702812195, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.3037305474281311, "rewards/tag_count_reward": 0.734375, "step": 581 }, { "completion_length": 1228.0, "epoch": 0.0388, "grad_norm": 174.9635467529297, "kl": 2.78125, "learning_rate": 1e-06, "loss": 0.1113, "reward": 0.9252910614013672, "reward_std": 0.5327416062355042, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.26220887899398804, "rewards/tag_count_reward": 0.9375, "step": 582 }, { "completion_length": 1432.0, "epoch": 0.03886666666666667, "grad_norm": 66.7169418334961, "kl": 4.4375, "learning_rate": 1e-06, "loss": 0.1769, "reward": 0.6800612211227417, "reward_std": 0.22471702098846436, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.2886887788772583, "rewards/tag_count_reward": 0.96875, "step": 583 }, { "completion_length": 1371.5, "epoch": 0.038933333333333334, "grad_norm": 167.17843627929688, "kl": 1.3359375, "learning_rate": 1e-06, "loss": 0.0533, "reward": 0.8494129180908203, "reward_std": 0.1625041514635086, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1193370521068573, "rewards/tag_count_reward": 0.96875, "step": 584 }, { "completion_length": 791.5, "epoch": 0.039, "grad_norm": 50.34352493286133, "kl": 2.21875, "learning_rate": 1e-06, "loss": 0.0892, "reward": 0.8169752359390259, "reward_std": 0.2861425578594208, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.22989973425865173, "rewards/tag_count_reward": 0.984375, "step": 585 }, { "completion_length": 1327.0, "epoch": 0.039066666666666666, "grad_norm": 114.80569458007812, "kl": 1.3125, "learning_rate": 1e-06, "loss": 0.0525, "reward": 0.978561520576477, "reward_std": 0.35342609882354736, "rewards/accuracy_reward": 0.3125, "rewards/len_reward": -0.31831347942352295, "rewards/tag_count_reward": 0.984375, "step": 586 }, { "completion_length": 576.0, "epoch": 0.03913333333333333, "grad_norm": 31.026878356933594, "kl": 1.3125, "learning_rate": 1e-06, "loss": 0.0524, "reward": 0.7971070408821106, "reward_std": 0.31213486194610596, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.1716429740190506, "rewards/tag_count_reward": 0.90625, "step": 587 }, { "completion_length": 544.5, "epoch": 0.0392, "grad_norm": 72.87944793701172, "kl": 1.125, "learning_rate": 1e-06, "loss": 0.0449, "reward": 1.0887908935546875, "reward_std": 0.2934584617614746, "rewards/accuracy_reward": 0.4375, "rewards/len_reward": -0.1924591362476349, "rewards/tag_count_reward": 0.84375, "step": 588 }, { "completion_length": 1214.0, "epoch": 0.039266666666666665, "grad_norm": 26.02140235900879, "kl": 2.28125, "learning_rate": 1e-06, "loss": 0.0911, "reward": 1.1631091833114624, "reward_std": 0.526539146900177, "rewards/accuracy_reward": 0.375, "rewards/len_reward": -0.008765812031924725, "rewards/tag_count_reward": 0.796875, "step": 589 }, { "completion_length": 999.0, "epoch": 0.03933333333333333, "grad_norm": 101.99327850341797, "kl": 1.765625, "learning_rate": 1e-06, "loss": 0.0705, "reward": 0.5965220332145691, "reward_std": 0.23640874028205872, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.1847279667854309, "rewards/tag_count_reward": 0.78125, "step": 590 }, { "completion_length": 797.5, "epoch": 0.0394, "grad_norm": 129.76966857910156, "kl": 2.390625, "learning_rate": 1e-06, "loss": 0.0952, "reward": 1.2040002346038818, "reward_std": 0.31418269872665405, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.202249675989151, "rewards/tag_count_reward": 0.90625, "step": 591 }, { "completion_length": 692.5, "epoch": 0.039466666666666664, "grad_norm": 79.64732360839844, "kl": 1.8125, "learning_rate": 1e-06, "loss": 0.0726, "reward": 0.5536153316497803, "reward_std": 0.21468180418014526, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.14950962364673615, "rewards/tag_count_reward": 0.703125, "step": 592 }, { "completion_length": 1234.5, "epoch": 0.03953333333333333, "grad_norm": 217.17437744140625, "kl": 2.46875, "learning_rate": 1e-06, "loss": 0.0983, "reward": 0.7479342222213745, "reward_std": 0.18500711023807526, "rewards/accuracy_reward": 0.0, "rewards/len_reward": -0.22081580758094788, "rewards/tag_count_reward": 0.96875, "step": 593 }, { "completion_length": 1058.5, "epoch": 0.0396, "grad_norm": 24.286544799804688, "kl": 2.828125, "learning_rate": 1e-06, "loss": 0.1131, "reward": 0.7602176666259766, "reward_std": 0.3844819664955139, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.19290727376937866, "rewards/tag_count_reward": 0.890625, "step": 594 }, { "completion_length": 1172.5, "epoch": 0.03966666666666667, "grad_norm": 67.3233413696289, "kl": 6.34375, "learning_rate": 1e-06, "loss": 0.2544, "reward": 0.7233742475509644, "reward_std": 0.49401748180389404, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.13600073754787445, "rewards/tag_count_reward": 0.796875, "step": 595 }, { "completion_length": 259.0, "epoch": 0.039733333333333336, "grad_norm": 26.167888641357422, "kl": 1.3046875, "learning_rate": 1e-06, "loss": 0.0522, "reward": 1.4047433137893677, "reward_std": 0.27290037274360657, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.09525654464960098, "rewards/tag_count_reward": 1.0, "step": 596 }, { "completion_length": 1156.0, "epoch": 0.0398, "grad_norm": 68.4002685546875, "kl": 6.1875, "learning_rate": 1e-06, "loss": 0.248, "reward": 0.8152648210525513, "reward_std": 0.3991209864616394, "rewards/accuracy_reward": 0.125, "rewards/len_reward": -0.27848517894744873, "rewards/tag_count_reward": 0.96875, "step": 597 }, { "completion_length": 707.0, "epoch": 0.03986666666666667, "grad_norm": 75.64336395263672, "kl": 3.515625, "learning_rate": 1e-06, "loss": 0.1403, "reward": 0.9840830564498901, "reward_std": 0.4521138668060303, "rewards/accuracy_reward": 0.25, "rewards/len_reward": -0.2190418690443039, "rewards/tag_count_reward": 0.953125, "step": 598 }, { "completion_length": 1399.5, "epoch": 0.039933333333333335, "grad_norm": 49.26580047607422, "kl": 3.6875, "learning_rate": 1e-06, "loss": 0.1476, "reward": 0.6098353862762451, "reward_std": 0.25023001432418823, "rewards/accuracy_reward": 0.0625, "rewards/len_reward": -0.29641464352607727, "rewards/tag_count_reward": 0.84375, "step": 599 }, { "completion_length": 1176.0, "epoch": 0.04, "grad_norm": 48.72330093383789, "kl": 3.90625, "learning_rate": 1e-06, "loss": 0.1564, "reward": 1.256005048751831, "reward_std": 0.3112409710884094, "rewards/accuracy_reward": 0.5, "rewards/len_reward": -0.11899499595165253, "rewards/tag_count_reward": 0.875, "step": 600 }, { "epoch": 0.04, "step": 600, "total_flos": 0.0, "train_loss": 0.15968478212249465, "train_runtime": 15013.7279, "train_samples_per_second": 0.639, "train_steps_per_second": 0.04 } ], "logging_steps": 1, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }