| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 180, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 3512.2977294921875, | |
| "epoch": 0.027972027972027972, | |
| "grad_norm": 0.8593507409095764, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 0.4687500149011612, | |
| "reward_std": 0.12072492484003305, | |
| "rewards/accuracy_reward": 0.08333333441987634, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3854166716337204, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 3691.6666870117188, | |
| "epoch": 0.055944055944055944, | |
| "grad_norm": 0.3791194260120392, | |
| "kl": 0.0, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.0, | |
| "reward": 0.4680059477686882, | |
| "reward_std": 0.1798743773251772, | |
| "rewards/accuracy_reward": 0.09226190531626344, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3757440596818924, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3474.386962890625, | |
| "epoch": 0.08391608391608392, | |
| "grad_norm": 0.8631621599197388, | |
| "kl": 0.0010442733764648438, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 0.0, | |
| "reward": 0.5632440596818924, | |
| "reward_std": 0.14932114072144032, | |
| "rewards/accuracy_reward": 0.14880952634848654, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4144345298409462, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 3347.2440795898438, | |
| "epoch": 0.11188811188811189, | |
| "grad_norm": 0.5367617011070251, | |
| "kl": 0.0007762908935546875, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.0, | |
| "reward": 0.5282738134264946, | |
| "reward_std": 0.10862548742443323, | |
| "rewards/accuracy_reward": 0.12202381063252687, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4062500149011612, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3752.8154907226562, | |
| "epoch": 0.13986013986013987, | |
| "grad_norm": 8.54397964477539, | |
| "kl": 0.004771232604980469, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.0002, | |
| "reward": 0.500744067132473, | |
| "reward_std": 0.16472498513758183, | |
| "rewards/accuracy_reward": 0.11904762173071504, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3816964328289032, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3802.7738647460938, | |
| "epoch": 0.16783216783216784, | |
| "grad_norm": 0.41414809226989746, | |
| "kl": 0.0012989044189453125, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 0.0001, | |
| "reward": 0.4546131044626236, | |
| "reward_std": 0.16401701793074608, | |
| "rewards/accuracy_reward": 0.08333333674818277, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3712797686457634, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3980.6995239257812, | |
| "epoch": 0.1958041958041958, | |
| "grad_norm": 0.267231285572052, | |
| "kl": 0.0023193359375, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0001, | |
| "reward": 0.4702381044626236, | |
| "reward_std": 0.17164652980864048, | |
| "rewards/accuracy_reward": 0.10119047830812633, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3690476268529892, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 3679.449462890625, | |
| "epoch": 0.22377622377622378, | |
| "grad_norm": 0.41959381103515625, | |
| "kl": 0.00553131103515625, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 0.0002, | |
| "reward": 0.5014881119132042, | |
| "reward_std": 0.19226571172475815, | |
| "rewards/accuracy_reward": 0.0773809552192688, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4241071566939354, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3754.1875610351562, | |
| "epoch": 0.2517482517482518, | |
| "grad_norm": 0.3485651910305023, | |
| "kl": 0.00905609130859375, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.0004, | |
| "reward": 0.5498512014746666, | |
| "reward_std": 0.20580468513071537, | |
| "rewards/accuracy_reward": 0.1547619104385376, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3950892984867096, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 3627.5863647460938, | |
| "epoch": 0.27972027972027974, | |
| "grad_norm": 0.3600512146949768, | |
| "kl": 0.01141357421875, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0005, | |
| "reward": 0.6197916865348816, | |
| "reward_std": 0.26382705941796303, | |
| "rewards/accuracy_reward": 0.1696428656578064, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4501488134264946, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3484.1519165039062, | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.506460964679718, | |
| "kl": 0.020751953125, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.0008, | |
| "reward": 0.607886902987957, | |
| "reward_std": 0.17454461008310318, | |
| "rewards/accuracy_reward": 0.13392857857979834, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.473958358168602, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 3566.0863647460938, | |
| "epoch": 0.3356643356643357, | |
| "grad_norm": 0.2500086724758148, | |
| "kl": 0.019134521484375, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 0.0008, | |
| "reward": 0.4992559626698494, | |
| "reward_std": 0.18833242915570736, | |
| "rewards/accuracy_reward": 0.08035714481957257, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4188988134264946, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 3528.726318359375, | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.9317561984062195, | |
| "kl": 0.03057861328125, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.0012, | |
| "reward": 0.7031250074505806, | |
| "reward_std": 0.20333649218082428, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250074505806, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 3629.4702758789062, | |
| "epoch": 0.3916083916083916, | |
| "grad_norm": 0.35670071840286255, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 7.222222222222223e-06, | |
| "loss": 0.0013, | |
| "reward": 0.7373512238264084, | |
| "reward_std": 0.3180300444364548, | |
| "rewards/accuracy_reward": 0.3571428693830967, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3802083507180214, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 3948.136962890625, | |
| "epoch": 0.4195804195804196, | |
| "grad_norm": 0.27876967191696167, | |
| "kl": 0.03790283203125, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.0015, | |
| "reward": 0.5714285671710968, | |
| "reward_std": 0.18836580589413643, | |
| "rewards/accuracy_reward": 0.2440476268529892, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3273809552192688, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3863.0506591796875, | |
| "epoch": 0.44755244755244755, | |
| "grad_norm": 0.3702605366706848, | |
| "kl": 0.04669189453125, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.0019, | |
| "reward": 0.6302083507180214, | |
| "reward_std": 0.2468692809343338, | |
| "rewards/accuracy_reward": 0.28273809887468815, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3474702537059784, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 3905.9791870117188, | |
| "epoch": 0.4755244755244755, | |
| "grad_norm": 0.37722447514533997, | |
| "kl": 0.0635986328125, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.0025, | |
| "reward": 0.5989583432674408, | |
| "reward_std": 0.23427820205688477, | |
| "rewards/accuracy_reward": 0.26190476957708597, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3370535746216774, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 3940.4078369140625, | |
| "epoch": 0.5034965034965035, | |
| "grad_norm": 0.1647387593984604, | |
| "kl": 0.06719970703125, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 0.0027, | |
| "reward": 0.5595238283276558, | |
| "reward_std": 0.1750249993056059, | |
| "rewards/accuracy_reward": 0.23214286216534674, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3273809552192688, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3544.8065795898438, | |
| "epoch": 0.5314685314685315, | |
| "grad_norm": 0.22612161934375763, | |
| "kl": 0.0860595703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0034, | |
| "reward": 0.7909226268529892, | |
| "reward_std": 0.1969960704445839, | |
| "rewards/accuracy_reward": 0.422619067132473, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3683035746216774, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 4051.9525146484375, | |
| "epoch": 0.5594405594405595, | |
| "grad_norm": 0.18135209381580353, | |
| "kl": 0.1053466796875, | |
| "learning_rate": 9.999153867018256e-06, | |
| "loss": 0.0042, | |
| "reward": 0.5111607164144516, | |
| "reward_std": 0.1593446908518672, | |
| "rewards/accuracy_reward": 0.1964285783469677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3147321492433548, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 3969.1458129882812, | |
| "epoch": 0.5874125874125874, | |
| "grad_norm": 0.19610409438610077, | |
| "kl": 0.10791015625, | |
| "learning_rate": 9.996615786269036e-06, | |
| "loss": 0.0043, | |
| "reward": 0.5543154925107956, | |
| "reward_std": 0.2091393768787384, | |
| "rewards/accuracy_reward": 0.22619047947227955, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3281250074505806, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 4012.9880981445312, | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.1830030381679535, | |
| "kl": 0.119873046875, | |
| "learning_rate": 9.99238671222071e-06, | |
| "loss": 0.0048, | |
| "reward": 0.569940485060215, | |
| "reward_std": 0.1295820251107216, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.319940485060215, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 3812.71435546875, | |
| "epoch": 0.6433566433566433, | |
| "grad_norm": 0.15088728070259094, | |
| "kl": 0.1474609375, | |
| "learning_rate": 9.986468235255065e-06, | |
| "loss": 0.0059, | |
| "reward": 0.6748512089252472, | |
| "reward_std": 0.19087868556380272, | |
| "rewards/accuracy_reward": 0.3363095335662365, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3385416716337204, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 3547.1428833007812, | |
| "epoch": 0.6713286713286714, | |
| "grad_norm": 0.1391303539276123, | |
| "kl": 0.15625, | |
| "learning_rate": 9.978862581069247e-06, | |
| "loss": 0.0062, | |
| "reward": 0.7068452537059784, | |
| "reward_std": 0.16218139231204987, | |
| "rewards/accuracy_reward": 0.3511904813349247, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3556547611951828, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 3582.3483276367188, | |
| "epoch": 0.6993006993006993, | |
| "grad_norm": 0.16375645995140076, | |
| "kl": 0.17431640625, | |
| "learning_rate": 9.969572609838745e-06, | |
| "loss": 0.007, | |
| "reward": 0.7254464477300644, | |
| "reward_std": 0.2593443766236305, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3683035746216774, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3646.1101684570312, | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.1331067532300949, | |
| "kl": 0.19384765625, | |
| "learning_rate": 9.958601815141804e-06, | |
| "loss": 0.0078, | |
| "reward": 0.6919643133878708, | |
| "reward_std": 0.1708707083016634, | |
| "rewards/accuracy_reward": 0.345238097012043, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3467262014746666, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 3312.107177734375, | |
| "epoch": 0.7552447552447552, | |
| "grad_norm": 0.16016700863838196, | |
| "kl": 0.2138671875, | |
| "learning_rate": 9.945954322645643e-06, | |
| "loss": 0.0086, | |
| "reward": 0.7827381193637848, | |
| "reward_std": 0.14833381958305836, | |
| "rewards/accuracy_reward": 0.3779761902987957, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4047619104385376, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 3692.27978515625, | |
| "epoch": 0.7832167832167832, | |
| "grad_norm": 0.16688044369220734, | |
| "kl": 0.218017578125, | |
| "learning_rate": 9.931634888554937e-06, | |
| "loss": 0.0087, | |
| "reward": 0.494791679084301, | |
| "reward_std": 0.12484246864914894, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3489583432674408, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3611.77392578125, | |
| "epoch": 0.8111888111888111, | |
| "grad_norm": 0.1440156251192093, | |
| "kl": 0.201416015625, | |
| "learning_rate": 9.915648897823232e-06, | |
| "loss": 0.0081, | |
| "reward": 0.6674107313156128, | |
| "reward_std": 0.19944895431399345, | |
| "rewards/accuracy_reward": 0.28273810259997845, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3846726343035698, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3664.08935546875, | |
| "epoch": 0.8391608391608392, | |
| "grad_norm": 0.1561511754989624, | |
| "kl": 0.247314453125, | |
| "learning_rate": 9.89800236212786e-06, | |
| "loss": 0.0099, | |
| "reward": 0.5520833358168602, | |
| "reward_std": 0.15778076276183128, | |
| "rewards/accuracy_reward": 0.17857143469154835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3735119104385376, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3394.3392944335938, | |
| "epoch": 0.8671328671328671, | |
| "grad_norm": 0.885275661945343, | |
| "kl": 0.255126953125, | |
| "learning_rate": 9.878701917609208e-06, | |
| "loss": 0.0102, | |
| "reward": 0.6629464328289032, | |
| "reward_std": 0.24217551946640015, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.392113097012043, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3195.0714721679688, | |
| "epoch": 0.8951048951048951, | |
| "grad_norm": 0.21199384331703186, | |
| "kl": 0.2421875, | |
| "learning_rate": 9.857754822375127e-06, | |
| "loss": 0.0097, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.1642789300531149, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3958333432674408, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3266.6904907226562, | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.17293396592140198, | |
| "kl": 0.28955078125, | |
| "learning_rate": 9.835168953771463e-06, | |
| "loss": 0.0116, | |
| "reward": 0.7313988357782364, | |
| "reward_std": 0.19151451624929905, | |
| "rewards/accuracy_reward": 0.3363095298409462, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.395089291036129, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2827.1220703125, | |
| "epoch": 0.951048951048951, | |
| "grad_norm": 0.17569643259048462, | |
| "kl": 0.285888671875, | |
| "learning_rate": 9.810952805419701e-06, | |
| "loss": 0.0114, | |
| "reward": 0.8117559850215912, | |
| "reward_std": 0.1866137869656086, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.43675597012043, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3854.8453369140625, | |
| "epoch": 0.9790209790209791, | |
| "grad_norm": 0.20866483449935913, | |
| "kl": 0.32470703125, | |
| "learning_rate": 9.78511548402287e-06, | |
| "loss": 0.013, | |
| "reward": 0.4598214253783226, | |
| "reward_std": 0.1464038034901023, | |
| "rewards/accuracy_reward": 0.13095238478854299, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3288690596818924, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 4199.79833984375, | |
| "epoch": 1.0, | |
| "grad_norm": 0.20866483449935913, | |
| "kl": 0.3567708333333333, | |
| "learning_rate": 9.757666705940879e-06, | |
| "loss": 0.0107, | |
| "reward": 0.4285714427630107, | |
| "reward_std": 0.11506069699923198, | |
| "rewards/accuracy_reward": 0.11507936815420787, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3134920696417491, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3277.7470703125, | |
| "epoch": 1.027972027972028, | |
| "grad_norm": 0.22707578539848328, | |
| "kl": 0.29443359375, | |
| "learning_rate": 9.728616793536588e-06, | |
| "loss": 0.0118, | |
| "reward": 0.4918154701590538, | |
| "reward_std": 0.12884833849966526, | |
| "rewards/accuracy_reward": 0.10714285774156451, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3846726343035698, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3351.4137573242188, | |
| "epoch": 1.055944055944056, | |
| "grad_norm": 0.20524734258651733, | |
| "kl": 0.31640625, | |
| "learning_rate": 9.697976671294004e-06, | |
| "loss": 0.0127, | |
| "reward": 0.4799107238650322, | |
| "reward_std": 0.14404969848692417, | |
| "rewards/accuracy_reward": 0.09226190787740052, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3876488134264946, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2960.2083740234375, | |
| "epoch": 1.083916083916084, | |
| "grad_norm": 0.2950814664363861, | |
| "kl": 0.6025390625, | |
| "learning_rate": 9.665757861710008e-06, | |
| "loss": 0.0241, | |
| "reward": 0.5669642984867096, | |
| "reward_std": 0.16924606263637543, | |
| "rewards/accuracy_reward": 0.13690476352348924, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4300595298409462, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2929.4702758789062, | |
| "epoch": 1.1118881118881119, | |
| "grad_norm": 62.54094314575195, | |
| "kl": 0.326171875, | |
| "learning_rate": 9.631972480961235e-06, | |
| "loss": 0.0131, | |
| "reward": 0.5580357238650322, | |
| "reward_std": 0.12265495210886002, | |
| "rewards/accuracy_reward": 0.13392857275903225, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4241071566939354, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3358.2232055664062, | |
| "epoch": 1.1398601398601398, | |
| "grad_norm": 0.316780686378479, | |
| "kl": 0.39453125, | |
| "learning_rate": 9.596633234347661e-06, | |
| "loss": 0.0158, | |
| "reward": 0.5625, | |
| "reward_std": 0.16730434074997902, | |
| "rewards/accuracy_reward": 0.16369048319756985, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3988095298409462, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 3222.4703369140625, | |
| "epoch": 1.167832167832168, | |
| "grad_norm": 24.193492889404297, | |
| "kl": 0.609375, | |
| "learning_rate": 9.55975341151467e-06, | |
| "loss": 0.0244, | |
| "reward": 0.5245535746216774, | |
| "reward_std": 0.1717843282967806, | |
| "rewards/accuracy_reward": 0.10714285867288709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4174107238650322, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3550.6250610351562, | |
| "epoch": 1.1958041958041958, | |
| "grad_norm": 0.47472667694091797, | |
| "kl": 0.44384765625, | |
| "learning_rate": 9.521346881455356e-06, | |
| "loss": 0.0177, | |
| "reward": 0.5491071492433548, | |
| "reward_std": 0.1706194244325161, | |
| "rewards/accuracy_reward": 0.15476190764456987, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3943452462553978, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 3420.3126220703125, | |
| "epoch": 1.2237762237762237, | |
| "grad_norm": 0.37584051489830017, | |
| "kl": 0.5224609375, | |
| "learning_rate": 9.48142808729496e-06, | |
| "loss": 0.0209, | |
| "reward": 0.5186012014746666, | |
| "reward_std": 0.1755489856004715, | |
| "rewards/accuracy_reward": 0.11904762405902147, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.399553582072258, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3507.6400146484375, | |
| "epoch": 1.2517482517482517, | |
| "grad_norm": 0.49210748076438904, | |
| "kl": 0.66796875, | |
| "learning_rate": 9.44001204085941e-06, | |
| "loss": 0.0267, | |
| "reward": 0.5327381119132042, | |
| "reward_std": 0.20459075924009085, | |
| "rewards/accuracy_reward": 0.1488095298409462, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.383928582072258, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3493.9286499023438, | |
| "epoch": 1.2797202797202798, | |
| "grad_norm": 1.0954947471618652, | |
| "kl": 0.8056640625, | |
| "learning_rate": 9.397114317029975e-06, | |
| "loss": 0.0322, | |
| "reward": 0.5386904776096344, | |
| "reward_std": 0.20434119179844856, | |
| "rewards/accuracy_reward": 0.15178571827709675, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3869047686457634, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 3490.360107421875, | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 141.8840789794922, | |
| "kl": 4.5302734375, | |
| "learning_rate": 9.3527510478862e-06, | |
| "loss": 0.1815, | |
| "reward": 0.4858631193637848, | |
| "reward_std": 0.1842249110341072, | |
| "rewards/accuracy_reward": 0.08333333604969084, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4025297686457634, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 3363.1935424804688, | |
| "epoch": 1.3356643356643356, | |
| "grad_norm": 1.6688289642333984, | |
| "kl": 1.64453125, | |
| "learning_rate": 9.306938916639285e-06, | |
| "loss": 0.0657, | |
| "reward": 0.5513392984867096, | |
| "reward_std": 0.23358215764164925, | |
| "rewards/accuracy_reward": 0.10119047714397311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4501488134264946, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2769.7887573242188, | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.9288725256919861, | |
| "kl": 1.892578125, | |
| "learning_rate": 9.259695151358215e-06, | |
| "loss": 0.0757, | |
| "reward": 0.7202381193637848, | |
| "reward_std": 0.28254037350416183, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.511904776096344, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2399.5387573242188, | |
| "epoch": 1.3916083916083917, | |
| "grad_norm": 1.414353847503662, | |
| "kl": 1.958984375, | |
| "learning_rate": 9.211037518490981e-06, | |
| "loss": 0.0784, | |
| "reward": 0.909226194024086, | |
| "reward_std": 0.3613435998558998, | |
| "rewards/accuracy_reward": 0.3630952462553978, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.54613097012043, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2236.949462890625, | |
| "epoch": 1.4195804195804196, | |
| "grad_norm": 2.3309292793273926, | |
| "kl": 1.921875, | |
| "learning_rate": 9.160984316183354e-06, | |
| "loss": 0.0769, | |
| "reward": 0.7492559850215912, | |
| "reward_std": 0.3186347261071205, | |
| "rewards/accuracy_reward": 0.16369047947227955, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5855654776096344, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2093.497039794922, | |
| "epoch": 1.4475524475524475, | |
| "grad_norm": 6.4610066413879395, | |
| "kl": 1.353515625, | |
| "learning_rate": 9.109554367397699e-06, | |
| "loss": 0.0542, | |
| "reward": 0.8705357313156128, | |
| "reward_std": 0.3492353707551956, | |
| "rewards/accuracy_reward": 0.2976190485060215, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5729166567325592, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 1978.5565795898438, | |
| "epoch": 1.4755244755244754, | |
| "grad_norm": 1.0276720523834229, | |
| "kl": 1.486328125, | |
| "learning_rate": 9.056767012834417e-06, | |
| "loss": 0.0594, | |
| "reward": 0.85863097012043, | |
| "reward_std": 0.3453834652900696, | |
| "rewards/accuracy_reward": 0.2738095261156559, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5848214328289032, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2361.6160888671875, | |
| "epoch": 1.5034965034965035, | |
| "grad_norm": 8.49375057220459, | |
| "kl": 1.091796875, | |
| "learning_rate": 9.00264210365872e-06, | |
| "loss": 0.0437, | |
| "reward": 0.864583358168602, | |
| "reward_std": 0.3236875534057617, | |
| "rewards/accuracy_reward": 0.24404762405902147, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6205357164144516, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2253.2886962890625, | |
| "epoch": 1.5314685314685315, | |
| "grad_norm": 0.7754113674163818, | |
| "kl": 0.78515625, | |
| "learning_rate": 8.947199994035402e-06, | |
| "loss": 0.0314, | |
| "reward": 0.8921131044626236, | |
| "reward_std": 0.33837051689624786, | |
| "rewards/accuracy_reward": 0.2976190559566021, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5944940596818924, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2794.6668090820312, | |
| "epoch": 1.5594405594405596, | |
| "grad_norm": 0.8983144760131836, | |
| "kl": 0.7841796875, | |
| "learning_rate": 8.890461533474473e-06, | |
| "loss": 0.0314, | |
| "reward": 0.8772321790456772, | |
| "reward_std": 0.35824262350797653, | |
| "rewards/accuracy_reward": 0.3065476268529892, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5706845372915268, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3274.985107421875, | |
| "epoch": 1.5874125874125875, | |
| "grad_norm": 0.6588050723075867, | |
| "kl": 0.619140625, | |
| "learning_rate": 8.832448058990522e-06, | |
| "loss": 0.0248, | |
| "reward": 0.8474702537059784, | |
| "reward_std": 0.38370462507009506, | |
| "rewards/accuracy_reward": 0.3214285708963871, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5260416716337204, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 3612.96728515625, | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 0.5346247553825378, | |
| "kl": 0.8251953125, | |
| "learning_rate": 8.77318138707872e-06, | |
| "loss": 0.033, | |
| "reward": 0.7187500298023224, | |
| "reward_std": 0.35608696937561035, | |
| "rewards/accuracy_reward": 0.2767857201397419, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4419642984867096, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 3529.2113647460938, | |
| "epoch": 1.6433566433566433, | |
| "grad_norm": 0.8232976198196411, | |
| "kl": 0.904296875, | |
| "learning_rate": 8.712683805510547e-06, | |
| "loss": 0.0362, | |
| "reward": 0.8043155074119568, | |
| "reward_std": 0.3637358583509922, | |
| "rewards/accuracy_reward": 0.3839285857975483, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4203869104385376, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 3394.2500610351562, | |
| "epoch": 1.6713286713286712, | |
| "grad_norm": 0.833419919013977, | |
| "kl": 1.0947265625, | |
| "learning_rate": 8.650978064952259e-06, | |
| "loss": 0.0439, | |
| "reward": 0.7477678805589676, | |
| "reward_std": 0.29748640581965446, | |
| "rewards/accuracy_reward": 0.3214285783469677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4263392984867096, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 3472.0595703125, | |
| "epoch": 1.6993006993006992, | |
| "grad_norm": 0.6742235422134399, | |
| "kl": 1.1708984375, | |
| "learning_rate": 8.588087370409303e-06, | |
| "loss": 0.0469, | |
| "reward": 0.767857164144516, | |
| "reward_std": 0.3432356268167496, | |
| "rewards/accuracy_reward": 0.3392857238650322, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4285714477300644, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 3589.4345703125, | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 0.6556262969970703, | |
| "kl": 1.3671875, | |
| "learning_rate": 8.524035372499851e-06, | |
| "loss": 0.0548, | |
| "reward": 0.742559552192688, | |
| "reward_std": 0.26402388140559196, | |
| "rewards/accuracy_reward": 0.3065476231276989, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4360119104385376, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 3276.6161499023438, | |
| "epoch": 1.7552447552447552, | |
| "grad_norm": 1.2825839519500732, | |
| "kl": 0.7578125, | |
| "learning_rate": 8.458846158560787e-06, | |
| "loss": 0.0303, | |
| "reward": 0.8199404776096344, | |
| "reward_std": 0.2816154323518276, | |
| "rewards/accuracy_reward": 0.3363095372915268, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.48363097012043, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 3791.9851684570312, | |
| "epoch": 1.7832167832167833, | |
| "grad_norm": 0.8932743072509766, | |
| "kl": 1.0615234375, | |
| "learning_rate": 8.392544243589428e-06, | |
| "loss": 0.0425, | |
| "reward": 0.5357142835855484, | |
| "reward_std": 0.19767598249018192, | |
| "rewards/accuracy_reward": 0.13988095708191395, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3958333432674408, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 3556.7501220703125, | |
| "epoch": 1.8111888111888113, | |
| "grad_norm": 0.4717884957790375, | |
| "kl": 0.6962890625, | |
| "learning_rate": 8.325154561024445e-06, | |
| "loss": 0.0278, | |
| "reward": 0.7752976417541504, | |
| "reward_std": 0.3101865127682686, | |
| "rewards/accuracy_reward": 0.3273809626698494, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.447916679084301, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 3541.2708740234375, | |
| "epoch": 1.8391608391608392, | |
| "grad_norm": 1.9377450942993164, | |
| "kl": 0.85009765625, | |
| "learning_rate": 8.256702453369413e-06, | |
| "loss": 0.034, | |
| "reward": 0.65550597012043, | |
| "reward_std": 0.28243885561823845, | |
| "rewards/accuracy_reward": 0.21726190485060215, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4382440596818924, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3365.2470703125, | |
| "epoch": 1.867132867132867, | |
| "grad_norm": 8.12527847290039, | |
| "kl": 1.16015625, | |
| "learning_rate": 8.187213662662539e-06, | |
| "loss": 0.0464, | |
| "reward": 0.7477678805589676, | |
| "reward_std": 0.2514929175376892, | |
| "rewards/accuracy_reward": 0.2827381007373333, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.465029776096344, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 3002.2918090820312, | |
| "epoch": 1.895104895104895, | |
| "grad_norm": 1.0385046005249023, | |
| "kl": 0.8564453125, | |
| "learning_rate": 8.11671432079612e-06, | |
| "loss": 0.0342, | |
| "reward": 0.8169642984867096, | |
| "reward_std": 0.2804528884589672, | |
| "rewards/accuracy_reward": 0.2976190485060215, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5193452537059784, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 3085.89892578125, | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.8841953873634338, | |
| "kl": 1.638671875, | |
| "learning_rate": 8.045230939689425e-06, | |
| "loss": 0.0656, | |
| "reward": 0.7678571492433548, | |
| "reward_std": 0.2939384840428829, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.517857164144516, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 2596.6726684570312, | |
| "epoch": 1.951048951048951, | |
| "grad_norm": 1.013146996498108, | |
| "kl": 1.80859375, | |
| "learning_rate": 7.972790401318627e-06, | |
| "loss": 0.0725, | |
| "reward": 0.9523809850215912, | |
| "reward_std": 0.31720103323459625, | |
| "rewards/accuracy_reward": 0.3630952388048172, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5892857164144516, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2973.0327758789062, | |
| "epoch": 1.9790209790209792, | |
| "grad_norm": 2.256330966949463, | |
| "kl": 2.23046875, | |
| "learning_rate": 7.899419947607611e-06, | |
| "loss": 0.0892, | |
| "reward": 0.7790178805589676, | |
| "reward_std": 0.3002399504184723, | |
| "rewards/accuracy_reward": 0.1964285708963871, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5825892835855484, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 3074.5263671875, | |
| "epoch": 2.0, | |
| "grad_norm": 4.588380336761475, | |
| "kl": 1.8671875, | |
| "learning_rate": 7.825147170183384e-06, | |
| "loss": 0.056, | |
| "reward": 0.6795635024706522, | |
| "reward_std": 0.2360253930091858, | |
| "rewards/accuracy_reward": 0.12698413183291754, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.552579383055369, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 2728.1428833007812, | |
| "epoch": 2.027972027972028, | |
| "grad_norm": 2.954068422317505, | |
| "kl": 1.978515625, | |
| "learning_rate": 7.75e-06, | |
| "loss": 0.0792, | |
| "reward": 0.6919642984867096, | |
| "reward_std": 0.30895066261291504, | |
| "rewards/accuracy_reward": 0.1250000004656613, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5669643133878708, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 2740.3035888671875, | |
| "epoch": 2.055944055944056, | |
| "grad_norm": 2.6688222885131836, | |
| "kl": 1.734375, | |
| "learning_rate": 7.674006696834874e-06, | |
| "loss": 0.0695, | |
| "reward": 0.7194940596818924, | |
| "reward_std": 0.2921901308000088, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5944940745830536, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2415.52685546875, | |
| "epoch": 2.0839160839160837, | |
| "grad_norm": 0.6129941344261169, | |
| "kl": 3.77734375, | |
| "learning_rate": 7.597195838661426e-06, | |
| "loss": 0.1507, | |
| "reward": 0.8467262089252472, | |
| "reward_std": 0.33736108988523483, | |
| "rewards/accuracy_reward": 0.16666666930541396, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6800595372915268, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2300.232208251953, | |
| "epoch": 2.111888111888112, | |
| "grad_norm": 77.38348388671875, | |
| "kl": 0.85107421875, | |
| "learning_rate": 7.519596310902081e-06, | |
| "loss": 0.0341, | |
| "reward": 0.802083358168602, | |
| "reward_std": 0.2800351716578007, | |
| "rewards/accuracy_reward": 0.11607143376022577, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6860119253396988, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 2576.9226684570312, | |
| "epoch": 2.13986013986014, | |
| "grad_norm": 2.116926908493042, | |
| "kl": 0.953125, | |
| "learning_rate": 7.441237295565642e-06, | |
| "loss": 0.038, | |
| "reward": 0.8288690596818924, | |
| "reward_std": 0.29788894951343536, | |
| "rewards/accuracy_reward": 0.16964286006987095, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.659226194024086, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 2475.0149536132812, | |
| "epoch": 2.167832167832168, | |
| "grad_norm": 12.25960922241211, | |
| "kl": 0.62890625, | |
| "learning_rate": 7.362148260273128e-06, | |
| "loss": 0.0251, | |
| "reward": 0.808779776096344, | |
| "reward_std": 0.3128742165863514, | |
| "rewards/accuracy_reward": 0.11011905036866665, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6986607164144516, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2879.7262573242188, | |
| "epoch": 2.195804195804196, | |
| "grad_norm": 4.687532901763916, | |
| "kl": 0.728515625, | |
| "learning_rate": 7.282358947176207e-06, | |
| "loss": 0.0291, | |
| "reward": 0.8385416716337204, | |
| "reward_std": 0.3654126450419426, | |
| "rewards/accuracy_reward": 0.18154762219637632, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6569940596818924, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 2604.5892944335938, | |
| "epoch": 2.2237762237762237, | |
| "grad_norm": 412.8674011230469, | |
| "kl": 5.9697265625, | |
| "learning_rate": 7.201899361772392e-06, | |
| "loss": 0.238, | |
| "reward": 0.7812500149011612, | |
| "reward_std": 0.3256835453212261, | |
| "rewards/accuracy_reward": 0.1279761923942715, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6532738208770752, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 2611.827392578125, | |
| "epoch": 2.2517482517482517, | |
| "grad_norm": 2.032397985458374, | |
| "kl": 0.85546875, | |
| "learning_rate": 7.120799761621198e-06, | |
| "loss": 0.0342, | |
| "reward": 0.8177083432674408, | |
| "reward_std": 0.29930025711655617, | |
| "rewards/accuracy_reward": 0.1547619067132473, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6629464328289032, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2559.6697387695312, | |
| "epoch": 2.2797202797202796, | |
| "grad_norm": 3.929804801940918, | |
| "kl": 0.849609375, | |
| "learning_rate": 7.0390906449655104e-06, | |
| "loss": 0.034, | |
| "reward": 0.8764881044626236, | |
| "reward_std": 0.36110614240169525, | |
| "rewards/accuracy_reward": 0.19642857648432255, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6800595223903656, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2797.1458740234375, | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 2.691180944442749, | |
| "kl": 1.2119140625, | |
| "learning_rate": 6.956802739262446e-06, | |
| "loss": 0.0484, | |
| "reward": 0.778273805975914, | |
| "reward_std": 0.3232569172978401, | |
| "rewards/accuracy_reward": 0.13392857369035482, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6443452388048172, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2735.65185546875, | |
| "epoch": 2.335664335664336, | |
| "grad_norm": 0.9807026386260986, | |
| "kl": 1.646484375, | |
| "learning_rate": 6.873966989628011e-06, | |
| "loss": 0.0659, | |
| "reward": 0.85788694024086, | |
| "reward_std": 0.3320453241467476, | |
| "rewards/accuracy_reward": 0.16071429220028222, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6971726268529892, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 2845.1012573242188, | |
| "epoch": 2.3636363636363638, | |
| "grad_norm": 1.2658172845840454, | |
| "kl": 2.30078125, | |
| "learning_rate": 6.790614547199908e-06, | |
| "loss": 0.092, | |
| "reward": 0.8273809850215912, | |
| "reward_std": 0.35115472227334976, | |
| "rewards/accuracy_reward": 0.2440476231276989, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5833333432674408, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 3053.264892578125, | |
| "epoch": 2.3916083916083917, | |
| "grad_norm": 2.7772088050842285, | |
| "kl": 2.6328125, | |
| "learning_rate": 6.7067767574228695e-06, | |
| "loss": 0.1054, | |
| "reward": 0.9248512089252472, | |
| "reward_std": 0.4560338780283928, | |
| "rewards/accuracy_reward": 0.351190485060215, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5736607313156128, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 3551.6250610351562, | |
| "epoch": 2.4195804195804196, | |
| "grad_norm": 2.6345677375793457, | |
| "kl": 3.49609375, | |
| "learning_rate": 6.622485148260916e-06, | |
| "loss": 0.14, | |
| "reward": 0.6949404925107956, | |
| "reward_std": 0.3689125031232834, | |
| "rewards/accuracy_reward": 0.1755952425301075, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5193452388048172, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 3338.6429443359375, | |
| "epoch": 2.4475524475524475, | |
| "grad_norm": 3.476290702819824, | |
| "kl": 2.296875, | |
| "learning_rate": 6.537771418340981e-06, | |
| "loss": 0.0918, | |
| "reward": 0.821428582072258, | |
| "reward_std": 0.3828240856528282, | |
| "rewards/accuracy_reward": 0.285714291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5357142984867096, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3541.5267944335938, | |
| "epoch": 2.4755244755244754, | |
| "grad_norm": 1.3453477621078491, | |
| "kl": 3.4296875, | |
| "learning_rate": 6.45266742503235e-06, | |
| "loss": 0.1373, | |
| "reward": 0.7574404925107956, | |
| "reward_std": 0.40839895606040955, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5282738208770752, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 3516.6727294921875, | |
| "epoch": 2.5034965034965033, | |
| "grad_norm": 21.825538635253906, | |
| "kl": 2.98828125, | |
| "learning_rate": 6.367205172466404e-06, | |
| "loss": 0.1196, | |
| "reward": 0.715029776096344, | |
| "reward_std": 0.36236244440078735, | |
| "rewards/accuracy_reward": 0.2053571422584355, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5096726343035698, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3277.6250610351562, | |
| "epoch": 2.5314685314685317, | |
| "grad_norm": 38.884925842285156, | |
| "kl": 1.423828125, | |
| "learning_rate": 6.281416799501188e-06, | |
| "loss": 0.057, | |
| "reward": 0.8058035969734192, | |
| "reward_std": 0.3809823840856552, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5141369253396988, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 3536.5803833007812, | |
| "epoch": 2.5594405594405596, | |
| "grad_norm": 2.8481810092926025, | |
| "kl": 1.0390625, | |
| "learning_rate": 6.1953345676352835e-06, | |
| "loss": 0.0415, | |
| "reward": 0.8251488208770752, | |
| "reward_std": 0.4142955169081688, | |
| "rewards/accuracy_reward": 0.2946428656578064, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5305059403181076, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3367.8631591796875, | |
| "epoch": 2.5874125874125875, | |
| "grad_norm": 2.616717576980591, | |
| "kl": 1.26953125, | |
| "learning_rate": 6.108990848875591e-06, | |
| "loss": 0.0509, | |
| "reward": 0.8214285969734192, | |
| "reward_std": 0.4105582758784294, | |
| "rewards/accuracy_reward": 0.2886904813349247, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5327381044626236, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3568.5804443359375, | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 2.0899202823638916, | |
| "kl": 1.24609375, | |
| "learning_rate": 6.022418113563536e-06, | |
| "loss": 0.0499, | |
| "reward": 0.7953869253396988, | |
| "reward_std": 0.41642241925001144, | |
| "rewards/accuracy_reward": 0.3154761902987957, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4799107164144516, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3176.202392578125, | |
| "epoch": 2.6433566433566433, | |
| "grad_norm": 2.2424473762512207, | |
| "kl": 1.669921875, | |
| "learning_rate": 5.935648918164308e-06, | |
| "loss": 0.0668, | |
| "reward": 0.8839285969734192, | |
| "reward_std": 0.3916260041296482, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4880952462553978, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2926.9583740234375, | |
| "epoch": 2.6713286713286712, | |
| "grad_norm": 1.989888072013855, | |
| "kl": 2.158203125, | |
| "learning_rate": 5.84871589302369e-06, | |
| "loss": 0.0862, | |
| "reward": 0.8571428805589676, | |
| "reward_std": 0.3645229861140251, | |
| "rewards/accuracy_reward": 0.3184523917734623, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.538690485060215, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 2869.0267944335938, | |
| "epoch": 2.699300699300699, | |
| "grad_norm": 1.3042261600494385, | |
| "kl": 2.421875, | |
| "learning_rate": 5.761651730097142e-06, | |
| "loss": 0.0969, | |
| "reward": 0.9077381044626236, | |
| "reward_std": 0.3927621468901634, | |
| "rewards/accuracy_reward": 0.3928571492433548, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.51488097012043, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 3120.1517944335938, | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 3.1568398475646973, | |
| "kl": 2.81640625, | |
| "learning_rate": 5.674489170655676e-06, | |
| "loss": 0.1126, | |
| "reward": 0.7596726268529892, | |
| "reward_std": 0.31667689979076385, | |
| "rewards/accuracy_reward": 0.2767857164144516, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4828869104385376, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2494.9970092773438, | |
| "epoch": 2.755244755244755, | |
| "grad_norm": 1.9581549167633057, | |
| "kl": 1.568359375, | |
| "learning_rate": 5.58726099297321e-06, | |
| "loss": 0.0627, | |
| "reward": 0.887648805975914, | |
| "reward_std": 0.3121208921074867, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5662202388048172, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 3111.6607666015625, | |
| "epoch": 2.7832167832167833, | |
| "grad_norm": 0.9897369146347046, | |
| "kl": 2.19921875, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.0881, | |
| "reward": 0.646577388048172, | |
| "reward_std": 0.2645460404455662, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5007440596818924, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2862.7828369140625, | |
| "epoch": 2.8111888111888113, | |
| "grad_norm": 1.539971113204956, | |
| "kl": 1.580078125, | |
| "learning_rate": 5.412739007026791e-06, | |
| "loss": 0.0632, | |
| "reward": 0.8995535969734192, | |
| "reward_std": 0.3328116163611412, | |
| "rewards/accuracy_reward": 0.3452381119132042, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5543155074119568, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2887.4553833007812, | |
| "epoch": 2.839160839160839, | |
| "grad_norm": 1.2075374126434326, | |
| "kl": 1.572265625, | |
| "learning_rate": 5.325510829344325e-06, | |
| "loss": 0.0629, | |
| "reward": 0.71800597012043, | |
| "reward_std": 0.2983597405254841, | |
| "rewards/accuracy_reward": 0.19642857322469354, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5215773954987526, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2699.0684814453125, | |
| "epoch": 2.867132867132867, | |
| "grad_norm": 1.2193324565887451, | |
| "kl": 1.591796875, | |
| "learning_rate": 5.23834826990286e-06, | |
| "loss": 0.0637, | |
| "reward": 0.8020833432674408, | |
| "reward_std": 0.29492413625121117, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5312500149011612, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2352.889892578125, | |
| "epoch": 2.895104895104895, | |
| "grad_norm": 1.231808066368103, | |
| "kl": 1.10546875, | |
| "learning_rate": 5.151284106976312e-06, | |
| "loss": 0.0443, | |
| "reward": 0.9270833432674408, | |
| "reward_std": 0.31997836381196976, | |
| "rewards/accuracy_reward": 0.3154761977493763, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6116071343421936, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2591.3423461914062, | |
| "epoch": 2.9230769230769234, | |
| "grad_norm": 1.6210328340530396, | |
| "kl": 1.580078125, | |
| "learning_rate": 5.064351081835695e-06, | |
| "loss": 0.0632, | |
| "reward": 0.8043154925107956, | |
| "reward_std": 0.30041009932756424, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5543154925107956, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2248.0178833007812, | |
| "epoch": 2.951048951048951, | |
| "grad_norm": 0.8436636328697205, | |
| "kl": 1.2607421875, | |
| "learning_rate": 4.9775818864364635e-06, | |
| "loss": 0.0504, | |
| "reward": 1.0156250298023224, | |
| "reward_std": 0.3139919266104698, | |
| "rewards/accuracy_reward": 0.3660714440047741, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.649553582072258, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2945.767852783203, | |
| "epoch": 2.979020979020979, | |
| "grad_norm": 0.9325153231620789, | |
| "kl": 2.052734375, | |
| "learning_rate": 4.8910091511244115e-06, | |
| "loss": 0.0822, | |
| "reward": 0.712053582072258, | |
| "reward_std": 0.28182170167565346, | |
| "rewards/accuracy_reward": 0.1785714291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5334821492433548, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 3243.7850748697915, | |
| "epoch": 3.0, | |
| "grad_norm": 2.5548951625823975, | |
| "kl": 1.9192708333333333, | |
| "learning_rate": 4.804665432364719e-06, | |
| "loss": 0.0577, | |
| "reward": 0.6726190447807312, | |
| "reward_std": 0.23654385904471079, | |
| "rewards/accuracy_reward": 0.14682540049155554, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5257936517397562, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 2749.8274536132812, | |
| "epoch": 3.027972027972028, | |
| "grad_norm": 2.549743890762329, | |
| "kl": 1.861328125, | |
| "learning_rate": 4.718583200498814e-06, | |
| "loss": 0.0745, | |
| "reward": 0.7068452537059784, | |
| "reward_std": 0.28479718416929245, | |
| "rewards/accuracy_reward": 0.11011904943734407, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5967262014746666, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2722.1517944335938, | |
| "epoch": 3.055944055944056, | |
| "grad_norm": 2.3009603023529053, | |
| "kl": 1.884765625, | |
| "learning_rate": 4.632794827533597e-06, | |
| "loss": 0.0754, | |
| "reward": 0.746279776096344, | |
| "reward_std": 0.3024236336350441, | |
| "rewards/accuracy_reward": 0.11011904897168279, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6361607164144516, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 2548.0089111328125, | |
| "epoch": 3.0839160839160837, | |
| "grad_norm": 10.268058776855469, | |
| "kl": 1.5048828125, | |
| "learning_rate": 4.547332574967653e-06, | |
| "loss": 0.0602, | |
| "reward": 0.8355654925107956, | |
| "reward_std": 0.3237708546221256, | |
| "rewards/accuracy_reward": 0.16964285960420966, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6659226417541504, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 2413.6905212402344, | |
| "epoch": 3.111888111888112, | |
| "grad_norm": 13.962904930114746, | |
| "kl": 1.51953125, | |
| "learning_rate": 4.462228581659019e-06, | |
| "loss": 0.0608, | |
| "reward": 0.7760416865348816, | |
| "reward_std": 0.280382689088583, | |
| "rewards/accuracy_reward": 0.12202381156384945, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6540178805589676, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2638.9613037109375, | |
| "epoch": 3.13986013986014, | |
| "grad_norm": 4.123325347900391, | |
| "kl": 1.47265625, | |
| "learning_rate": 4.377514851739085e-06, | |
| "loss": 0.059, | |
| "reward": 0.7924107313156128, | |
| "reward_std": 0.26885663345456123, | |
| "rewards/accuracy_reward": 0.17559524066746235, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6168154925107956, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2640.5358276367188, | |
| "epoch": 3.167832167832168, | |
| "grad_norm": 2.064690351486206, | |
| "kl": 1.400390625, | |
| "learning_rate": 4.293223242577131e-06, | |
| "loss": 0.0561, | |
| "reward": 0.7767857313156128, | |
| "reward_std": 0.32507994771003723, | |
| "rewards/accuracy_reward": 0.1339285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.642857164144516, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2925.0834350585938, | |
| "epoch": 3.195804195804196, | |
| "grad_norm": 59.9140739440918, | |
| "kl": 1.56640625, | |
| "learning_rate": 4.2093854528000955e-06, | |
| "loss": 0.0626, | |
| "reward": 0.786458358168602, | |
| "reward_std": 0.36089181154966354, | |
| "rewards/accuracy_reward": 0.1934523843228817, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5930059552192688, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 2769.46728515625, | |
| "epoch": 3.2237762237762237, | |
| "grad_norm": 68.28800201416016, | |
| "kl": 2.8828125, | |
| "learning_rate": 4.1260330103719915e-06, | |
| "loss": 0.1151, | |
| "reward": 0.745535746216774, | |
| "reward_std": 0.2985537722706795, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6205357313156128, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 2711.261962890625, | |
| "epoch": 3.2517482517482517, | |
| "grad_norm": 2.611356258392334, | |
| "kl": 1.1708984375, | |
| "learning_rate": 4.043197260737556e-06, | |
| "loss": 0.0468, | |
| "reward": 0.8154762238264084, | |
| "reward_std": 0.33052273094654083, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6071428656578064, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2561.4881591796875, | |
| "epoch": 3.2797202797202796, | |
| "grad_norm": 1.7765567302703857, | |
| "kl": 1.232421875, | |
| "learning_rate": 3.960909355034491e-06, | |
| "loss": 0.0493, | |
| "reward": 0.8623512089252472, | |
| "reward_std": 0.3735603392124176, | |
| "rewards/accuracy_reward": 0.21130952425301075, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416865348816, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2737.0208740234375, | |
| "epoch": 3.3076923076923075, | |
| "grad_norm": 1.2620360851287842, | |
| "kl": 1.474609375, | |
| "learning_rate": 3.8792002383788044e-06, | |
| "loss": 0.0589, | |
| "reward": 0.7953869253396988, | |
| "reward_std": 0.3045261651277542, | |
| "rewards/accuracy_reward": 0.14880952704697847, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6465774029493332, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2570.681640625, | |
| "epoch": 3.335664335664336, | |
| "grad_norm": 1.4667586088180542, | |
| "kl": 1.34765625, | |
| "learning_rate": 3.7981006382276097e-06, | |
| "loss": 0.0539, | |
| "reward": 0.821428582072258, | |
| "reward_std": 0.2918845936655998, | |
| "rewards/accuracy_reward": 0.1428571459837258, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6785714477300644, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 2684.2351684570312, | |
| "epoch": 3.3636363636363638, | |
| "grad_norm": 1.5117037296295166, | |
| "kl": 1.5498046875, | |
| "learning_rate": 3.717641052823795e-06, | |
| "loss": 0.0619, | |
| "reward": 0.9568452537059784, | |
| "reward_std": 0.39537160843610764, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6443452388048172, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2584.9761962890625, | |
| "epoch": 3.3916083916083917, | |
| "grad_norm": 0.769509494304657, | |
| "kl": 1.505859375, | |
| "learning_rate": 3.6378517397268744e-06, | |
| "loss": 0.0603, | |
| "reward": 1.0952381193637848, | |
| "reward_std": 0.424266554415226, | |
| "rewards/accuracy_reward": 0.419642873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6755952537059784, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 3175.181640625, | |
| "epoch": 3.4195804195804196, | |
| "grad_norm": 1.1727639436721802, | |
| "kl": 2.921875, | |
| "learning_rate": 3.558762704434361e-06, | |
| "loss": 0.1169, | |
| "reward": 0.755952388048172, | |
| "reward_std": 0.332593098282814, | |
| "rewards/accuracy_reward": 0.19345238246023655, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5625, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 3093.1905517578125, | |
| "epoch": 3.4475524475524475, | |
| "grad_norm": 4.137202739715576, | |
| "kl": 2.53125, | |
| "learning_rate": 3.4804036890979207e-06, | |
| "loss": 0.1011, | |
| "reward": 0.9211309552192688, | |
| "reward_std": 0.4176478758454323, | |
| "rewards/accuracy_reward": 0.348214291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5729166865348816, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 3159.46435546875, | |
| "epoch": 3.4755244755244754, | |
| "grad_norm": 3.5323171615600586, | |
| "kl": 3.4296875, | |
| "learning_rate": 3.402804161338577e-06, | |
| "loss": 0.1371, | |
| "reward": 0.8549107313156128, | |
| "reward_std": 0.3986319825053215, | |
| "rewards/accuracy_reward": 0.2827381007373333, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5721726268529892, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 3065.3007202148438, | |
| "epoch": 3.5034965034965033, | |
| "grad_norm": 5.417646408081055, | |
| "kl": 3.1171875, | |
| "learning_rate": 3.325993303165127e-06, | |
| "loss": 0.1246, | |
| "reward": 0.8028274029493332, | |
| "reward_std": 0.34128784388303757, | |
| "rewards/accuracy_reward": 0.20238095615059137, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6004464477300644, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2942.6726684570312, | |
| "epoch": 3.5314685314685317, | |
| "grad_norm": 3.8368661403656006, | |
| "kl": 2.546875, | |
| "learning_rate": 3.2500000000000015e-06, | |
| "loss": 0.102, | |
| "reward": 0.9203869253396988, | |
| "reward_std": 0.4298209026455879, | |
| "rewards/accuracy_reward": 0.3452381044626236, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5751488357782364, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 3157.21435546875, | |
| "epoch": 3.5594405594405596, | |
| "grad_norm": 1.6504576206207275, | |
| "kl": 2.37109375, | |
| "learning_rate": 3.174852829816617e-06, | |
| "loss": 0.0949, | |
| "reward": 0.8816964477300644, | |
| "reward_std": 0.4139380306005478, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6108631044626236, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3150.074462890625, | |
| "epoch": 3.5874125874125875, | |
| "grad_norm": 3.4168283939361572, | |
| "kl": 2.41796875, | |
| "learning_rate": 3.1005800523923906e-06, | |
| "loss": 0.0968, | |
| "reward": 0.9196428805589676, | |
| "reward_std": 0.41364390403032303, | |
| "rewards/accuracy_reward": 0.3422619104385376, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5773809403181076, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 3319.4673461914062, | |
| "epoch": 3.6153846153846154, | |
| "grad_norm": 0.8300291299819946, | |
| "kl": 1.931640625, | |
| "learning_rate": 3.027209598681373e-06, | |
| "loss": 0.0772, | |
| "reward": 0.8906250149011612, | |
| "reward_std": 0.4191039651632309, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5691964328289032, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2872.2589721679688, | |
| "epoch": 3.6433566433566433, | |
| "grad_norm": 2.4745171070098877, | |
| "kl": 1.271484375, | |
| "learning_rate": 2.9547690603105774e-06, | |
| "loss": 0.0509, | |
| "reward": 1.0081845223903656, | |
| "reward_std": 0.4375082179903984, | |
| "rewards/accuracy_reward": 0.410714291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5974702537059784, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2652.6607666015625, | |
| "epoch": 3.6713286713286712, | |
| "grad_norm": 1.8032488822937012, | |
| "kl": 1.2646484375, | |
| "learning_rate": 2.88328567920388e-06, | |
| "loss": 0.0505, | |
| "reward": 0.9300595372915268, | |
| "reward_std": 0.36631206423044205, | |
| "rewards/accuracy_reward": 0.2648809552192688, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.665178582072258, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 2565.232177734375, | |
| "epoch": 3.699300699300699, | |
| "grad_norm": 2.1931300163269043, | |
| "kl": 1.310546875, | |
| "learning_rate": 2.8127863373374637e-06, | |
| "loss": 0.0524, | |
| "reward": 0.9873512238264084, | |
| "reward_std": 0.4016312509775162, | |
| "rewards/accuracy_reward": 0.3154762014746666, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750149011612, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2977.1369018554688, | |
| "epoch": 3.7272727272727275, | |
| "grad_norm": 1.6613937616348267, | |
| "kl": 1.912109375, | |
| "learning_rate": 2.743297546630588e-06, | |
| "loss": 0.0766, | |
| "reward": 0.776785746216774, | |
| "reward_std": 0.32996587827801704, | |
| "rewards/accuracy_reward": 0.17857143096625805, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5982142984867096, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2323.5059509277344, | |
| "epoch": 3.755244755244755, | |
| "grad_norm": 1.0777219533920288, | |
| "kl": 1.11328125, | |
| "learning_rate": 2.6748454389755576e-06, | |
| "loss": 0.0445, | |
| "reward": 0.9226190596818924, | |
| "reward_std": 0.3410933315753937, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.714285746216774, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2958.886962890625, | |
| "epoch": 3.7832167832167833, | |
| "grad_norm": 13.500560760498047, | |
| "kl": 1.955078125, | |
| "learning_rate": 2.607455756410573e-06, | |
| "loss": 0.0782, | |
| "reward": 0.7656250149011612, | |
| "reward_std": 0.33549799770116806, | |
| "rewards/accuracy_reward": 0.15178571664728224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6138393133878708, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2686.9553833007812, | |
| "epoch": 3.8111888111888113, | |
| "grad_norm": 1.549809217453003, | |
| "kl": 1.130859375, | |
| "learning_rate": 2.5411538414392146e-06, | |
| "loss": 0.0452, | |
| "reward": 1.0505952537059784, | |
| "reward_std": 0.3903638646006584, | |
| "rewards/accuracy_reward": 0.3898809663951397, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6607143133878708, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2769.0001220703125, | |
| "epoch": 3.839160839160839, | |
| "grad_norm": 11.73907470703125, | |
| "kl": 1.48828125, | |
| "learning_rate": 2.4759646275001494e-06, | |
| "loss": 0.0597, | |
| "reward": 0.840029776096344, | |
| "reward_std": 0.35654914379119873, | |
| "rewards/accuracy_reward": 0.17857143096625805, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6614583432674408, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2654.2351684570312, | |
| "epoch": 3.867132867132867, | |
| "grad_norm": 0.9885497093200684, | |
| "kl": 1.779296875, | |
| "learning_rate": 2.4119126295906997e-06, | |
| "loss": 0.0713, | |
| "reward": 0.9441964626312256, | |
| "reward_std": 0.3702985644340515, | |
| "rewards/accuracy_reward": 0.2767857201397419, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6674107164144516, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2571.5625, | |
| "epoch": 3.895104895104895, | |
| "grad_norm": 2.12560772895813, | |
| "kl": 1.7060546875, | |
| "learning_rate": 2.349021935047742e-06, | |
| "loss": 0.0682, | |
| "reward": 0.9784226417541504, | |
| "reward_std": 0.37573204189538956, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.68675597012043, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2644.9464721679688, | |
| "epoch": 3.9230769230769234, | |
| "grad_norm": 0.8384514451026917, | |
| "kl": 1.78125, | |
| "learning_rate": 2.2873161944894552e-06, | |
| "loss": 0.0713, | |
| "reward": 0.9345238357782364, | |
| "reward_std": 0.4023793339729309, | |
| "rewards/accuracy_reward": 0.2648809552192688, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6696428656578064, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2405.4851684570312, | |
| "epoch": 3.951048951048951, | |
| "grad_norm": 4.06576681137085, | |
| "kl": 1.271484375, | |
| "learning_rate": 2.226818612921281e-06, | |
| "loss": 0.0509, | |
| "reward": 1.1086309850215912, | |
| "reward_std": 0.29693298786878586, | |
| "rewards/accuracy_reward": 0.3779761977493763, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7306547611951828, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 3141.5178833007812, | |
| "epoch": 3.979020979020979, | |
| "grad_norm": 1.1091569662094116, | |
| "kl": 2.236328125, | |
| "learning_rate": 2.1675519410094803e-06, | |
| "loss": 0.0895, | |
| "reward": 0.7700893133878708, | |
| "reward_std": 0.39849065244197845, | |
| "rewards/accuracy_reward": 0.1815476194024086, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.588541679084301, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 3389.0745442708335, | |
| "epoch": 4.0, | |
| "grad_norm": 1.6339079141616821, | |
| "kl": 2.0390625, | |
| "learning_rate": 2.109538466525527e-06, | |
| "loss": 0.0613, | |
| "reward": 0.6617063482602438, | |
| "reward_std": 0.23624500632286072, | |
| "rewards/accuracy_reward": 0.1388888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5228174726168314, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 3073.3660888671875, | |
| "epoch": 4.027972027972028, | |
| "grad_norm": 2.455653429031372, | |
| "kl": 2.716796875, | |
| "learning_rate": 2.0528000059646e-06, | |
| "loss": 0.1088, | |
| "reward": 0.7105654925107956, | |
| "reward_std": 0.3298322930932045, | |
| "rewards/accuracy_reward": 0.08630952564999461, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6242559552192688, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2989.4732666015625, | |
| "epoch": 4.055944055944056, | |
| "grad_norm": 2.6637086868286133, | |
| "kl": 2.578125, | |
| "learning_rate": 1.99735789634128e-06, | |
| "loss": 0.1031, | |
| "reward": 0.8325893133878708, | |
| "reward_std": 0.34651779383420944, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6659226268529892, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2764.9137573242188, | |
| "epoch": 4.083916083916084, | |
| "grad_norm": 1.2049380540847778, | |
| "kl": 1.8046875, | |
| "learning_rate": 1.9432329871655837e-06, | |
| "loss": 0.0721, | |
| "reward": 0.8392857313156128, | |
| "reward_std": 0.34320978820323944, | |
| "rewards/accuracy_reward": 0.1398809519596398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.699404776096344, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2765.4702758789062, | |
| "epoch": 4.111888111888112, | |
| "grad_norm": 1.449282169342041, | |
| "kl": 2.341796875, | |
| "learning_rate": 1.890445632602303e-06, | |
| "loss": 0.0937, | |
| "reward": 0.7827381044626236, | |
| "reward_std": 0.29919303208589554, | |
| "rewards/accuracy_reward": 0.08928571734577417, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.693452388048172, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2920.1964721679688, | |
| "epoch": 4.13986013986014, | |
| "grad_norm": 1.0743755102157593, | |
| "kl": 2.1796875, | |
| "learning_rate": 1.8390156838166464e-06, | |
| "loss": 0.0873, | |
| "reward": 0.82738097012043, | |
| "reward_std": 0.3419215455651283, | |
| "rewards/accuracy_reward": 0.15773809887468815, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6696428656578064, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2747.6815795898438, | |
| "epoch": 4.1678321678321675, | |
| "grad_norm": 14.367873191833496, | |
| "kl": 2.328125, | |
| "learning_rate": 1.78896248150902e-06, | |
| "loss": 0.0933, | |
| "reward": 0.8325892984867096, | |
| "reward_std": 0.31476327776908875, | |
| "rewards/accuracy_reward": 0.12797619309276342, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7046131193637848, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 3094.21435546875, | |
| "epoch": 4.195804195804196, | |
| "grad_norm": 2.8251848220825195, | |
| "kl": 1.671875, | |
| "learning_rate": 1.740304848641787e-06, | |
| "loss": 0.0668, | |
| "reward": 0.8266369253396988, | |
| "reward_std": 0.3867366462945938, | |
| "rewards/accuracy_reward": 0.18154762499034405, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6450892984867096, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2937.919677734375, | |
| "epoch": 4.223776223776224, | |
| "grad_norm": 1.2695749998092651, | |
| "kl": 1.755859375, | |
| "learning_rate": 1.6930610833607152e-06, | |
| "loss": 0.0703, | |
| "reward": 0.8020833432674408, | |
| "reward_std": 0.36031080409884453, | |
| "rewards/accuracy_reward": 0.11011905036866665, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6919642984867096, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2880.4524536132812, | |
| "epoch": 4.251748251748252, | |
| "grad_norm": 12.877209663391113, | |
| "kl": 1.720703125, | |
| "learning_rate": 1.6472489521138016e-06, | |
| "loss": 0.0687, | |
| "reward": 0.9352678656578064, | |
| "reward_std": 0.37138979136943817, | |
| "rewards/accuracy_reward": 0.21428572107106447, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.720982164144516, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 2600.854248046875, | |
| "epoch": 4.27972027972028, | |
| "grad_norm": 125.01750946044922, | |
| "kl": 2.51171875, | |
| "learning_rate": 1.602885682970026e-06, | |
| "loss": 0.1003, | |
| "reward": 1.026785746216774, | |
| "reward_std": 0.37672605365514755, | |
| "rewards/accuracy_reward": 0.2589285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7678571492433548, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2646.982177734375, | |
| "epoch": 4.3076923076923075, | |
| "grad_norm": 1.7407286167144775, | |
| "kl": 1.2509765625, | |
| "learning_rate": 1.5599879591405917e-06, | |
| "loss": 0.0501, | |
| "reward": 0.9434524029493332, | |
| "reward_std": 0.33248691260814667, | |
| "rewards/accuracy_reward": 0.17559524066746235, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.767857164144516, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2717.7470703125, | |
| "epoch": 4.335664335664336, | |
| "grad_norm": 1.5020253658294678, | |
| "kl": 1.34765625, | |
| "learning_rate": 1.5185719127050399e-06, | |
| "loss": 0.0539, | |
| "reward": 0.9166666865348816, | |
| "reward_std": 0.2963341251015663, | |
| "rewards/accuracy_reward": 0.15178571664728224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7648809552192688, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2733.7709350585938, | |
| "epoch": 4.363636363636363, | |
| "grad_norm": 3.6608200073242188, | |
| "kl": 1.498046875, | |
| "learning_rate": 1.4786531185446455e-06, | |
| "loss": 0.0599, | |
| "reward": 1.0535714328289032, | |
| "reward_std": 0.3990664631128311, | |
| "rewards/accuracy_reward": 0.3303571492433548, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7232142984867096, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2560.15478515625, | |
| "epoch": 4.391608391608392, | |
| "grad_norm": 3.7004573345184326, | |
| "kl": 1.154296875, | |
| "learning_rate": 1.4402465884853304e-06, | |
| "loss": 0.0461, | |
| "reward": 1.277529776096344, | |
| "reward_std": 0.5079735741019249, | |
| "rewards/accuracy_reward": 0.4851190522313118, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7924107313156128, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 3273.9732666015625, | |
| "epoch": 4.41958041958042, | |
| "grad_norm": 1.2642496824264526, | |
| "kl": 2.13671875, | |
| "learning_rate": 1.4033667656523405e-06, | |
| "loss": 0.0855, | |
| "reward": 0.9412202686071396, | |
| "reward_std": 0.43186940997838974, | |
| "rewards/accuracy_reward": 0.2857142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.65550597012043, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 3008.824462890625, | |
| "epoch": 4.4475524475524475, | |
| "grad_norm": 1.0765700340270996, | |
| "kl": 1.83984375, | |
| "learning_rate": 1.3680275190387677e-06, | |
| "loss": 0.0736, | |
| "reward": 1.0900297611951828, | |
| "reward_std": 0.45845719426870346, | |
| "rewards/accuracy_reward": 0.4196428656578064, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6703869253396988, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 3204.6161499023438, | |
| "epoch": 4.475524475524476, | |
| "grad_norm": 1.5021779537200928, | |
| "kl": 2.759765625, | |
| "learning_rate": 1.3342421382899936e-06, | |
| "loss": 0.1104, | |
| "reward": 0.9203869104385376, | |
| "reward_std": 0.4819802939891815, | |
| "rewards/accuracy_reward": 0.3005952462553978, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6197916865348816, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3017.0774536132812, | |
| "epoch": 4.503496503496503, | |
| "grad_norm": 2.5586483478546143, | |
| "kl": 2.5078125, | |
| "learning_rate": 1.3020233287059976e-06, | |
| "loss": 0.1005, | |
| "reward": 0.8846726268529892, | |
| "reward_std": 0.3981989845633507, | |
| "rewards/accuracy_reward": 0.2053571422584355, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6793154925107956, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2873.4286499023438, | |
| "epoch": 4.531468531468532, | |
| "grad_norm": 2.847804546356201, | |
| "kl": 2.2890625, | |
| "learning_rate": 1.2713832064634127e-06, | |
| "loss": 0.0917, | |
| "reward": 1.0126488357782364, | |
| "reward_std": 0.4107717126607895, | |
| "rewards/accuracy_reward": 0.3392857201397419, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6733630895614624, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 3004.6726684570312, | |
| "epoch": 4.559440559440559, | |
| "grad_norm": 1.9864062070846558, | |
| "kl": 2.361328125, | |
| "learning_rate": 1.242333294059124e-06, | |
| "loss": 0.0945, | |
| "reward": 1.001488134264946, | |
| "reward_std": 0.47794152796268463, | |
| "rewards/accuracy_reward": 0.3095238134264946, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6919643133878708, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 3040.9702758789062, | |
| "epoch": 4.5874125874125875, | |
| "grad_norm": 4.236363410949707, | |
| "kl": 2.9609375, | |
| "learning_rate": 1.2148845159771311e-06, | |
| "loss": 0.1186, | |
| "reward": 0.987351194024086, | |
| "reward_std": 0.437920942902565, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6540178656578064, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 3136.985107421875, | |
| "epoch": 4.615384615384615, | |
| "grad_norm": 4.410726070404053, | |
| "kl": 2.48828125, | |
| "learning_rate": 1.1890471945803e-06, | |
| "loss": 0.0996, | |
| "reward": 1.0252976566553116, | |
| "reward_std": 0.43048153072595596, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.668154776096344, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2729.3512573242188, | |
| "epoch": 4.643356643356643, | |
| "grad_norm": 1.0968337059020996, | |
| "kl": 2.01171875, | |
| "learning_rate": 1.1648310462285386e-06, | |
| "loss": 0.0804, | |
| "reward": 1.1755952537059784, | |
| "reward_std": 0.45762403309345245, | |
| "rewards/accuracy_reward": 0.4672619178891182, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7083333432674408, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2629.4702758789062, | |
| "epoch": 4.671328671328672, | |
| "grad_norm": 1.0384118556976318, | |
| "kl": 1.98828125, | |
| "learning_rate": 1.1422451776248741e-06, | |
| "loss": 0.0795, | |
| "reward": 1.0312500149011612, | |
| "reward_std": 0.3993323966860771, | |
| "rewards/accuracy_reward": 0.2976190522313118, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.73363097012043, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2584.9524536132812, | |
| "epoch": 4.699300699300699, | |
| "grad_norm": 1.431814432144165, | |
| "kl": 2.044921875, | |
| "learning_rate": 1.121298082390793e-06, | |
| "loss": 0.082, | |
| "reward": 1.0580357313156128, | |
| "reward_std": 0.435057669878006, | |
| "rewards/accuracy_reward": 0.3630952388048172, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6949404925107956, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2954.7471313476562, | |
| "epoch": 4.7272727272727275, | |
| "grad_norm": 2.2138113975524902, | |
| "kl": 2.462890625, | |
| "learning_rate": 1.10199763787214e-06, | |
| "loss": 0.0986, | |
| "reward": 0.9293154776096344, | |
| "reward_std": 0.4329492375254631, | |
| "rewards/accuracy_reward": 0.2797619067132473, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.649553582072258, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2202.3809814453125, | |
| "epoch": 4.755244755244755, | |
| "grad_norm": 1.4412554502487183, | |
| "kl": 1.197265625, | |
| "learning_rate": 1.084351102176769e-06, | |
| "loss": 0.0479, | |
| "reward": 1.0461309850215912, | |
| "reward_std": 0.35539595037698746, | |
| "rewards/accuracy_reward": 0.3035714253783226, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7425595372915268, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2917.4584350585938, | |
| "epoch": 4.783216783216783, | |
| "grad_norm": 1.2288846969604492, | |
| "kl": 2.052734375, | |
| "learning_rate": 1.0683651114450641e-06, | |
| "loss": 0.0822, | |
| "reward": 0.8482142984867096, | |
| "reward_std": 0.3459298089146614, | |
| "rewards/accuracy_reward": 0.16666667023673654, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6815476417541504, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2635.6786499023438, | |
| "epoch": 4.811188811188811, | |
| "grad_norm": 2.2013814449310303, | |
| "kl": 1.4228515625, | |
| "learning_rate": 1.0540456773543596e-06, | |
| "loss": 0.0569, | |
| "reward": 1.090029776096344, | |
| "reward_std": 0.4234035685658455, | |
| "rewards/accuracy_reward": 0.35416667349636555, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7358631044626236, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2739.666748046875, | |
| "epoch": 4.839160839160839, | |
| "grad_norm": 2.3238914012908936, | |
| "kl": 1.884765625, | |
| "learning_rate": 1.0413981848581963e-06, | |
| "loss": 0.0755, | |
| "reward": 0.908482164144516, | |
| "reward_std": 0.38875988870859146, | |
| "rewards/accuracy_reward": 0.16666666907258332, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7418154925107956, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2622.46728515625, | |
| "epoch": 4.867132867132867, | |
| "grad_norm": 1.6446164846420288, | |
| "kl": 2.046875, | |
| "learning_rate": 1.0304273901612566e-06, | |
| "loss": 0.0819, | |
| "reward": 0.9806547909975052, | |
| "reward_std": 0.3640812337398529, | |
| "rewards/accuracy_reward": 0.2529762014746666, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.727678582072258, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2319.9107666015625, | |
| "epoch": 4.895104895104895, | |
| "grad_norm": 1.7031948566436768, | |
| "kl": 1.5048828125, | |
| "learning_rate": 1.021137418930754e-06, | |
| "loss": 0.0603, | |
| "reward": 1.0997024327516556, | |
| "reward_std": 0.36972612142562866, | |
| "rewards/accuracy_reward": 0.318452388048172, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7812500149011612, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2425.166748046875, | |
| "epoch": 4.923076923076923, | |
| "grad_norm": 1.7434192895889282, | |
| "kl": 1.529296875, | |
| "learning_rate": 1.0135317647449362e-06, | |
| "loss": 0.0612, | |
| "reward": 1.0372024178504944, | |
| "reward_std": 0.37977878004312515, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.745535746216774, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2256.047637939453, | |
| "epoch": 4.951048951048951, | |
| "grad_norm": 2.017437696456909, | |
| "kl": 1.2978515625, | |
| "learning_rate": 1.0076132877792933e-06, | |
| "loss": 0.0519, | |
| "reward": 1.1726190596818924, | |
| "reward_std": 0.3175879791378975, | |
| "rewards/accuracy_reward": 0.383928582072258, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7886904925107956, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2973.77685546875, | |
| "epoch": 4.979020979020979, | |
| "grad_norm": 2.749056100845337, | |
| "kl": 1.736328125, | |
| "learning_rate": 1.0033842137309649e-06, | |
| "loss": 0.0694, | |
| "reward": 0.9427083283662796, | |
| "reward_std": 0.42725374549627304, | |
| "rewards/accuracy_reward": 0.2380952425301075, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7046131044626236, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 3272.8553059895835, | |
| "epoch": 5.0, | |
| "grad_norm": 2.749056100845337, | |
| "kl": 1.7526041666666667, | |
| "learning_rate": 1.000846132981744e-06, | |
| "loss": 0.0525, | |
| "reward": 0.7926587462425232, | |
| "reward_std": 0.31565434734026593, | |
| "rewards/accuracy_reward": 0.1230158768594265, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6696428656578064, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 180, | |
| "total_flos": 0.0, | |
| "train_loss": 0.055402542805832566, | |
| "train_runtime": 70946.2323, | |
| "train_samples_per_second": 0.07, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 180, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |