{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.856898029134533, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 2862.5695190429688, "epoch": 0.001713796058269066, "grad_norm": 0.16925157606601715, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0467, "reward": 0.12026740610599518, "reward_std": 0.47210293635725975, "rewards/cosine_scaled_reward": -0.1343107339926064, "rewards/format_reward": 0.3888888917863369, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2739.5, "epoch": 0.003427592116538132, "grad_norm": 0.18508067727088928, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0391, "reward": -0.05314926430583, "reward_std": 0.36226021870970726, "rewards/cosine_scaled_reward": -0.21407463820651174, "rewards/format_reward": 0.3750000111758709, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2816.1944580078125, "epoch": 0.005141388174807198, "grad_norm": 0.15574845671653748, "kl": 4.06801700592041e-05, "learning_rate": 6e-08, "loss": 0.024, "reward": -0.0735303945839405, "reward_std": 0.4152667075395584, "rewards/cosine_scaled_reward": -0.21037630829960108, "rewards/format_reward": 0.34722223225980997, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2746.875, "epoch": 0.006855184233076264, "grad_norm": 0.18099600076675415, "kl": 3.692507743835449e-05, "learning_rate": 8e-08, "loss": 0.0516, "reward": 0.2664791904389858, "reward_std": 0.8305703550577164, "rewards/cosine_scaled_reward": -0.07509375014342368, "rewards/format_reward": 0.4166666716337204, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2557.513916015625, "epoch": 0.00856898029134533, "grad_norm": 0.173630490899086, "kl": 2.3245811462402344e-05, "learning_rate": 1e-07, "loss": 0.0579, "reward": 0.4870211333036423, "reward_std": 0.6806018278002739, "rewards/cosine_scaled_reward": -0.006489435210824013, "rewards/format_reward": 0.5000000074505806, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 3163.8333129882812, "epoch": 0.010282776349614395, "grad_norm": 0.1903219074010849, "kl": 4.1365623474121094e-05, "learning_rate": 1.2e-07, "loss": 0.0699, "reward": 0.22140773385763168, "reward_std": 0.614318884909153, "rewards/cosine_scaled_reward": -0.07679613586515188, "rewards/format_reward": 0.37500001303851604, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 2238.3055725097656, "epoch": 0.011996572407883462, "grad_norm": 0.2037331461906433, "kl": 3.427267074584961e-05, "learning_rate": 1.4e-07, "loss": 0.0507, "reward": 0.39292821660637856, "reward_std": 0.6100749522447586, "rewards/cosine_scaled_reward": -0.08825810719281435, "rewards/format_reward": 0.5694444552063942, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 2888.4166870117188, "epoch": 0.013710368466152529, "grad_norm": 0.1671508252620697, "kl": 2.8967857360839844e-05, "learning_rate": 1.6e-07, "loss": 0.0888, "reward": 0.5700129643082619, "reward_std": 1.0805757492780685, "rewards/cosine_scaled_reward": 0.04195092432200909, "rewards/format_reward": 0.486111119389534, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2740.638916015625, "epoch": 0.015424164524421594, "grad_norm": 0.2825331389904022, "kl": 3.212690353393555e-05, "learning_rate": 1.8e-07, "loss": 0.1025, "reward": 0.3288399577140808, "reward_std": 0.6967436075210571, "rewards/cosine_scaled_reward": -0.03696890315040946, "rewards/format_reward": 0.4027777733281255, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 3010.7916870117188, "epoch": 0.01713796058269066, "grad_norm": 0.17822624742984772, "kl": 4.1991472244262695e-05, "learning_rate": 2e-07, "loss": 0.0471, "reward": 0.09832120686769485, "reward_std": 0.6553668975830078, "rewards/cosine_scaled_reward": -0.1036171680316329, "rewards/format_reward": 0.3055555522441864, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2748.486114501953, "epoch": 0.018851756640959727, "grad_norm": 0.2476479411125183, "kl": 3.9696693420410156e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0491, "reward": 0.015873797237873077, "reward_std": 0.553259089589119, "rewards/cosine_scaled_reward": -0.16567421704530716, "rewards/format_reward": 0.3472222238779068, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2954.3472290039062, "epoch": 0.02056555269922879, "grad_norm": 0.28294840455055237, "kl": 3.898143768310547e-05, "learning_rate": 2.4e-07, "loss": 0.1311, "reward": -0.11908636894077063, "reward_std": 0.6466177105903625, "rewards/cosine_scaled_reward": -0.22620984725654125, "rewards/format_reward": 0.3333333367481828, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 2818.986114501953, "epoch": 0.022279348757497857, "grad_norm": 0.18577341735363007, "kl": 4.303455352783203e-05, "learning_rate": 2.6e-07, "loss": 0.0007, "reward": 0.3697042800486088, "reward_std": 0.7059066146612167, "rewards/cosine_scaled_reward": -0.03042563726194203, "rewards/format_reward": 0.4305555559694767, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2905.3333740234375, "epoch": 0.023993144815766924, "grad_norm": 0.226650208234787, "kl": 3.2275915145874023e-05, "learning_rate": 2.8e-07, "loss": 0.0212, "reward": 0.04198750853538513, "reward_std": 0.5741659551858902, "rewards/cosine_scaled_reward": -0.14567292109131813, "rewards/format_reward": 0.33333333395421505, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 3468.2222290039062, "epoch": 0.02570694087403599, "grad_norm": 0.1521635353565216, "kl": 4.279613494873047e-05, "learning_rate": 3e-07, "loss": 0.0233, "reward": -0.17704490013420582, "reward_std": 0.6536840051412582, "rewards/cosine_scaled_reward": -0.1996335554867983, "rewards/format_reward": 0.22222222574055195, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 2405.263916015625, "epoch": 0.027420736932305057, "grad_norm": 0.23728908598423004, "kl": 2.495013177394867e-05, "learning_rate": 3.2e-07, "loss": 0.0632, "reward": 0.7499620914459229, "reward_std": 0.9962631165981293, "rewards/cosine_scaled_reward": 0.07636993401683867, "rewards/format_reward": 0.5972222238779068, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 2764.875030517578, "epoch": 0.02913453299057412, "grad_norm": 0.21387562155723572, "kl": 2.6166439056396484e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0416, "reward": 0.27334376238286495, "reward_std": 0.4753483533859253, "rewards/cosine_scaled_reward": -0.05082811089232564, "rewards/format_reward": 0.3750000111758709, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 3252.486083984375, "epoch": 0.030848329048843187, "grad_norm": 0.209347203373909, "kl": 4.25875186920166e-05, "learning_rate": 3.6e-07, "loss": 0.0587, "reward": -0.18576696328818798, "reward_std": 0.5022815316915512, "rewards/cosine_scaled_reward": -0.19010569993406534, "rewards/format_reward": 0.1944444514811039, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 3157.4166870117188, "epoch": 0.032562125107112254, "grad_norm": 0.22900572419166565, "kl": 3.084540367126465e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0687, "reward": 0.03116392099764198, "reward_std": 0.7267041057348251, "rewards/cosine_scaled_reward": -0.14414026169106364, "rewards/format_reward": 0.3194444486871362, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 3228.5972290039062, "epoch": 0.03427592116538132, "grad_norm": 0.24043872952461243, "kl": 2.6807188987731934e-05, "learning_rate": 4e-07, "loss": 0.1293, "reward": -0.1261596381664276, "reward_std": 0.7229140102863312, "rewards/cosine_scaled_reward": -0.20196872018277645, "rewards/format_reward": 0.2777777835726738, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 2856.6805419921875, "epoch": 0.03598971722365039, "grad_norm": 0.19779175519943237, "kl": 3.987550735473633e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0069, "reward": 0.11652377434074879, "reward_std": 0.8210525661706924, "rewards/cosine_scaled_reward": -0.12229366600513458, "rewards/format_reward": 0.3611111165955663, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 3298.3472290039062, "epoch": 0.037703513281919454, "grad_norm": 0.13437196612358093, "kl": 2.828240394592285e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0123, "reward": 0.1601133793592453, "reward_std": 0.6881751976907253, "rewards/cosine_scaled_reward": -0.06577664241194725, "rewards/format_reward": 0.2916666753590107, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 3107.4583129882812, "epoch": 0.03941730934018852, "grad_norm": 0.1506253182888031, "kl": 2.2932887077331543e-05, "learning_rate": 4.6e-07, "loss": 0.0149, "reward": -0.13085854798555374, "reward_std": 0.5464130863547325, "rewards/cosine_scaled_reward": -0.20431815274059772, "rewards/format_reward": 0.2777777807787061, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 2710.6806030273438, "epoch": 0.04113110539845758, "grad_norm": 0.24692188203334808, "kl": 2.8967857360839844e-05, "learning_rate": 4.8e-07, "loss": 0.1012, "reward": 0.24628422083333135, "reward_std": 0.4773574620485306, "rewards/cosine_scaled_reward": -0.057413444737903774, "rewards/format_reward": 0.3611111268401146, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 2784.7361450195312, "epoch": 0.04284490145672665, "grad_norm": 0.25797340273857117, "kl": 2.6673078536987305e-05, "learning_rate": 5e-07, "loss": 0.106, "reward": 0.46540534496307373, "reward_std": 0.8211657330393791, "rewards/cosine_scaled_reward": -0.01729731634259224, "rewards/format_reward": 0.5000000037252903, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 3136.52783203125, "epoch": 0.044558697514995714, "grad_norm": 0.14968131482601166, "kl": 3.291666507720947e-05, "learning_rate": 5.2e-07, "loss": 0.0512, "reward": -0.09118526801466942, "reward_std": 0.5860454589128494, "rewards/cosine_scaled_reward": -0.21225931122899055, "rewards/format_reward": 0.33333334140479565, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3040.0000610351562, "epoch": 0.04627249357326478, "grad_norm": 0.17181935906410217, "kl": 1.5079975128173828e-05, "learning_rate": 5.4e-07, "loss": 0.0738, "reward": 0.34727448783814907, "reward_std": 0.6153330877423286, "rewards/cosine_scaled_reward": -0.027751651592552662, "rewards/format_reward": 0.4027777947485447, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 2343.1111755371094, "epoch": 0.04798628963153385, "grad_norm": 0.2077193260192871, "kl": 2.5130808353424072e-05, "learning_rate": 5.6e-07, "loss": 0.0598, "reward": 0.6073902919888496, "reward_std": 0.6849471032619476, "rewards/cosine_scaled_reward": 0.018972909078001976, "rewards/format_reward": 0.5694444477558136, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 3073.7222290039062, "epoch": 0.049700085689802914, "grad_norm": 0.21480253338813782, "kl": 2.290681004524231e-05, "learning_rate": 5.8e-07, "loss": 0.0747, "reward": 0.17731062695384026, "reward_std": 0.8807300254702568, "rewards/cosine_scaled_reward": -0.07801135815680027, "rewards/format_reward": 0.3333333320915699, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 2768.02783203125, "epoch": 0.05141388174807198, "grad_norm": 0.25759172439575195, "kl": 2.993270754814148e-05, "learning_rate": 6e-07, "loss": 0.0674, "reward": 0.5063075462821871, "reward_std": 0.771463930606842, "rewards/cosine_scaled_reward": -0.010735094547271729, "rewards/format_reward": 0.5277777910232544, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2695.6944580078125, "epoch": 0.05312767780634105, "grad_norm": 0.2701717019081116, "kl": 1.3127923011779785e-05, "learning_rate": 6.2e-07, "loss": 0.0971, "reward": 0.2706103939563036, "reward_std": 0.49449611082673073, "rewards/cosine_scaled_reward": -0.045250357885379344, "rewards/format_reward": 0.361111119389534, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 3051.52783203125, "epoch": 0.054841473864610114, "grad_norm": 0.17947925627231598, "kl": 2.6337802410125732e-05, "learning_rate": 6.4e-07, "loss": 0.057, "reward": 0.45089754834771156, "reward_std": 1.1203400194644928, "rewards/cosine_scaled_reward": -0.02455122536048293, "rewards/format_reward": 0.5000000074505806, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 2306.8750610351562, "epoch": 0.056555269922879174, "grad_norm": 0.21536274254322052, "kl": 5.3569674491882324e-05, "learning_rate": 6.6e-07, "loss": 0.0764, "reward": 0.8166992478072643, "reward_std": 0.8387185409665108, "rewards/cosine_scaled_reward": 0.12362739443778992, "rewards/format_reward": 0.5694444626569748, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2698.7083740234375, "epoch": 0.05826906598114824, "grad_norm": 0.29884466528892517, "kl": 0.00017189979553222656, "learning_rate": 6.800000000000001e-07, "loss": 0.1617, "reward": 0.057983118342235684, "reward_std": 0.7621737122535706, "rewards/cosine_scaled_reward": -0.1585084507241845, "rewards/format_reward": 0.3750000074505806, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3312.3055419921875, "epoch": 0.05998286203941731, "grad_norm": 0.1554093211889267, "kl": 9.316205978393555e-05, "learning_rate": 7e-07, "loss": 0.0273, "reward": -0.2900172360241413, "reward_std": 0.5383428931236267, "rewards/cosine_scaled_reward": -0.2700086124241352, "rewards/format_reward": 0.2500000074505806, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2631.8055725097656, "epoch": 0.061696658097686374, "grad_norm": 0.19274435937404633, "kl": 0.0002084970474243164, "learning_rate": 7.2e-07, "loss": 0.0306, "reward": 0.006275304593145847, "reward_std": 0.46724043786525726, "rewards/cosine_scaled_reward": -0.18436234444379807, "rewards/format_reward": 0.37500000931322575, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3124.5277709960938, "epoch": 0.06341045415595545, "grad_norm": 0.15709905326366425, "kl": 7.59810209274292e-05, "learning_rate": 7.4e-07, "loss": 0.0561, "reward": -0.008991474285721779, "reward_std": 0.5808551460504532, "rewards/cosine_scaled_reward": -0.1294957408681512, "rewards/format_reward": 0.25000000558793545, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3045.90283203125, "epoch": 0.06512425021422451, "grad_norm": 0.2423790842294693, "kl": 0.00022971630096435547, "learning_rate": 7.599999999999999e-07, "loss": 0.1263, "reward": 0.1536001469939947, "reward_std": 0.7093052342534065, "rewards/cosine_scaled_reward": -0.07597769796848297, "rewards/format_reward": 0.305555559694767, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 3150.0833740234375, "epoch": 0.06683804627249357, "grad_norm": 0.13335144519805908, "kl": 0.0003066062927246094, "learning_rate": 7.799999999999999e-07, "loss": 0.0187, "reward": -0.01171512296423316, "reward_std": 0.48150157928466797, "rewards/cosine_scaled_reward": -0.1586353350430727, "rewards/format_reward": 0.3055555559694767, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 2782.27783203125, "epoch": 0.06855184233076264, "grad_norm": 0.1773526668548584, "kl": 0.0007457435131072998, "learning_rate": 8e-07, "loss": 0.0236, "reward": 0.19545890390872955, "reward_std": 0.5221360512077808, "rewards/cosine_scaled_reward": -0.08282610075548291, "rewards/format_reward": 0.361111112870276, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2729.9722290039062, "epoch": 0.0702656383890317, "grad_norm": 0.2603820860385895, "kl": 0.0002143383026123047, "learning_rate": 8.199999999999999e-07, "loss": 0.1308, "reward": 0.5641986541450024, "reward_std": 0.7014989629387856, "rewards/cosine_scaled_reward": 0.05293265450745821, "rewards/format_reward": 0.4583333432674408, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2622.0555419921875, "epoch": 0.07197943444730077, "grad_norm": 0.19547662138938904, "kl": 0.0008759498596191406, "learning_rate": 8.399999999999999e-07, "loss": 0.0788, "reward": 0.3987229084596038, "reward_std": 0.6764711476862431, "rewards/cosine_scaled_reward": -0.05063853319734335, "rewards/format_reward": 0.5, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 2757.3611450195312, "epoch": 0.07369323050556983, "grad_norm": 0.133390411734581, "kl": 0.00021369755268096924, "learning_rate": 8.599999999999999e-07, "loss": 0.0354, "reward": 0.5515957027673721, "reward_std": 0.6986619718372822, "rewards/cosine_scaled_reward": 0.04663117043673992, "rewards/format_reward": 0.4583333283662796, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 2743.763916015625, "epoch": 0.07540702656383891, "grad_norm": 0.17805209755897522, "kl": 0.0008558034896850586, "learning_rate": 8.799999999999999e-07, "loss": 0.1039, "reward": 0.06273656419944018, "reward_std": 0.7254525497555733, "rewards/cosine_scaled_reward": -0.18390950025059283, "rewards/format_reward": 0.4305555671453476, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3018.1805419921875, "epoch": 0.07712082262210797, "grad_norm": 0.23340974748134613, "kl": 0.0007225275039672852, "learning_rate": 9e-07, "loss": 0.047, "reward": 0.12753370963037014, "reward_std": 0.5756559893488884, "rewards/cosine_scaled_reward": -0.09595536440610886, "rewards/format_reward": 0.31944444589316845, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 2453.77783203125, "epoch": 0.07883461868037704, "grad_norm": 0.25216469168663025, "kl": 0.0028772354125976562, "learning_rate": 9.2e-07, "loss": 0.0976, "reward": 0.4031712617725134, "reward_std": 0.5689256861805916, "rewards/cosine_scaled_reward": -0.05535881780087948, "rewards/format_reward": 0.5138888955116272, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 3180.0972290039062, "epoch": 0.0805484147386461, "grad_norm": 0.17415259778499603, "kl": 0.0014755725860595703, "learning_rate": 9.399999999999999e-07, "loss": 0.0718, "reward": -0.026270870119333267, "reward_std": 0.641656719148159, "rewards/cosine_scaled_reward": -0.15202434547245502, "rewards/format_reward": 0.27777778171002865, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2680.7639770507812, "epoch": 0.08226221079691516, "grad_norm": 0.20438066124916077, "kl": 0.001586318016052246, "learning_rate": 9.6e-07, "loss": 0.0807, "reward": 0.6057721227407455, "reward_std": 0.7416700124740601, "rewards/cosine_scaled_reward": 0.05288607440888882, "rewards/format_reward": 0.5000000074505806, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2346.055633544922, "epoch": 0.08397600685518423, "grad_norm": 0.35583311319351196, "kl": 0.018939971923828125, "learning_rate": 9.8e-07, "loss": 0.1404, "reward": 0.7048290632665157, "reward_std": 0.6792610064148903, "rewards/cosine_scaled_reward": 0.06074785813689232, "rewards/format_reward": 0.5833333432674408, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 2833.5833740234375, "epoch": 0.0856898029134533, "grad_norm": 0.2027311623096466, "kl": 0.0032949447631835938, "learning_rate": 1e-06, "loss": 0.0416, "reward": 0.07023209612816572, "reward_std": 0.6861855462193489, "rewards/cosine_scaled_reward": -0.16627284698188305, "rewards/format_reward": 0.4027777872979641, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 3051.2777709960938, "epoch": 0.08740359897172237, "grad_norm": 0.16748514771461487, "kl": 0.001615285873413086, "learning_rate": 9.999890338174275e-07, "loss": 0.069, "reward": 0.1449947228829842, "reward_std": 0.7090619504451752, "rewards/cosine_scaled_reward": -0.10111376643180847, "rewards/format_reward": 0.34722223225980997, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3181.9583740234375, "epoch": 0.08911739502999143, "grad_norm": 0.16281543672084808, "kl": 0.0019249916076660156, "learning_rate": 9.999561358041868e-07, "loss": 0.0803, "reward": -0.03632636368274689, "reward_std": 0.5028033927083015, "rewards/cosine_scaled_reward": -0.12927428726106882, "rewards/format_reward": 0.22222222946584225, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3303.1805419921875, "epoch": 0.0908311910882605, "grad_norm": 0.14455804228782654, "kl": 0.0005393028259277344, "learning_rate": 9.999013075636804e-07, "loss": 0.0318, "reward": -0.10013403557240963, "reward_std": 0.4606664590537548, "rewards/cosine_scaled_reward": -0.17506700940430164, "rewards/format_reward": 0.2500000046566129, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3313.1944580078125, "epoch": 0.09254498714652956, "grad_norm": 0.13308647274971008, "kl": 0.0011081695556640625, "learning_rate": 9.998245517681593e-07, "loss": 0.0055, "reward": 0.10159287042915821, "reward_std": 0.6204735822975636, "rewards/cosine_scaled_reward": -0.060314678063150495, "rewards/format_reward": 0.2222222276031971, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3481.15283203125, "epoch": 0.09425878320479864, "grad_norm": 0.13649359345436096, "kl": 0.0008268356323242188, "learning_rate": 9.997258721585931e-07, "loss": 0.0328, "reward": -0.12874329963233322, "reward_std": 0.5648706145584583, "rewards/cosine_scaled_reward": -0.1754827625118196, "rewards/format_reward": 0.22222222480922937, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3232.7222900390625, "epoch": 0.0959725792630677, "grad_norm": 0.19132941961288452, "kl": 0.0013275146484375, "learning_rate": 9.996052735444862e-07, "loss": 0.1077, "reward": -0.17376804118975997, "reward_std": 0.749246733263135, "rewards/cosine_scaled_reward": -0.20493957586586475, "rewards/format_reward": 0.2361111156642437, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3279.4584350585938, "epoch": 0.09768637532133675, "grad_norm": 0.15241067111492157, "kl": 0.000919342041015625, "learning_rate": 9.994627618036452e-07, "loss": 0.0282, "reward": 0.31643399875611067, "reward_std": 0.6422489807009697, "rewards/cosine_scaled_reward": 0.005439223721623421, "rewards/format_reward": 0.30555556155741215, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3047.2916870117188, "epoch": 0.09940017137960583, "grad_norm": 0.22829630970954895, "kl": 0.0054931640625, "learning_rate": 9.992983438818915e-07, "loss": 0.0909, "reward": -0.17570834839716554, "reward_std": 0.4780988022685051, "rewards/cosine_scaled_reward": -0.23368750512599945, "rewards/format_reward": 0.2916666679084301, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 2918.5555419921875, "epoch": 0.10111396743787489, "grad_norm": 0.17409604787826538, "kl": 0.010187149047851562, "learning_rate": 9.991120277927223e-07, "loss": -0.0001, "reward": 0.6838416904211044, "reward_std": 0.7215724363923073, "rewards/cosine_scaled_reward": 0.1196986111899605, "rewards/format_reward": 0.4444444477558136, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3240.90283203125, "epoch": 0.10282776349614396, "grad_norm": 0.21398130059242249, "kl": 0.0015239715576171875, "learning_rate": 9.989038226169207e-07, "loss": 0.0841, "reward": -0.013310029171407223, "reward_std": 0.6487029865384102, "rewards/cosine_scaled_reward": -0.13859945815056562, "rewards/format_reward": 0.2638888992369175, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 3323.3889770507812, "epoch": 0.10454155955441302, "grad_norm": 0.25011396408081055, "kl": 0.0015153884887695312, "learning_rate": 9.98673738502114e-07, "loss": 0.0677, "reward": -0.37927111238241196, "reward_std": 0.43354837596416473, "rewards/cosine_scaled_reward": -0.2799133397638798, "rewards/format_reward": 0.18055556155741215, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2683.6250610351562, "epoch": 0.1062553556126821, "grad_norm": 0.17982754111289978, "kl": 0.00201416015625, "learning_rate": 9.98421786662277e-07, "loss": 0.0008, "reward": 0.40144167095422745, "reward_std": 0.5826155617833138, "rewards/cosine_scaled_reward": -0.02844582637771964, "rewards/format_reward": 0.4583333432674408, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3426.2361450195312, "epoch": 0.10796915167095116, "grad_norm": 0.182517409324646, "kl": 0.00151824951171875, "learning_rate": 9.981479793771866e-07, "loss": 0.0294, "reward": -0.09498679265379906, "reward_std": 0.7008046992123127, "rewards/cosine_scaled_reward": -0.13777116686105728, "rewards/format_reward": 0.18055555690079927, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 2847.5972290039062, "epoch": 0.10968294772922023, "grad_norm": 0.31501731276512146, "kl": 0.0022530555725097656, "learning_rate": 9.97852329991824e-07, "loss": 0.1548, "reward": 0.009381972253322601, "reward_std": 0.36741600558161736, "rewards/cosine_scaled_reward": -0.16197567898780107, "rewards/format_reward": 0.3333333432674408, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3167.236083984375, "epoch": 0.11139674378748929, "grad_norm": 0.4229466915130615, "kl": 0.0364532470703125, "learning_rate": 9.975348529157229e-07, "loss": 0.0659, "reward": -0.029949136078357697, "reward_std": 0.5782980695366859, "rewards/cosine_scaled_reward": -0.13997458899393678, "rewards/format_reward": 0.25000000838190317, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 2846.8334350585938, "epoch": 0.11311053984575835, "grad_norm": 0.1699674278497696, "kl": 0.0013065338134765625, "learning_rate": 9.971955636222684e-07, "loss": 0.0667, "reward": 0.2395001295953989, "reward_std": 0.3902180567383766, "rewards/cosine_scaled_reward": -0.053861052729189396, "rewards/format_reward": 0.3472222313284874, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 3227.4445190429688, "epoch": 0.11482433590402742, "grad_norm": 0.15845970809459686, "kl": 0.0022869110107421875, "learning_rate": 9.968344786479415e-07, "loss": 0.0416, "reward": 0.06229268200695515, "reward_std": 0.5577914118766785, "rewards/cosine_scaled_reward": -0.1285758875310421, "rewards/format_reward": 0.3194444552063942, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 2906.3472290039062, "epoch": 0.11653813196229648, "grad_norm": 0.17754817008972168, "kl": 0.0027103424072265625, "learning_rate": 9.964516155915151e-07, "loss": -0.0006, "reward": 0.000796053558588028, "reward_std": 0.5399865545332432, "rewards/cosine_scaled_reward": -0.15932418778538704, "rewards/format_reward": 0.3194444449618459, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 3079.4583740234375, "epoch": 0.11825192802056556, "grad_norm": 0.16689395904541016, "kl": 0.00244140625, "learning_rate": 9.960469931131936e-07, "loss": 0.0012, "reward": 0.40755608677864075, "reward_std": 0.592438168823719, "rewards/cosine_scaled_reward": 0.009333595633506775, "rewards/format_reward": 0.3888889029622078, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 2852.6388549804688, "epoch": 0.11996572407883462, "grad_norm": 0.14442802965641022, "kl": 0.0042266845703125, "learning_rate": 9.956206309337066e-07, "loss": 0.023, "reward": 0.44340329244732857, "reward_std": 0.43735441006720066, "rewards/cosine_scaled_reward": 0.00642385333776474, "rewards/format_reward": 0.4305555559694767, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 3119.8195190429688, "epoch": 0.12167952013710369, "grad_norm": 0.1541452407836914, "kl": 0.003391265869140625, "learning_rate": 9.951725498333448e-07, "loss": 0.0155, "reward": 0.49696624279022217, "reward_std": 0.9607885628938675, "rewards/cosine_scaled_reward": 0.07487202249467373, "rewards/format_reward": 0.3472222238779068, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 2584.513885498047, "epoch": 0.12339331619537275, "grad_norm": 0.16282722353935242, "kl": 0.007266998291015625, "learning_rate": 9.947027716509488e-07, "loss": 0.0302, "reward": 0.4334046132862568, "reward_std": 0.42579157277941704, "rewards/cosine_scaled_reward": -0.04024216299876571, "rewards/format_reward": 0.5138888955116272, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 3057.8611450195312, "epoch": 0.12510711225364182, "grad_norm": 0.19297440350055695, "kl": 0.004047393798828125, "learning_rate": 9.942113192828444e-07, "loss": -0.0268, "reward": 0.2504111938178539, "reward_std": 0.6320941485464573, "rewards/cosine_scaled_reward": -0.05534995626658201, "rewards/format_reward": 0.3611111268401146, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 2350.5000610351562, "epoch": 0.1268209083119109, "grad_norm": 0.25634145736694336, "kl": 0.004367828369140625, "learning_rate": 9.93698216681727e-07, "loss": 0.1227, "reward": 0.7754522487521172, "reward_std": 0.8430259823799133, "rewards/cosine_scaled_reward": 0.07522611878812313, "rewards/format_reward": 0.625, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 3078.013916015625, "epoch": 0.12853470437017994, "grad_norm": 0.15847010910511017, "kl": 0.004947662353515625, "learning_rate": 9.931634888554935e-07, "loss": 0.0447, "reward": 0.27387892454862595, "reward_std": 0.5773990303277969, "rewards/cosine_scaled_reward": -0.03667165897786617, "rewards/format_reward": 0.3472222276031971, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 2247.8194427490234, "epoch": 0.13024850042844902, "grad_norm": 0.28341227769851685, "kl": 0.014591217041015625, "learning_rate": 9.926071618660237e-07, "loss": 0.0403, "reward": 0.7070811688899994, "reward_std": 0.7020798400044441, "rewards/cosine_scaled_reward": 0.06881837674882263, "rewards/format_reward": 0.5694444440305233, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 3159.75, "epoch": 0.1319622964867181, "grad_norm": 0.13436463475227356, "kl": 0.0049896240234375, "learning_rate": 9.9202926282791e-07, "loss": 0.023, "reward": 0.35647532157599926, "reward_std": 0.7988947406411171, "rewards/cosine_scaled_reward": 0.011570994276553392, "rewards/format_reward": 0.33333334513008595, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 3150.0139770507812, "epoch": 0.13367609254498714, "grad_norm": 0.176174134016037, "kl": 0.004405975341796875, "learning_rate": 9.91429819907136e-07, "loss": 0.0747, "reward": -0.14098340552300215, "reward_std": 0.5686891078948975, "rewards/cosine_scaled_reward": -0.18854726571589708, "rewards/format_reward": 0.23611112032085657, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 2571.0694580078125, "epoch": 0.1353898886032562, "grad_norm": 0.1847277730703354, "kl": 0.008609771728515625, "learning_rate": 9.908088623197048e-07, "loss": -0.0106, "reward": 0.3892364539206028, "reward_std": 0.7569635957479477, "rewards/cosine_scaled_reward": -0.06927067344076931, "rewards/format_reward": 0.5277777835726738, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 3138.5555419921875, "epoch": 0.13710368466152528, "grad_norm": 0.21640530228614807, "kl": 0.005603790283203125, "learning_rate": 9.901664203302124e-07, "loss": 0.1324, "reward": -0.1231984393671155, "reward_std": 0.778315082192421, "rewards/cosine_scaled_reward": -0.2074325531721115, "rewards/format_reward": 0.2916666716337204, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 3191.916748046875, "epoch": 0.13881748071979436, "grad_norm": 0.1524638533592224, "kl": 0.014739990234375, "learning_rate": 9.895025252503755e-07, "loss": 0.0255, "reward": -0.14118600636720657, "reward_std": 0.3157992772758007, "rewards/cosine_scaled_reward": -0.17475967481732368, "rewards/format_reward": 0.20833334047347307, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 2925.013916015625, "epoch": 0.1405312767780634, "grad_norm": 0.21411970257759094, "kl": 0.00635528564453125, "learning_rate": 9.888172094375033e-07, "loss": 0.0735, "reward": -0.06351233087480068, "reward_std": 0.5284828841686249, "rewards/cosine_scaled_reward": -0.18453393690288067, "rewards/format_reward": 0.30555555783212185, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 2801.2638549804688, "epoch": 0.14224507283633248, "grad_norm": 0.18929333984851837, "kl": 0.0023746490478515625, "learning_rate": 9.881105062929221e-07, "loss": 0.0434, "reward": 0.5797148197889328, "reward_std": 0.8048742488026619, "rewards/cosine_scaled_reward": 0.03985740663483739, "rewards/format_reward": 0.5000000074505806, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2881.999969482422, "epoch": 0.14395886889460155, "grad_norm": 0.16995370388031006, "kl": 0.00823211669921875, "learning_rate": 9.873824502603459e-07, "loss": 0.0417, "reward": 0.1579499295912683, "reward_std": 0.6737323254346848, "rewards/cosine_scaled_reward": -0.12935838662087917, "rewards/format_reward": 0.4166666716337204, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 2817.888916015625, "epoch": 0.1456726649528706, "grad_norm": 0.17163607478141785, "kl": 0.004947662353515625, "learning_rate": 9.866330768241983e-07, "loss": 0.0843, "reward": 0.14664312824606895, "reward_std": 0.6406831294298172, "rewards/cosine_scaled_reward": -0.10028954246081412, "rewards/format_reward": 0.3472222248092294, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 2666.0972595214844, "epoch": 0.14738646101113967, "grad_norm": 0.23853930830955505, "kl": 0.0075836181640625, "learning_rate": 9.85862422507884e-07, "loss": 0.184, "reward": 0.15615743398666382, "reward_std": 0.6508499458432198, "rewards/cosine_scaled_reward": -0.14414352551102638, "rewards/format_reward": 0.4444444440305233, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 3479.0000610351562, "epoch": 0.14910025706940874, "grad_norm": 0.13812494277954102, "kl": 0.003131866455078125, "learning_rate": 9.850705248720068e-07, "loss": 0.0273, "reward": -0.3952238578349352, "reward_std": 0.4180161654949188, "rewards/cosine_scaled_reward": -0.24622303992509842, "rewards/format_reward": 0.0972222238779068, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 3411.0556030273438, "epoch": 0.15081405312767782, "grad_norm": 0.14131076633930206, "kl": 0.00627899169921875, "learning_rate": 9.8425742251254e-07, "loss": 0.0242, "reward": -0.18497492372989655, "reward_std": 0.3112034276127815, "rewards/cosine_scaled_reward": -0.15498745813965797, "rewards/format_reward": 0.12500000186264515, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 2821.4305725097656, "epoch": 0.15252784918594686, "grad_norm": 0.23381026089191437, "kl": 0.00811767578125, "learning_rate": 9.83423155058946e-07, "loss": 0.1044, "reward": -0.15477947797626257, "reward_std": 0.3880116418004036, "rewards/cosine_scaled_reward": -0.257945304736495, "rewards/format_reward": 0.3611111082136631, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2741.013885498047, "epoch": 0.15424164524421594, "grad_norm": 0.3015286326408386, "kl": 0.005706787109375, "learning_rate": 9.825677631722435e-07, "loss": 0.146, "reward": 0.32925539929419756, "reward_std": 0.5706463847309351, "rewards/cosine_scaled_reward": -0.01592785632237792, "rewards/format_reward": 0.3611111082136631, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 3004.1805419921875, "epoch": 0.155955441302485, "grad_norm": 0.2821044325828552, "kl": 0.013214111328125, "learning_rate": 9.816912885430258e-07, "loss": 0.1457, "reward": -0.23375913500785828, "reward_std": 0.6937631815671921, "rewards/cosine_scaled_reward": -0.25576844066381454, "rewards/format_reward": 0.2777777872979641, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 2889.15283203125, "epoch": 0.15766923736075408, "grad_norm": 0.19952206313610077, "kl": 0.01056671142578125, "learning_rate": 9.807937738894303e-07, "loss": -0.0327, "reward": 0.10378427803516388, "reward_std": 0.6779353246092796, "rewards/cosine_scaled_reward": -0.13560786750167608, "rewards/format_reward": 0.37500000186264515, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 2751.8055725097656, "epoch": 0.15938303341902313, "grad_norm": 0.18763676285743713, "kl": 0.00572967529296875, "learning_rate": 9.798752629550546e-07, "loss": 0.0734, "reward": 0.5665245279669762, "reward_std": 0.7802244201302528, "rewards/cosine_scaled_reward": 0.012428927002474666, "rewards/format_reward": 0.5416666604578495, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 2943.9722290039062, "epoch": 0.1610968294772922, "grad_norm": 0.17491032183170319, "kl": 0.005878448486328125, "learning_rate": 9.78935800506826e-07, "loss": 0.0466, "reward": 0.36631612479686737, "reward_std": 0.5951685793697834, "rewards/cosine_scaled_reward": -0.011286390479654074, "rewards/format_reward": 0.38888889737427235, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 2803.4862060546875, "epoch": 0.16281062553556128, "grad_norm": 0.2179604023694992, "kl": 0.0073699951171875, "learning_rate": 9.779754323328192e-07, "loss": 0.1111, "reward": 0.20993795804679394, "reward_std": 0.5628918968141079, "rewards/cosine_scaled_reward": -0.08253101143054664, "rewards/format_reward": 0.3750000074505806, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3062.25, "epoch": 0.16452442159383032, "grad_norm": 0.1575266271829605, "kl": 0.005573272705078125, "learning_rate": 9.769942052400235e-07, "loss": 0.0192, "reward": 0.5143513884395361, "reward_std": 0.9291824996471405, "rewards/cosine_scaled_reward": 0.021064545959234238, "rewards/format_reward": 0.4722222350537777, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 3426.0555419921875, "epoch": 0.1662382176520994, "grad_norm": 0.152592271566391, "kl": 0.0096588134765625, "learning_rate": 9.759921670520634e-07, "loss": 0.0595, "reward": -0.316804476082325, "reward_std": 0.5735431797802448, "rewards/cosine_scaled_reward": -0.2209022343158722, "rewards/format_reward": 0.12500000186264515, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 2718.1806030273438, "epoch": 0.16795201371036847, "grad_norm": 0.19641156494617462, "kl": 0.00783538818359375, "learning_rate": 9.749693666068663e-07, "loss": 0.0871, "reward": 0.34513735864311457, "reward_std": 0.7377712428569794, "rewards/cosine_scaled_reward": -0.09826467745006084, "rewards/format_reward": 0.541666679084301, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 3183.8611450195312, "epoch": 0.16966580976863754, "grad_norm": 0.13990604877471924, "kl": 0.00958251953125, "learning_rate": 9.739258537542835e-07, "loss": 0.0408, "reward": 0.10082972631789744, "reward_std": 0.4568670317530632, "rewards/cosine_scaled_reward": -0.09541848301887512, "rewards/format_reward": 0.2916666669771075, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 2988.263916015625, "epoch": 0.1713796058269066, "grad_norm": 0.1574762910604477, "kl": 0.01104736328125, "learning_rate": 9.728616793536587e-07, "loss": 0.02, "reward": 0.05844925343990326, "reward_std": 0.4471042864024639, "rewards/cosine_scaled_reward": -0.13744205003604293, "rewards/format_reward": 0.3333333358168602, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 2955.6388549804688, "epoch": 0.17309340188517566, "grad_norm": 0.15706215798854828, "kl": 0.006420135498046875, "learning_rate": 9.717768952713511e-07, "loss": 0.0337, "reward": 0.032026506960392, "reward_std": 0.35832666605710983, "rewards/cosine_scaled_reward": -0.1298200935125351, "rewards/format_reward": 0.2916666753590107, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 2912.0972290039062, "epoch": 0.17480719794344474, "grad_norm": 0.1945251077413559, "kl": 0.0088043212890625, "learning_rate": 9.706715543782064e-07, "loss": 0.072, "reward": 0.22132272832095623, "reward_std": 0.4281787723302841, "rewards/cosine_scaled_reward": -0.09072753041982651, "rewards/format_reward": 0.40277779288589954, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 2903.9444580078125, "epoch": 0.17652099400171378, "grad_norm": 0.1475774347782135, "kl": 0.00759124755859375, "learning_rate": 9.695457105469804e-07, "loss": 0.0409, "reward": 0.16637181863188744, "reward_std": 0.6222990080714226, "rewards/cosine_scaled_reward": -0.10431409068405628, "rewards/format_reward": 0.3750000149011612, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3320.1805419921875, "epoch": 0.17823479005998286, "grad_norm": 0.16452452540397644, "kl": 0.006443023681640625, "learning_rate": 9.683994186497132e-07, "loss": 0.073, "reward": -0.04724724031984806, "reward_std": 0.5820007584989071, "rewards/cosine_scaled_reward": -0.13473473582416773, "rewards/format_reward": 0.2222222276031971, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2745.27783203125, "epoch": 0.17994858611825193, "grad_norm": 0.23044738173484802, "kl": 0.0102996826171875, "learning_rate": 9.672327345550543e-07, "loss": 0.0909, "reward": 0.48719315230846405, "reward_std": 0.9213617816567421, "rewards/cosine_scaled_reward": -0.01334787905216217, "rewards/format_reward": 0.5138888955116272, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 3009.0694580078125, "epoch": 0.181662382176521, "grad_norm": 0.25429767370224, "kl": 0.0078125, "learning_rate": 9.66045715125541e-07, "loss": 0.127, "reward": 0.27888505905866623, "reward_std": 0.7037396281957626, "rewards/cosine_scaled_reward": -0.048057474195957184, "rewards/format_reward": 0.3750000009313226, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 3172.5694580078125, "epoch": 0.18337617823479005, "grad_norm": 0.17300733923912048, "kl": 0.008148193359375, "learning_rate": 9.648384182148252e-07, "loss": 0.0446, "reward": 0.21187454462051392, "reward_std": 0.549411840736866, "rewards/cosine_scaled_reward": -0.06767383548867656, "rewards/format_reward": 0.34722222574055195, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 3347.52783203125, "epoch": 0.18508997429305912, "grad_norm": 0.17588993906974792, "kl": 0.00628662109375, "learning_rate": 9.636109026648554e-07, "loss": 0.06, "reward": -0.038673363626003265, "reward_std": 0.728736087679863, "rewards/cosine_scaled_reward": -0.15822557546198368, "rewards/format_reward": 0.2777777835726738, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 2627.263946533203, "epoch": 0.1868037703513282, "grad_norm": 0.29850271344184875, "kl": 0.01171875, "learning_rate": 9.623632283030077e-07, "loss": 0.0662, "reward": 0.19531617127358913, "reward_std": 0.4965377002954483, "rewards/cosine_scaled_reward": -0.09678636118769646, "rewards/format_reward": 0.38888889737427235, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 2958.625, "epoch": 0.18851756640959727, "grad_norm": 0.46270403265953064, "kl": 0.0089263916015625, "learning_rate": 9.610954559391704e-07, "loss": 0.1711, "reward": 0.08645874005742371, "reward_std": 0.9684502333402634, "rewards/cosine_scaled_reward": -0.1512150838971138, "rewards/format_reward": 0.3888888955116272, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2400.3472900390625, "epoch": 0.19023136246786632, "grad_norm": 0.18343479931354523, "kl": 0.00701904296875, "learning_rate": 9.598076473627796e-07, "loss": 0.1083, "reward": 0.22095186542719603, "reward_std": 0.5088437423110008, "rewards/cosine_scaled_reward": -0.15341296698898077, "rewards/format_reward": 0.5277777835726738, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 2780.916748046875, "epoch": 0.1919451585261354, "grad_norm": 0.16234862804412842, "kl": 0.007965087890625, "learning_rate": 9.58499865339809e-07, "loss": 0.0115, "reward": 0.19807963073253632, "reward_std": 0.5584643110632896, "rewards/cosine_scaled_reward": -0.14401574060320854, "rewards/format_reward": 0.48611112777143717, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 2751.7361450195312, "epoch": 0.19365895458440446, "grad_norm": 0.20913416147232056, "kl": 0.00861358642578125, "learning_rate": 9.571721736097088e-07, "loss": 0.0851, "reward": 0.7618176154792309, "reward_std": 1.0328082591295242, "rewards/cosine_scaled_reward": 0.11007547879125923, "rewards/format_reward": 0.541666679084301, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 2235.9166564941406, "epoch": 0.1953727506426735, "grad_norm": 0.20926620066165924, "kl": 0.008697509765625, "learning_rate": 9.55824636882301e-07, "loss": 0.0327, "reward": 0.2064858078956604, "reward_std": 0.4848344102501869, "rewards/cosine_scaled_reward": -0.1675904355943203, "rewards/format_reward": 0.5416666753590107, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2317.6250610351562, "epoch": 0.19708654670094258, "grad_norm": 0.4515492916107178, "kl": 0.0092010498046875, "learning_rate": 9.54457320834625e-07, "loss": 0.2531, "reward": 0.45756053365767, "reward_std": 0.7848574221134186, "rewards/cosine_scaled_reward": -0.04899751394987106, "rewards/format_reward": 0.5555555745959282, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 3182.0694580078125, "epoch": 0.19880034275921166, "grad_norm": 0.17537973821163177, "kl": 0.0128631591796875, "learning_rate": 9.530702921077358e-07, "loss": 0.0165, "reward": -0.06121325120329857, "reward_std": 0.4434010796248913, "rewards/cosine_scaled_reward": -0.1694955169223249, "rewards/format_reward": 0.2777777798473835, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 3030.7083129882812, "epoch": 0.20051413881748073, "grad_norm": 0.18003451824188232, "kl": 0.017120361328125, "learning_rate": 9.516636183034564e-07, "loss": 0.1071, "reward": 0.42929551005363464, "reward_std": 0.9132848009467125, "rewards/cosine_scaled_reward": 0.006314422586001456, "rewards/format_reward": 0.4166666651144624, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 2533.9443969726562, "epoch": 0.20222793487574978, "grad_norm": 0.2703484296798706, "kl": 0.011077880859375, "learning_rate": 9.502373679810839e-07, "loss": 0.0946, "reward": 0.4179135374724865, "reward_std": 0.8737296983599663, "rewards/cosine_scaled_reward": -0.04798768740147352, "rewards/format_reward": 0.5138889029622078, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2823.5000610351562, "epoch": 0.20394173093401885, "grad_norm": 0.19636695086956024, "kl": 0.01312255859375, "learning_rate": 9.487916106540465e-07, "loss": 0.0303, "reward": 0.31334975361824036, "reward_std": 0.30826447159051895, "rewards/cosine_scaled_reward": -0.058602908393368125, "rewards/format_reward": 0.430555553175509, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 2826.5694580078125, "epoch": 0.20565552699228792, "grad_norm": 0.2075241059064865, "kl": 0.016265869140625, "learning_rate": 9.473264167865171e-07, "loss": 0.094, "reward": 0.4697803445160389, "reward_std": 0.7031994387507439, "rewards/cosine_scaled_reward": 0.005723495967686176, "rewards/format_reward": 0.4583333358168602, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2887.3056030273438, "epoch": 0.207369323050557, "grad_norm": 0.19230371713638306, "kl": 0.0111083984375, "learning_rate": 9.458418577899774e-07, "loss": 0.086, "reward": 0.3282506223767996, "reward_std": 0.7738695293664932, "rewards/cosine_scaled_reward": -0.05809690523892641, "rewards/format_reward": 0.4444444589316845, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 3083.3611450195312, "epoch": 0.20908311910882604, "grad_norm": 0.17026208341121674, "kl": 0.01568603515625, "learning_rate": 9.443380060197385e-07, "loss": 0.0301, "reward": -0.03662687446922064, "reward_std": 0.5345718339085579, "rewards/cosine_scaled_reward": -0.19886899180710316, "rewards/format_reward": 0.361111119389534, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2682.6805725097656, "epoch": 0.21079691516709512, "grad_norm": 0.19728592038154602, "kl": 0.012115478515625, "learning_rate": 9.428149347714143e-07, "loss": 0.0481, "reward": 0.3675118573009968, "reward_std": 1.058239296078682, "rewards/cosine_scaled_reward": -0.052355190739035606, "rewards/format_reward": 0.4722222248092294, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 3100.1666870117188, "epoch": 0.2125107112253642, "grad_norm": 0.19675055146217346, "kl": 0.013641357421875, "learning_rate": 9.412727182773486e-07, "loss": 0.0775, "reward": 0.28848724998533726, "reward_std": 0.5403149202466011, "rewards/cosine_scaled_reward": -0.04325637500733137, "rewards/format_reward": 0.3750000149011612, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 2862.0277709960938, "epoch": 0.21422450728363324, "grad_norm": 0.1939004808664322, "kl": 0.017242431640625, "learning_rate": 9.397114317029974e-07, "loss": 0.0453, "reward": 0.3707499373704195, "reward_std": 0.7198375910520554, "rewards/cosine_scaled_reward": -0.016013892367482185, "rewards/format_reward": 0.4027777807787061, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 2753.486083984375, "epoch": 0.2159383033419023, "grad_norm": 0.25714027881622314, "kl": 0.0194091796875, "learning_rate": 9.381311511432658e-07, "loss": 0.0648, "reward": 0.3369361013174057, "reward_std": 0.5913353934884071, "rewards/cosine_scaled_reward": -0.08847637102007866, "rewards/format_reward": 0.5138888889923692, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2923.125, "epoch": 0.21765209940017138, "grad_norm": 0.2240990549325943, "kl": 0.016571044921875, "learning_rate": 9.36531953618799e-07, "loss": 0.076, "reward": -0.2184343640692532, "reward_std": 0.5479928515851498, "rewards/cosine_scaled_reward": -0.27588383853435516, "rewards/format_reward": 0.3333333395421505, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3276.6111450195312, "epoch": 0.21936589545844046, "grad_norm": 0.15262338519096375, "kl": 0.0207061767578125, "learning_rate": 9.34913917072228e-07, "loss": -0.0001, "reward": -0.12921499274671078, "reward_std": 0.5691854059696198, "rewards/cosine_scaled_reward": -0.1757186003960669, "rewards/format_reward": 0.22222223225980997, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 2278.4305725097656, "epoch": 0.2210796915167095, "grad_norm": 0.3608929216861725, "kl": 0.019287109375, "learning_rate": 9.332771203643714e-07, "loss": 0.0927, "reward": 0.706303309649229, "reward_std": 0.7875337153673172, "rewards/cosine_scaled_reward": 0.04759608302265406, "rewards/format_reward": 0.6111111119389534, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1965.999984741211, "epoch": 0.22279348757497858, "grad_norm": 0.18217293918132782, "kl": 0.018463134765625, "learning_rate": 9.316216432703916e-07, "loss": 0.0064, "reward": 1.0708431326784194, "reward_std": 0.7828814685344696, "rewards/cosine_scaled_reward": 0.17431045067496598, "rewards/format_reward": 0.722222238779068, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 3232.7222290039062, "epoch": 0.22450728363324765, "grad_norm": 0.1822432279586792, "kl": 0.0173797607421875, "learning_rate": 9.299475664759068e-07, "loss": 0.0286, "reward": -0.31177592277526855, "reward_std": 0.350917749106884, "rewards/cosine_scaled_reward": -0.27394353225827217, "rewards/format_reward": 0.23611112032085657, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 2963.8055419921875, "epoch": 0.2262210796915167, "grad_norm": 0.22750675678253174, "kl": 0.016204833984375, "learning_rate": 9.282549715730579e-07, "loss": 0.0406, "reward": 0.32277560234069824, "reward_std": 0.8804080411791801, "rewards/cosine_scaled_reward": -0.07472331821918488, "rewards/format_reward": 0.4722222238779068, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 3082.263916015625, "epoch": 0.22793487574978577, "grad_norm": 0.2046993225812912, "kl": 0.021820068359375, "learning_rate": 9.265439410565328e-07, "loss": 0.0353, "reward": 0.3385091759264469, "reward_std": 0.7099575102329254, "rewards/cosine_scaled_reward": -0.011300940066576004, "rewards/format_reward": 0.3611111156642437, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 2206.375030517578, "epoch": 0.22964867180805484, "grad_norm": 0.19563263654708862, "kl": 0.017303466796875, "learning_rate": 9.248145583195447e-07, "loss": 0.0577, "reward": 0.640228021889925, "reward_std": 0.7054692879319191, "rewards/cosine_scaled_reward": 0.0006695720367133617, "rewards/format_reward": 0.6388888955116272, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2421.611114501953, "epoch": 0.23136246786632392, "grad_norm": 0.338701069355011, "kl": 0.0213623046875, "learning_rate": 9.230669076497687e-07, "loss": 0.1507, "reward": 0.6078107673674822, "reward_std": 0.8746988773345947, "rewards/cosine_scaled_reward": 0.0469609391366248, "rewards/format_reward": 0.5138888955116272, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 2651.125030517578, "epoch": 0.23307626392459296, "grad_norm": 0.28927695751190186, "kl": 0.0211334228515625, "learning_rate": 9.213010742252327e-07, "loss": 0.1053, "reward": 0.35874155908823013, "reward_std": 0.7097110822796822, "rewards/cosine_scaled_reward": -0.015073666349053383, "rewards/format_reward": 0.3888888992369175, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 3202.0972900390625, "epoch": 0.23479005998286204, "grad_norm": 0.17811518907546997, "kl": 0.02239990234375, "learning_rate": 9.195171441101668e-07, "loss": 0.0492, "reward": 0.5012375935912132, "reward_std": 0.9828417152166367, "rewards/cosine_scaled_reward": 0.03534099366515875, "rewards/format_reward": 0.43055555410683155, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2946.4166870117188, "epoch": 0.2365038560411311, "grad_norm": 0.23094090819358826, "kl": 0.0191650390625, "learning_rate": 9.177152042508077e-07, "loss": 0.0193, "reward": 0.09741606749594212, "reward_std": 0.5724444687366486, "rewards/cosine_scaled_reward": -0.13184750825166702, "rewards/format_reward": 0.36111111007630825, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 2927.0972290039062, "epoch": 0.23821765209940018, "grad_norm": 0.19129879772663116, "kl": 0.024505615234375, "learning_rate": 9.158953424711624e-07, "loss": 0.0374, "reward": 0.1535217664204538, "reward_std": 0.4049301743507385, "rewards/cosine_scaled_reward": -0.08296133577823639, "rewards/format_reward": 0.3194444449618459, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 2202.6944580078125, "epoch": 0.23993144815766923, "grad_norm": 0.4724877178668976, "kl": 0.0255584716796875, "learning_rate": 9.140576474687263e-07, "loss": 0.1836, "reward": 0.3395635038614273, "reward_std": 0.6675402373075485, "rewards/cosine_scaled_reward": -0.11494047567248344, "rewards/format_reward": 0.5694444552063942, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 2910.916748046875, "epoch": 0.2416452442159383, "grad_norm": 0.18322300910949707, "kl": 0.02935791015625, "learning_rate": 9.122022088101613e-07, "loss": 0.0365, "reward": 0.045268273912370205, "reward_std": 0.6290135830640793, "rewards/cosine_scaled_reward": -0.1440325528383255, "rewards/format_reward": 0.3333333367481828, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 3141.638916015625, "epoch": 0.24335904027420738, "grad_norm": 0.1756112426519394, "kl": 0.031341552734375, "learning_rate": 9.103291169269299e-07, "loss": 0.0029, "reward": -0.12469126284122467, "reward_std": 0.39061762765049934, "rewards/cosine_scaled_reward": -0.18734563700854778, "rewards/format_reward": 0.2500000009313226, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2654.4166259765625, "epoch": 0.24507283633247642, "grad_norm": 0.29079416394233704, "kl": 0.020843505859375, "learning_rate": 9.084384631108882e-07, "loss": 0.0697, "reward": 0.4159288965165615, "reward_std": 0.7245111912488937, "rewards/cosine_scaled_reward": -0.06981334753800184, "rewards/format_reward": 0.555555559694767, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 3201.9306030273438, "epoch": 0.2467866323907455, "grad_norm": 0.197592630982399, "kl": 0.0269775390625, "learning_rate": 9.065303395098358e-07, "loss": 0.0373, "reward": -0.11726564727723598, "reward_std": 0.6086189821362495, "rewards/cosine_scaled_reward": -0.21141060069203377, "rewards/format_reward": 0.30555556155741215, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 2885.9722900390625, "epoch": 0.24850042844901457, "grad_norm": 0.29763004183769226, "kl": 0.028472900390625, "learning_rate": 9.046048391230247e-07, "loss": 0.0677, "reward": 0.5742630921304226, "reward_std": 0.37366680055856705, "rewards/cosine_scaled_reward": 0.0649093296378851, "rewards/format_reward": 0.4444444440305233, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2155.9444885253906, "epoch": 0.25021422450728364, "grad_norm": 0.3611903190612793, "kl": 0.024993896484375, "learning_rate": 9.026620557966279e-07, "loss": 0.1546, "reward": 0.5257812030613422, "reward_std": 0.9518508315086365, "rewards/cosine_scaled_reward": -0.049609407782554626, "rewards/format_reward": 0.625, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 2810.9583740234375, "epoch": 0.2519280205655527, "grad_norm": 0.2670803964138031, "kl": 0.031829833984375, "learning_rate": 9.007020842191634e-07, "loss": 0.0389, "reward": 0.11524944752454758, "reward_std": 0.6441401988267899, "rewards/cosine_scaled_reward": -0.12293083127588034, "rewards/format_reward": 0.36111112032085657, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 3006.0416259765625, "epoch": 0.2536418166238218, "grad_norm": 0.230261892080307, "kl": 0.0283203125, "learning_rate": 8.987250199168808e-07, "loss": 0.0866, "reward": -0.06906389445066452, "reward_std": 0.41436275094747543, "rewards/cosine_scaled_reward": -0.21508748084306717, "rewards/format_reward": 0.3611111156642437, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 2527.4305572509766, "epoch": 0.25535561268209084, "grad_norm": 0.2313620001077652, "kl": 0.028106689453125, "learning_rate": 8.967309592491052e-07, "loss": 0.05, "reward": 0.3055970072746277, "reward_std": 0.8265255615115166, "rewards/cosine_scaled_reward": -0.0972014885628596, "rewards/format_reward": 0.5000000074505806, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 2374.0416870117188, "epoch": 0.2570694087403599, "grad_norm": 0.7321764826774597, "kl": 0.028839111328125, "learning_rate": 8.9471999940354e-07, "loss": 0.1817, "reward": 0.8978928253054619, "reward_std": 0.7169746980071068, "rewards/cosine_scaled_reward": 0.16422418132424355, "rewards/format_reward": 0.5694444477558136, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 2437.0694580078125, "epoch": 0.258783204798629, "grad_norm": 0.7071412801742554, "kl": 0.041168212890625, "learning_rate": 8.926922383915315e-07, "loss": 0.2136, "reward": 0.06301388889551163, "reward_std": 0.4757090378552675, "rewards/cosine_scaled_reward": -0.21849306486546993, "rewards/format_reward": 0.5000000074505806, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 2845.4583129882812, "epoch": 0.26049700085689803, "grad_norm": 0.5604143738746643, "kl": 0.046142578125, "learning_rate": 8.906477750432903e-07, "loss": 0.1265, "reward": 0.16903822124004364, "reward_std": 0.5248951427638531, "rewards/cosine_scaled_reward": -0.09603646397590637, "rewards/format_reward": 0.36111111380159855, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 2768.4027709960938, "epoch": 0.2622107969151671, "grad_norm": 0.23171323537826538, "kl": 0.0499267578125, "learning_rate": 8.88586709003076e-07, "loss": 0.0185, "reward": 0.15082042291760445, "reward_std": 0.7368991822004318, "rewards/cosine_scaled_reward": -0.12597868964076042, "rewards/format_reward": 0.4027777798473835, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 2804.9861450195312, "epoch": 0.2639245929734362, "grad_norm": 0.40300193428993225, "kl": 0.05609130859375, "learning_rate": 8.865091407243394e-07, "loss": 0.0954, "reward": 0.4552767127752304, "reward_std": 0.7285914719104767, "rewards/cosine_scaled_reward": -0.00847275834530592, "rewards/format_reward": 0.472222238779068, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3049.0416870117188, "epoch": 0.2656383890317052, "grad_norm": 0.23651528358459473, "kl": 0.0596923828125, "learning_rate": 8.844151714648274e-07, "loss": -0.0013, "reward": 0.12507159425877035, "reward_std": 0.8443149924278259, "rewards/cosine_scaled_reward": -0.10413086414337158, "rewards/format_reward": 0.3333333469927311, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 2847.4584350585938, "epoch": 0.26735218508997427, "grad_norm": 0.3277675211429596, "kl": 0.05987548828125, "learning_rate": 8.823049032816478e-07, "loss": 0.032, "reward": 0.31620367243885994, "reward_std": 0.7322921454906464, "rewards/cosine_scaled_reward": -0.10578705929219723, "rewards/format_reward": 0.5277777835726738, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2717.8750610351562, "epoch": 0.26906598114824337, "grad_norm": 0.39394786953926086, "kl": 0.0775146484375, "learning_rate": 8.801784390262943e-07, "loss": 0.0925, "reward": 0.10540201608091593, "reward_std": 0.6488600596785545, "rewards/cosine_scaled_reward": -0.13479896634817123, "rewards/format_reward": 0.3750000037252903, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 2818.125, "epoch": 0.2707797772065124, "grad_norm": 0.40347573161125183, "kl": 0.0806884765625, "learning_rate": 8.780358823396352e-07, "loss": 0.0484, "reward": 0.07575460057705641, "reward_std": 0.6178670972585678, "rewards/cosine_scaled_reward": -0.15656715538352728, "rewards/format_reward": 0.3888888889923692, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3166.1666259765625, "epoch": 0.27249357326478146, "grad_norm": 0.3342011868953705, "kl": 0.1055908203125, "learning_rate": 8.758773376468604e-07, "loss": 0.0491, "reward": -0.02998074982315302, "reward_std": 0.6097311675548553, "rewards/cosine_scaled_reward": -0.19554592855274677, "rewards/format_reward": 0.36111111380159855, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3013.7222290039062, "epoch": 0.27420736932305056, "grad_norm": 0.4173794388771057, "kl": 0.106689453125, "learning_rate": 8.737029101523929e-07, "loss": 0.0339, "reward": 0.3507204055786133, "reward_std": 0.6021532118320465, "rewards/cosine_scaled_reward": 0.0017490852624177933, "rewards/format_reward": 0.34722222201526165, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 2572.8194274902344, "epoch": 0.2759211653813196, "grad_norm": 0.4282573163509369, "kl": 0.123046875, "learning_rate": 8.715127058347614e-07, "loss": 0.0645, "reward": 0.010059013031423092, "reward_std": 0.5160095170140266, "rewards/cosine_scaled_reward": -0.15469271643087268, "rewards/format_reward": 0.3194444514811039, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2242.9444274902344, "epoch": 0.2776349614395887, "grad_norm": 0.39615368843078613, "kl": 0.1090087890625, "learning_rate": 8.693068314414344e-07, "loss": 0.0772, "reward": 0.16390804119873792, "reward_std": 0.5712290816009045, "rewards/cosine_scaled_reward": -0.18887930922210217, "rewards/format_reward": 0.5416666716337204, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 2832.9861450195312, "epoch": 0.27934875749785776, "grad_norm": 0.6331592798233032, "kl": 0.1380615234375, "learning_rate": 8.670853944836176e-07, "loss": 0.1022, "reward": 0.20613746903836727, "reward_std": 0.7383135333657265, "rewards/cosine_scaled_reward": -0.04276460176333785, "rewards/format_reward": 0.2916666753590107, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 2359.250030517578, "epoch": 0.2810625535561268, "grad_norm": 0.8423472046852112, "kl": 0.1591796875, "learning_rate": 8.648485032310144e-07, "loss": 0.0087, "reward": 0.1561935730278492, "reward_std": 0.8059368506073952, "rewards/cosine_scaled_reward": -0.10245877737179399, "rewards/format_reward": 0.3611111156642437, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 2781.041717529297, "epoch": 0.2827763496143959, "grad_norm": 0.5075474977493286, "kl": 0.17626953125, "learning_rate": 8.625962667065487e-07, "loss": 0.0061, "reward": 0.35288394801318645, "reward_std": 0.7819623723626137, "rewards/cosine_scaled_reward": -0.038835824467241764, "rewards/format_reward": 0.4305555634200573, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 2699.3611450195312, "epoch": 0.28449014567266495, "grad_norm": 0.41815418004989624, "kl": 0.1614990234375, "learning_rate": 8.603287946810513e-07, "loss": 0.0354, "reward": 0.2616021269932389, "reward_std": 0.8704780116677284, "rewards/cosine_scaled_reward": -0.07753227837383747, "rewards/format_reward": 0.416666672565043, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 2883.5972900390625, "epoch": 0.286203941730934, "grad_norm": 0.6100507378578186, "kl": 0.192626953125, "learning_rate": 8.580461976679099e-07, "loss": 0.1117, "reward": 0.6217167973518372, "reward_std": 1.1077049523591995, "rewards/cosine_scaled_reward": 0.08863616734743118, "rewards/format_reward": 0.4444444477558136, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 2844.77783203125, "epoch": 0.2879177377892031, "grad_norm": 1.0341858863830566, "kl": 0.193359375, "learning_rate": 8.557485869176825e-07, "loss": 0.1403, "reward": 0.44696745090186596, "reward_std": 0.8215643167495728, "rewards/cosine_scaled_reward": 0.008205945428926498, "rewards/format_reward": 0.4305555671453476, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 2149.9306030273438, "epoch": 0.28963153384747214, "grad_norm": 0.8718350529670715, "kl": 0.213134765625, "learning_rate": 8.534360744126753e-07, "loss": 0.0265, "reward": 0.24263115064240992, "reward_std": 0.7163522839546204, "rewards/cosine_scaled_reward": -0.08701775036752224, "rewards/format_reward": 0.416666679084301, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 2499.9722900390625, "epoch": 0.2913453299057412, "grad_norm": 0.7302869558334351, "kl": 0.233154296875, "learning_rate": 8.511087728614862e-07, "loss": 0.1123, "reward": 0.1794309187680483, "reward_std": 0.7098504453897476, "rewards/cosine_scaled_reward": -0.056117892265319824, "rewards/format_reward": 0.2916666679084301, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 2474.6250610351562, "epoch": 0.2930591259640103, "grad_norm": 0.5687596797943115, "kl": 0.2568359375, "learning_rate": 8.487667956935087e-07, "loss": 0.0829, "reward": 0.3411689009517431, "reward_std": 0.6550407111644745, "rewards/cosine_scaled_reward": -0.1280266623944044, "rewards/format_reward": 0.597222238779068, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 2846.763916015625, "epoch": 0.29477292202227934, "grad_norm": 0.48782670497894287, "kl": 0.252197265625, "learning_rate": 8.464102570534061e-07, "loss": 0.0672, "reward": 0.22427130304276943, "reward_std": 0.6338695511221886, "rewards/cosine_scaled_reward": -0.10314211621880531, "rewards/format_reward": 0.4305555559694767, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 2281.2638549804688, "epoch": 0.29648671808054844, "grad_norm": 1.23881196975708, "kl": 0.234375, "learning_rate": 8.440392717955475e-07, "loss": 0.1363, "reward": 0.24636091478168964, "reward_std": 0.725439690053463, "rewards/cosine_scaled_reward": -0.13376398687250912, "rewards/format_reward": 0.5138888899236917, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 2596.7500610351562, "epoch": 0.2982005141388175, "grad_norm": 0.9256901741027832, "kl": 0.32763671875, "learning_rate": 8.416539554784089e-07, "loss": 0.0993, "reward": 0.03723787656053901, "reward_std": 0.669374942779541, "rewards/cosine_scaled_reward": -0.18276994861662388, "rewards/format_reward": 0.4027777835726738, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 2802.7916870117188, "epoch": 0.29991431019708653, "grad_norm": 1.6108390092849731, "kl": 0.41748046875, "learning_rate": 8.392544243589427e-07, "loss": -0.0161, "reward": -0.026769233867526054, "reward_std": 0.7613073363900185, "rewards/cosine_scaled_reward": -0.1939401812851429, "rewards/format_reward": 0.3611111231148243, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 2250.7916564941406, "epoch": 0.30162810625535563, "grad_norm": 1.3027092218399048, "kl": 0.29345703125, "learning_rate": 8.368407953869103e-07, "loss": 0.1672, "reward": 0.34848211891949177, "reward_std": 0.8886565566062927, "rewards/cosine_scaled_reward": -0.07575894566252828, "rewards/format_reward": 0.5000000074505806, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 3187.25, "epoch": 0.3033419023136247, "grad_norm": 0.7333221435546875, "kl": 0.390625, "learning_rate": 8.344131861991828e-07, "loss": 0.0057, "reward": -0.06705992296338081, "reward_std": 0.5766744017601013, "rewards/cosine_scaled_reward": -0.16547441016882658, "rewards/format_reward": 0.26388889364898205, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 2658.9583129882812, "epoch": 0.3050556983718937, "grad_norm": 2.110689878463745, "kl": 0.40185546875, "learning_rate": 8.319717151140072e-07, "loss": 0.1018, "reward": 0.15619678050279617, "reward_std": 0.5456085540354252, "rewards/cosine_scaled_reward": -0.15106826776172966, "rewards/format_reward": 0.4583333358168602, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 3046.361083984375, "epoch": 0.3067694944301628, "grad_norm": 1.409805417060852, "kl": 0.44482421875, "learning_rate": 8.295165011252396e-07, "loss": 0.1019, "reward": -0.08309876918792725, "reward_std": 0.6837619245052338, "rewards/cosine_scaled_reward": -0.1665493929758668, "rewards/format_reward": 0.2500000027939677, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 2090.263916015625, "epoch": 0.30848329048843187, "grad_norm": 1.259092926979065, "kl": 0.4365234375, "learning_rate": 8.270476638965461e-07, "loss": 0.1312, "reward": 0.5355786010622978, "reward_std": 0.9339739978313446, "rewards/cosine_scaled_reward": -0.07248848024755716, "rewards/format_reward": 0.6805555671453476, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 2998.388916015625, "epoch": 0.3101970865467009, "grad_norm": 0.7514684796333313, "kl": 0.5283203125, "learning_rate": 8.245653237555705e-07, "loss": 0.0645, "reward": 0.0823521837592125, "reward_std": 0.6557292975485325, "rewards/cosine_scaled_reward": -0.1602128129452467, "rewards/format_reward": 0.4027777835726738, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 3348.6806030273438, "epoch": 0.31191088260497, "grad_norm": 1.1279796361923218, "kl": 0.6435546875, "learning_rate": 8.220696016880687e-07, "loss": 0.0509, "reward": -0.29229177720844746, "reward_std": 0.44720375537872314, "rewards/cosine_scaled_reward": -0.27809032425284386, "rewards/format_reward": 0.26388889364898205, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 2909.6806030273438, "epoch": 0.31362467866323906, "grad_norm": 0.8539410829544067, "kl": 0.5654296875, "learning_rate": 8.195606193320136e-07, "loss": 0.1078, "reward": 0.20359659614041448, "reward_std": 0.7151020988821983, "rewards/cosine_scaled_reward": -0.07875726278871298, "rewards/format_reward": 0.361111112870276, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 2720.6666870117188, "epoch": 0.31533847472150817, "grad_norm": 1.0726344585418701, "kl": 0.6005859375, "learning_rate": 8.170384989716657e-07, "loss": 0.0571, "reward": 0.45398143492639065, "reward_std": 0.8964811712503433, "rewards/cosine_scaled_reward": -0.04384262952953577, "rewards/format_reward": 0.5416666716337204, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2828.9305419921875, "epoch": 0.3170522707797772, "grad_norm": 0.9460340142250061, "kl": 0.6220703125, "learning_rate": 8.145033635316128e-07, "loss": 0.1297, "reward": -0.03706150595098734, "reward_std": 0.7321052774786949, "rewards/cosine_scaled_reward": -0.1990863112732768, "rewards/format_reward": 0.3611111231148243, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2725.8750610351562, "epoch": 0.31876606683804626, "grad_norm": 1.0472413301467896, "kl": 0.5556640625, "learning_rate": 8.119553365707802e-07, "loss": 0.0507, "reward": -0.008732129819691181, "reward_std": 0.43902990967035294, "rewards/cosine_scaled_reward": -0.2057549599558115, "rewards/format_reward": 0.4027777835726738, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 2761.0556030273438, "epoch": 0.32047986289631536, "grad_norm": 0.9237687587738037, "kl": 0.55126953125, "learning_rate": 8.093945422764069e-07, "loss": 0.1122, "reward": 0.34623236872721463, "reward_std": 0.8785705417394638, "rewards/cosine_scaled_reward": -0.07688381336629391, "rewards/format_reward": 0.5000000074505806, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 3188.4027709960938, "epoch": 0.3221936589545844, "grad_norm": 1.4287723302841187, "kl": 0.607421875, "learning_rate": 8.068211054579943e-07, "loss": 0.0457, "reward": -0.10439129918813705, "reward_std": 0.6522045210003853, "rewards/cosine_scaled_reward": -0.20497343130409718, "rewards/format_reward": 0.30555556807667017, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 2965.75, "epoch": 0.32390745501285345, "grad_norm": 1.0540153980255127, "kl": 0.6044921875, "learning_rate": 8.04235151541222e-07, "loss": 0.0926, "reward": 0.00805249996483326, "reward_std": 0.5005255490541458, "rewards/cosine_scaled_reward": -0.1904182005673647, "rewards/format_reward": 0.38888888992369175, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 2995.3611450195312, "epoch": 0.32562125107112255, "grad_norm": 2.005993604660034, "kl": 0.60546875, "learning_rate": 8.01636806561836e-07, "loss": 0.1571, "reward": -0.2390465196222067, "reward_std": 0.5108147040009499, "rewards/cosine_scaled_reward": -0.2792454734444618, "rewards/format_reward": 0.3194444449618459, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2995.02783203125, "epoch": 0.3273350471293916, "grad_norm": 0.914374828338623, "kl": 0.537109375, "learning_rate": 7.990261971595048e-07, "loss": 0.0791, "reward": -0.008268387988209724, "reward_std": 0.7869899272918701, "rewards/cosine_scaled_reward": -0.19163418684911449, "rewards/format_reward": 0.3750000037252903, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 2657.3055725097656, "epoch": 0.32904884318766064, "grad_norm": 0.9198621511459351, "kl": 0.6298828125, "learning_rate": 7.964034505716476e-07, "loss": 0.1016, "reward": 0.14560853224247694, "reward_std": 0.44526704400777817, "rewards/cosine_scaled_reward": -0.177195742726326, "rewards/format_reward": 0.5000000074505806, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 2256.6666564941406, "epoch": 0.33076263924592975, "grad_norm": 0.9307562708854675, "kl": 0.5947265625, "learning_rate": 7.93768694627233e-07, "loss": 0.082, "reward": 0.184324630536139, "reward_std": 0.5673187747597694, "rewards/cosine_scaled_reward": -0.17867101542651653, "rewards/format_reward": 0.5416666679084301, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 2654.013916015625, "epoch": 0.3324764353041988, "grad_norm": 1.2104908227920532, "kl": 0.6787109375, "learning_rate": 7.911220577405484e-07, "loss": 0.0927, "reward": 0.5049788989126682, "reward_std": 0.6255298256874084, "rewards/cosine_scaled_reward": 0.0024894457310438156, "rewards/format_reward": 0.5000000074505806, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 3088.2222900390625, "epoch": 0.3341902313624679, "grad_norm": 2.0733349323272705, "kl": 0.787109375, "learning_rate": 7.884636689049422e-07, "loss": 0.0256, "reward": -0.1914132796227932, "reward_std": 0.39547703973948956, "rewards/cosine_scaled_reward": -0.23459553346037865, "rewards/format_reward": 0.2777777761220932, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2572.5416259765625, "epoch": 0.33590402742073694, "grad_norm": 0.9238296151161194, "kl": 0.6484375, "learning_rate": 7.857936576865356e-07, "loss": 0.0799, "reward": 0.4742476176470518, "reward_std": 0.8941326662898064, "rewards/cosine_scaled_reward": -0.04759840480983257, "rewards/format_reward": 0.5694444477558136, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 2907.4583129882812, "epoch": 0.337617823479006, "grad_norm": 0.9024485945701599, "kl": 0.693359375, "learning_rate": 7.831121542179086e-07, "loss": 0.0713, "reward": 0.0948091521859169, "reward_std": 0.4578506797552109, "rewards/cosine_scaled_reward": -0.11926210392266512, "rewards/format_reward": 0.33333334513008595, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2943.4305419921875, "epoch": 0.3393316195372751, "grad_norm": 1.3114806413650513, "kl": 0.7470703125, "learning_rate": 7.804192891917571e-07, "loss": 0.0493, "reward": 0.04198681065463461, "reward_std": 0.5121570453047752, "rewards/cosine_scaled_reward": -0.1595621556043625, "rewards/format_reward": 0.3611111156642437, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 3010.9166870117188, "epoch": 0.34104541559554413, "grad_norm": 0.6777936816215515, "kl": 0.697265625, "learning_rate": 7.777151938545235e-07, "loss": 0.0892, "reward": 0.12530913203954697, "reward_std": 0.5297227501869202, "rewards/cosine_scaled_reward": -0.145678770262748, "rewards/format_reward": 0.4166666716337204, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 2658.4722595214844, "epoch": 0.3427592116538132, "grad_norm": 1.0869694948196411, "kl": 0.5361328125, "learning_rate": 7.75e-07, "loss": 0.0285, "reward": 0.42103337205480784, "reward_std": 0.5303617715835571, "rewards/cosine_scaled_reward": -0.04642775317188352, "rewards/format_reward": 0.5138888992369175, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 2742.0833435058594, "epoch": 0.3444730077120823, "grad_norm": 0.6390620470046997, "kl": 0.56005859375, "learning_rate": 7.72273839962904e-07, "loss": 0.078, "reward": 0.13805552199482918, "reward_std": 0.5941917151212692, "rewards/cosine_scaled_reward": -0.18097224179655313, "rewards/format_reward": 0.5000000037252903, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 2622.611083984375, "epoch": 0.3461868037703513, "grad_norm": 1.5139989852905273, "kl": 0.51416015625, "learning_rate": 7.695368466124296e-07, "loss": 0.1322, "reward": 0.2749571923632175, "reward_std": 0.6380000561475754, "rewards/cosine_scaled_reward": -0.09863251959905028, "rewards/format_reward": 0.4722222313284874, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 2710.7361450195312, "epoch": 0.34790059982862037, "grad_norm": 1.517341136932373, "kl": 0.51171875, "learning_rate": 7.667891533457718e-07, "loss": 0.1124, "reward": 0.5119861587882042, "reward_std": 0.9760274440050125, "rewards/cosine_scaled_reward": -0.014840253628790379, "rewards/format_reward": 0.541666679084301, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 2822.6112060546875, "epoch": 0.3496143958868895, "grad_norm": 1.1272459030151367, "kl": 0.548828125, "learning_rate": 7.640308940816239e-07, "loss": 0.0668, "reward": 0.03917721984907985, "reward_std": 0.7430369108915329, "rewards/cosine_scaled_reward": -0.17485582828521729, "rewards/format_reward": 0.38888888992369175, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 2645.5694580078125, "epoch": 0.3513281919451585, "grad_norm": 1.7582755088806152, "kl": 0.5556640625, "learning_rate": 7.612622032536507e-07, "loss": 0.1055, "reward": 0.509862631559372, "reward_std": 0.7304475903511047, "rewards/cosine_scaled_reward": 0.004931296221911907, "rewards/format_reward": 0.5000000111758709, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 2748.02783203125, "epoch": 0.35304198800342756, "grad_norm": 13.779873847961426, "kl": 1.0166015625, "learning_rate": 7.584832158039378e-07, "loss": 0.0928, "reward": 0.13606557785533369, "reward_std": 0.5326481983065605, "rewards/cosine_scaled_reward": -0.2236338797956705, "rewards/format_reward": 0.5833333507180214, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 2727.9722900390625, "epoch": 0.35475578406169667, "grad_norm": 4.678215503692627, "kl": 0.8974609375, "learning_rate": 7.556940671764124e-07, "loss": 0.1124, "reward": 0.3662101551890373, "reward_std": 0.5158084109425545, "rewards/cosine_scaled_reward": -0.05995047930628061, "rewards/format_reward": 0.4861111268401146, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 2660.9722900390625, "epoch": 0.3564695801199657, "grad_norm": 2.2143702507019043, "kl": 0.7880859375, "learning_rate": 7.528948933102438e-07, "loss": 0.067, "reward": 0.29765829257667065, "reward_std": 0.7447296231985092, "rewards/cosine_scaled_reward": -0.1428375095129013, "rewards/format_reward": 0.5833333432674408, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2533.4444580078125, "epoch": 0.3581833761782348, "grad_norm": 1.057923436164856, "kl": 0.626953125, "learning_rate": 7.500858306332172e-07, "loss": 0.1388, "reward": 0.22743514459580183, "reward_std": 0.8155356049537659, "rewards/cosine_scaled_reward": -0.15017131343483925, "rewards/format_reward": 0.5277777835726738, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 2513.763916015625, "epoch": 0.35989717223650386, "grad_norm": 3.4706244468688965, "kl": 0.6640625, "learning_rate": 7.472670160550848e-07, "loss": 0.2307, "reward": 0.32537855207920074, "reward_std": 0.6403735391795635, "rewards/cosine_scaled_reward": -0.11508850922109559, "rewards/format_reward": 0.555555559694767, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 3072.1806030273438, "epoch": 0.3616109682947729, "grad_norm": 0.867877721786499, "kl": 0.85009765625, "learning_rate": 7.444385869608921e-07, "loss": 0.1175, "reward": 0.021036310121417046, "reward_std": 0.5472413003444672, "rewards/cosine_scaled_reward": -0.170037392526865, "rewards/format_reward": 0.36111112125217915, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 2580.791717529297, "epoch": 0.363324764353042, "grad_norm": 1.1602129936218262, "kl": 0.8037109375, "learning_rate": 7.416006812042827e-07, "loss": 0.1343, "reward": 0.6223988421261311, "reward_std": 0.851245753467083, "rewards/cosine_scaled_reward": -0.0013005826622247696, "rewards/format_reward": 0.625, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2920.2916870117188, "epoch": 0.36503856041131105, "grad_norm": 1.2226418256759644, "kl": 1.0029296875, "learning_rate": 7.387534371007797e-07, "loss": 0.1063, "reward": 0.10683083906769753, "reward_std": 0.6580070406198502, "rewards/cosine_scaled_reward": -0.21741791814565659, "rewards/format_reward": 0.5416666641831398, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2951.166748046875, "epoch": 0.3667523564695801, "grad_norm": 1.302587628364563, "kl": 1.1484375, "learning_rate": 7.358969934210438e-07, "loss": 0.1102, "reward": 0.16499032359570265, "reward_std": 0.5117045789957047, "rewards/cosine_scaled_reward": -0.13278261446976103, "rewards/format_reward": 0.4305555671453476, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 2642.3056030273438, "epoch": 0.3684661525278492, "grad_norm": 1.028397560119629, "kl": 0.8671875, "learning_rate": 7.330314893841101e-07, "loss": 0.1035, "reward": 0.3645508070476353, "reward_std": 0.9228581190109253, "rewards/cosine_scaled_reward": -0.10939126997254789, "rewards/format_reward": 0.5833333283662796, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 2541.0833129882812, "epoch": 0.37017994858611825, "grad_norm": 1.578083872795105, "kl": 0.86865234375, "learning_rate": 7.301570646506027e-07, "loss": 0.1489, "reward": 0.26378826051950455, "reward_std": 0.6202561929821968, "rewards/cosine_scaled_reward": -0.13199475780129433, "rewards/format_reward": 0.5277777910232544, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 2184.9445190429688, "epoch": 0.3718937446443873, "grad_norm": 1.103194236755371, "kl": 0.6845703125, "learning_rate": 7.27273859315928e-07, "loss": 0.0844, "reward": 0.4086096244864166, "reward_std": 0.7625616788864136, "rewards/cosine_scaled_reward": -0.11513962969183922, "rewards/format_reward": 0.6388889029622078, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2404.6805725097656, "epoch": 0.3736075407026564, "grad_norm": 2.3009181022644043, "kl": 0.8935546875, "learning_rate": 7.243820139034464e-07, "loss": 0.1892, "reward": 0.4591095373034477, "reward_std": 0.5642153918743134, "rewards/cosine_scaled_reward": -0.08294522017240524, "rewards/format_reward": 0.625, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 3044.9862060546875, "epoch": 0.37532133676092544, "grad_norm": 1.2761178016662598, "kl": 1.19921875, "learning_rate": 7.214816693576234e-07, "loss": 0.1195, "reward": 0.21450293064117432, "reward_std": 0.7603526711463928, "rewards/cosine_scaled_reward": -0.12191520072519779, "rewards/format_reward": 0.4583333358168602, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 2793.90283203125, "epoch": 0.37703513281919454, "grad_norm": 1.6576476097106934, "kl": 1.166015625, "learning_rate": 7.185729670371604e-07, "loss": 0.0977, "reward": 0.22772593423724174, "reward_std": 0.5124068222939968, "rewards/cosine_scaled_reward": -0.18474812898784876, "rewards/format_reward": 0.5972222313284874, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 2835.0, "epoch": 0.3787489288774636, "grad_norm": 3.882580280303955, "kl": 1.0927734375, "learning_rate": 7.156560487081051e-07, "loss": 0.0245, "reward": 0.10485807061195374, "reward_std": 0.5114092901349068, "rewards/cosine_scaled_reward": -0.20451541244983673, "rewards/format_reward": 0.5138888955116272, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 2310.9305725097656, "epoch": 0.38046272493573263, "grad_norm": 1.3096808195114136, "kl": 0.87890625, "learning_rate": 7.127310565369415e-07, "loss": 0.1143, "reward": 0.4324228148907423, "reward_std": 0.5727507174015045, "rewards/cosine_scaled_reward": -0.13101080805063248, "rewards/format_reward": 0.6944444477558136, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 2826.2222290039062, "epoch": 0.38217652099400173, "grad_norm": 1.182824730873108, "kl": 0.931640625, "learning_rate": 7.097981330836616e-07, "loss": 0.1159, "reward": 0.17814365401864052, "reward_std": 0.5051928982138634, "rewards/cosine_scaled_reward": -0.14703928492963314, "rewards/format_reward": 0.4722222238779068, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 2644.8611450195312, "epoch": 0.3838903170522708, "grad_norm": 1.271640658378601, "kl": 0.7939453125, "learning_rate": 7.068574212948169e-07, "loss": 0.0771, "reward": 0.21772570302709937, "reward_std": 0.5406957715749741, "rewards/cosine_scaled_reward": -0.17585936933755875, "rewards/format_reward": 0.5694444477558136, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 2389.749969482422, "epoch": 0.3856041131105398, "grad_norm": 2.586735486984253, "kl": 0.6689453125, "learning_rate": 7.039090644965509e-07, "loss": -0.0027, "reward": 0.5408617407083511, "reward_std": 0.7554269433021545, "rewards/cosine_scaled_reward": -0.03512469958513975, "rewards/format_reward": 0.6111111119389534, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2435.250030517578, "epoch": 0.3873179091688089, "grad_norm": 1.4329172372817993, "kl": 0.59521484375, "learning_rate": 7.009532063876148e-07, "loss": 0.0425, "reward": 0.21151528507471085, "reward_std": 0.6967541426420212, "rewards/cosine_scaled_reward": -0.14424235187470913, "rewards/format_reward": 0.5000000149011612, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2509.1945190429688, "epoch": 0.389031705227078, "grad_norm": 1.7964338064193726, "kl": 0.51220703125, "learning_rate": 6.979899910323624e-07, "loss": 0.0631, "reward": 0.3222038522362709, "reward_std": 0.7920150905847549, "rewards/cosine_scaled_reward": -0.0958425235003233, "rewards/format_reward": 0.5138888955116272, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 2343.888885498047, "epoch": 0.390745501285347, "grad_norm": 1.312915563583374, "kl": 0.48828125, "learning_rate": 6.950195628537299e-07, "loss": 0.0327, "reward": 0.72439269348979, "reward_std": 0.6716032773256302, "rewards/cosine_scaled_reward": 0.028863003477454185, "rewards/format_reward": 0.6666666865348816, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2668.8333740234375, "epoch": 0.3924592973436161, "grad_norm": 1.1794544458389282, "kl": 0.591796875, "learning_rate": 6.920420666261961e-07, "loss": 0.0617, "reward": 0.4572554435580969, "reward_std": 0.6365808099508286, "rewards/cosine_scaled_reward": -0.028316727373749018, "rewards/format_reward": 0.5138888955116272, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 2405.111114501953, "epoch": 0.39417309340188517, "grad_norm": 2.7993645668029785, "kl": 0.52197265625, "learning_rate": 6.890576474687263e-07, "loss": 0.193, "reward": 0.6450787968933582, "reward_std": 0.6886177062988281, "rewards/cosine_scaled_reward": -0.017738381633535028, "rewards/format_reward": 0.6805555671453476, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 2889.763916015625, "epoch": 0.39588688946015427, "grad_norm": 0.9008044600486755, "kl": 0.59228515625, "learning_rate": 6.860664508377001e-07, "loss": 0.0832, "reward": 0.17632517218589783, "reward_std": 0.7136962860822678, "rewards/cosine_scaled_reward": -0.11322630103677511, "rewards/format_reward": 0.4027777835726738, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2096.763885498047, "epoch": 0.3976006855184233, "grad_norm": 2.9937281608581543, "kl": 0.44482421875, "learning_rate": 6.83068622519821e-07, "loss": 0.0198, "reward": 0.45767842745408416, "reward_std": 0.6805157586932182, "rewards/cosine_scaled_reward": -0.0697719173040241, "rewards/format_reward": 0.5972222313284874, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 2095.625030517578, "epoch": 0.39931448157669236, "grad_norm": 3.1695449352264404, "kl": 0.52392578125, "learning_rate": 6.800643086250121e-07, "loss": 0.124, "reward": 0.8969383761286736, "reward_std": 0.8693148195743561, "rewards/cosine_scaled_reward": 0.10124696930870414, "rewards/format_reward": 0.6944444477558136, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 2397.9722290039062, "epoch": 0.40102827763496146, "grad_norm": 2.038714647293091, "kl": 0.60302734375, "learning_rate": 6.770536555792944e-07, "loss": 0.0803, "reward": 0.3801136128604412, "reward_std": 0.6368846967816353, "rewards/cosine_scaled_reward": -0.11549876257777214, "rewards/format_reward": 0.6111111044883728, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 2803.1805419921875, "epoch": 0.4027420736932305, "grad_norm": 1.1210250854492188, "kl": 0.76171875, "learning_rate": 6.740368101176495e-07, "loss": 0.0749, "reward": 0.051421504467725754, "reward_std": 0.46992237120866776, "rewards/cosine_scaled_reward": -0.21734481677412987, "rewards/format_reward": 0.4861111231148243, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 2735.9583129882812, "epoch": 0.40445586975149955, "grad_norm": 1.5609543323516846, "kl": 0.6220703125, "learning_rate": 6.710139192768694e-07, "loss": 0.1051, "reward": 0.33594064973294735, "reward_std": 0.5969594717025757, "rewards/cosine_scaled_reward": -0.1306407954543829, "rewards/format_reward": 0.597222238779068, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 2207.9444580078125, "epoch": 0.40616966580976865, "grad_norm": 3.293438673019409, "kl": 0.619140625, "learning_rate": 6.679851303883891e-07, "loss": 0.1014, "reward": 0.6933649554848671, "reward_std": 0.4978405013680458, "rewards/cosine_scaled_reward": -0.02831752598285675, "rewards/format_reward": 0.75, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 2911.0972900390625, "epoch": 0.4078834618680377, "grad_norm": 1.396133303642273, "kl": 0.689453125, "learning_rate": 6.649505910711058e-07, "loss": 0.1308, "reward": 0.23781822435557842, "reward_std": 0.5772198215126991, "rewards/cosine_scaled_reward": -0.1449797886889428, "rewards/format_reward": 0.5277777835726738, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 2865.1944580078125, "epoch": 0.40959725792630675, "grad_norm": 1.02251398563385, "kl": 0.775390625, "learning_rate": 6.619104492241847e-07, "loss": 0.1421, "reward": 0.38452258985489607, "reward_std": 0.7435072809457779, "rewards/cosine_scaled_reward": -0.0646831514313817, "rewards/format_reward": 0.5138888880610466, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 2345.4444274902344, "epoch": 0.41131105398457585, "grad_norm": 4.698256492614746, "kl": 0.6904296875, "learning_rate": 6.588648530198504e-07, "loss": 0.1594, "reward": 0.7729744166135788, "reward_std": 0.8151284381747246, "rewards/cosine_scaled_reward": 0.10176499933004379, "rewards/format_reward": 0.5694444477558136, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2070.4166259765625, "epoch": 0.4130248500428449, "grad_norm": 8.216842651367188, "kl": 0.52490234375, "learning_rate": 6.558139508961654e-07, "loss": 0.1321, "reward": 0.48884235695004463, "reward_std": 0.5597383752465248, "rewards/cosine_scaled_reward": -0.06113438308238983, "rewards/format_reward": 0.6111111119389534, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 2937.0972290039062, "epoch": 0.414738646101114, "grad_norm": 1.0033913850784302, "kl": 0.5947265625, "learning_rate": 6.527578915497951e-07, "loss": 0.1027, "reward": 0.3329106804449111, "reward_std": 0.626296728849411, "rewards/cosine_scaled_reward": -0.09048910066485405, "rewards/format_reward": 0.5138888955116272, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 2632.3055419921875, "epoch": 0.41645244215938304, "grad_norm": 3.2918546199798584, "kl": 0.62109375, "learning_rate": 6.496968239287603e-07, "loss": 0.151, "reward": 0.423097662627697, "reward_std": 0.7703854739665985, "rewards/cosine_scaled_reward": -0.05234006140381098, "rewards/format_reward": 0.527777798473835, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2488.8055725097656, "epoch": 0.4181662382176521, "grad_norm": 1.382688283920288, "kl": 0.51220703125, "learning_rate": 6.466308972251785e-07, "loss": 0.1239, "reward": 0.5373616181313992, "reward_std": 0.642534889280796, "rewards/cosine_scaled_reward": -0.036874750861898065, "rewards/format_reward": 0.6111111119389534, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1840.0555877685547, "epoch": 0.4198800342759212, "grad_norm": 5.2921977043151855, "kl": 0.435546875, "learning_rate": 6.435602608679916e-07, "loss": 0.1668, "reward": 1.0207914784550667, "reward_std": 0.6237036064267159, "rewards/cosine_scaled_reward": 0.08678461611270905, "rewards/format_reward": 0.8472222536802292, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2378.77783203125, "epoch": 0.42159383033419023, "grad_norm": 4.525283336639404, "kl": 0.7333984375, "learning_rate": 6.404850645156841e-07, "loss": 0.2341, "reward": 0.8125267028808594, "reward_std": 0.7737091481685638, "rewards/cosine_scaled_reward": 0.05209667468443513, "rewards/format_reward": 0.7083333432674408, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 2746.7916870117188, "epoch": 0.4233076263924593, "grad_norm": 1.215826392173767, "kl": 0.7509765625, "learning_rate": 6.374054580489873e-07, "loss": 0.124, "reward": 0.16153091937303543, "reward_std": 0.7042593955993652, "rewards/cosine_scaled_reward": -0.12756787613034248, "rewards/format_reward": 0.4166666716337204, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 2420.125, "epoch": 0.4250214224507284, "grad_norm": 2.161705732345581, "kl": 0.86328125, "learning_rate": 6.343215915635761e-07, "loss": 0.0674, "reward": 0.6162599250674248, "reward_std": 0.7196609973907471, "rewards/cosine_scaled_reward": -0.05298116838093847, "rewards/format_reward": 0.7222222238779068, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 2277.500030517578, "epoch": 0.4267352185089974, "grad_norm": 3.1015782356262207, "kl": 0.732421875, "learning_rate": 6.31233615362752e-07, "loss": 0.0179, "reward": 0.6064739339053631, "reward_std": 0.6056996583938599, "rewards/cosine_scaled_reward": -0.03704079985618591, "rewards/format_reward": 0.6805555671453476, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2416.9583740234375, "epoch": 0.4284490145672665, "grad_norm": 8.199381828308105, "kl": 0.904296875, "learning_rate": 6.281416799501187e-07, "loss": 0.0707, "reward": 0.5718964412808418, "reward_std": 0.7699461728334427, "rewards/cosine_scaled_reward": -0.06127400905825198, "rewards/format_reward": 0.6944444477558136, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2411.2916259765625, "epoch": 0.4301628106255356, "grad_norm": 7.757229328155518, "kl": 0.7177734375, "learning_rate": 6.25045936022246e-07, "loss": 0.0104, "reward": 0.6452328599989414, "reward_std": 0.8850838840007782, "rewards/cosine_scaled_reward": -0.010716899763792753, "rewards/format_reward": 0.6666666716337204, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2250.1805725097656, "epoch": 0.4318766066838046, "grad_norm": 50.71971893310547, "kl": 1.673828125, "learning_rate": 6.219465344613258e-07, "loss": 0.2537, "reward": 0.41996366158127785, "reward_std": 0.630496121942997, "rewards/cosine_scaled_reward": -0.16501817479729652, "rewards/format_reward": 0.7500000149011612, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2389.4861450195312, "epoch": 0.43359040274207367, "grad_norm": 76.95365142822266, "kl": 1.5126953125, "learning_rate": 6.188436263278172e-07, "loss": 0.1964, "reward": 0.5589244738221169, "reward_std": 0.8758179396390915, "rewards/cosine_scaled_reward": -0.019148872102960013, "rewards/format_reward": 0.597222238779068, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2433.6944580078125, "epoch": 0.43530419880034277, "grad_norm": 89.30572509765625, "kl": 1.7294921875, "learning_rate": 6.157373628530852e-07, "loss": 0.1411, "reward": 0.3653869954869151, "reward_std": 0.6425384879112244, "rewards/cosine_scaled_reward": -0.11591762490570545, "rewards/format_reward": 0.5972222313284874, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2666.02783203125, "epoch": 0.4370179948586118, "grad_norm": 9.923705101013184, "kl": 0.7724609375, "learning_rate": 6.126278954320294e-07, "loss": 0.1243, "reward": 0.3781815767288208, "reward_std": 0.6919418126344681, "rewards/cosine_scaled_reward": -0.06785366125404835, "rewards/format_reward": 0.5138888880610466, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 2432.8056030273438, "epoch": 0.4387317909168809, "grad_norm": 9.747496604919434, "kl": 0.7314453125, "learning_rate": 6.095153756157051e-07, "loss": 0.083, "reward": 0.2219482958316803, "reward_std": 0.47816336899995804, "rewards/cosine_scaled_reward": -0.20152585953474045, "rewards/format_reward": 0.6250000074505806, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 2205.027801513672, "epoch": 0.44044558697514996, "grad_norm": 3.7311699390411377, "kl": 0.53857421875, "learning_rate": 6.06399955103937e-07, "loss": 0.1269, "reward": 0.5375950075685978, "reward_std": 0.6251346915960312, "rewards/cosine_scaled_reward": -0.0923136118799448, "rewards/format_reward": 0.7222222238779068, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 2471.4584045410156, "epoch": 0.442159383033419, "grad_norm": 2.448763608932495, "kl": 0.91796875, "learning_rate": 6.032817857379256e-07, "loss": 0.1343, "reward": 0.33694631792604923, "reward_std": 0.4810459837317467, "rewards/cosine_scaled_reward": -0.1440268289297819, "rewards/format_reward": 0.6250000149011612, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2130.763885498047, "epoch": 0.4438731790916881, "grad_norm": 1.8066775798797607, "kl": 0.68505859375, "learning_rate": 6.001610194928464e-07, "loss": 0.1045, "reward": 0.7608818560838699, "reward_std": 0.697891928255558, "rewards/cosine_scaled_reward": -0.008447982007055543, "rewards/format_reward": 0.7777777910232544, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 2359.7638549804688, "epoch": 0.44558697514995715, "grad_norm": 1.9958624839782715, "kl": 1.009765625, "learning_rate": 5.97037808470444e-07, "loss": 0.1355, "reward": 0.7400075197219849, "reward_std": 0.5138791352510452, "rewards/cosine_scaled_reward": 0.00889264652505517, "rewards/format_reward": 0.7222222313284874, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 2002.7361145019531, "epoch": 0.4473007712082262, "grad_norm": 5.525115966796875, "kl": 0.7490234375, "learning_rate": 5.939123048916173e-07, "loss": 0.1647, "reward": 0.7085682898759842, "reward_std": 0.6736738979816437, "rewards/cosine_scaled_reward": -0.041549197398126125, "rewards/format_reward": 0.7916666716337204, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 2599.9028930664062, "epoch": 0.4490145672664953, "grad_norm": 4.714324474334717, "kl": 0.9013671875, "learning_rate": 5.907846610890011e-07, "loss": 0.1464, "reward": 0.7315462306141853, "reward_std": 0.8794215172529221, "rewards/cosine_scaled_reward": 0.05327310296706855, "rewards/format_reward": 0.6250000149011612, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 2593.236083984375, "epoch": 0.45072836332476435, "grad_norm": 1.4935065507888794, "kl": 0.8330078125, "learning_rate": 5.87655029499542e-07, "loss": 0.1333, "reward": 0.26326372660696507, "reward_std": 0.3958895206451416, "rewards/cosine_scaled_reward": -0.11836813762784004, "rewards/format_reward": 0.5000000074505806, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 2746.1111450195312, "epoch": 0.4524421593830334, "grad_norm": 2.71354341506958, "kl": 0.833984375, "learning_rate": 5.845235626570683e-07, "loss": 0.0506, "reward": 0.34650287590920925, "reward_std": 0.6603603884577751, "rewards/cosine_scaled_reward": -0.11841523088514805, "rewards/format_reward": 0.5833333283662796, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 2810.6111450195312, "epoch": 0.4541559554413025, "grad_norm": 3.0615949630737305, "kl": 0.8046875, "learning_rate": 5.813904131848564e-07, "loss": 0.0807, "reward": 0.6215685978531837, "reward_std": 0.6345800720155239, "rewards/cosine_scaled_reward": 0.06078430451452732, "rewards/format_reward": 0.5000000018626451, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 2483.513916015625, "epoch": 0.45586975149957154, "grad_norm": 2.4526662826538086, "kl": 0.689453125, "learning_rate": 5.78255733788191e-07, "loss": 0.1832, "reward": 0.4189574085175991, "reward_std": 0.4973677098751068, "rewards/cosine_scaled_reward": -0.0544101782143116, "rewards/format_reward": 0.5277777835726738, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 2401.1805725097656, "epoch": 0.45758354755784064, "grad_norm": 1.3402016162872314, "kl": 0.7294921875, "learning_rate": 5.751196772469237e-07, "loss": 0.1157, "reward": 0.36370813054963946, "reward_std": 0.4258965626358986, "rewards/cosine_scaled_reward": -0.14453481137752533, "rewards/format_reward": 0.6527777910232544, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2708.3194580078125, "epoch": 0.4592973436161097, "grad_norm": 3.217470407485962, "kl": 0.599609375, "learning_rate": 5.71982396408026e-07, "loss": 0.0051, "reward": 0.26998334005475044, "reward_std": 0.49185192957520485, "rewards/cosine_scaled_reward": -0.15667499974370003, "rewards/format_reward": 0.5833333432674408, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 2829.4583740234375, "epoch": 0.46101113967437873, "grad_norm": 1.457294225692749, "kl": 0.591796875, "learning_rate": 5.688440441781398e-07, "loss": 0.1149, "reward": 0.2393805852625519, "reward_std": 0.7379022389650345, "rewards/cosine_scaled_reward": -0.11642082477919757, "rewards/format_reward": 0.4722222313284874, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2878.2916259765625, "epoch": 0.46272493573264784, "grad_norm": 1.101474642753601, "kl": 0.69921875, "learning_rate": 5.657047735161255e-07, "loss": 0.0498, "reward": 0.37158428877592087, "reward_std": 0.5783374309539795, "rewards/cosine_scaled_reward": -0.029485642910003662, "rewards/format_reward": 0.430555559694767, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 2793.6806030273438, "epoch": 0.4644387317909169, "grad_norm": 0.8390009999275208, "kl": 0.63916015625, "learning_rate": 5.625647374256061e-07, "loss": 0.0834, "reward": 0.38940694369375706, "reward_std": 0.6547586917877197, "rewards/cosine_scaled_reward": -0.06918542925268412, "rewards/format_reward": 0.5277777723968029, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2366.638885498047, "epoch": 0.4661525278491859, "grad_norm": 4.141148090362549, "kl": 0.4765625, "learning_rate": 5.594240889475106e-07, "loss": 0.2224, "reward": 0.4617117829620838, "reward_std": 0.6572139859199524, "rewards/cosine_scaled_reward": -0.060810765251517296, "rewards/format_reward": 0.5833333358168602, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 2721.6805419921875, "epoch": 0.46786632390745503, "grad_norm": 1.906948208808899, "kl": 0.599609375, "learning_rate": 5.562829811526154e-07, "loss": 0.1188, "reward": 0.26052477210760117, "reward_std": 0.6570783406496048, "rewards/cosine_scaled_reward": -0.06418205983936787, "rewards/format_reward": 0.3888888992369175, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 2921.3750610351562, "epoch": 0.4695801199657241, "grad_norm": 1.8937734365463257, "kl": 0.52685546875, "learning_rate": 5.531415671340826e-07, "loss": 0.0185, "reward": 0.22513618250377476, "reward_std": 0.6058431342244148, "rewards/cosine_scaled_reward": -0.16520969779230654, "rewards/format_reward": 0.555555559694767, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 2988.15283203125, "epoch": 0.4712939160239931, "grad_norm": 1.208433985710144, "kl": 0.54931640625, "learning_rate": 5.5e-07, "loss": 0.0485, "reward": 0.11821263573256147, "reward_std": 0.391703762114048, "rewards/cosine_scaled_reward": -0.15617146715521812, "rewards/format_reward": 0.4305555494502187, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 2643.541748046875, "epoch": 0.4730077120822622, "grad_norm": 0.9594613313674927, "kl": 0.4326171875, "learning_rate": 5.468584328659172e-07, "loss": 0.0807, "reward": 0.32308289408683777, "reward_std": 0.7159284129738808, "rewards/cosine_scaled_reward": -0.08151410473510623, "rewards/format_reward": 0.4861111156642437, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 2926.013916015625, "epoch": 0.47472150814053127, "grad_norm": 1.3466960191726685, "kl": 0.5048828125, "learning_rate": 5.437170188473847e-07, "loss": 0.1283, "reward": 0.06485692039132118, "reward_std": 0.542218990623951, "rewards/cosine_scaled_reward": -0.17590487515553832, "rewards/format_reward": 0.4166666679084301, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2993.1945190429688, "epoch": 0.47643530419880037, "grad_norm": 3.963604688644409, "kl": 0.5546875, "learning_rate": 5.405759110524894e-07, "loss": 0.2168, "reward": 0.03267951123416424, "reward_std": 0.7076856940984726, "rewards/cosine_scaled_reward": -0.15727136190980673, "rewards/format_reward": 0.3472222276031971, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 3138.6805419921875, "epoch": 0.4781491002570694, "grad_norm": 0.5958003997802734, "kl": 0.56201171875, "learning_rate": 5.37435262574394e-07, "loss": 0.0759, "reward": -0.051162030547857285, "reward_std": 0.5351358503103256, "rewards/cosine_scaled_reward": -0.2478032372891903, "rewards/format_reward": 0.4444444477558136, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2737.1944885253906, "epoch": 0.47986289631533846, "grad_norm": 1.6760090589523315, "kl": 0.59228515625, "learning_rate": 5.342952264838747e-07, "loss": 0.1096, "reward": 0.45366813987493515, "reward_std": 0.7812162339687347, "rewards/cosine_scaled_reward": -0.0023325812071561813, "rewards/format_reward": 0.4583333367481828, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 2423.2222290039062, "epoch": 0.48157669237360756, "grad_norm": 2.0177345275878906, "kl": 0.461669921875, "learning_rate": 5.311559558218603e-07, "loss": 0.1262, "reward": 0.38226850144565105, "reward_std": 0.6752881184220314, "rewards/cosine_scaled_reward": -0.07969908323138952, "rewards/format_reward": 0.5416666585952044, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 2740.3611450195312, "epoch": 0.4832904884318766, "grad_norm": 1.8314719200134277, "kl": 0.6337890625, "learning_rate": 5.28017603591974e-07, "loss": 0.1455, "reward": 0.21622517937794328, "reward_std": 0.6346057131886482, "rewards/cosine_scaled_reward": -0.15577631071209908, "rewards/format_reward": 0.5277777910232544, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2889.2500610351562, "epoch": 0.48500428449014565, "grad_norm": 0.8971331119537354, "kl": 0.6259765625, "learning_rate": 5.248803227530763e-07, "loss": 0.1266, "reward": 0.03935375134460628, "reward_std": 0.6441294327378273, "rewards/cosine_scaled_reward": -0.20254534482955933, "rewards/format_reward": 0.4444444477558136, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2276.9584045410156, "epoch": 0.48671808054841476, "grad_norm": 6.356135368347168, "kl": 0.4833984375, "learning_rate": 5.21744266211809e-07, "loss": 0.3013, "reward": 0.32465188996866345, "reward_std": 0.6560942605137825, "rewards/cosine_scaled_reward": -0.09461849741637707, "rewards/format_reward": 0.5138889029622078, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 2624.9306640625, "epoch": 0.4884318766066838, "grad_norm": 1.9299744367599487, "kl": 0.5576171875, "learning_rate": 5.186095868151436e-07, "loss": 0.1493, "reward": 0.21753913909196854, "reward_std": 0.683107927441597, "rewards/cosine_scaled_reward": -0.16206377279013395, "rewards/format_reward": 0.5416666716337204, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 2555.7361450195312, "epoch": 0.49014567266495285, "grad_norm": 2.2400879859924316, "kl": 0.62890625, "learning_rate": 5.154764373429315e-07, "loss": 0.1583, "reward": 0.4158199355006218, "reward_std": 0.6385739594697952, "rewards/cosine_scaled_reward": -0.021256705978885293, "rewards/format_reward": 0.4583333320915699, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 2526.52783203125, "epoch": 0.49185946872322195, "grad_norm": 3.190502405166626, "kl": 0.916015625, "learning_rate": 5.123449705004581e-07, "loss": 0.2192, "reward": 0.4090676587074995, "reward_std": 0.7619837448000908, "rewards/cosine_scaled_reward": -0.0662995120510459, "rewards/format_reward": 0.5416666641831398, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 2134.65283203125, "epoch": 0.493573264781491, "grad_norm": 6.452578544616699, "kl": 1.103515625, "learning_rate": 5.09215338910999e-07, "loss": 0.368, "reward": 0.44275959208607674, "reward_std": 0.6151050329208374, "rewards/cosine_scaled_reward": -0.09806465543806553, "rewards/format_reward": 0.6388888955116272, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 2290.013885498047, "epoch": 0.4952870608397601, "grad_norm": 1.6855307817459106, "kl": 0.82275390625, "learning_rate": 5.060876951083828e-07, "loss": 0.1464, "reward": 0.4334092391654849, "reward_std": 0.6985170915722847, "rewards/cosine_scaled_reward": -0.0819065012037754, "rewards/format_reward": 0.5972222164273262, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1840.6388549804688, "epoch": 0.49700085689802914, "grad_norm": 2.9146785736083984, "kl": 0.79296875, "learning_rate": 5.02962191529556e-07, "loss": 0.0283, "reward": 0.7537698708474636, "reward_std": 0.5962013602256775, "rewards/cosine_scaled_reward": 0.008829381316900253, "rewards/format_reward": 0.736111119389534, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 2200.4444274902344, "epoch": 0.4987146529562982, "grad_norm": 1.7983882427215576, "kl": 0.927734375, "learning_rate": 4.998389805071536e-07, "loss": 0.2134, "reward": 0.614590086042881, "reward_std": 0.9522670358419418, "rewards/cosine_scaled_reward": -0.012149423826485872, "rewards/format_reward": 0.638888880610466, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1986.0694885253906, "epoch": 0.5004284490145673, "grad_norm": 2.5195839405059814, "kl": 0.9873046875, "learning_rate": 4.967182142620745e-07, "loss": 0.1778, "reward": 0.7197987511754036, "reward_std": 0.8348591700196266, "rewards/cosine_scaled_reward": 0.01267714286223054, "rewards/format_reward": 0.6944444477558136, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1970.9999694824219, "epoch": 0.5021422450728363, "grad_norm": 3.093557596206665, "kl": 1.0625, "learning_rate": 4.93600044896063e-07, "loss": 0.1538, "reward": 0.29343970119953156, "reward_std": 0.49155813455581665, "rewards/cosine_scaled_reward": -0.13105794228613377, "rewards/format_reward": 0.555555559694767, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1924.7916870117188, "epoch": 0.5038560411311054, "grad_norm": 6.1597208976745605, "kl": 0.87158203125, "learning_rate": 4.904846243842949e-07, "loss": 0.3554, "reward": 0.4494058433920145, "reward_std": 0.747850589454174, "rewards/cosine_scaled_reward": -0.08085263520479202, "rewards/format_reward": 0.6111111119389534, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 2000.6388854980469, "epoch": 0.5055698371893744, "grad_norm": 2.868744134902954, "kl": 0.8369140625, "learning_rate": 4.873721045679706e-07, "loss": 0.1929, "reward": 0.23984116781502962, "reward_std": 0.4878830164670944, "rewards/cosine_scaled_reward": -0.22730162646621466, "rewards/format_reward": 0.6944444477558136, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2322.5556030273438, "epoch": 0.5072836332476436, "grad_norm": 3.0267629623413086, "kl": 1.2763671875, "learning_rate": 4.842626371469149e-07, "loss": 0.2229, "reward": 0.4730116240680218, "reward_std": 0.672569528222084, "rewards/cosine_scaled_reward": 0.0003947049845010042, "rewards/format_reward": 0.4722222276031971, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1968.2222595214844, "epoch": 0.5089974293059126, "grad_norm": 3.3123202323913574, "kl": 1.0166015625, "learning_rate": 4.811563736721829e-07, "loss": 0.2608, "reward": 0.4829604886472225, "reward_std": 0.6525571122765541, "rewards/cosine_scaled_reward": -0.08490864699706435, "rewards/format_reward": 0.6527777686715126, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 2157.8055725097656, "epoch": 0.5107112253641817, "grad_norm": 4.5067524909973145, "kl": 1.609375, "learning_rate": 4.780534655386743e-07, "loss": 0.379, "reward": 0.673854373395443, "reward_std": 0.728430263698101, "rewards/cosine_scaled_reward": 0.024427177384495735, "rewards/format_reward": 0.6250000149011612, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1613.5000305175781, "epoch": 0.5124250214224507, "grad_norm": 3.3076987266540527, "kl": 1.0537109375, "learning_rate": 4.749540639777539e-07, "loss": 0.2441, "reward": 0.8705739304423332, "reward_std": 0.7832073271274567, "rewards/cosine_scaled_reward": 0.053342508152127266, "rewards/format_reward": 0.7638888955116272, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 2283.0000610351562, "epoch": 0.5141388174807198, "grad_norm": 4.202121734619141, "kl": 1.794921875, "learning_rate": 4.7185832004988133e-07, "loss": 0.2233, "reward": 0.41079268511384726, "reward_std": 0.49303294718265533, "rewards/cosine_scaled_reward": -0.07238144427537918, "rewards/format_reward": 0.5555555522441864, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1734.0139465332031, "epoch": 0.5158526135389888, "grad_norm": 2.733738660812378, "kl": 1.271484375, "learning_rate": 4.68766384637248e-07, "loss": 0.3623, "reward": 0.5950284972786903, "reward_std": 0.5468220561742783, "rewards/cosine_scaled_reward": -0.042763520032167435, "rewards/format_reward": 0.6805555671453476, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 2734.5138549804688, "epoch": 0.517566409597258, "grad_norm": 5.247328281402588, "kl": 1.837890625, "learning_rate": 4.656784084364238e-07, "loss": 0.1992, "reward": 0.1533558116061613, "reward_std": 0.5701670944690704, "rewards/cosine_scaled_reward": -0.10387765569612384, "rewards/format_reward": 0.36111111380159855, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1955.3333435058594, "epoch": 0.519280205655527, "grad_norm": 2.1991233825683594, "kl": 1.4111328125, "learning_rate": 4.6259454195101267e-07, "loss": 0.2534, "reward": 0.4565839725546539, "reward_std": 0.5375222712755203, "rewards/cosine_scaled_reward": -0.13281912542879581, "rewards/format_reward": 0.7222222164273262, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 1990.6805419921875, "epoch": 0.5209940017137961, "grad_norm": 3.0468897819519043, "kl": 1.685546875, "learning_rate": 4.59514935484316e-07, "loss": 0.2866, "reward": 0.589899554848671, "reward_std": 0.6861986592411995, "rewards/cosine_scaled_reward": -0.03143910859944299, "rewards/format_reward": 0.6527777910232544, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1996.5416870117188, "epoch": 0.5227077977720651, "grad_norm": 4.011932849884033, "kl": 1.462890625, "learning_rate": 4.5643973913200837e-07, "loss": 0.2531, "reward": 0.3674123687669635, "reward_std": 0.6027099043130875, "rewards/cosine_scaled_reward": -0.10796047560870647, "rewards/format_reward": 0.5833333283662796, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1896.9166870117188, "epoch": 0.5244215938303342, "grad_norm": 2.977555751800537, "kl": 1.01171875, "learning_rate": 4.5336910277482155e-07, "loss": 0.1578, "reward": 0.43058543652296066, "reward_std": 0.5935798361897469, "rewards/cosine_scaled_reward": -0.12498506158590317, "rewards/format_reward": 0.6805555671453476, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 2022.4584045410156, "epoch": 0.5261353898886033, "grad_norm": 2.7941606044769287, "kl": 1.24609375, "learning_rate": 4.503031760712397e-07, "loss": 0.2441, "reward": 0.7062125951051712, "reward_std": 0.8090076595544815, "rewards/cosine_scaled_reward": -0.0010603656992316246, "rewards/format_reward": 0.7083333283662796, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2287.125030517578, "epoch": 0.5278491859468724, "grad_norm": 1.6866544485092163, "kl": 1.287109375, "learning_rate": 4.4724210845020494e-07, "loss": 0.2517, "reward": 0.650560175999999, "reward_std": 0.742147371172905, "rewards/cosine_scaled_reward": 0.033613420091569424, "rewards/format_reward": 0.5833333507180214, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 2066.5694580078125, "epoch": 0.5295629820051414, "grad_norm": 2.5993359088897705, "kl": 1.2255859375, "learning_rate": 4.441860491038345e-07, "loss": 0.1916, "reward": 0.6525638625025749, "reward_std": 0.6665498167276382, "rewards/cosine_scaled_reward": 0.02767082443460822, "rewards/format_reward": 0.5972222313284874, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 2370.763885498047, "epoch": 0.5312767780634104, "grad_norm": 3.1745495796203613, "kl": 1.205078125, "learning_rate": 4.4113514698014953e-07, "loss": 0.167, "reward": 0.4759225994348526, "reward_std": 0.8299422115087509, "rewards/cosine_scaled_reward": -0.04676092881709337, "rewards/format_reward": 0.569444440305233, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 2503.5972290039062, "epoch": 0.5329905741216795, "grad_norm": 2.2356581687927246, "kl": 1.376953125, "learning_rate": 4.3808955077581546e-07, "loss": 0.2558, "reward": 0.2966647706925869, "reward_std": 0.5752375796437263, "rewards/cosine_scaled_reward": -0.15722317062318325, "rewards/format_reward": 0.6111111119389534, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 2278.0277404785156, "epoch": 0.5347043701799485, "grad_norm": 1.4038455486297607, "kl": 1.203125, "learning_rate": 4.350494089288943e-07, "loss": 0.241, "reward": 0.8164278883486986, "reward_std": 0.7621838673949242, "rewards/cosine_scaled_reward": 0.08876948896795511, "rewards/format_reward": 0.638888880610466, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 1964.8333435058594, "epoch": 0.5364181662382177, "grad_norm": 5.611269950866699, "kl": 1.6904296875, "learning_rate": 4.3201486961161093e-07, "loss": 0.1802, "reward": 0.5501147694885731, "reward_std": 0.48398981615900993, "rewards/cosine_scaled_reward": -0.07910929806530476, "rewards/format_reward": 0.7083333283662796, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2596.0139770507812, "epoch": 0.5381319622964867, "grad_norm": 2.7361557483673096, "kl": 0.904296875, "learning_rate": 4.2898608072313045e-07, "loss": 0.1967, "reward": 0.43212154414504766, "reward_std": 0.8072051256895065, "rewards/cosine_scaled_reward": -0.04088366776704788, "rewards/format_reward": 0.5138888880610466, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 2164.986083984375, "epoch": 0.5398457583547558, "grad_norm": 2.377624988555908, "kl": 1.009765625, "learning_rate": 4.2596318988235037e-07, "loss": 0.2612, "reward": 0.345002256333828, "reward_std": 0.6840208172798157, "rewards/cosine_scaled_reward": -0.13305442477576435, "rewards/format_reward": 0.611111119389534, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 2112.861114501953, "epoch": 0.5415595544130248, "grad_norm": 2.4749066829681396, "kl": 1.1328125, "learning_rate": 4.2294634442070553e-07, "loss": 0.2521, "reward": 0.3993903249502182, "reward_std": 0.7594646960496902, "rewards/cosine_scaled_reward": -0.09891596343368292, "rewards/format_reward": 0.5972222238779068, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2763.791717529297, "epoch": 0.5432733504712939, "grad_norm": 3.959906816482544, "kl": 1.5537109375, "learning_rate": 4.1993569137498776e-07, "loss": 0.1382, "reward": 0.3366717994213104, "reward_std": 0.5981347486376762, "rewards/cosine_scaled_reward": -0.10944187548011541, "rewards/format_reward": 0.5555555559694767, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1693.9583129882812, "epoch": 0.5449871465295629, "grad_norm": 7.122504711151123, "kl": 1.0234375, "learning_rate": 4.1693137748017915e-07, "loss": 0.3755, "reward": 1.1441613137722015, "reward_std": 0.5525609478354454, "rewards/cosine_scaled_reward": 0.21791397035121918, "rewards/format_reward": 0.7083333432674408, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2302.7222595214844, "epoch": 0.5467009425878321, "grad_norm": 1.7870283126831055, "kl": 1.1513671875, "learning_rate": 4.1393354916230005e-07, "loss": 0.2157, "reward": 0.44585637911222875, "reward_std": 0.5990823060274124, "rewards/cosine_scaled_reward": -0.09651626879349351, "rewards/format_reward": 0.638888880610466, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2418.6112060546875, "epoch": 0.5484147386461011, "grad_norm": 4.37020206451416, "kl": 1.3271484375, "learning_rate": 4.1094235253127374e-07, "loss": 0.1373, "reward": 0.7411398887634277, "reward_std": 0.8346492722630501, "rewards/cosine_scaled_reward": 0.030292170122265816, "rewards/format_reward": 0.6805555522441864, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 2477.4305419921875, "epoch": 0.5501285347043702, "grad_norm": 2.8605029582977295, "kl": 0.9931640625, "learning_rate": 4.079579333738039e-07, "loss": 0.1566, "reward": 0.3635707888752222, "reward_std": 0.6505779251456261, "rewards/cosine_scaled_reward": -0.14460349176079035, "rewards/format_reward": 0.6527777835726738, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1934.0694885253906, "epoch": 0.5518423307626392, "grad_norm": 2.2090539932250977, "kl": 1.033935546875, "learning_rate": 4.0498043714627006e-07, "loss": 0.1413, "reward": 0.43895523250102997, "reward_std": 0.6110691279172897, "rewards/cosine_scaled_reward": -0.14857794775161892, "rewards/format_reward": 0.7361111342906952, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 2659.2083129882812, "epoch": 0.5535561268209083, "grad_norm": 1.6186331510543823, "kl": 0.92578125, "learning_rate": 4.020100089676376e-07, "loss": 0.1545, "reward": 0.43137288000434637, "reward_std": 0.5587008334696293, "rewards/cosine_scaled_reward": -0.041258019395172596, "rewards/format_reward": 0.5138888955116272, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 2133.0, "epoch": 0.5552699228791774, "grad_norm": 2.309271812438965, "kl": 0.646484375, "learning_rate": 3.9904679361238526e-07, "loss": 0.2071, "reward": 0.8334337323904037, "reward_std": 0.7556089013814926, "rewards/cosine_scaled_reward": 0.0486613066168502, "rewards/format_reward": 0.7361111044883728, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2852.5416259765625, "epoch": 0.5569837189374465, "grad_norm": 1.367803692817688, "kl": 1.1455078125, "learning_rate": 3.9609093550344907e-07, "loss": 0.2217, "reward": 0.30423190630972385, "reward_std": 0.6387892812490463, "rewards/cosine_scaled_reward": -0.13955070948577486, "rewards/format_reward": 0.5833333432674408, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2254.2638549804688, "epoch": 0.5586975149957155, "grad_norm": 2.456948757171631, "kl": 0.90966796875, "learning_rate": 3.931425787051832e-07, "loss": 0.2717, "reward": 0.7499970353674144, "reward_std": 0.5177437886595726, "rewards/cosine_scaled_reward": 0.06249852292239666, "rewards/format_reward": 0.625, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2643.263916015625, "epoch": 0.5604113110539846, "grad_norm": 2.975080728530884, "kl": 0.96875, "learning_rate": 3.902018669163384e-07, "loss": 0.2016, "reward": 0.37286074459552765, "reward_std": 0.6334929168224335, "rewards/cosine_scaled_reward": -0.06356962397694588, "rewards/format_reward": 0.5000000074505806, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2430.6666564941406, "epoch": 0.5621251071122536, "grad_norm": 2.2276480197906494, "kl": 1.1484375, "learning_rate": 3.872689434630585e-07, "loss": 0.2054, "reward": 0.45167311653494835, "reward_std": 0.6782046630978584, "rewards/cosine_scaled_reward": -0.07971900515258312, "rewards/format_reward": 0.6111111268401146, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 2118.3194885253906, "epoch": 0.5638389031705227, "grad_norm": 1.439645767211914, "kl": 0.9873046875, "learning_rate": 3.843439512918949e-07, "loss": 0.1558, "reward": 0.3222229927778244, "reward_std": 0.5387292131781578, "rewards/cosine_scaled_reward": -0.186110720038414, "rewards/format_reward": 0.6944444477558136, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2615.3750610351562, "epoch": 0.5655526992287918, "grad_norm": 1.3368608951568604, "kl": 0.822265625, "learning_rate": 3.8142703296283953e-07, "loss": 0.0814, "reward": 0.19894374161958694, "reward_std": 0.4315878227353096, "rewards/cosine_scaled_reward": -0.17136146454140544, "rewards/format_reward": 0.5416666641831398, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 2637.2084045410156, "epoch": 0.5672664952870609, "grad_norm": 1.4436960220336914, "kl": 0.78125, "learning_rate": 3.785183306423767e-07, "loss": 0.1763, "reward": 0.555847343057394, "reward_std": 0.7657169997692108, "rewards/cosine_scaled_reward": 0.020979220047593117, "rewards/format_reward": 0.5138888955116272, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 2674.9444580078125, "epoch": 0.5689802913453299, "grad_norm": 3.6536855697631836, "kl": 1.30078125, "learning_rate": 3.7561798609655373e-07, "loss": 0.1134, "reward": 0.44342901557683945, "reward_std": 0.6001273989677429, "rewards/cosine_scaled_reward": -0.09772994555532932, "rewards/format_reward": 0.6388888880610466, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2896.0277709960938, "epoch": 0.570694087403599, "grad_norm": 1.9911452531814575, "kl": 1.0078125, "learning_rate": 3.72726140684072e-07, "loss": 0.1034, "reward": -0.17525828257203102, "reward_std": 0.4928872212767601, "rewards/cosine_scaled_reward": -0.24040691554546356, "rewards/format_reward": 0.30555556528270245, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 2810.861083984375, "epoch": 0.572407883461868, "grad_norm": 1.8073278665542603, "kl": 1.068359375, "learning_rate": 3.6984293534939737e-07, "loss": 0.1821, "reward": -0.006253276020288467, "reward_std": 0.5080604404211044, "rewards/cosine_scaled_reward": -0.2114599784836173, "rewards/format_reward": 0.4166666753590107, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 2462.65283203125, "epoch": 0.5741216795201372, "grad_norm": 1.5005158185958862, "kl": 0.771484375, "learning_rate": 3.6696851061588994e-07, "loss": 0.1649, "reward": 0.45248544216156006, "reward_std": 0.5002452582120895, "rewards/cosine_scaled_reward": -0.07931282371282578, "rewards/format_reward": 0.611111119389534, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 2160.125030517578, "epoch": 0.5758354755784062, "grad_norm": 3.1605849266052246, "kl": 0.548828125, "learning_rate": 3.641030065789562e-07, "loss": 0.1752, "reward": 0.24482397455722094, "reward_std": 0.515994019806385, "rewards/cosine_scaled_reward": -0.17619912140071392, "rewards/format_reward": 0.597222238779068, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2403.3194580078125, "epoch": 0.5775492716366752, "grad_norm": 2.8763837814331055, "kl": 0.736328125, "learning_rate": 3.612465628992203e-07, "loss": 0.1929, "reward": 0.5475254282355309, "reward_std": 0.6548926681280136, "rewards/cosine_scaled_reward": -0.038737302646040916, "rewards/format_reward": 0.6249999925494194, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2418.013946533203, "epoch": 0.5792630676949443, "grad_norm": 2.0177650451660156, "kl": 0.6171875, "learning_rate": 3.5839931879571725e-07, "loss": 0.2226, "reward": 0.5356792770326138, "reward_std": 0.9891562312841415, "rewards/cosine_scaled_reward": -0.01688259281218052, "rewards/format_reward": 0.5694444552063942, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 2717.888916015625, "epoch": 0.5809768637532133, "grad_norm": 2.011136293411255, "kl": 0.8603515625, "learning_rate": 3.555614130391079e-07, "loss": 0.0711, "reward": 0.31583554670214653, "reward_std": 0.6509723365306854, "rewards/cosine_scaled_reward": -0.14069335255771875, "rewards/format_reward": 0.5972222313284874, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2609.1805114746094, "epoch": 0.5826906598114824, "grad_norm": 2.1439146995544434, "kl": 0.60546875, "learning_rate": 3.5273298394491515e-07, "loss": 0.1085, "reward": 0.26384788006544113, "reward_std": 0.5596405640244484, "rewards/cosine_scaled_reward": -0.12502050958573818, "rewards/format_reward": 0.5138888917863369, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 2366.8055725097656, "epoch": 0.5844044558697515, "grad_norm": 3.087810516357422, "kl": 1.16796875, "learning_rate": 3.4991416936678276e-07, "loss": 0.2266, "reward": 0.19622072577476501, "reward_std": 0.43179403990507126, "rewards/cosine_scaled_reward": -0.20050075091421604, "rewards/format_reward": 0.5972222313284874, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 2665.3055419921875, "epoch": 0.5861182519280206, "grad_norm": 2.5300474166870117, "kl": 0.7939453125, "learning_rate": 3.471051066897562e-07, "loss": 0.1537, "reward": 0.24374699965119362, "reward_std": 0.7871415168046951, "rewards/cosine_scaled_reward": -0.1350709474645555, "rewards/format_reward": 0.5138889029622078, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 2231.2222900390625, "epoch": 0.5878320479862896, "grad_norm": 6.685680389404297, "kl": 0.60302734375, "learning_rate": 3.4430593282358777e-07, "loss": 0.3184, "reward": 0.4801894012489356, "reward_std": 0.4852745458483696, "rewards/cosine_scaled_reward": -0.02379419095814228, "rewards/format_reward": 0.5277777835726738, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2502.8472290039062, "epoch": 0.5895458440445587, "grad_norm": 2.49977970123291, "kl": 1.0634765625, "learning_rate": 3.4151678419606233e-07, "loss": 0.2075, "reward": 0.22286849096417427, "reward_std": 0.51853808760643, "rewards/cosine_scaled_reward": -0.18023241311311722, "rewards/format_reward": 0.5833333432674408, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 2553.8472290039062, "epoch": 0.5912596401028277, "grad_norm": 1.4922689199447632, "kl": 0.7724609375, "learning_rate": 3.387377967463493e-07, "loss": 0.11, "reward": 0.40987285412847996, "reward_std": 0.6866099908947945, "rewards/cosine_scaled_reward": -0.03117468417622149, "rewards/format_reward": 0.4722222313284874, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 2434.250030517578, "epoch": 0.5929734361610969, "grad_norm": 2.287896156311035, "kl": 0.73828125, "learning_rate": 3.359691059183761e-07, "loss": 0.1042, "reward": 0.566213920712471, "reward_std": 0.6637867465615273, "rewards/cosine_scaled_reward": -0.05022636614739895, "rewards/format_reward": 0.6666666865348816, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2606.3056030273438, "epoch": 0.5946872322193659, "grad_norm": 5.195223808288574, "kl": 0.8203125, "learning_rate": 3.3321084665422803e-07, "loss": 0.0878, "reward": 0.30292151868343353, "reward_std": 0.6232884004712105, "rewards/cosine_scaled_reward": -0.09853924717754126, "rewards/format_reward": 0.5000000074505806, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 2294.3194580078125, "epoch": 0.596401028277635, "grad_norm": 2.7749757766723633, "kl": 1.12744140625, "learning_rate": 3.3046315338757026e-07, "loss": 0.113, "reward": 0.10566018056124449, "reward_std": 0.4547986686229706, "rewards/cosine_scaled_reward": -0.23883657529950142, "rewards/format_reward": 0.5833333507180214, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 2798.375, "epoch": 0.598114824335904, "grad_norm": 4.372980117797852, "kl": 0.76708984375, "learning_rate": 3.2772616003709616e-07, "loss": 0.0724, "reward": 0.19539665430784225, "reward_std": 0.547118715941906, "rewards/cosine_scaled_reward": -0.0898016735445708, "rewards/format_reward": 0.37500000186264515, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 2323.1806030273438, "epoch": 0.5998286203941731, "grad_norm": 0.9666920900344849, "kl": 0.42724609375, "learning_rate": 3.250000000000001e-07, "loss": 0.1139, "reward": 0.3154673893004656, "reward_std": 0.6931511759757996, "rewards/cosine_scaled_reward": -0.1408774359151721, "rewards/format_reward": 0.5972222238779068, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 2554.791717529297, "epoch": 0.6015424164524421, "grad_norm": 1.4160966873168945, "kl": 0.673828125, "learning_rate": 3.222848061454764e-07, "loss": 0.1226, "reward": 0.2959921658039093, "reward_std": 0.6351921036839485, "rewards/cosine_scaled_reward": -0.1922817062586546, "rewards/format_reward": 0.6805555522441864, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2883.0972900390625, "epoch": 0.6032562125107113, "grad_norm": 1.3862693309783936, "kl": 0.57470703125, "learning_rate": 3.195807108082429e-07, "loss": 0.0944, "reward": -0.15852557588368654, "reward_std": 0.3618531711399555, "rewards/cosine_scaled_reward": -0.22509612515568733, "rewards/format_reward": 0.29166666977107525, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 2766.3611755371094, "epoch": 0.6049700085689803, "grad_norm": 2.0312767028808594, "kl": 0.47900390625, "learning_rate": 3.168878457820915e-07, "loss": 0.1735, "reward": 0.0013820715248584747, "reward_std": 0.4379913955926895, "rewards/cosine_scaled_reward": -0.2284756200388074, "rewards/format_reward": 0.4583333283662796, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 2303.4444885253906, "epoch": 0.6066838046272494, "grad_norm": 1.9564874172210693, "kl": 0.583984375, "learning_rate": 3.142063423134644e-07, "loss": 0.1414, "reward": 0.8931210651062429, "reward_std": 0.7761038094758987, "rewards/cosine_scaled_reward": 0.12017163541167974, "rewards/format_reward": 0.6527777649462223, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2350.3333435058594, "epoch": 0.6083976006855184, "grad_norm": 3.0156519412994385, "kl": 0.501953125, "learning_rate": 3.115363310950578e-07, "loss": 0.0948, "reward": 0.21542136371135712, "reward_std": 0.4440060332417488, "rewards/cosine_scaled_reward": -0.15617820341140032, "rewards/format_reward": 0.5277777761220932, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 2587.5694580078125, "epoch": 0.6101113967437874, "grad_norm": 1.8885806798934937, "kl": 0.7080078125, "learning_rate": 3.0887794225945143e-07, "loss": 0.1358, "reward": 0.2837059774901718, "reward_std": 0.5896440669894218, "rewards/cosine_scaled_reward": -0.1637025810778141, "rewards/format_reward": 0.6111111119389534, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 2377.9722290039062, "epoch": 0.6118251928020566, "grad_norm": 2.1884989738464355, "kl": 0.7119140625, "learning_rate": 3.062313053727671e-07, "loss": 0.2069, "reward": 0.5028799092397094, "reward_std": 0.6587233692407608, "rewards/cosine_scaled_reward": -0.04022672958672047, "rewards/format_reward": 0.5833333358168602, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 2724.02783203125, "epoch": 0.6135389888603257, "grad_norm": 1.1445249319076538, "kl": 0.5654296875, "learning_rate": 3.0359654942835247e-07, "loss": 0.1018, "reward": 0.2546988914255053, "reward_std": 0.7167258933186531, "rewards/cosine_scaled_reward": -0.10876166447997093, "rewards/format_reward": 0.4722222238779068, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 2532.2638549804688, "epoch": 0.6152527849185947, "grad_norm": 2.5419840812683105, "kl": 0.68505859375, "learning_rate": 3.0097380284049523e-07, "loss": 0.0873, "reward": 0.21593652665615082, "reward_std": 0.6653575152158737, "rewards/cosine_scaled_reward": -0.11425395932747051, "rewards/format_reward": 0.4444444477558136, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 2583.513916015625, "epoch": 0.6169665809768637, "grad_norm": 3.3858025074005127, "kl": 0.7587890625, "learning_rate": 2.9836319343816397e-07, "loss": 0.0324, "reward": 0.317771688933135, "reward_std": 0.6041048616170883, "rewards/cosine_scaled_reward": -0.09111416153609753, "rewards/format_reward": 0.4999999962747097, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 2334.1944274902344, "epoch": 0.6186803770351328, "grad_norm": 0.9652746915817261, "kl": 0.50732421875, "learning_rate": 2.9576484845877793e-07, "loss": 0.0994, "reward": 0.6510396376252174, "reward_std": 0.8507343530654907, "rewards/cosine_scaled_reward": 0.019964261911809444, "rewards/format_reward": 0.6111111119389534, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 3043.9445190429688, "epoch": 0.6203941730934018, "grad_norm": 0.7930195927619934, "kl": 0.67626953125, "learning_rate": 2.931788945420058e-07, "loss": 0.0974, "reward": 0.13819648325443268, "reward_std": 0.5064943730831146, "rewards/cosine_scaled_reward": -0.1531239915639162, "rewards/format_reward": 0.4444444552063942, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 2825.7916259765625, "epoch": 0.622107969151671, "grad_norm": 0.6896006464958191, "kl": 0.64794921875, "learning_rate": 2.9060545772359305e-07, "loss": 0.1145, "reward": 0.08600431494414806, "reward_std": 0.5438744425773621, "rewards/cosine_scaled_reward": -0.17227561306208372, "rewards/format_reward": 0.4305555559694767, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 2422.3334045410156, "epoch": 0.62382176520994, "grad_norm": 1.5994491577148438, "kl": 0.5615234375, "learning_rate": 2.8804466342921987e-07, "loss": 0.0825, "reward": 0.5049359295517206, "reward_std": 0.7525297850370407, "rewards/cosine_scaled_reward": -0.06697649694979191, "rewards/format_reward": 0.6388888955116272, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 2465.0556030273438, "epoch": 0.6255355612682091, "grad_norm": 7.009267330169678, "kl": 0.66259765625, "learning_rate": 2.854966364683872e-07, "loss": 0.2849, "reward": 0.15274390950798988, "reward_std": 0.48094654455780983, "rewards/cosine_scaled_reward": -0.18057249579578638, "rewards/format_reward": 0.5138888880610466, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 2835.9861450195312, "epoch": 0.6272493573264781, "grad_norm": 1.8245147466659546, "kl": 0.6435546875, "learning_rate": 2.829615010283344e-07, "loss": 0.0872, "reward": 0.43825584976002574, "reward_std": 0.4411094859242439, "rewards/cosine_scaled_reward": 0.031627919524908066, "rewards/format_reward": 0.37500001210719347, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2408.2500610351562, "epoch": 0.6289631533847472, "grad_norm": 1.2879042625427246, "kl": 0.64208984375, "learning_rate": 2.8043938066798645e-07, "loss": 0.1245, "reward": 0.5816475376486778, "reward_std": 0.8480053022503853, "rewards/cosine_scaled_reward": 0.026934866793453693, "rewards/format_reward": 0.5277777761220932, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 3011.8472290039062, "epoch": 0.6306769494430163, "grad_norm": 1.5342601537704468, "kl": 0.9501953125, "learning_rate": 2.7793039831193133e-07, "loss": 0.1117, "reward": -0.08722967363428324, "reward_std": 0.4681037962436676, "rewards/cosine_scaled_reward": -0.20333705097436905, "rewards/format_reward": 0.31944445334374905, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 2088.1111450195312, "epoch": 0.6323907455012854, "grad_norm": 5.395845413208008, "kl": 0.52392578125, "learning_rate": 2.7543467624442956e-07, "loss": 0.278, "reward": 0.8531668335199356, "reward_std": 0.7198526412248611, "rewards/cosine_scaled_reward": 0.07936117798089981, "rewards/format_reward": 0.6944444477558136, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 2144.4583587646484, "epoch": 0.6341045415595544, "grad_norm": 1.9326905012130737, "kl": 0.5986328125, "learning_rate": 2.729523361034538e-07, "loss": 0.165, "reward": 0.7016473673284054, "reward_std": 0.35017503798007965, "rewards/cosine_scaled_reward": 0.04526812210679054, "rewards/format_reward": 0.6111111268401146, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 2226.8333740234375, "epoch": 0.6358183376178235, "grad_norm": 6.064547538757324, "kl": 0.47265625, "learning_rate": 2.7048349887476037e-07, "loss": 0.2747, "reward": 0.36747913248836994, "reward_std": 0.47022923082113266, "rewards/cosine_scaled_reward": -0.12876042909920216, "rewards/format_reward": 0.6250000149011612, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 2646.166717529297, "epoch": 0.6375321336760925, "grad_norm": 1.8105882406234741, "kl": 0.66259765625, "learning_rate": 2.6802828488599294e-07, "loss": 0.1523, "reward": 0.2788702640682459, "reward_std": 0.7272945195436478, "rewards/cosine_scaled_reward": -0.082787093706429, "rewards/format_reward": 0.4444444440305233, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 2658.513916015625, "epoch": 0.6392459297343616, "grad_norm": 0.861741304397583, "kl": 0.56201171875, "learning_rate": 2.655868138008171e-07, "loss": 0.127, "reward": 0.25536923203617334, "reward_std": 0.549317829310894, "rewards/cosine_scaled_reward": -0.16398204606957734, "rewards/format_reward": 0.5833333358168602, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 2475.7916564941406, "epoch": 0.6409597257926307, "grad_norm": 5.120214462280273, "kl": 0.66943359375, "learning_rate": 2.631592046130896e-07, "loss": -0.0041, "reward": 0.4401531554758549, "reward_std": 0.5939441919326782, "rewards/cosine_scaled_reward": -0.10631232312880456, "rewards/format_reward": 0.6527777686715126, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 2241.4166564941406, "epoch": 0.6426735218508998, "grad_norm": 4.6052021980285645, "kl": 0.68896484375, "learning_rate": 2.6074557564105724e-07, "loss": 0.258, "reward": 0.40211474522948265, "reward_std": 0.6810158491134644, "rewards/cosine_scaled_reward": -0.13922041468322277, "rewards/format_reward": 0.6805555522441864, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 2345.541717529297, "epoch": 0.6443873179091688, "grad_norm": 1.593520164489746, "kl": 0.65869140625, "learning_rate": 2.583460445215911e-07, "loss": 0.1983, "reward": 0.4966873601078987, "reward_std": 0.6450872495770454, "rewards/cosine_scaled_reward": -0.0363785345107317, "rewards/format_reward": 0.569444440305233, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 2599.9583129882812, "epoch": 0.6461011139674379, "grad_norm": 1.0820523500442505, "kl": 0.54345703125, "learning_rate": 2.5596072820445254e-07, "loss": 0.1269, "reward": 0.3041490036994219, "reward_std": 0.5556300804018974, "rewards/cosine_scaled_reward": -0.11181438341736794, "rewards/format_reward": 0.5277777910232544, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 2423.1944274902344, "epoch": 0.6478149100257069, "grad_norm": 3.9577648639678955, "kl": 0.46435546875, "learning_rate": 2.5358974294659373e-07, "loss": 0.2539, "reward": 0.3343656752258539, "reward_std": 0.6136218756437302, "rewards/cosine_scaled_reward": -0.14531716238707304, "rewards/format_reward": 0.625, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1991.0555419921875, "epoch": 0.6495287060839761, "grad_norm": 6.228683948516846, "kl": 0.7626953125, "learning_rate": 2.512332043064913e-07, "loss": 0.2113, "reward": 0.7852881997823715, "reward_std": 0.7995356619358063, "rewards/cosine_scaled_reward": 0.031532974913716316, "rewards/format_reward": 0.722222238779068, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 2142.0972595214844, "epoch": 0.6512425021422451, "grad_norm": 4.392513751983643, "kl": 0.8505859375, "learning_rate": 2.488912271385139e-07, "loss": 0.166, "reward": 0.774210050702095, "reward_std": 0.9235591739416122, "rewards/cosine_scaled_reward": 0.01904946379363537, "rewards/format_reward": 0.736111119389534, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 2721.166748046875, "epoch": 0.6529562982005142, "grad_norm": 0.7555143237113953, "kl": 0.54931640625, "learning_rate": 2.465639255873246e-07, "loss": 0.1157, "reward": 0.06699353083968163, "reward_std": 0.6024204641580582, "rewards/cosine_scaled_reward": -0.18872546032071114, "rewards/format_reward": 0.4444444477558136, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 2592.3472595214844, "epoch": 0.6546700942587832, "grad_norm": 1.4892374277114868, "kl": 0.5986328125, "learning_rate": 2.4425141308231765e-07, "loss": 0.164, "reward": 0.4388514533638954, "reward_std": 0.7740809172391891, "rewards/cosine_scaled_reward": -0.07918539177626371, "rewards/format_reward": 0.597222238779068, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 2303.125, "epoch": 0.6563838903170522, "grad_norm": 1.8696836233139038, "kl": 0.63427734375, "learning_rate": 2.4195380233209006e-07, "loss": 0.0839, "reward": 0.2414467092603445, "reward_std": 0.5401086919009686, "rewards/cosine_scaled_reward": -0.14316555112600327, "rewards/format_reward": 0.5277777835726738, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 2894.138916015625, "epoch": 0.6580976863753213, "grad_norm": 2.512624740600586, "kl": 0.7236328125, "learning_rate": 2.3967120531894857e-07, "loss": 0.188, "reward": -0.111133978003636, "reward_std": 0.4146636873483658, "rewards/cosine_scaled_reward": -0.256955873221159, "rewards/format_reward": 0.4027777798473835, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 2577.1805725097656, "epoch": 0.6598114824335904, "grad_norm": 1.5134508609771729, "kl": 0.75341796875, "learning_rate": 2.374037332934512e-07, "loss": 0.1359, "reward": 0.12209473713301122, "reward_std": 0.42869339138269424, "rewards/cosine_scaled_reward": -0.18200820498168468, "rewards/format_reward": 0.4861111156642437, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 2836.388916015625, "epoch": 0.6615252784918595, "grad_norm": 1.6320090293884277, "kl": 0.71435546875, "learning_rate": 2.3515149676898552e-07, "loss": 0.1314, "reward": 0.027245239354670048, "reward_std": 0.5338631048798561, "rewards/cosine_scaled_reward": -0.21554404497146606, "rewards/format_reward": 0.4583333432674408, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 2370.5416870117188, "epoch": 0.6632390745501285, "grad_norm": 2.790175437927246, "kl": 0.69384765625, "learning_rate": 2.3291460551638237e-07, "loss": 0.1291, "reward": 0.45274626836180687, "reward_std": 0.5044268742203712, "rewards/cosine_scaled_reward": -0.07223799102939665, "rewards/format_reward": 0.5972222313284874, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 2043.15283203125, "epoch": 0.6649528706083976, "grad_norm": 2.196779251098633, "kl": 0.91259765625, "learning_rate": 2.306931685585657e-07, "loss": 0.1933, "reward": 0.6870926842093468, "reward_std": 0.7499307841062546, "rewards/cosine_scaled_reward": -0.017564778798259795, "rewards/format_reward": 0.722222238779068, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 2507.999969482422, "epoch": 0.6666666666666666, "grad_norm": 4.833599090576172, "kl": 0.7890625, "learning_rate": 2.2848729416523859e-07, "loss": 0.1485, "reward": 0.48738833516836166, "reward_std": 0.3942640535533428, "rewards/cosine_scaled_reward": -0.047972507774829865, "rewards/format_reward": 0.5833333283662796, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 2180.5833740234375, "epoch": 0.6683804627249358, "grad_norm": 4.208037853240967, "kl": 0.57275390625, "learning_rate": 2.2629708984760706e-07, "loss": 0.2068, "reward": 0.6339845806360245, "reward_std": 0.8561032116413116, "rewards/cosine_scaled_reward": -0.009396598441526294, "rewards/format_reward": 0.6527777761220932, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 2664.0, "epoch": 0.6700942587832048, "grad_norm": 1.711565375328064, "kl": 0.57421875, "learning_rate": 2.2412266235313973e-07, "loss": 0.2027, "reward": 0.2511326225940138, "reward_std": 0.7724436074495316, "rewards/cosine_scaled_reward": -0.13137813284993172, "rewards/format_reward": 0.5138888955116272, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 2048.1945190429688, "epoch": 0.6718080548414739, "grad_norm": 7.40539026260376, "kl": 0.576171875, "learning_rate": 2.2196411766036487e-07, "loss": 0.2932, "reward": 0.390616811811924, "reward_std": 0.46938444674015045, "rewards/cosine_scaled_reward": -0.1519138067960739, "rewards/format_reward": 0.6944444552063942, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 2850.15283203125, "epoch": 0.6735218508997429, "grad_norm": 2.314105272293091, "kl": 0.712890625, "learning_rate": 2.1982156097370557e-07, "loss": 0.1382, "reward": 0.12740344926714897, "reward_std": 0.5854331143200397, "rewards/cosine_scaled_reward": -0.1307427268475294, "rewards/format_reward": 0.3888888992369175, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 2054.3334045410156, "epoch": 0.675235646958012, "grad_norm": 3.0562775135040283, "kl": 0.958984375, "learning_rate": 2.1769509671835223e-07, "loss": 0.1762, "reward": 0.33383211493492126, "reward_std": 0.6097967401146889, "rewards/cosine_scaled_reward": -0.15947283059358597, "rewards/format_reward": 0.6527777835726738, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 2001.4027709960938, "epoch": 0.676949443016281, "grad_norm": 2.749018907546997, "kl": 1.3720703125, "learning_rate": 2.1558482853517253e-07, "loss": 0.2474, "reward": 0.46511383540928364, "reward_std": 0.5483391135931015, "rewards/cosine_scaled_reward": -0.11466531874611974, "rewards/format_reward": 0.6944444626569748, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 2663.9722900390625, "epoch": 0.6786632390745502, "grad_norm": 1.3055802583694458, "kl": 0.8837890625, "learning_rate": 2.134908592756607e-07, "loss": 0.1475, "reward": 0.21666064485907555, "reward_std": 0.8081866502761841, "rewards/cosine_scaled_reward": -0.14861411787569523, "rewards/format_reward": 0.5138889029622078, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 2148.027801513672, "epoch": 0.6803770351328192, "grad_norm": 2.2016310691833496, "kl": 1.193359375, "learning_rate": 2.1141329099692406e-07, "loss": 0.2683, "reward": 0.3172401809133589, "reward_std": 0.5794945135712624, "rewards/cosine_scaled_reward": -0.13999101985245943, "rewards/format_reward": 0.5972222164273262, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 2772.2778930664062, "epoch": 0.6820908311910883, "grad_norm": 5.671627044677734, "kl": 0.994140625, "learning_rate": 2.0935222495670968e-07, "loss": 0.1316, "reward": 0.22605895064771175, "reward_std": 0.528959184885025, "rewards/cosine_scaled_reward": -0.12308163847774267, "rewards/format_reward": 0.4722222313284874, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1907.2083740234375, "epoch": 0.6838046272493573, "grad_norm": 4.919534206390381, "kl": 1.0986328125, "learning_rate": 2.0730776160846853e-07, "loss": 0.1523, "reward": 0.8099863529205322, "reward_std": 0.7783814370632172, "rewards/cosine_scaled_reward": 0.03693760558962822, "rewards/format_reward": 0.7361111119389534, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 2289.3195190429688, "epoch": 0.6855184233076264, "grad_norm": 4.336697578430176, "kl": 0.9921875, "learning_rate": 2.0528000059645995e-07, "loss": 0.125, "reward": 0.08474167913664132, "reward_std": 0.4911258965730667, "rewards/cosine_scaled_reward": -0.21457360684871674, "rewards/format_reward": 0.5138888955116272, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 2537.0833740234375, "epoch": 0.6872322193658955, "grad_norm": 1.9845013618469238, "kl": 0.7470703125, "learning_rate": 2.032690407508949e-07, "loss": 0.1521, "reward": 0.20629934733733535, "reward_std": 0.5084620639681816, "rewards/cosine_scaled_reward": -0.17462810222059488, "rewards/format_reward": 0.5555555671453476, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2364.2222595214844, "epoch": 0.6889460154241646, "grad_norm": 4.490449905395508, "kl": 0.85595703125, "learning_rate": 2.0127498008311922e-07, "loss": 0.0678, "reward": 0.2729727178812027, "reward_std": 0.40766991674900055, "rewards/cosine_scaled_reward": -0.16906920075416565, "rewards/format_reward": 0.6111111044883728, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 2377.500030517578, "epoch": 0.6906598114824336, "grad_norm": 2.0314667224884033, "kl": 0.9287109375, "learning_rate": 1.9929791578083655e-07, "loss": 0.1243, "reward": 0.7173348069190979, "reward_std": 0.6178643703460693, "rewards/cosine_scaled_reward": 0.03922295683878474, "rewards/format_reward": 0.6388888955116272, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 2395.3472595214844, "epoch": 0.6923736075407027, "grad_norm": 3.6266534328460693, "kl": 1.1142578125, "learning_rate": 1.9733794420337213e-07, "loss": 0.1186, "reward": 0.37652647122740746, "reward_std": 0.6333749815821648, "rewards/cosine_scaled_reward": -0.11034788191318512, "rewards/format_reward": 0.5972222238779068, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 2634.7222595214844, "epoch": 0.6940874035989717, "grad_norm": 1.629310131072998, "kl": 0.87353515625, "learning_rate": 1.9539516087697517e-07, "loss": 0.1284, "reward": 0.30899196676909924, "reward_std": 0.5874167829751968, "rewards/cosine_scaled_reward": -0.11633734963834286, "rewards/format_reward": 0.541666679084301, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1720.5416870117188, "epoch": 0.6958011996572407, "grad_norm": 3.2341248989105225, "kl": 1.0625, "learning_rate": 1.934696604901642e-07, "loss": 0.1949, "reward": 0.5183681361377239, "reward_std": 0.5259700566530228, "rewards/cosine_scaled_reward": -0.08109369967132807, "rewards/format_reward": 0.680555559694767, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 2396.875, "epoch": 0.6975149957155099, "grad_norm": 2.575775146484375, "kl": 0.56005859375, "learning_rate": 1.915615368891117e-07, "loss": 0.1577, "reward": 0.16498053632676601, "reward_std": 0.6976238563656807, "rewards/cosine_scaled_reward": -0.17445417866110802, "rewards/format_reward": 0.5138888955116272, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 2211.986114501953, "epoch": 0.699228791773779, "grad_norm": 5.147465229034424, "kl": 0.939453125, "learning_rate": 1.8967088307307e-07, "loss": 0.264, "reward": 0.8435009941458702, "reward_std": 0.8539558947086334, "rewards/cosine_scaled_reward": 0.1370282769203186, "rewards/format_reward": 0.569444440305233, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 2221.791717529297, "epoch": 0.700942587832048, "grad_norm": 3.616407632827759, "kl": 1.2978515625, "learning_rate": 1.8779779118983867e-07, "loss": 0.2031, "reward": 0.5767598450183868, "reward_std": 0.6021636947989464, "rewards/cosine_scaled_reward": -0.01717562135308981, "rewards/format_reward": 0.611111119389534, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 2195.5833129882812, "epoch": 0.702656383890317, "grad_norm": 4.223770618438721, "kl": 0.9541015625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0719, "reward": 0.4589345343410969, "reward_std": 0.5643011257052422, "rewards/cosine_scaled_reward": -0.09692162275314331, "rewards/format_reward": 0.6527777835726738, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 2020.15283203125, "epoch": 0.7043701799485861, "grad_norm": 4.778375148773193, "kl": 0.9111328125, "learning_rate": 1.8410465752883758e-07, "loss": 0.2462, "reward": 0.4322133334353566, "reward_std": 0.5240239724516869, "rewards/cosine_scaled_reward": -0.11722666956484318, "rewards/format_reward": 0.666666679084301, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 2159.5694580078125, "epoch": 0.7060839760068551, "grad_norm": 5.010425090789795, "kl": 1.271484375, "learning_rate": 1.822847957491922e-07, "loss": 0.1504, "reward": 0.27721285074949265, "reward_std": 0.3799732178449631, "rewards/cosine_scaled_reward": -0.19472691789269447, "rewards/format_reward": 0.6666666567325592, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1994.1805419921875, "epoch": 0.7077977720651243, "grad_norm": 3.7414398193359375, "kl": 0.79296875, "learning_rate": 1.804828558898332e-07, "loss": 0.2209, "reward": 0.4823665115982294, "reward_std": 0.8085788935422897, "rewards/cosine_scaled_reward": -0.12687229178845882, "rewards/format_reward": 0.736111119389534, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 2582.513885498047, "epoch": 0.7095115681233933, "grad_norm": 2.3787803649902344, "kl": 0.955078125, "learning_rate": 1.7869892577476722e-07, "loss": 0.234, "reward": 0.35093772783875465, "reward_std": 0.7004451155662537, "rewards/cosine_scaled_reward": -0.06758668273687363, "rewards/format_reward": 0.4861111119389534, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2286.027801513672, "epoch": 0.7112253641816624, "grad_norm": 2.242143154144287, "kl": 1.1669921875, "learning_rate": 1.7693309235023127e-07, "loss": 0.2089, "reward": 0.0793907418847084, "reward_std": 0.4775719493627548, "rewards/cosine_scaled_reward": -0.2519712895154953, "rewards/format_reward": 0.5833333320915699, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 2716.8194885253906, "epoch": 0.7129391602399314, "grad_norm": 1.0189129114151, "kl": 1.0341796875, "learning_rate": 1.7518544168045524e-07, "loss": 0.1319, "reward": 0.3344786912202835, "reward_std": 0.6283555030822754, "rewards/cosine_scaled_reward": -0.09664955246262252, "rewards/format_reward": 0.5277777835726738, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 2145.777786254883, "epoch": 0.7146529562982005, "grad_norm": 3.3594307899475098, "kl": 0.630859375, "learning_rate": 1.7345605894346726e-07, "loss": 0.1719, "reward": 0.29561759158968925, "reward_std": 0.45837917923927307, "rewards/cosine_scaled_reward": -0.16469121165573597, "rewards/format_reward": 0.6250000074505806, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 2316.4722595214844, "epoch": 0.7163667523564696, "grad_norm": 3.205843448638916, "kl": 0.9482421875, "learning_rate": 1.7174502842694212e-07, "loss": 0.1905, "reward": 0.1763996873050928, "reward_std": 0.35552147775888443, "rewards/cosine_scaled_reward": -0.16874459758400917, "rewards/format_reward": 0.5138888880610466, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1993.9166564941406, "epoch": 0.7180805484147387, "grad_norm": 5.31653356552124, "kl": 1.2333984375, "learning_rate": 1.7005243352409333e-07, "loss": 0.1316, "reward": 0.38526383973658085, "reward_std": 0.3629095181822777, "rewards/cosine_scaled_reward": -0.16153474483871832, "rewards/format_reward": 0.7083333283662796, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 2585.3472290039062, "epoch": 0.7197943444730077, "grad_norm": 3.0050301551818848, "kl": 1.2138671875, "learning_rate": 1.6837835672960831e-07, "loss": 0.2609, "reward": 0.03709686268121004, "reward_std": 0.45924656093120575, "rewards/cosine_scaled_reward": -0.1967293554916978, "rewards/format_reward": 0.430555559694767, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 2077.4166870117188, "epoch": 0.7215081405312768, "grad_norm": 2.9638571739196777, "kl": 1.1162109375, "learning_rate": 1.6672287963562852e-07, "loss": 0.1967, "reward": 0.4120505638420582, "reward_std": 0.7001288831233978, "rewards/cosine_scaled_reward": -0.09258583001792431, "rewards/format_reward": 0.5972222313284874, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1782.4722290039062, "epoch": 0.7232219365895458, "grad_norm": 2.496225595474243, "kl": 1.1123046875, "learning_rate": 1.6508608292777203e-07, "loss": 0.3012, "reward": 0.3580199657008052, "reward_std": 0.5790654197335243, "rewards/cosine_scaled_reward": -0.18904556892812252, "rewards/format_reward": 0.7361111044883728, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 2290.388916015625, "epoch": 0.7249357326478149, "grad_norm": 2.5555100440979004, "kl": 0.8837890625, "learning_rate": 1.6346804638120098e-07, "loss": 0.1268, "reward": 0.4261997193098068, "reward_std": 0.6714624091982841, "rewards/cosine_scaled_reward": -0.1341223642230034, "rewards/format_reward": 0.6944444552063942, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 2174.3055419921875, "epoch": 0.726649528706084, "grad_norm": 4.850281715393066, "kl": 0.9345703125, "learning_rate": 1.6186884885673413e-07, "loss": 0.2657, "reward": 0.31390602327883244, "reward_std": 0.5223901495337486, "rewards/cosine_scaled_reward": -0.1833247635513544, "rewards/format_reward": 0.6805555820465088, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 2507.8334045410156, "epoch": 0.7283633247643531, "grad_norm": 3.5151827335357666, "kl": 1.26953125, "learning_rate": 1.6028856829700258e-07, "loss": 0.1979, "reward": 0.6333100497722626, "reward_std": 0.7416208535432816, "rewards/cosine_scaled_reward": 0.031932787562254816, "rewards/format_reward": 0.569444440305233, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1761.2083740234375, "epoch": 0.7300771208226221, "grad_norm": 3.2045891284942627, "kl": 1.076171875, "learning_rate": 1.5872728172265146e-07, "loss": 0.1263, "reward": 0.9971873387694359, "reward_std": 0.7048115953803062, "rewards/cosine_scaled_reward": 0.0749825444072485, "rewards/format_reward": 0.8472222238779068, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 2577.9861450195312, "epoch": 0.7317909168808912, "grad_norm": 1.8627033233642578, "kl": 1.14453125, "learning_rate": 1.5718506522858572e-07, "loss": 0.2595, "reward": 0.2531158346682787, "reward_std": 0.6184235513210297, "rewards/cosine_scaled_reward": -0.10955319553613663, "rewards/format_reward": 0.4722222276031971, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1856.7222595214844, "epoch": 0.7335047129391602, "grad_norm": 4.033189296722412, "kl": 1.1201171875, "learning_rate": 1.5566199398026147e-07, "loss": 0.263, "reward": 0.7039023488759995, "reward_std": 0.8175256699323654, "rewards/cosine_scaled_reward": -0.009159944485872984, "rewards/format_reward": 0.7222222238779068, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 2663.4861450195312, "epoch": 0.7352185089974294, "grad_norm": 3.801396369934082, "kl": 1.130859375, "learning_rate": 1.5415814221002265e-07, "loss": 0.15, "reward": 0.34896907582879066, "reward_std": 0.5518276765942574, "rewards/cosine_scaled_reward": -0.12412657774984837, "rewards/format_reward": 0.597222238779068, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 2153.555618286133, "epoch": 0.7369323050556984, "grad_norm": 2.9870073795318604, "kl": 0.9130859375, "learning_rate": 1.5267358321348285e-07, "loss": 0.1497, "reward": 0.44994640722870827, "reward_std": 0.3946686089038849, "rewards/cosine_scaled_reward": -0.08752679079771042, "rewards/format_reward": 0.6250000074505806, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 2261.75, "epoch": 0.7386461011139674, "grad_norm": 6.578658580780029, "kl": 1.78515625, "learning_rate": 1.5120838934595337e-07, "loss": 0.1464, "reward": 0.35643661208450794, "reward_std": 0.5088120512664318, "rewards/cosine_scaled_reward": -0.12039280403405428, "rewards/format_reward": 0.5972222238779068, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 2386.652801513672, "epoch": 0.7403598971722365, "grad_norm": 4.385483741760254, "kl": 1.3447265625, "learning_rate": 1.4976263201891613e-07, "loss": 0.3579, "reward": 0.4007231565192342, "reward_std": 0.6231922283768654, "rewards/cosine_scaled_reward": -0.09824953693896532, "rewards/format_reward": 0.5972222238779068, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 2001.3611297607422, "epoch": 0.7420736932305055, "grad_norm": 4.371149063110352, "kl": 1.0439453125, "learning_rate": 1.483363816965435e-07, "loss": 0.0871, "reward": 0.6510265804827213, "reward_std": 0.4398561269044876, "rewards/cosine_scaled_reward": -0.04254225082695484, "rewards/format_reward": 0.736111119389534, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1622.6666870117188, "epoch": 0.7437874892887746, "grad_norm": 6.787911891937256, "kl": 1.326171875, "learning_rate": 1.469297078922642e-07, "loss": 0.3842, "reward": 0.46582701057195663, "reward_std": 0.5145231448113918, "rewards/cosine_scaled_reward": -0.14208650775253773, "rewards/format_reward": 0.7500000149011612, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 2658.7500610351562, "epoch": 0.7455012853470437, "grad_norm": 2.6709697246551514, "kl": 1.232421875, "learning_rate": 1.4554267916537495e-07, "loss": 0.0821, "reward": 0.31399114802479744, "reward_std": 0.5854284539818764, "rewards/cosine_scaled_reward": -0.10689331218600273, "rewards/format_reward": 0.5277777835726738, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1860.9583435058594, "epoch": 0.7472150814053128, "grad_norm": 3.8863277435302734, "kl": 0.91796875, "learning_rate": 1.4417536311769885e-07, "loss": 0.2564, "reward": 0.5377090591937304, "reward_std": 0.5195211619138718, "rewards/cosine_scaled_reward": -0.0853121317923069, "rewards/format_reward": 0.7083333283662796, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 2466.2361755371094, "epoch": 0.7489288774635818, "grad_norm": 2.9903695583343506, "kl": 1.16015625, "learning_rate": 1.4282782639029128e-07, "loss": 0.1351, "reward": 0.4385749250650406, "reward_std": 0.6242729276418686, "rewards/cosine_scaled_reward": -0.10015699185896665, "rewards/format_reward": 0.6388889029622078, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 2362.8611450195312, "epoch": 0.7506426735218509, "grad_norm": 1.599947214126587, "kl": 1.189453125, "learning_rate": 1.4150013466019114e-07, "loss": 0.1863, "reward": 0.6738657765090466, "reward_std": 0.6156510934233665, "rewards/cosine_scaled_reward": 0.03137733961921185, "rewards/format_reward": 0.611111119389534, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 2748.2916870117188, "epoch": 0.7523564695801199, "grad_norm": 2.867025136947632, "kl": 1.033203125, "learning_rate": 1.4019235263722034e-07, "loss": 0.1123, "reward": 0.22596902353689075, "reward_std": 0.5135553628206253, "rewards/cosine_scaled_reward": -0.13701549544930458, "rewards/format_reward": 0.5000000074505806, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 2215.277801513672, "epoch": 0.7540702656383891, "grad_norm": 5.796390533447266, "kl": 1.396484375, "learning_rate": 1.3890454406082956e-07, "loss": 0.1245, "reward": 0.6629978334531188, "reward_std": 0.5948286652565002, "rewards/cosine_scaled_reward": 0.0051100607961416245, "rewards/format_reward": 0.6527777910232544, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 2402.7222595214844, "epoch": 0.7557840616966581, "grad_norm": 5.96156644821167, "kl": 1.34375, "learning_rate": 1.3763677169699217e-07, "loss": 0.0634, "reward": 0.5063027180731297, "reward_std": 0.6581330522894859, "rewards/cosine_scaled_reward": -0.08018200099468231, "rewards/format_reward": 0.6666666716337204, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 2082.263916015625, "epoch": 0.7574978577549272, "grad_norm": 3.405839443206787, "kl": 1.6640625, "learning_rate": 1.3638909733514452e-07, "loss": 0.2784, "reward": 0.5193299576640129, "reward_std": 0.5714153945446014, "rewards/cosine_scaled_reward": -0.05977945402264595, "rewards/format_reward": 0.6388888880610466, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1665.7638854980469, "epoch": 0.7592116538131962, "grad_norm": 5.792540550231934, "kl": 1.0849609375, "learning_rate": 1.351615817851748e-07, "loss": 0.1343, "reward": 0.6929136589169502, "reward_std": 0.636933371424675, "rewards/cosine_scaled_reward": -0.056320954114198685, "rewards/format_reward": 0.8055555671453476, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 2291.27783203125, "epoch": 0.7609254498714653, "grad_norm": 5.360567569732666, "kl": 1.2060546875, "learning_rate": 1.3395428487445914e-07, "loss": 0.18, "reward": 0.36108400439843535, "reward_std": 0.5261719971895218, "rewards/cosine_scaled_reward": -0.15973576810210943, "rewards/format_reward": 0.6805555447936058, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 2755.4583740234375, "epoch": 0.7626392459297343, "grad_norm": 2.5989925861358643, "kl": 0.875, "learning_rate": 1.3276726544494571e-07, "loss": 0.1482, "reward": 0.14870610460639, "reward_std": 0.567838903516531, "rewards/cosine_scaled_reward": -0.18259140476584435, "rewards/format_reward": 0.5138888880610466, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 2247.9583435058594, "epoch": 0.7643530419880035, "grad_norm": 5.224709987640381, "kl": 1.005859375, "learning_rate": 1.316005813502869e-07, "loss": 0.321, "reward": 0.5112787692341954, "reward_std": 0.6983462646603584, "rewards/cosine_scaled_reward": -0.049916195683181286, "rewards/format_reward": 0.6111111119389534, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 2349.3055725097656, "epoch": 0.7660668380462725, "grad_norm": 3.252889633178711, "kl": 1.216796875, "learning_rate": 1.3045428945301953e-07, "loss": 0.1445, "reward": 0.32806872576475143, "reward_std": 0.7308538854122162, "rewards/cosine_scaled_reward": -0.1276322863996029, "rewards/format_reward": 0.5833333358168602, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 2661.5555725097656, "epoch": 0.7677806341045416, "grad_norm": 2.3366446495056152, "kl": 0.8486328125, "learning_rate": 1.2932844562179352e-07, "loss": 0.0872, "reward": 0.36861317604780197, "reward_std": 0.6462560296058655, "rewards/cosine_scaled_reward": -0.051804508082568645, "rewards/format_reward": 0.47222223225980997, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 2592.4444580078125, "epoch": 0.7694944301628106, "grad_norm": 4.123133182525635, "kl": 1.052734375, "learning_rate": 1.2822310472864885e-07, "loss": 0.2043, "reward": 0.2611931987339631, "reward_std": 0.7008328437805176, "rewards/cosine_scaled_reward": -0.147181186825037, "rewards/format_reward": 0.5555555522441864, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1906.3333740234375, "epoch": 0.7712082262210797, "grad_norm": 3.090589761734009, "kl": 0.865234375, "learning_rate": 1.2713832064634125e-07, "loss": 0.1642, "reward": 0.5996736511588097, "reward_std": 0.5084411576390266, "rewards/cosine_scaled_reward": -0.06821873132139444, "rewards/format_reward": 0.736111119389534, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 2815.4166870117188, "epoch": 0.7729220222793488, "grad_norm": 2.466654062271118, "kl": 0.7978515625, "learning_rate": 1.260741462457165e-07, "loss": 0.1236, "reward": 0.07433861424215138, "reward_std": 0.5574841573834419, "rewards/cosine_scaled_reward": -0.21977514401078224, "rewards/format_reward": 0.5138888917863369, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 2436.8611450195312, "epoch": 0.7746358183376179, "grad_norm": 2.854764699935913, "kl": 0.8505859375, "learning_rate": 1.2503063339313356e-07, "loss": 0.1263, "reward": 0.462260864675045, "reward_std": 0.7514103129506111, "rewards/cosine_scaled_reward": -0.09525846503674984, "rewards/format_reward": 0.6527777761220932, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 2337.7361755371094, "epoch": 0.7763496143958869, "grad_norm": 3.1975936889648438, "kl": 0.97265625, "learning_rate": 1.2400783294793668e-07, "loss": 0.1955, "reward": 0.39777151867747307, "reward_std": 0.6588628813624382, "rewards/cosine_scaled_reward": -0.11361423693597317, "rewards/format_reward": 0.6250000074505806, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 2078.0972595214844, "epoch": 0.778063410454156, "grad_norm": 1.6080825328826904, "kl": 1.15625, "learning_rate": 1.2300579475997657e-07, "loss": 0.2023, "reward": 0.7216087523847818, "reward_std": 0.7977120280265808, "rewards/cosine_scaled_reward": 0.006637714395765215, "rewards/format_reward": 0.7083333283662796, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 2211.7916259765625, "epoch": 0.779777206512425, "grad_norm": 3.7410457134246826, "kl": 1.00390625, "learning_rate": 1.220245676671809e-07, "loss": 0.2082, "reward": 0.4414830207824707, "reward_std": 0.6565307825803757, "rewards/cosine_scaled_reward": -0.10564738605171442, "rewards/format_reward": 0.6527777761220932, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 2365.52783203125, "epoch": 0.781491002570694, "grad_norm": 6.645061492919922, "kl": 0.64013671875, "learning_rate": 1.2106419949317388e-07, "loss": 0.2263, "reward": 0.3324281768873334, "reward_std": 0.5864489898085594, "rewards/cosine_scaled_reward": -0.1046192436479032, "rewards/format_reward": 0.5416666641831398, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2122.5833435058594, "epoch": 0.7832047986289632, "grad_norm": 2.3880536556243896, "kl": 1.04736328125, "learning_rate": 1.2012473704494537e-07, "loss": 0.1423, "reward": 0.8056632168591022, "reward_std": 0.6164202988147736, "rewards/cosine_scaled_reward": 0.020887171383947134, "rewards/format_reward": 0.7638888955116272, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1959.8056030273438, "epoch": 0.7849185946872322, "grad_norm": 4.885958671569824, "kl": 0.94287109375, "learning_rate": 1.1920622611056974e-07, "loss": 0.2544, "reward": 0.4146232455968857, "reward_std": 0.5990116819739342, "rewards/cosine_scaled_reward": -0.12602169532328844, "rewards/format_reward": 0.6666666865348816, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 2254.0694274902344, "epoch": 0.7866323907455013, "grad_norm": 2.2345101833343506, "kl": 0.53564453125, "learning_rate": 1.1830871145697412e-07, "loss": 0.0455, "reward": 0.4500209465622902, "reward_std": 0.5013090819120407, "rewards/cosine_scaled_reward": -0.10137841757386923, "rewards/format_reward": 0.652777798473835, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2215.1666564941406, "epoch": 0.7883461868037703, "grad_norm": 4.272637367248535, "kl": 0.92236328125, "learning_rate": 1.1743223682775649e-07, "loss": 0.0789, "reward": 0.44543247297406197, "reward_std": 0.5984909385442734, "rewards/cosine_scaled_reward": -0.10367265064269304, "rewards/format_reward": 0.6527777910232544, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1930.5000305175781, "epoch": 0.7900599828620394, "grad_norm": 2.8687753677368164, "kl": 1.2333984375, "learning_rate": 1.1657684494105386e-07, "loss": 0.1984, "reward": 0.4672253951430321, "reward_std": 0.6156143024563789, "rewards/cosine_scaled_reward": -0.13444286305457354, "rewards/format_reward": 0.736111119389534, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1981.9583740234375, "epoch": 0.7917737789203085, "grad_norm": 3.0882349014282227, "kl": 0.7373046875, "learning_rate": 1.1574257748745986e-07, "loss": 0.0448, "reward": 0.5570826064795256, "reward_std": 0.6341868117451668, "rewards/cosine_scaled_reward": -0.08951424108818173, "rewards/format_reward": 0.7361111119389534, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1936.4306030273438, "epoch": 0.7934875749785776, "grad_norm": 2.4561548233032227, "kl": 0.7119140625, "learning_rate": 1.1492947512799328e-07, "loss": 0.2065, "reward": 0.5815738141536713, "reward_std": 0.7455588281154633, "rewards/cosine_scaled_reward": -0.08421308733522892, "rewards/format_reward": 0.7500000149011612, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 2648.277801513672, "epoch": 0.7952013710368466, "grad_norm": 1.9183648824691772, "kl": 1.189453125, "learning_rate": 1.1413757749211602e-07, "loss": 0.1831, "reward": 0.4944647327065468, "reward_std": 0.5960628166794777, "rewards/cosine_scaled_reward": -0.04443428758531809, "rewards/format_reward": 0.5833333358168602, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 3089.4722290039062, "epoch": 0.7969151670951157, "grad_norm": 1.3800582885742188, "kl": 0.88671875, "learning_rate": 1.1336692317580158e-07, "loss": 0.1132, "reward": 0.17487204633653164, "reward_std": 0.6750592887401581, "rewards/cosine_scaled_reward": -0.1278417520225048, "rewards/format_reward": 0.430555559694767, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 2615.013916015625, "epoch": 0.7986289631533847, "grad_norm": 2.8072264194488525, "kl": 0.93115234375, "learning_rate": 1.1261754973965422e-07, "loss": 0.15, "reward": 0.17807744164019823, "reward_std": 0.6022924780845642, "rewards/cosine_scaled_reward": -0.11929461418185383, "rewards/format_reward": 0.4166666679084301, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 2457.3750610351562, "epoch": 0.8003427592116538, "grad_norm": 4.940661430358887, "kl": 0.7890625, "learning_rate": 1.1188949370707787e-07, "loss": 0.1464, "reward": 0.24202457256615162, "reward_std": 0.42437436431646347, "rewards/cosine_scaled_reward": -0.19148772559128702, "rewards/format_reward": 0.6249999925494194, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 2192.986114501953, "epoch": 0.8020565552699229, "grad_norm": 3.136319637298584, "kl": 0.767578125, "learning_rate": 1.1118279056249653e-07, "loss": 0.1721, "reward": 0.6284131053835154, "reward_std": 0.5748142190277576, "rewards/cosine_scaled_reward": -0.012182342819869518, "rewards/format_reward": 0.6527777761220932, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 2432.8333435058594, "epoch": 0.803770351328192, "grad_norm": 1.9713729619979858, "kl": 0.8603515625, "learning_rate": 1.1049747474962444e-07, "loss": 0.1523, "reward": 0.5457211770117283, "reward_std": 0.729132629930973, "rewards/cosine_scaled_reward": -0.0535283163189888, "rewards/format_reward": 0.6527777910232544, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 2276.4305725097656, "epoch": 0.805484147386461, "grad_norm": 3.8467977046966553, "kl": 1.216796875, "learning_rate": 1.0983357966978745e-07, "loss": 0.1232, "reward": 0.5122000686824322, "reward_std": 0.7733886539936066, "rewards/cosine_scaled_reward": -0.09112219791859388, "rewards/format_reward": 0.6944444552063942, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 2643.7083129882812, "epoch": 0.8071979434447301, "grad_norm": 1.1509345769882202, "kl": 0.7216796875, "learning_rate": 1.0919113768029517e-07, "loss": 0.1381, "reward": 0.10054661217145622, "reward_std": 0.6373118087649345, "rewards/cosine_scaled_reward": -0.16500448435544968, "rewards/format_reward": 0.4305555745959282, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 2733.8056030273438, "epoch": 0.8089117395029991, "grad_norm": 1.7471221685409546, "kl": 0.60986328125, "learning_rate": 1.0857018009286381e-07, "loss": 0.1037, "reward": 0.3268199451267719, "reward_std": 0.7872605472803116, "rewards/cosine_scaled_reward": -0.06575669860467315, "rewards/format_reward": 0.4583333507180214, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 2760.416717529297, "epoch": 0.8106255355612683, "grad_norm": 2.182706832885742, "kl": 0.7177734375, "learning_rate": 1.0797073717209013e-07, "loss": 0.103, "reward": 0.3022213885560632, "reward_std": 0.5640696436166763, "rewards/cosine_scaled_reward": -0.15444485377520323, "rewards/format_reward": 0.611111119389534, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2262.4166564941406, "epoch": 0.8123393316195373, "grad_norm": 2.2662978172302246, "kl": 1.310546875, "learning_rate": 1.0739283813397639e-07, "loss": 0.2095, "reward": 0.817143252119422, "reward_std": 0.5297734513878822, "rewards/cosine_scaled_reward": 0.0960716437548399, "rewards/format_reward": 0.6250000149011612, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 2711.9722595214844, "epoch": 0.8140531276778064, "grad_norm": 5.152209758758545, "kl": 0.900390625, "learning_rate": 1.068365111445064e-07, "loss": 0.0199, "reward": 0.1260463148355484, "reward_std": 0.5338724106550217, "rewards/cosine_scaled_reward": -0.1939212940633297, "rewards/format_reward": 0.5138888917863369, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 2675.7361755371094, "epoch": 0.8157669237360754, "grad_norm": 2.230329990386963, "kl": 0.79931640625, "learning_rate": 1.063017833182728e-07, "loss": 0.1478, "reward": 0.14864197466522455, "reward_std": 0.6397556141018867, "rewards/cosine_scaled_reward": -0.1756790205836296, "rewards/format_reward": 0.5000000037252903, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 2487.75, "epoch": 0.8174807197943444, "grad_norm": 4.63166618347168, "kl": 0.8720703125, "learning_rate": 1.0578868071715544e-07, "loss": 0.3369, "reward": 0.37176867201924324, "reward_std": 0.6089313849806786, "rewards/cosine_scaled_reward": -0.07106010848656297, "rewards/format_reward": 0.5138888992369175, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 2181.4722595214844, "epoch": 0.8191945158526135, "grad_norm": 3.272205114364624, "kl": 1.212890625, "learning_rate": 1.0529722834905125e-07, "loss": 0.1721, "reward": 0.2916110037913313, "reward_std": 0.49708379805088043, "rewards/cosine_scaled_reward": -0.11113895289599895, "rewards/format_reward": 0.5138889029622078, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 2471.8194732666016, "epoch": 0.8209083119108826, "grad_norm": 3.132082462310791, "kl": 1.228515625, "learning_rate": 1.0482745016665526e-07, "loss": 0.208, "reward": 0.7535388497635722, "reward_std": 0.695548452436924, "rewards/cosine_scaled_reward": 0.06426943093538284, "rewards/format_reward": 0.6250000074505806, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 2416.0693969726562, "epoch": 0.8226221079691517, "grad_norm": 3.6008918285369873, "kl": 0.6015625, "learning_rate": 1.0437936906629334e-07, "loss": 0.1762, "reward": 0.3882830161601305, "reward_std": 0.6291572600603104, "rewards/cosine_scaled_reward": -0.07669184263795614, "rewards/format_reward": 0.5416666641831398, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1688.6388854980469, "epoch": 0.8243359040274207, "grad_norm": 3.3292489051818848, "kl": 0.94482421875, "learning_rate": 1.0395300688680625e-07, "loss": 0.1756, "reward": 0.6976406946778297, "reward_std": 0.7118247449398041, "rewards/cosine_scaled_reward": -0.040068539790809155, "rewards/format_reward": 0.7777777835726738, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 2275.9723205566406, "epoch": 0.8260497000856898, "grad_norm": 5.62246036529541, "kl": 0.806640625, "learning_rate": 1.0354838440848501e-07, "loss": 0.3047, "reward": 0.3320089429616928, "reward_std": 0.5019624754786491, "rewards/cosine_scaled_reward": -0.13955109613016248, "rewards/format_reward": 0.6111111268401146, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 2215.8055725097656, "epoch": 0.8277634961439588, "grad_norm": 1.8363783359527588, "kl": 0.98046875, "learning_rate": 1.0316552135205837e-07, "loss": 0.205, "reward": 0.4018698123982176, "reward_std": 0.5796016827225685, "rewards/cosine_scaled_reward": -0.13239844236522913, "rewards/format_reward": 0.6666666716337204, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1844.236099243164, "epoch": 0.829477292202228, "grad_norm": 4.732890605926514, "kl": 0.64990234375, "learning_rate": 1.0280443637773163e-07, "loss": 0.229, "reward": 0.37843877635896206, "reward_std": 0.6878086104989052, "rewards/cosine_scaled_reward": -0.1441139355301857, "rewards/format_reward": 0.6666666716337204, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 2126.527801513672, "epoch": 0.831191088260497, "grad_norm": 3.030064821243286, "kl": 1.2275390625, "learning_rate": 1.0246514708427701e-07, "loss": 0.2121, "reward": 0.5017230249941349, "reward_std": 0.8949761241674423, "rewards/cosine_scaled_reward": -0.0616384893655777, "rewards/format_reward": 0.625, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2167.52783203125, "epoch": 0.8329048843187661, "grad_norm": 2.106167793273926, "kl": 0.8662109375, "learning_rate": 1.0214767000817596e-07, "loss": 0.0938, "reward": 0.5535758845508099, "reward_std": 0.5298986956477165, "rewards/cosine_scaled_reward": -0.021823172457516193, "rewards/format_reward": 0.597222238779068, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 2419.6111450195312, "epoch": 0.8346186803770351, "grad_norm": 2.8747453689575195, "kl": 0.6708984375, "learning_rate": 1.0185202062281336e-07, "loss": 0.1754, "reward": 0.2720159562304616, "reward_std": 0.545224204659462, "rewards/cosine_scaled_reward": -0.1487142387777567, "rewards/format_reward": 0.5694444552063942, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 2196.486114501953, "epoch": 0.8363324764353042, "grad_norm": 2.825509786605835, "kl": 0.55517578125, "learning_rate": 1.0157821333772304e-07, "loss": 0.1679, "reward": 0.6998728811740875, "reward_std": 0.6955326199531555, "rewards/cosine_scaled_reward": 0.030491996556520462, "rewards/format_reward": 0.6388888955116272, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 2617.722198486328, "epoch": 0.8380462724935732, "grad_norm": 1.9763245582580566, "kl": 0.8466796875, "learning_rate": 1.013262614978859e-07, "loss": 0.1616, "reward": 0.28653959557414055, "reward_std": 0.6969783715903759, "rewards/cosine_scaled_reward": -0.12061909190379083, "rewards/format_reward": 0.5277777835726738, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 2630.0972290039062, "epoch": 0.8397600685518424, "grad_norm": 1.7776055335998535, "kl": 0.73388671875, "learning_rate": 1.0109617738307911e-07, "loss": 0.1053, "reward": 0.3464082106947899, "reward_std": 0.7413296326994896, "rewards/cosine_scaled_reward": -0.08374034571170341, "rewards/format_reward": 0.5138888880610466, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 2610.1666870117188, "epoch": 0.8414738646101114, "grad_norm": 2.362657308578491, "kl": 1.03369140625, "learning_rate": 1.0088797220727779e-07, "loss": 0.1536, "reward": 0.054581154661718756, "reward_std": 0.5118880867958069, "rewards/cosine_scaled_reward": -0.27132053300738335, "rewards/format_reward": 0.5972222238779068, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 2591.611114501953, "epoch": 0.8431876606683805, "grad_norm": 1.4310436248779297, "kl": 0.748046875, "learning_rate": 1.0070165611810855e-07, "loss": 0.1227, "reward": 0.2780334800481796, "reward_std": 0.5931698530912399, "rewards/cosine_scaled_reward": -0.16653881408274174, "rewards/format_reward": 0.611111119389534, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1928.0278015136719, "epoch": 0.8449014567266495, "grad_norm": 4.139144420623779, "kl": 1.39892578125, "learning_rate": 1.005372381963547e-07, "loss": 0.2949, "reward": 0.36576576717197895, "reward_std": 0.4379217103123665, "rewards/cosine_scaled_reward": -0.15045045968145132, "rewards/format_reward": 0.6666666716337204, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 2274.680633544922, "epoch": 0.8466152527849186, "grad_norm": 1.5368496179580688, "kl": 0.958984375, "learning_rate": 1.0039472645551372e-07, "loss": 0.1968, "reward": 0.3456185795366764, "reward_std": 0.5900547206401825, "rewards/cosine_scaled_reward": -0.11885737907141447, "rewards/format_reward": 0.5833333358168602, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1835.8333740234375, "epoch": 0.8483290488431876, "grad_norm": 4.2471394538879395, "kl": 1.61279296875, "learning_rate": 1.002741278414069e-07, "loss": 0.1987, "reward": 0.9312632232904434, "reward_std": 0.586229220032692, "rewards/cosine_scaled_reward": 0.06979827064787969, "rewards/format_reward": 0.7916666567325592, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 2454.902801513672, "epoch": 0.8500428449014568, "grad_norm": 2.069298505783081, "kl": 0.9306640625, "learning_rate": 1.0017544823184055e-07, "loss": 0.176, "reward": 0.6016820748336613, "reward_std": 0.7270394861698151, "rewards/cosine_scaled_reward": 0.0161188212223351, "rewards/format_reward": 0.5694444477558136, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 2282.250030517578, "epoch": 0.8517566409597258, "grad_norm": 2.224278688430786, "kl": 0.7265625, "learning_rate": 1.0009869243631952e-07, "loss": 0.1715, "reward": 0.43558146245777607, "reward_std": 0.6017558500170708, "rewards/cosine_scaled_reward": -0.07387594413012266, "rewards/format_reward": 0.5833333432674408, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 2533.7500610351562, "epoch": 0.8534704370179949, "grad_norm": 5.092855930328369, "kl": 1.08935546875, "learning_rate": 1.000438641958131e-07, "loss": 0.1009, "reward": 0.1837000446394086, "reward_std": 0.7107623964548111, "rewards/cosine_scaled_reward": -0.19287220388650894, "rewards/format_reward": 0.569444440305233, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 2394.777801513672, "epoch": 0.8551842330762639, "grad_norm": 2.348245620727539, "kl": 0.81884765625, "learning_rate": 1.0001096618257236e-07, "loss": 0.1019, "reward": 0.8653097227215767, "reward_std": 0.7131348252296448, "rewards/cosine_scaled_reward": 0.13404375594109297, "rewards/format_reward": 0.5972222238779068, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1616.4583282470703, "epoch": 0.856898029134533, "grad_norm": 2.849949598312378, "kl": 0.951171875, "learning_rate": 1e-07, "loss": 0.12, "reward": 1.1779827252030373, "reward_std": 0.6799286007881165, "rewards/cosine_scaled_reward": 0.20010241214185953, "rewards/format_reward": 0.7777777761220932, "step": 500 }, { "epoch": 0.856898029134533, "step": 500, "total_flos": 0.0, "train_loss": 0.12157149085606943, "train_runtime": 48026.4516, "train_samples_per_second": 0.75, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }