| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.856898029134533, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2862.5695190429688, | |
| "epoch": 0.001713796058269066, | |
| "grad_norm": 0.16925157606601715, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0467, | |
| "reward": 0.12026740610599518, | |
| "reward_std": 0.47210293635725975, | |
| "rewards/cosine_scaled_reward": -0.1343107339926064, | |
| "rewards/format_reward": 0.3888888917863369, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2739.5, | |
| "epoch": 0.003427592116538132, | |
| "grad_norm": 0.18508067727088928, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0391, | |
| "reward": -0.05314926430583, | |
| "reward_std": 0.36226021870970726, | |
| "rewards/cosine_scaled_reward": -0.21407463820651174, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2816.1944580078125, | |
| "epoch": 0.005141388174807198, | |
| "grad_norm": 0.15574845671653748, | |
| "kl": 4.06801700592041e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.024, | |
| "reward": -0.0735303945839405, | |
| "reward_std": 0.4152667075395584, | |
| "rewards/cosine_scaled_reward": -0.21037630829960108, | |
| "rewards/format_reward": 0.34722223225980997, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2746.875, | |
| "epoch": 0.006855184233076264, | |
| "grad_norm": 0.18099600076675415, | |
| "kl": 3.692507743835449e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0516, | |
| "reward": 0.2664791904389858, | |
| "reward_std": 0.8305703550577164, | |
| "rewards/cosine_scaled_reward": -0.07509375014342368, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2557.513916015625, | |
| "epoch": 0.00856898029134533, | |
| "grad_norm": 0.173630490899086, | |
| "kl": 2.3245811462402344e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0579, | |
| "reward": 0.4870211333036423, | |
| "reward_std": 0.6806018278002739, | |
| "rewards/cosine_scaled_reward": -0.006489435210824013, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3163.8333129882812, | |
| "epoch": 0.010282776349614395, | |
| "grad_norm": 0.1903219074010849, | |
| "kl": 4.1365623474121094e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0699, | |
| "reward": 0.22140773385763168, | |
| "reward_std": 0.614318884909153, | |
| "rewards/cosine_scaled_reward": -0.07679613586515188, | |
| "rewards/format_reward": 0.37500001303851604, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2238.3055725097656, | |
| "epoch": 0.011996572407883462, | |
| "grad_norm": 0.2037331461906433, | |
| "kl": 3.427267074584961e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0507, | |
| "reward": 0.39292821660637856, | |
| "reward_std": 0.6100749522447586, | |
| "rewards/cosine_scaled_reward": -0.08825810719281435, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2888.4166870117188, | |
| "epoch": 0.013710368466152529, | |
| "grad_norm": 0.1671508252620697, | |
| "kl": 2.8967857360839844e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0888, | |
| "reward": 0.5700129643082619, | |
| "reward_std": 1.0805757492780685, | |
| "rewards/cosine_scaled_reward": 0.04195092432200909, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2740.638916015625, | |
| "epoch": 0.015424164524421594, | |
| "grad_norm": 0.2825331389904022, | |
| "kl": 3.212690353393555e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.1025, | |
| "reward": 0.3288399577140808, | |
| "reward_std": 0.6967436075210571, | |
| "rewards/cosine_scaled_reward": -0.03696890315040946, | |
| "rewards/format_reward": 0.4027777733281255, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3010.7916870117188, | |
| "epoch": 0.01713796058269066, | |
| "grad_norm": 0.17822624742984772, | |
| "kl": 4.1991472244262695e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0471, | |
| "reward": 0.09832120686769485, | |
| "reward_std": 0.6553668975830078, | |
| "rewards/cosine_scaled_reward": -0.1036171680316329, | |
| "rewards/format_reward": 0.3055555522441864, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2748.486114501953, | |
| "epoch": 0.018851756640959727, | |
| "grad_norm": 0.2476479411125183, | |
| "kl": 3.9696693420410156e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0491, | |
| "reward": 0.015873797237873077, | |
| "reward_std": 0.553259089589119, | |
| "rewards/cosine_scaled_reward": -0.16567421704530716, | |
| "rewards/format_reward": 0.3472222238779068, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2954.3472290039062, | |
| "epoch": 0.02056555269922879, | |
| "grad_norm": 0.28294840455055237, | |
| "kl": 3.898143768310547e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.1311, | |
| "reward": -0.11908636894077063, | |
| "reward_std": 0.6466177105903625, | |
| "rewards/cosine_scaled_reward": -0.22620984725654125, | |
| "rewards/format_reward": 0.3333333367481828, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2818.986114501953, | |
| "epoch": 0.022279348757497857, | |
| "grad_norm": 0.18577341735363007, | |
| "kl": 4.303455352783203e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0007, | |
| "reward": 0.3697042800486088, | |
| "reward_std": 0.7059066146612167, | |
| "rewards/cosine_scaled_reward": -0.03042563726194203, | |
| "rewards/format_reward": 0.4305555559694767, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2905.3333740234375, | |
| "epoch": 0.023993144815766924, | |
| "grad_norm": 0.226650208234787, | |
| "kl": 3.2275915145874023e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0212, | |
| "reward": 0.04198750853538513, | |
| "reward_std": 0.5741659551858902, | |
| "rewards/cosine_scaled_reward": -0.14567292109131813, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3468.2222290039062, | |
| "epoch": 0.02570694087403599, | |
| "grad_norm": 0.1521635353565216, | |
| "kl": 4.279613494873047e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0233, | |
| "reward": -0.17704490013420582, | |
| "reward_std": 0.6536840051412582, | |
| "rewards/cosine_scaled_reward": -0.1996335554867983, | |
| "rewards/format_reward": 0.22222222574055195, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2405.263916015625, | |
| "epoch": 0.027420736932305057, | |
| "grad_norm": 0.23728908598423004, | |
| "kl": 2.495013177394867e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0632, | |
| "reward": 0.7499620914459229, | |
| "reward_std": 0.9962631165981293, | |
| "rewards/cosine_scaled_reward": 0.07636993401683867, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2764.875030517578, | |
| "epoch": 0.02913453299057412, | |
| "grad_norm": 0.21387562155723572, | |
| "kl": 2.6166439056396484e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0416, | |
| "reward": 0.27334376238286495, | |
| "reward_std": 0.4753483533859253, | |
| "rewards/cosine_scaled_reward": -0.05082811089232564, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3252.486083984375, | |
| "epoch": 0.030848329048843187, | |
| "grad_norm": 0.209347203373909, | |
| "kl": 4.25875186920166e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0587, | |
| "reward": -0.18576696328818798, | |
| "reward_std": 0.5022815316915512, | |
| "rewards/cosine_scaled_reward": -0.19010569993406534, | |
| "rewards/format_reward": 0.1944444514811039, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3157.4166870117188, | |
| "epoch": 0.032562125107112254, | |
| "grad_norm": 0.22900572419166565, | |
| "kl": 3.084540367126465e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0687, | |
| "reward": 0.03116392099764198, | |
| "reward_std": 0.7267041057348251, | |
| "rewards/cosine_scaled_reward": -0.14414026169106364, | |
| "rewards/format_reward": 0.3194444486871362, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3228.5972290039062, | |
| "epoch": 0.03427592116538132, | |
| "grad_norm": 0.24043872952461243, | |
| "kl": 2.6807188987731934e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.1293, | |
| "reward": -0.1261596381664276, | |
| "reward_std": 0.7229140102863312, | |
| "rewards/cosine_scaled_reward": -0.20196872018277645, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2856.6805419921875, | |
| "epoch": 0.03598971722365039, | |
| "grad_norm": 0.19779175519943237, | |
| "kl": 3.987550735473633e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0069, | |
| "reward": 0.11652377434074879, | |
| "reward_std": 0.8210525661706924, | |
| "rewards/cosine_scaled_reward": -0.12229366600513458, | |
| "rewards/format_reward": 0.3611111165955663, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3298.3472290039062, | |
| "epoch": 0.037703513281919454, | |
| "grad_norm": 0.13437196612358093, | |
| "kl": 2.828240394592285e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0123, | |
| "reward": 0.1601133793592453, | |
| "reward_std": 0.6881751976907253, | |
| "rewards/cosine_scaled_reward": -0.06577664241194725, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3107.4583129882812, | |
| "epoch": 0.03941730934018852, | |
| "grad_norm": 0.1506253182888031, | |
| "kl": 2.2932887077331543e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0149, | |
| "reward": -0.13085854798555374, | |
| "reward_std": 0.5464130863547325, | |
| "rewards/cosine_scaled_reward": -0.20431815274059772, | |
| "rewards/format_reward": 0.2777777807787061, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2710.6806030273438, | |
| "epoch": 0.04113110539845758, | |
| "grad_norm": 0.24692188203334808, | |
| "kl": 2.8967857360839844e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.1012, | |
| "reward": 0.24628422083333135, | |
| "reward_std": 0.4773574620485306, | |
| "rewards/cosine_scaled_reward": -0.057413444737903774, | |
| "rewards/format_reward": 0.3611111268401146, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2784.7361450195312, | |
| "epoch": 0.04284490145672665, | |
| "grad_norm": 0.25797340273857117, | |
| "kl": 2.6673078536987305e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.106, | |
| "reward": 0.46540534496307373, | |
| "reward_std": 0.8211657330393791, | |
| "rewards/cosine_scaled_reward": -0.01729731634259224, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3136.52783203125, | |
| "epoch": 0.044558697514995714, | |
| "grad_norm": 0.14968131482601166, | |
| "kl": 3.291666507720947e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0512, | |
| "reward": -0.09118526801466942, | |
| "reward_std": 0.5860454589128494, | |
| "rewards/cosine_scaled_reward": -0.21225931122899055, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3040.0000610351562, | |
| "epoch": 0.04627249357326478, | |
| "grad_norm": 0.17181935906410217, | |
| "kl": 1.5079975128173828e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0738, | |
| "reward": 0.34727448783814907, | |
| "reward_std": 0.6153330877423286, | |
| "rewards/cosine_scaled_reward": -0.027751651592552662, | |
| "rewards/format_reward": 0.4027777947485447, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2343.1111755371094, | |
| "epoch": 0.04798628963153385, | |
| "grad_norm": 0.2077193260192871, | |
| "kl": 2.5130808353424072e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0598, | |
| "reward": 0.6073902919888496, | |
| "reward_std": 0.6849471032619476, | |
| "rewards/cosine_scaled_reward": 0.018972909078001976, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3073.7222290039062, | |
| "epoch": 0.049700085689802914, | |
| "grad_norm": 0.21480253338813782, | |
| "kl": 2.290681004524231e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0747, | |
| "reward": 0.17731062695384026, | |
| "reward_std": 0.8807300254702568, | |
| "rewards/cosine_scaled_reward": -0.07801135815680027, | |
| "rewards/format_reward": 0.3333333320915699, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2768.02783203125, | |
| "epoch": 0.05141388174807198, | |
| "grad_norm": 0.25759172439575195, | |
| "kl": 2.993270754814148e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0674, | |
| "reward": 0.5063075462821871, | |
| "reward_std": 0.771463930606842, | |
| "rewards/cosine_scaled_reward": -0.010735094547271729, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2695.6944580078125, | |
| "epoch": 0.05312767780634105, | |
| "grad_norm": 0.2701717019081116, | |
| "kl": 1.3127923011779785e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0971, | |
| "reward": 0.2706103939563036, | |
| "reward_std": 0.49449611082673073, | |
| "rewards/cosine_scaled_reward": -0.045250357885379344, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3051.52783203125, | |
| "epoch": 0.054841473864610114, | |
| "grad_norm": 0.17947925627231598, | |
| "kl": 2.6337802410125732e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.057, | |
| "reward": 0.45089754834771156, | |
| "reward_std": 1.1203400194644928, | |
| "rewards/cosine_scaled_reward": -0.02455122536048293, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2306.8750610351562, | |
| "epoch": 0.056555269922879174, | |
| "grad_norm": 0.21536274254322052, | |
| "kl": 5.3569674491882324e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0764, | |
| "reward": 0.8166992478072643, | |
| "reward_std": 0.8387185409665108, | |
| "rewards/cosine_scaled_reward": 0.12362739443778992, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2698.7083740234375, | |
| "epoch": 0.05826906598114824, | |
| "grad_norm": 0.29884466528892517, | |
| "kl": 0.00017189979553222656, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.1617, | |
| "reward": 0.057983118342235684, | |
| "reward_std": 0.7621737122535706, | |
| "rewards/cosine_scaled_reward": -0.1585084507241845, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3312.3055419921875, | |
| "epoch": 0.05998286203941731, | |
| "grad_norm": 0.1554093211889267, | |
| "kl": 9.316205978393555e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0273, | |
| "reward": -0.2900172360241413, | |
| "reward_std": 0.5383428931236267, | |
| "rewards/cosine_scaled_reward": -0.2700086124241352, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2631.8055725097656, | |
| "epoch": 0.061696658097686374, | |
| "grad_norm": 0.19274435937404633, | |
| "kl": 0.0002084970474243164, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0306, | |
| "reward": 0.006275304593145847, | |
| "reward_std": 0.46724043786525726, | |
| "rewards/cosine_scaled_reward": -0.18436234444379807, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3124.5277709960938, | |
| "epoch": 0.06341045415595545, | |
| "grad_norm": 0.15709905326366425, | |
| "kl": 7.59810209274292e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0561, | |
| "reward": -0.008991474285721779, | |
| "reward_std": 0.5808551460504532, | |
| "rewards/cosine_scaled_reward": -0.1294957408681512, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3045.90283203125, | |
| "epoch": 0.06512425021422451, | |
| "grad_norm": 0.2423790842294693, | |
| "kl": 0.00022971630096435547, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.1263, | |
| "reward": 0.1536001469939947, | |
| "reward_std": 0.7093052342534065, | |
| "rewards/cosine_scaled_reward": -0.07597769796848297, | |
| "rewards/format_reward": 0.305555559694767, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3150.0833740234375, | |
| "epoch": 0.06683804627249357, | |
| "grad_norm": 0.13335144519805908, | |
| "kl": 0.0003066062927246094, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0187, | |
| "reward": -0.01171512296423316, | |
| "reward_std": 0.48150157928466797, | |
| "rewards/cosine_scaled_reward": -0.1586353350430727, | |
| "rewards/format_reward": 0.3055555559694767, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2782.27783203125, | |
| "epoch": 0.06855184233076264, | |
| "grad_norm": 0.1773526668548584, | |
| "kl": 0.0007457435131072998, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0236, | |
| "reward": 0.19545890390872955, | |
| "reward_std": 0.5221360512077808, | |
| "rewards/cosine_scaled_reward": -0.08282610075548291, | |
| "rewards/format_reward": 0.361111112870276, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2729.9722290039062, | |
| "epoch": 0.0702656383890317, | |
| "grad_norm": 0.2603820860385895, | |
| "kl": 0.0002143383026123047, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.1308, | |
| "reward": 0.5641986541450024, | |
| "reward_std": 0.7014989629387856, | |
| "rewards/cosine_scaled_reward": 0.05293265450745821, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2622.0555419921875, | |
| "epoch": 0.07197943444730077, | |
| "grad_norm": 0.19547662138938904, | |
| "kl": 0.0008759498596191406, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0788, | |
| "reward": 0.3987229084596038, | |
| "reward_std": 0.6764711476862431, | |
| "rewards/cosine_scaled_reward": -0.05063853319734335, | |
| "rewards/format_reward": 0.5, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2757.3611450195312, | |
| "epoch": 0.07369323050556983, | |
| "grad_norm": 0.133390411734581, | |
| "kl": 0.00021369755268096924, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0354, | |
| "reward": 0.5515957027673721, | |
| "reward_std": 0.6986619718372822, | |
| "rewards/cosine_scaled_reward": 0.04663117043673992, | |
| "rewards/format_reward": 0.4583333283662796, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2743.763916015625, | |
| "epoch": 0.07540702656383891, | |
| "grad_norm": 0.17805209755897522, | |
| "kl": 0.0008558034896850586, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.1039, | |
| "reward": 0.06273656419944018, | |
| "reward_std": 0.7254525497555733, | |
| "rewards/cosine_scaled_reward": -0.18390950025059283, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3018.1805419921875, | |
| "epoch": 0.07712082262210797, | |
| "grad_norm": 0.23340974748134613, | |
| "kl": 0.0007225275039672852, | |
| "learning_rate": 9e-07, | |
| "loss": 0.047, | |
| "reward": 0.12753370963037014, | |
| "reward_std": 0.5756559893488884, | |
| "rewards/cosine_scaled_reward": -0.09595536440610886, | |
| "rewards/format_reward": 0.31944444589316845, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2453.77783203125, | |
| "epoch": 0.07883461868037704, | |
| "grad_norm": 0.25216469168663025, | |
| "kl": 0.0028772354125976562, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0976, | |
| "reward": 0.4031712617725134, | |
| "reward_std": 0.5689256861805916, | |
| "rewards/cosine_scaled_reward": -0.05535881780087948, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3180.0972290039062, | |
| "epoch": 0.0805484147386461, | |
| "grad_norm": 0.17415259778499603, | |
| "kl": 0.0014755725860595703, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0718, | |
| "reward": -0.026270870119333267, | |
| "reward_std": 0.641656719148159, | |
| "rewards/cosine_scaled_reward": -0.15202434547245502, | |
| "rewards/format_reward": 0.27777778171002865, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2680.7639770507812, | |
| "epoch": 0.08226221079691516, | |
| "grad_norm": 0.20438066124916077, | |
| "kl": 0.001586318016052246, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0807, | |
| "reward": 0.6057721227407455, | |
| "reward_std": 0.7416700124740601, | |
| "rewards/cosine_scaled_reward": 0.05288607440888882, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2346.055633544922, | |
| "epoch": 0.08397600685518423, | |
| "grad_norm": 0.35583311319351196, | |
| "kl": 0.018939971923828125, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.1404, | |
| "reward": 0.7048290632665157, | |
| "reward_std": 0.6792610064148903, | |
| "rewards/cosine_scaled_reward": 0.06074785813689232, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2833.5833740234375, | |
| "epoch": 0.0856898029134533, | |
| "grad_norm": 0.2027311623096466, | |
| "kl": 0.0032949447631835938, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0416, | |
| "reward": 0.07023209612816572, | |
| "reward_std": 0.6861855462193489, | |
| "rewards/cosine_scaled_reward": -0.16627284698188305, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3051.2777709960938, | |
| "epoch": 0.08740359897172237, | |
| "grad_norm": 0.16748514771461487, | |
| "kl": 0.001615285873413086, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.069, | |
| "reward": 0.1449947228829842, | |
| "reward_std": 0.7090619504451752, | |
| "rewards/cosine_scaled_reward": -0.10111376643180847, | |
| "rewards/format_reward": 0.34722223225980997, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3181.9583740234375, | |
| "epoch": 0.08911739502999143, | |
| "grad_norm": 0.16281543672084808, | |
| "kl": 0.0019249916076660156, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0803, | |
| "reward": -0.03632636368274689, | |
| "reward_std": 0.5028033927083015, | |
| "rewards/cosine_scaled_reward": -0.12927428726106882, | |
| "rewards/format_reward": 0.22222222946584225, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3303.1805419921875, | |
| "epoch": 0.0908311910882605, | |
| "grad_norm": 0.14455804228782654, | |
| "kl": 0.0005393028259277344, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0318, | |
| "reward": -0.10013403557240963, | |
| "reward_std": 0.4606664590537548, | |
| "rewards/cosine_scaled_reward": -0.17506700940430164, | |
| "rewards/format_reward": 0.2500000046566129, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3313.1944580078125, | |
| "epoch": 0.09254498714652956, | |
| "grad_norm": 0.13308647274971008, | |
| "kl": 0.0011081695556640625, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0055, | |
| "reward": 0.10159287042915821, | |
| "reward_std": 0.6204735822975636, | |
| "rewards/cosine_scaled_reward": -0.060314678063150495, | |
| "rewards/format_reward": 0.2222222276031971, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3481.15283203125, | |
| "epoch": 0.09425878320479864, | |
| "grad_norm": 0.13649359345436096, | |
| "kl": 0.0008268356323242188, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0328, | |
| "reward": -0.12874329963233322, | |
| "reward_std": 0.5648706145584583, | |
| "rewards/cosine_scaled_reward": -0.1754827625118196, | |
| "rewards/format_reward": 0.22222222480922937, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3232.7222900390625, | |
| "epoch": 0.0959725792630677, | |
| "grad_norm": 0.19132941961288452, | |
| "kl": 0.0013275146484375, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.1077, | |
| "reward": -0.17376804118975997, | |
| "reward_std": 0.749246733263135, | |
| "rewards/cosine_scaled_reward": -0.20493957586586475, | |
| "rewards/format_reward": 0.2361111156642437, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3279.4584350585938, | |
| "epoch": 0.09768637532133675, | |
| "grad_norm": 0.15241067111492157, | |
| "kl": 0.000919342041015625, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0282, | |
| "reward": 0.31643399875611067, | |
| "reward_std": 0.6422489807009697, | |
| "rewards/cosine_scaled_reward": 0.005439223721623421, | |
| "rewards/format_reward": 0.30555556155741215, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3047.2916870117188, | |
| "epoch": 0.09940017137960583, | |
| "grad_norm": 0.22829630970954895, | |
| "kl": 0.0054931640625, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0909, | |
| "reward": -0.17570834839716554, | |
| "reward_std": 0.4780988022685051, | |
| "rewards/cosine_scaled_reward": -0.23368750512599945, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2918.5555419921875, | |
| "epoch": 0.10111396743787489, | |
| "grad_norm": 0.17409604787826538, | |
| "kl": 0.010187149047851562, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": -0.0001, | |
| "reward": 0.6838416904211044, | |
| "reward_std": 0.7215724363923073, | |
| "rewards/cosine_scaled_reward": 0.1196986111899605, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3240.90283203125, | |
| "epoch": 0.10282776349614396, | |
| "grad_norm": 0.21398130059242249, | |
| "kl": 0.0015239715576171875, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0841, | |
| "reward": -0.013310029171407223, | |
| "reward_std": 0.6487029865384102, | |
| "rewards/cosine_scaled_reward": -0.13859945815056562, | |
| "rewards/format_reward": 0.2638888992369175, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3323.3889770507812, | |
| "epoch": 0.10454155955441302, | |
| "grad_norm": 0.25011396408081055, | |
| "kl": 0.0015153884887695312, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0677, | |
| "reward": -0.37927111238241196, | |
| "reward_std": 0.43354837596416473, | |
| "rewards/cosine_scaled_reward": -0.2799133397638798, | |
| "rewards/format_reward": 0.18055556155741215, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2683.6250610351562, | |
| "epoch": 0.1062553556126821, | |
| "grad_norm": 0.17982754111289978, | |
| "kl": 0.00201416015625, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0008, | |
| "reward": 0.40144167095422745, | |
| "reward_std": 0.5826155617833138, | |
| "rewards/cosine_scaled_reward": -0.02844582637771964, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3426.2361450195312, | |
| "epoch": 0.10796915167095116, | |
| "grad_norm": 0.182517409324646, | |
| "kl": 0.00151824951171875, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0294, | |
| "reward": -0.09498679265379906, | |
| "reward_std": 0.7008046992123127, | |
| "rewards/cosine_scaled_reward": -0.13777116686105728, | |
| "rewards/format_reward": 0.18055555690079927, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2847.5972290039062, | |
| "epoch": 0.10968294772922023, | |
| "grad_norm": 0.31501731276512146, | |
| "kl": 0.0022530555725097656, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.1548, | |
| "reward": 0.009381972253322601, | |
| "reward_std": 0.36741600558161736, | |
| "rewards/cosine_scaled_reward": -0.16197567898780107, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3167.236083984375, | |
| "epoch": 0.11139674378748929, | |
| "grad_norm": 0.4229466915130615, | |
| "kl": 0.0364532470703125, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0659, | |
| "reward": -0.029949136078357697, | |
| "reward_std": 0.5782980695366859, | |
| "rewards/cosine_scaled_reward": -0.13997458899393678, | |
| "rewards/format_reward": 0.25000000838190317, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2846.8334350585938, | |
| "epoch": 0.11311053984575835, | |
| "grad_norm": 0.1699674278497696, | |
| "kl": 0.0013065338134765625, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0667, | |
| "reward": 0.2395001295953989, | |
| "reward_std": 0.3902180567383766, | |
| "rewards/cosine_scaled_reward": -0.053861052729189396, | |
| "rewards/format_reward": 0.3472222313284874, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3227.4445190429688, | |
| "epoch": 0.11482433590402742, | |
| "grad_norm": 0.15845970809459686, | |
| "kl": 0.0022869110107421875, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0416, | |
| "reward": 0.06229268200695515, | |
| "reward_std": 0.5577914118766785, | |
| "rewards/cosine_scaled_reward": -0.1285758875310421, | |
| "rewards/format_reward": 0.3194444552063942, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2906.3472290039062, | |
| "epoch": 0.11653813196229648, | |
| "grad_norm": 0.17754817008972168, | |
| "kl": 0.0027103424072265625, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": -0.0006, | |
| "reward": 0.000796053558588028, | |
| "reward_std": 0.5399865545332432, | |
| "rewards/cosine_scaled_reward": -0.15932418778538704, | |
| "rewards/format_reward": 0.3194444449618459, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3079.4583740234375, | |
| "epoch": 0.11825192802056556, | |
| "grad_norm": 0.16689395904541016, | |
| "kl": 0.00244140625, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0012, | |
| "reward": 0.40755608677864075, | |
| "reward_std": 0.592438168823719, | |
| "rewards/cosine_scaled_reward": 0.009333595633506775, | |
| "rewards/format_reward": 0.3888889029622078, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2852.6388549804688, | |
| "epoch": 0.11996572407883462, | |
| "grad_norm": 0.14442802965641022, | |
| "kl": 0.0042266845703125, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.023, | |
| "reward": 0.44340329244732857, | |
| "reward_std": 0.43735441006720066, | |
| "rewards/cosine_scaled_reward": 0.00642385333776474, | |
| "rewards/format_reward": 0.4305555559694767, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3119.8195190429688, | |
| "epoch": 0.12167952013710369, | |
| "grad_norm": 0.1541452407836914, | |
| "kl": 0.003391265869140625, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0155, | |
| "reward": 0.49696624279022217, | |
| "reward_std": 0.9607885628938675, | |
| "rewards/cosine_scaled_reward": 0.07487202249467373, | |
| "rewards/format_reward": 0.3472222238779068, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2584.513885498047, | |
| "epoch": 0.12339331619537275, | |
| "grad_norm": 0.16282722353935242, | |
| "kl": 0.007266998291015625, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0302, | |
| "reward": 0.4334046132862568, | |
| "reward_std": 0.42579157277941704, | |
| "rewards/cosine_scaled_reward": -0.04024216299876571, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3057.8611450195312, | |
| "epoch": 0.12510711225364182, | |
| "grad_norm": 0.19297440350055695, | |
| "kl": 0.004047393798828125, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": -0.0268, | |
| "reward": 0.2504111938178539, | |
| "reward_std": 0.6320941485464573, | |
| "rewards/cosine_scaled_reward": -0.05534995626658201, | |
| "rewards/format_reward": 0.3611111268401146, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2350.5000610351562, | |
| "epoch": 0.1268209083119109, | |
| "grad_norm": 0.25634145736694336, | |
| "kl": 0.004367828369140625, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.1227, | |
| "reward": 0.7754522487521172, | |
| "reward_std": 0.8430259823799133, | |
| "rewards/cosine_scaled_reward": 0.07522611878812313, | |
| "rewards/format_reward": 0.625, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3078.013916015625, | |
| "epoch": 0.12853470437017994, | |
| "grad_norm": 0.15847010910511017, | |
| "kl": 0.004947662353515625, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0447, | |
| "reward": 0.27387892454862595, | |
| "reward_std": 0.5773990303277969, | |
| "rewards/cosine_scaled_reward": -0.03667165897786617, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2247.8194427490234, | |
| "epoch": 0.13024850042844902, | |
| "grad_norm": 0.28341227769851685, | |
| "kl": 0.014591217041015625, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0403, | |
| "reward": 0.7070811688899994, | |
| "reward_std": 0.7020798400044441, | |
| "rewards/cosine_scaled_reward": 0.06881837674882263, | |
| "rewards/format_reward": 0.5694444440305233, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3159.75, | |
| "epoch": 0.1319622964867181, | |
| "grad_norm": 0.13436463475227356, | |
| "kl": 0.0049896240234375, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.023, | |
| "reward": 0.35647532157599926, | |
| "reward_std": 0.7988947406411171, | |
| "rewards/cosine_scaled_reward": 0.011570994276553392, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3150.0139770507812, | |
| "epoch": 0.13367609254498714, | |
| "grad_norm": 0.176174134016037, | |
| "kl": 0.004405975341796875, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0747, | |
| "reward": -0.14098340552300215, | |
| "reward_std": 0.5686891078948975, | |
| "rewards/cosine_scaled_reward": -0.18854726571589708, | |
| "rewards/format_reward": 0.23611112032085657, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2571.0694580078125, | |
| "epoch": 0.1353898886032562, | |
| "grad_norm": 0.1847277730703354, | |
| "kl": 0.008609771728515625, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": -0.0106, | |
| "reward": 0.3892364539206028, | |
| "reward_std": 0.7569635957479477, | |
| "rewards/cosine_scaled_reward": -0.06927067344076931, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3138.5555419921875, | |
| "epoch": 0.13710368466152528, | |
| "grad_norm": 0.21640530228614807, | |
| "kl": 0.005603790283203125, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.1324, | |
| "reward": -0.1231984393671155, | |
| "reward_std": 0.778315082192421, | |
| "rewards/cosine_scaled_reward": -0.2074325531721115, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3191.916748046875, | |
| "epoch": 0.13881748071979436, | |
| "grad_norm": 0.1524638533592224, | |
| "kl": 0.014739990234375, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0255, | |
| "reward": -0.14118600636720657, | |
| "reward_std": 0.3157992772758007, | |
| "rewards/cosine_scaled_reward": -0.17475967481732368, | |
| "rewards/format_reward": 0.20833334047347307, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2925.013916015625, | |
| "epoch": 0.1405312767780634, | |
| "grad_norm": 0.21411970257759094, | |
| "kl": 0.00635528564453125, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0735, | |
| "reward": -0.06351233087480068, | |
| "reward_std": 0.5284828841686249, | |
| "rewards/cosine_scaled_reward": -0.18453393690288067, | |
| "rewards/format_reward": 0.30555555783212185, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2801.2638549804688, | |
| "epoch": 0.14224507283633248, | |
| "grad_norm": 0.18929333984851837, | |
| "kl": 0.0023746490478515625, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0434, | |
| "reward": 0.5797148197889328, | |
| "reward_std": 0.8048742488026619, | |
| "rewards/cosine_scaled_reward": 0.03985740663483739, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2881.999969482422, | |
| "epoch": 0.14395886889460155, | |
| "grad_norm": 0.16995370388031006, | |
| "kl": 0.00823211669921875, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0417, | |
| "reward": 0.1579499295912683, | |
| "reward_std": 0.6737323254346848, | |
| "rewards/cosine_scaled_reward": -0.12935838662087917, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2817.888916015625, | |
| "epoch": 0.1456726649528706, | |
| "grad_norm": 0.17163607478141785, | |
| "kl": 0.004947662353515625, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0843, | |
| "reward": 0.14664312824606895, | |
| "reward_std": 0.6406831294298172, | |
| "rewards/cosine_scaled_reward": -0.10028954246081412, | |
| "rewards/format_reward": 0.3472222248092294, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2666.0972595214844, | |
| "epoch": 0.14738646101113967, | |
| "grad_norm": 0.23853930830955505, | |
| "kl": 0.0075836181640625, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.184, | |
| "reward": 0.15615743398666382, | |
| "reward_std": 0.6508499458432198, | |
| "rewards/cosine_scaled_reward": -0.14414352551102638, | |
| "rewards/format_reward": 0.4444444440305233, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3479.0000610351562, | |
| "epoch": 0.14910025706940874, | |
| "grad_norm": 0.13812494277954102, | |
| "kl": 0.003131866455078125, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0273, | |
| "reward": -0.3952238578349352, | |
| "reward_std": 0.4180161654949188, | |
| "rewards/cosine_scaled_reward": -0.24622303992509842, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3411.0556030273438, | |
| "epoch": 0.15081405312767782, | |
| "grad_norm": 0.14131076633930206, | |
| "kl": 0.00627899169921875, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0242, | |
| "reward": -0.18497492372989655, | |
| "reward_std": 0.3112034276127815, | |
| "rewards/cosine_scaled_reward": -0.15498745813965797, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2821.4305725097656, | |
| "epoch": 0.15252784918594686, | |
| "grad_norm": 0.23381026089191437, | |
| "kl": 0.00811767578125, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.1044, | |
| "reward": -0.15477947797626257, | |
| "reward_std": 0.3880116418004036, | |
| "rewards/cosine_scaled_reward": -0.257945304736495, | |
| "rewards/format_reward": 0.3611111082136631, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2741.013885498047, | |
| "epoch": 0.15424164524421594, | |
| "grad_norm": 0.3015286326408386, | |
| "kl": 0.005706787109375, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.146, | |
| "reward": 0.32925539929419756, | |
| "reward_std": 0.5706463847309351, | |
| "rewards/cosine_scaled_reward": -0.01592785632237792, | |
| "rewards/format_reward": 0.3611111082136631, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3004.1805419921875, | |
| "epoch": 0.155955441302485, | |
| "grad_norm": 0.2821044325828552, | |
| "kl": 0.013214111328125, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.1457, | |
| "reward": -0.23375913500785828, | |
| "reward_std": 0.6937631815671921, | |
| "rewards/cosine_scaled_reward": -0.25576844066381454, | |
| "rewards/format_reward": 0.2777777872979641, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2889.15283203125, | |
| "epoch": 0.15766923736075408, | |
| "grad_norm": 0.19952206313610077, | |
| "kl": 0.01056671142578125, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": -0.0327, | |
| "reward": 0.10378427803516388, | |
| "reward_std": 0.6779353246092796, | |
| "rewards/cosine_scaled_reward": -0.13560786750167608, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2751.8055725097656, | |
| "epoch": 0.15938303341902313, | |
| "grad_norm": 0.18763676285743713, | |
| "kl": 0.00572967529296875, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0734, | |
| "reward": 0.5665245279669762, | |
| "reward_std": 0.7802244201302528, | |
| "rewards/cosine_scaled_reward": 0.012428927002474666, | |
| "rewards/format_reward": 0.5416666604578495, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2943.9722290039062, | |
| "epoch": 0.1610968294772922, | |
| "grad_norm": 0.17491032183170319, | |
| "kl": 0.005878448486328125, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0466, | |
| "reward": 0.36631612479686737, | |
| "reward_std": 0.5951685793697834, | |
| "rewards/cosine_scaled_reward": -0.011286390479654074, | |
| "rewards/format_reward": 0.38888889737427235, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2803.4862060546875, | |
| "epoch": 0.16281062553556128, | |
| "grad_norm": 0.2179604023694992, | |
| "kl": 0.0073699951171875, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.1111, | |
| "reward": 0.20993795804679394, | |
| "reward_std": 0.5628918968141079, | |
| "rewards/cosine_scaled_reward": -0.08253101143054664, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3062.25, | |
| "epoch": 0.16452442159383032, | |
| "grad_norm": 0.1575266271829605, | |
| "kl": 0.005573272705078125, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0192, | |
| "reward": 0.5143513884395361, | |
| "reward_std": 0.9291824996471405, | |
| "rewards/cosine_scaled_reward": 0.021064545959234238, | |
| "rewards/format_reward": 0.4722222350537777, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3426.0555419921875, | |
| "epoch": 0.1662382176520994, | |
| "grad_norm": 0.152592271566391, | |
| "kl": 0.0096588134765625, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0595, | |
| "reward": -0.316804476082325, | |
| "reward_std": 0.5735431797802448, | |
| "rewards/cosine_scaled_reward": -0.2209022343158722, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2718.1806030273438, | |
| "epoch": 0.16795201371036847, | |
| "grad_norm": 0.19641156494617462, | |
| "kl": 0.00783538818359375, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0871, | |
| "reward": 0.34513735864311457, | |
| "reward_std": 0.7377712428569794, | |
| "rewards/cosine_scaled_reward": -0.09826467745006084, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3183.8611450195312, | |
| "epoch": 0.16966580976863754, | |
| "grad_norm": 0.13990604877471924, | |
| "kl": 0.00958251953125, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0408, | |
| "reward": 0.10082972631789744, | |
| "reward_std": 0.4568670317530632, | |
| "rewards/cosine_scaled_reward": -0.09541848301887512, | |
| "rewards/format_reward": 0.2916666669771075, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2988.263916015625, | |
| "epoch": 0.1713796058269066, | |
| "grad_norm": 0.1574762910604477, | |
| "kl": 0.01104736328125, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.02, | |
| "reward": 0.05844925343990326, | |
| "reward_std": 0.4471042864024639, | |
| "rewards/cosine_scaled_reward": -0.13744205003604293, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2955.6388549804688, | |
| "epoch": 0.17309340188517566, | |
| "grad_norm": 0.15706215798854828, | |
| "kl": 0.006420135498046875, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0337, | |
| "reward": 0.032026506960392, | |
| "reward_std": 0.35832666605710983, | |
| "rewards/cosine_scaled_reward": -0.1298200935125351, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2912.0972290039062, | |
| "epoch": 0.17480719794344474, | |
| "grad_norm": 0.1945251077413559, | |
| "kl": 0.0088043212890625, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.072, | |
| "reward": 0.22132272832095623, | |
| "reward_std": 0.4281787723302841, | |
| "rewards/cosine_scaled_reward": -0.09072753041982651, | |
| "rewards/format_reward": 0.40277779288589954, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2903.9444580078125, | |
| "epoch": 0.17652099400171378, | |
| "grad_norm": 0.1475774347782135, | |
| "kl": 0.00759124755859375, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0409, | |
| "reward": 0.16637181863188744, | |
| "reward_std": 0.6222990080714226, | |
| "rewards/cosine_scaled_reward": -0.10431409068405628, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3320.1805419921875, | |
| "epoch": 0.17823479005998286, | |
| "grad_norm": 0.16452452540397644, | |
| "kl": 0.006443023681640625, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.073, | |
| "reward": -0.04724724031984806, | |
| "reward_std": 0.5820007584989071, | |
| "rewards/cosine_scaled_reward": -0.13473473582416773, | |
| "rewards/format_reward": 0.2222222276031971, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2745.27783203125, | |
| "epoch": 0.17994858611825193, | |
| "grad_norm": 0.23044738173484802, | |
| "kl": 0.0102996826171875, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0909, | |
| "reward": 0.48719315230846405, | |
| "reward_std": 0.9213617816567421, | |
| "rewards/cosine_scaled_reward": -0.01334787905216217, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3009.0694580078125, | |
| "epoch": 0.181662382176521, | |
| "grad_norm": 0.25429767370224, | |
| "kl": 0.0078125, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.127, | |
| "reward": 0.27888505905866623, | |
| "reward_std": 0.7037396281957626, | |
| "rewards/cosine_scaled_reward": -0.048057474195957184, | |
| "rewards/format_reward": 0.3750000009313226, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3172.5694580078125, | |
| "epoch": 0.18337617823479005, | |
| "grad_norm": 0.17300733923912048, | |
| "kl": 0.008148193359375, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0446, | |
| "reward": 0.21187454462051392, | |
| "reward_std": 0.549411840736866, | |
| "rewards/cosine_scaled_reward": -0.06767383548867656, | |
| "rewards/format_reward": 0.34722222574055195, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3347.52783203125, | |
| "epoch": 0.18508997429305912, | |
| "grad_norm": 0.17588993906974792, | |
| "kl": 0.00628662109375, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.06, | |
| "reward": -0.038673363626003265, | |
| "reward_std": 0.728736087679863, | |
| "rewards/cosine_scaled_reward": -0.15822557546198368, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2627.263946533203, | |
| "epoch": 0.1868037703513282, | |
| "grad_norm": 0.29850271344184875, | |
| "kl": 0.01171875, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0662, | |
| "reward": 0.19531617127358913, | |
| "reward_std": 0.4965377002954483, | |
| "rewards/cosine_scaled_reward": -0.09678636118769646, | |
| "rewards/format_reward": 0.38888889737427235, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2958.625, | |
| "epoch": 0.18851756640959727, | |
| "grad_norm": 0.46270403265953064, | |
| "kl": 0.0089263916015625, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.1711, | |
| "reward": 0.08645874005742371, | |
| "reward_std": 0.9684502333402634, | |
| "rewards/cosine_scaled_reward": -0.1512150838971138, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2400.3472900390625, | |
| "epoch": 0.19023136246786632, | |
| "grad_norm": 0.18343479931354523, | |
| "kl": 0.00701904296875, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.1083, | |
| "reward": 0.22095186542719603, | |
| "reward_std": 0.5088437423110008, | |
| "rewards/cosine_scaled_reward": -0.15341296698898077, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2780.916748046875, | |
| "epoch": 0.1919451585261354, | |
| "grad_norm": 0.16234862804412842, | |
| "kl": 0.007965087890625, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0115, | |
| "reward": 0.19807963073253632, | |
| "reward_std": 0.5584643110632896, | |
| "rewards/cosine_scaled_reward": -0.14401574060320854, | |
| "rewards/format_reward": 0.48611112777143717, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2751.7361450195312, | |
| "epoch": 0.19365895458440446, | |
| "grad_norm": 0.20913416147232056, | |
| "kl": 0.00861358642578125, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0851, | |
| "reward": 0.7618176154792309, | |
| "reward_std": 1.0328082591295242, | |
| "rewards/cosine_scaled_reward": 0.11007547879125923, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2235.9166564941406, | |
| "epoch": 0.1953727506426735, | |
| "grad_norm": 0.20926620066165924, | |
| "kl": 0.008697509765625, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0327, | |
| "reward": 0.2064858078956604, | |
| "reward_std": 0.4848344102501869, | |
| "rewards/cosine_scaled_reward": -0.1675904355943203, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2317.6250610351562, | |
| "epoch": 0.19708654670094258, | |
| "grad_norm": 0.4515492916107178, | |
| "kl": 0.0092010498046875, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.2531, | |
| "reward": 0.45756053365767, | |
| "reward_std": 0.7848574221134186, | |
| "rewards/cosine_scaled_reward": -0.04899751394987106, | |
| "rewards/format_reward": 0.5555555745959282, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3182.0694580078125, | |
| "epoch": 0.19880034275921166, | |
| "grad_norm": 0.17537973821163177, | |
| "kl": 0.0128631591796875, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0165, | |
| "reward": -0.06121325120329857, | |
| "reward_std": 0.4434010796248913, | |
| "rewards/cosine_scaled_reward": -0.1694955169223249, | |
| "rewards/format_reward": 0.2777777798473835, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3030.7083129882812, | |
| "epoch": 0.20051413881748073, | |
| "grad_norm": 0.18003451824188232, | |
| "kl": 0.017120361328125, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.1071, | |
| "reward": 0.42929551005363464, | |
| "reward_std": 0.9132848009467125, | |
| "rewards/cosine_scaled_reward": 0.006314422586001456, | |
| "rewards/format_reward": 0.4166666651144624, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2533.9443969726562, | |
| "epoch": 0.20222793487574978, | |
| "grad_norm": 0.2703484296798706, | |
| "kl": 0.011077880859375, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0946, | |
| "reward": 0.4179135374724865, | |
| "reward_std": 0.8737296983599663, | |
| "rewards/cosine_scaled_reward": -0.04798768740147352, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2823.5000610351562, | |
| "epoch": 0.20394173093401885, | |
| "grad_norm": 0.19636695086956024, | |
| "kl": 0.01312255859375, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0303, | |
| "reward": 0.31334975361824036, | |
| "reward_std": 0.30826447159051895, | |
| "rewards/cosine_scaled_reward": -0.058602908393368125, | |
| "rewards/format_reward": 0.430555553175509, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2826.5694580078125, | |
| "epoch": 0.20565552699228792, | |
| "grad_norm": 0.2075241059064865, | |
| "kl": 0.016265869140625, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.094, | |
| "reward": 0.4697803445160389, | |
| "reward_std": 0.7031994387507439, | |
| "rewards/cosine_scaled_reward": 0.005723495967686176, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2887.3056030273438, | |
| "epoch": 0.207369323050557, | |
| "grad_norm": 0.19230371713638306, | |
| "kl": 0.0111083984375, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.086, | |
| "reward": 0.3282506223767996, | |
| "reward_std": 0.7738695293664932, | |
| "rewards/cosine_scaled_reward": -0.05809690523892641, | |
| "rewards/format_reward": 0.4444444589316845, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3083.3611450195312, | |
| "epoch": 0.20908311910882604, | |
| "grad_norm": 0.17026208341121674, | |
| "kl": 0.01568603515625, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0301, | |
| "reward": -0.03662687446922064, | |
| "reward_std": 0.5345718339085579, | |
| "rewards/cosine_scaled_reward": -0.19886899180710316, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2682.6805725097656, | |
| "epoch": 0.21079691516709512, | |
| "grad_norm": 0.19728592038154602, | |
| "kl": 0.012115478515625, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0481, | |
| "reward": 0.3675118573009968, | |
| "reward_std": 1.058239296078682, | |
| "rewards/cosine_scaled_reward": -0.052355190739035606, | |
| "rewards/format_reward": 0.4722222248092294, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3100.1666870117188, | |
| "epoch": 0.2125107112253642, | |
| "grad_norm": 0.19675055146217346, | |
| "kl": 0.013641357421875, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0775, | |
| "reward": 0.28848724998533726, | |
| "reward_std": 0.5403149202466011, | |
| "rewards/cosine_scaled_reward": -0.04325637500733137, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2862.0277709960938, | |
| "epoch": 0.21422450728363324, | |
| "grad_norm": 0.1939004808664322, | |
| "kl": 0.017242431640625, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0453, | |
| "reward": 0.3707499373704195, | |
| "reward_std": 0.7198375910520554, | |
| "rewards/cosine_scaled_reward": -0.016013892367482185, | |
| "rewards/format_reward": 0.4027777807787061, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2753.486083984375, | |
| "epoch": 0.2159383033419023, | |
| "grad_norm": 0.25714027881622314, | |
| "kl": 0.0194091796875, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0648, | |
| "reward": 0.3369361013174057, | |
| "reward_std": 0.5913353934884071, | |
| "rewards/cosine_scaled_reward": -0.08847637102007866, | |
| "rewards/format_reward": 0.5138888889923692, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2923.125, | |
| "epoch": 0.21765209940017138, | |
| "grad_norm": 0.2240990549325943, | |
| "kl": 0.016571044921875, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.076, | |
| "reward": -0.2184343640692532, | |
| "reward_std": 0.5479928515851498, | |
| "rewards/cosine_scaled_reward": -0.27588383853435516, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3276.6111450195312, | |
| "epoch": 0.21936589545844046, | |
| "grad_norm": 0.15262338519096375, | |
| "kl": 0.0207061767578125, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": -0.0001, | |
| "reward": -0.12921499274671078, | |
| "reward_std": 0.5691854059696198, | |
| "rewards/cosine_scaled_reward": -0.1757186003960669, | |
| "rewards/format_reward": 0.22222223225980997, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2278.4305725097656, | |
| "epoch": 0.2210796915167095, | |
| "grad_norm": 0.3608929216861725, | |
| "kl": 0.019287109375, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0927, | |
| "reward": 0.706303309649229, | |
| "reward_std": 0.7875337153673172, | |
| "rewards/cosine_scaled_reward": 0.04759608302265406, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1965.999984741211, | |
| "epoch": 0.22279348757497858, | |
| "grad_norm": 0.18217293918132782, | |
| "kl": 0.018463134765625, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0064, | |
| "reward": 1.0708431326784194, | |
| "reward_std": 0.7828814685344696, | |
| "rewards/cosine_scaled_reward": 0.17431045067496598, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3232.7222290039062, | |
| "epoch": 0.22450728363324765, | |
| "grad_norm": 0.1822432279586792, | |
| "kl": 0.0173797607421875, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0286, | |
| "reward": -0.31177592277526855, | |
| "reward_std": 0.350917749106884, | |
| "rewards/cosine_scaled_reward": -0.27394353225827217, | |
| "rewards/format_reward": 0.23611112032085657, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2963.8055419921875, | |
| "epoch": 0.2262210796915167, | |
| "grad_norm": 0.22750675678253174, | |
| "kl": 0.016204833984375, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0406, | |
| "reward": 0.32277560234069824, | |
| "reward_std": 0.8804080411791801, | |
| "rewards/cosine_scaled_reward": -0.07472331821918488, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3082.263916015625, | |
| "epoch": 0.22793487574978577, | |
| "grad_norm": 0.2046993225812912, | |
| "kl": 0.021820068359375, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0353, | |
| "reward": 0.3385091759264469, | |
| "reward_std": 0.7099575102329254, | |
| "rewards/cosine_scaled_reward": -0.011300940066576004, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2206.375030517578, | |
| "epoch": 0.22964867180805484, | |
| "grad_norm": 0.19563263654708862, | |
| "kl": 0.017303466796875, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0577, | |
| "reward": 0.640228021889925, | |
| "reward_std": 0.7054692879319191, | |
| "rewards/cosine_scaled_reward": 0.0006695720367133617, | |
| "rewards/format_reward": 0.6388888955116272, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2421.611114501953, | |
| "epoch": 0.23136246786632392, | |
| "grad_norm": 0.338701069355011, | |
| "kl": 0.0213623046875, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.1507, | |
| "reward": 0.6078107673674822, | |
| "reward_std": 0.8746988773345947, | |
| "rewards/cosine_scaled_reward": 0.0469609391366248, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2651.125030517578, | |
| "epoch": 0.23307626392459296, | |
| "grad_norm": 0.28927695751190186, | |
| "kl": 0.0211334228515625, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.1053, | |
| "reward": 0.35874155908823013, | |
| "reward_std": 0.7097110822796822, | |
| "rewards/cosine_scaled_reward": -0.015073666349053383, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3202.0972900390625, | |
| "epoch": 0.23479005998286204, | |
| "grad_norm": 0.17811518907546997, | |
| "kl": 0.02239990234375, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0492, | |
| "reward": 0.5012375935912132, | |
| "reward_std": 0.9828417152166367, | |
| "rewards/cosine_scaled_reward": 0.03534099366515875, | |
| "rewards/format_reward": 0.43055555410683155, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2946.4166870117188, | |
| "epoch": 0.2365038560411311, | |
| "grad_norm": 0.23094090819358826, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0193, | |
| "reward": 0.09741606749594212, | |
| "reward_std": 0.5724444687366486, | |
| "rewards/cosine_scaled_reward": -0.13184750825166702, | |
| "rewards/format_reward": 0.36111111007630825, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2927.0972290039062, | |
| "epoch": 0.23821765209940018, | |
| "grad_norm": 0.19129879772663116, | |
| "kl": 0.024505615234375, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0374, | |
| "reward": 0.1535217664204538, | |
| "reward_std": 0.4049301743507385, | |
| "rewards/cosine_scaled_reward": -0.08296133577823639, | |
| "rewards/format_reward": 0.3194444449618459, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2202.6944580078125, | |
| "epoch": 0.23993144815766923, | |
| "grad_norm": 0.4724877178668976, | |
| "kl": 0.0255584716796875, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.1836, | |
| "reward": 0.3395635038614273, | |
| "reward_std": 0.6675402373075485, | |
| "rewards/cosine_scaled_reward": -0.11494047567248344, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2910.916748046875, | |
| "epoch": 0.2416452442159383, | |
| "grad_norm": 0.18322300910949707, | |
| "kl": 0.02935791015625, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0365, | |
| "reward": 0.045268273912370205, | |
| "reward_std": 0.6290135830640793, | |
| "rewards/cosine_scaled_reward": -0.1440325528383255, | |
| "rewards/format_reward": 0.3333333367481828, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3141.638916015625, | |
| "epoch": 0.24335904027420738, | |
| "grad_norm": 0.1756112426519394, | |
| "kl": 0.031341552734375, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0029, | |
| "reward": -0.12469126284122467, | |
| "reward_std": 0.39061762765049934, | |
| "rewards/cosine_scaled_reward": -0.18734563700854778, | |
| "rewards/format_reward": 0.2500000009313226, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2654.4166259765625, | |
| "epoch": 0.24507283633247642, | |
| "grad_norm": 0.29079416394233704, | |
| "kl": 0.020843505859375, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0697, | |
| "reward": 0.4159288965165615, | |
| "reward_std": 0.7245111912488937, | |
| "rewards/cosine_scaled_reward": -0.06981334753800184, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3201.9306030273438, | |
| "epoch": 0.2467866323907455, | |
| "grad_norm": 0.197592630982399, | |
| "kl": 0.0269775390625, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0373, | |
| "reward": -0.11726564727723598, | |
| "reward_std": 0.6086189821362495, | |
| "rewards/cosine_scaled_reward": -0.21141060069203377, | |
| "rewards/format_reward": 0.30555556155741215, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2885.9722900390625, | |
| "epoch": 0.24850042844901457, | |
| "grad_norm": 0.29763004183769226, | |
| "kl": 0.028472900390625, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0677, | |
| "reward": 0.5742630921304226, | |
| "reward_std": 0.37366680055856705, | |
| "rewards/cosine_scaled_reward": 0.0649093296378851, | |
| "rewards/format_reward": 0.4444444440305233, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2155.9444885253906, | |
| "epoch": 0.25021422450728364, | |
| "grad_norm": 0.3611903190612793, | |
| "kl": 0.024993896484375, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.1546, | |
| "reward": 0.5257812030613422, | |
| "reward_std": 0.9518508315086365, | |
| "rewards/cosine_scaled_reward": -0.049609407782554626, | |
| "rewards/format_reward": 0.625, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2810.9583740234375, | |
| "epoch": 0.2519280205655527, | |
| "grad_norm": 0.2670803964138031, | |
| "kl": 0.031829833984375, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0389, | |
| "reward": 0.11524944752454758, | |
| "reward_std": 0.6441401988267899, | |
| "rewards/cosine_scaled_reward": -0.12293083127588034, | |
| "rewards/format_reward": 0.36111112032085657, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3006.0416259765625, | |
| "epoch": 0.2536418166238218, | |
| "grad_norm": 0.230261892080307, | |
| "kl": 0.0283203125, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0866, | |
| "reward": -0.06906389445066452, | |
| "reward_std": 0.41436275094747543, | |
| "rewards/cosine_scaled_reward": -0.21508748084306717, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2527.4305572509766, | |
| "epoch": 0.25535561268209084, | |
| "grad_norm": 0.2313620001077652, | |
| "kl": 0.028106689453125, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.05, | |
| "reward": 0.3055970072746277, | |
| "reward_std": 0.8265255615115166, | |
| "rewards/cosine_scaled_reward": -0.0972014885628596, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2374.0416870117188, | |
| "epoch": 0.2570694087403599, | |
| "grad_norm": 0.7321764826774597, | |
| "kl": 0.028839111328125, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.1817, | |
| "reward": 0.8978928253054619, | |
| "reward_std": 0.7169746980071068, | |
| "rewards/cosine_scaled_reward": 0.16422418132424355, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2437.0694580078125, | |
| "epoch": 0.258783204798629, | |
| "grad_norm": 0.7071412801742554, | |
| "kl": 0.041168212890625, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.2136, | |
| "reward": 0.06301388889551163, | |
| "reward_std": 0.4757090378552675, | |
| "rewards/cosine_scaled_reward": -0.21849306486546993, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2845.4583129882812, | |
| "epoch": 0.26049700085689803, | |
| "grad_norm": 0.5604143738746643, | |
| "kl": 0.046142578125, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.1265, | |
| "reward": 0.16903822124004364, | |
| "reward_std": 0.5248951427638531, | |
| "rewards/cosine_scaled_reward": -0.09603646397590637, | |
| "rewards/format_reward": 0.36111111380159855, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2768.4027709960938, | |
| "epoch": 0.2622107969151671, | |
| "grad_norm": 0.23171323537826538, | |
| "kl": 0.0499267578125, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0185, | |
| "reward": 0.15082042291760445, | |
| "reward_std": 0.7368991822004318, | |
| "rewards/cosine_scaled_reward": -0.12597868964076042, | |
| "rewards/format_reward": 0.4027777798473835, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2804.9861450195312, | |
| "epoch": 0.2639245929734362, | |
| "grad_norm": 0.40300193428993225, | |
| "kl": 0.05609130859375, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0954, | |
| "reward": 0.4552767127752304, | |
| "reward_std": 0.7285914719104767, | |
| "rewards/cosine_scaled_reward": -0.00847275834530592, | |
| "rewards/format_reward": 0.472222238779068, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3049.0416870117188, | |
| "epoch": 0.2656383890317052, | |
| "grad_norm": 0.23651528358459473, | |
| "kl": 0.0596923828125, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": -0.0013, | |
| "reward": 0.12507159425877035, | |
| "reward_std": 0.8443149924278259, | |
| "rewards/cosine_scaled_reward": -0.10413086414337158, | |
| "rewards/format_reward": 0.3333333469927311, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2847.4584350585938, | |
| "epoch": 0.26735218508997427, | |
| "grad_norm": 0.3277675211429596, | |
| "kl": 0.05987548828125, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.032, | |
| "reward": 0.31620367243885994, | |
| "reward_std": 0.7322921454906464, | |
| "rewards/cosine_scaled_reward": -0.10578705929219723, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2717.8750610351562, | |
| "epoch": 0.26906598114824337, | |
| "grad_norm": 0.39394786953926086, | |
| "kl": 0.0775146484375, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0925, | |
| "reward": 0.10540201608091593, | |
| "reward_std": 0.6488600596785545, | |
| "rewards/cosine_scaled_reward": -0.13479896634817123, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2818.125, | |
| "epoch": 0.2707797772065124, | |
| "grad_norm": 0.40347573161125183, | |
| "kl": 0.0806884765625, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0484, | |
| "reward": 0.07575460057705641, | |
| "reward_std": 0.6178670972585678, | |
| "rewards/cosine_scaled_reward": -0.15656715538352728, | |
| "rewards/format_reward": 0.3888888889923692, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3166.1666259765625, | |
| "epoch": 0.27249357326478146, | |
| "grad_norm": 0.3342011868953705, | |
| "kl": 0.1055908203125, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0491, | |
| "reward": -0.02998074982315302, | |
| "reward_std": 0.6097311675548553, | |
| "rewards/cosine_scaled_reward": -0.19554592855274677, | |
| "rewards/format_reward": 0.36111111380159855, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3013.7222290039062, | |
| "epoch": 0.27420736932305056, | |
| "grad_norm": 0.4173794388771057, | |
| "kl": 0.106689453125, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0339, | |
| "reward": 0.3507204055786133, | |
| "reward_std": 0.6021532118320465, | |
| "rewards/cosine_scaled_reward": 0.0017490852624177933, | |
| "rewards/format_reward": 0.34722222201526165, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2572.8194274902344, | |
| "epoch": 0.2759211653813196, | |
| "grad_norm": 0.4282573163509369, | |
| "kl": 0.123046875, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0645, | |
| "reward": 0.010059013031423092, | |
| "reward_std": 0.5160095170140266, | |
| "rewards/cosine_scaled_reward": -0.15469271643087268, | |
| "rewards/format_reward": 0.3194444514811039, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2242.9444274902344, | |
| "epoch": 0.2776349614395887, | |
| "grad_norm": 0.39615368843078613, | |
| "kl": 0.1090087890625, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0772, | |
| "reward": 0.16390804119873792, | |
| "reward_std": 0.5712290816009045, | |
| "rewards/cosine_scaled_reward": -0.18887930922210217, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2832.9861450195312, | |
| "epoch": 0.27934875749785776, | |
| "grad_norm": 0.6331592798233032, | |
| "kl": 0.1380615234375, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.1022, | |
| "reward": 0.20613746903836727, | |
| "reward_std": 0.7383135333657265, | |
| "rewards/cosine_scaled_reward": -0.04276460176333785, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2359.250030517578, | |
| "epoch": 0.2810625535561268, | |
| "grad_norm": 0.8423472046852112, | |
| "kl": 0.1591796875, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0087, | |
| "reward": 0.1561935730278492, | |
| "reward_std": 0.8059368506073952, | |
| "rewards/cosine_scaled_reward": -0.10245877737179399, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2781.041717529297, | |
| "epoch": 0.2827763496143959, | |
| "grad_norm": 0.5075474977493286, | |
| "kl": 0.17626953125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0061, | |
| "reward": 0.35288394801318645, | |
| "reward_std": 0.7819623723626137, | |
| "rewards/cosine_scaled_reward": -0.038835824467241764, | |
| "rewards/format_reward": 0.4305555634200573, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2699.3611450195312, | |
| "epoch": 0.28449014567266495, | |
| "grad_norm": 0.41815418004989624, | |
| "kl": 0.1614990234375, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0354, | |
| "reward": 0.2616021269932389, | |
| "reward_std": 0.8704780116677284, | |
| "rewards/cosine_scaled_reward": -0.07753227837383747, | |
| "rewards/format_reward": 0.416666672565043, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2883.5972900390625, | |
| "epoch": 0.286203941730934, | |
| "grad_norm": 0.6100507378578186, | |
| "kl": 0.192626953125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.1117, | |
| "reward": 0.6217167973518372, | |
| "reward_std": 1.1077049523591995, | |
| "rewards/cosine_scaled_reward": 0.08863616734743118, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2844.77783203125, | |
| "epoch": 0.2879177377892031, | |
| "grad_norm": 1.0341858863830566, | |
| "kl": 0.193359375, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.1403, | |
| "reward": 0.44696745090186596, | |
| "reward_std": 0.8215643167495728, | |
| "rewards/cosine_scaled_reward": 0.008205945428926498, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2149.9306030273438, | |
| "epoch": 0.28963153384747214, | |
| "grad_norm": 0.8718350529670715, | |
| "kl": 0.213134765625, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0265, | |
| "reward": 0.24263115064240992, | |
| "reward_std": 0.7163522839546204, | |
| "rewards/cosine_scaled_reward": -0.08701775036752224, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2499.9722900390625, | |
| "epoch": 0.2913453299057412, | |
| "grad_norm": 0.7302869558334351, | |
| "kl": 0.233154296875, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.1123, | |
| "reward": 0.1794309187680483, | |
| "reward_std": 0.7098504453897476, | |
| "rewards/cosine_scaled_reward": -0.056117892265319824, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2474.6250610351562, | |
| "epoch": 0.2930591259640103, | |
| "grad_norm": 0.5687596797943115, | |
| "kl": 0.2568359375, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0829, | |
| "reward": 0.3411689009517431, | |
| "reward_std": 0.6550407111644745, | |
| "rewards/cosine_scaled_reward": -0.1280266623944044, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2846.763916015625, | |
| "epoch": 0.29477292202227934, | |
| "grad_norm": 0.48782670497894287, | |
| "kl": 0.252197265625, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0672, | |
| "reward": 0.22427130304276943, | |
| "reward_std": 0.6338695511221886, | |
| "rewards/cosine_scaled_reward": -0.10314211621880531, | |
| "rewards/format_reward": 0.4305555559694767, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2281.2638549804688, | |
| "epoch": 0.29648671808054844, | |
| "grad_norm": 1.23881196975708, | |
| "kl": 0.234375, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.1363, | |
| "reward": 0.24636091478168964, | |
| "reward_std": 0.725439690053463, | |
| "rewards/cosine_scaled_reward": -0.13376398687250912, | |
| "rewards/format_reward": 0.5138888899236917, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2596.7500610351562, | |
| "epoch": 0.2982005141388175, | |
| "grad_norm": 0.9256901741027832, | |
| "kl": 0.32763671875, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0993, | |
| "reward": 0.03723787656053901, | |
| "reward_std": 0.669374942779541, | |
| "rewards/cosine_scaled_reward": -0.18276994861662388, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2802.7916870117188, | |
| "epoch": 0.29991431019708653, | |
| "grad_norm": 1.6108390092849731, | |
| "kl": 0.41748046875, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": -0.0161, | |
| "reward": -0.026769233867526054, | |
| "reward_std": 0.7613073363900185, | |
| "rewards/cosine_scaled_reward": -0.1939401812851429, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2250.7916564941406, | |
| "epoch": 0.30162810625535563, | |
| "grad_norm": 1.3027092218399048, | |
| "kl": 0.29345703125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.1672, | |
| "reward": 0.34848211891949177, | |
| "reward_std": 0.8886565566062927, | |
| "rewards/cosine_scaled_reward": -0.07575894566252828, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3187.25, | |
| "epoch": 0.3033419023136247, | |
| "grad_norm": 0.7333221435546875, | |
| "kl": 0.390625, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0057, | |
| "reward": -0.06705992296338081, | |
| "reward_std": 0.5766744017601013, | |
| "rewards/cosine_scaled_reward": -0.16547441016882658, | |
| "rewards/format_reward": 0.26388889364898205, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2658.9583129882812, | |
| "epoch": 0.3050556983718937, | |
| "grad_norm": 2.110689878463745, | |
| "kl": 0.40185546875, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.1018, | |
| "reward": 0.15619678050279617, | |
| "reward_std": 0.5456085540354252, | |
| "rewards/cosine_scaled_reward": -0.15106826776172966, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3046.361083984375, | |
| "epoch": 0.3067694944301628, | |
| "grad_norm": 1.409805417060852, | |
| "kl": 0.44482421875, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.1019, | |
| "reward": -0.08309876918792725, | |
| "reward_std": 0.6837619245052338, | |
| "rewards/cosine_scaled_reward": -0.1665493929758668, | |
| "rewards/format_reward": 0.2500000027939677, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2090.263916015625, | |
| "epoch": 0.30848329048843187, | |
| "grad_norm": 1.259092926979065, | |
| "kl": 0.4365234375, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.1312, | |
| "reward": 0.5355786010622978, | |
| "reward_std": 0.9339739978313446, | |
| "rewards/cosine_scaled_reward": -0.07248848024755716, | |
| "rewards/format_reward": 0.6805555671453476, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2998.388916015625, | |
| "epoch": 0.3101970865467009, | |
| "grad_norm": 0.7514684796333313, | |
| "kl": 0.5283203125, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0645, | |
| "reward": 0.0823521837592125, | |
| "reward_std": 0.6557292975485325, | |
| "rewards/cosine_scaled_reward": -0.1602128129452467, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3348.6806030273438, | |
| "epoch": 0.31191088260497, | |
| "grad_norm": 1.1279796361923218, | |
| "kl": 0.6435546875, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0509, | |
| "reward": -0.29229177720844746, | |
| "reward_std": 0.44720375537872314, | |
| "rewards/cosine_scaled_reward": -0.27809032425284386, | |
| "rewards/format_reward": 0.26388889364898205, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2909.6806030273438, | |
| "epoch": 0.31362467866323906, | |
| "grad_norm": 0.8539410829544067, | |
| "kl": 0.5654296875, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.1078, | |
| "reward": 0.20359659614041448, | |
| "reward_std": 0.7151020988821983, | |
| "rewards/cosine_scaled_reward": -0.07875726278871298, | |
| "rewards/format_reward": 0.361111112870276, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2720.6666870117188, | |
| "epoch": 0.31533847472150817, | |
| "grad_norm": 1.0726344585418701, | |
| "kl": 0.6005859375, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0571, | |
| "reward": 0.45398143492639065, | |
| "reward_std": 0.8964811712503433, | |
| "rewards/cosine_scaled_reward": -0.04384262952953577, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2828.9305419921875, | |
| "epoch": 0.3170522707797772, | |
| "grad_norm": 0.9460340142250061, | |
| "kl": 0.6220703125, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.1297, | |
| "reward": -0.03706150595098734, | |
| "reward_std": 0.7321052774786949, | |
| "rewards/cosine_scaled_reward": -0.1990863112732768, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2725.8750610351562, | |
| "epoch": 0.31876606683804626, | |
| "grad_norm": 1.0472413301467896, | |
| "kl": 0.5556640625, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0507, | |
| "reward": -0.008732129819691181, | |
| "reward_std": 0.43902990967035294, | |
| "rewards/cosine_scaled_reward": -0.2057549599558115, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2761.0556030273438, | |
| "epoch": 0.32047986289631536, | |
| "grad_norm": 0.9237687587738037, | |
| "kl": 0.55126953125, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.1122, | |
| "reward": 0.34623236872721463, | |
| "reward_std": 0.8785705417394638, | |
| "rewards/cosine_scaled_reward": -0.07688381336629391, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3188.4027709960938, | |
| "epoch": 0.3221936589545844, | |
| "grad_norm": 1.4287723302841187, | |
| "kl": 0.607421875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0457, | |
| "reward": -0.10439129918813705, | |
| "reward_std": 0.6522045210003853, | |
| "rewards/cosine_scaled_reward": -0.20497343130409718, | |
| "rewards/format_reward": 0.30555556807667017, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2965.75, | |
| "epoch": 0.32390745501285345, | |
| "grad_norm": 1.0540153980255127, | |
| "kl": 0.6044921875, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0926, | |
| "reward": 0.00805249996483326, | |
| "reward_std": 0.5005255490541458, | |
| "rewards/cosine_scaled_reward": -0.1904182005673647, | |
| "rewards/format_reward": 0.38888888992369175, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2995.3611450195312, | |
| "epoch": 0.32562125107112255, | |
| "grad_norm": 2.005993604660034, | |
| "kl": 0.60546875, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.1571, | |
| "reward": -0.2390465196222067, | |
| "reward_std": 0.5108147040009499, | |
| "rewards/cosine_scaled_reward": -0.2792454734444618, | |
| "rewards/format_reward": 0.3194444449618459, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2995.02783203125, | |
| "epoch": 0.3273350471293916, | |
| "grad_norm": 0.914374828338623, | |
| "kl": 0.537109375, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0791, | |
| "reward": -0.008268387988209724, | |
| "reward_std": 0.7869899272918701, | |
| "rewards/cosine_scaled_reward": -0.19163418684911449, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2657.3055725097656, | |
| "epoch": 0.32904884318766064, | |
| "grad_norm": 0.9198621511459351, | |
| "kl": 0.6298828125, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.1016, | |
| "reward": 0.14560853224247694, | |
| "reward_std": 0.44526704400777817, | |
| "rewards/cosine_scaled_reward": -0.177195742726326, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2256.6666564941406, | |
| "epoch": 0.33076263924592975, | |
| "grad_norm": 0.9307562708854675, | |
| "kl": 0.5947265625, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.082, | |
| "reward": 0.184324630536139, | |
| "reward_std": 0.5673187747597694, | |
| "rewards/cosine_scaled_reward": -0.17867101542651653, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2654.013916015625, | |
| "epoch": 0.3324764353041988, | |
| "grad_norm": 1.2104908227920532, | |
| "kl": 0.6787109375, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0927, | |
| "reward": 0.5049788989126682, | |
| "reward_std": 0.6255298256874084, | |
| "rewards/cosine_scaled_reward": 0.0024894457310438156, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3088.2222900390625, | |
| "epoch": 0.3341902313624679, | |
| "grad_norm": 2.0733349323272705, | |
| "kl": 0.787109375, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0256, | |
| "reward": -0.1914132796227932, | |
| "reward_std": 0.39547703973948956, | |
| "rewards/cosine_scaled_reward": -0.23459553346037865, | |
| "rewards/format_reward": 0.2777777761220932, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2572.5416259765625, | |
| "epoch": 0.33590402742073694, | |
| "grad_norm": 0.9238296151161194, | |
| "kl": 0.6484375, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0799, | |
| "reward": 0.4742476176470518, | |
| "reward_std": 0.8941326662898064, | |
| "rewards/cosine_scaled_reward": -0.04759840480983257, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2907.4583129882812, | |
| "epoch": 0.337617823479006, | |
| "grad_norm": 0.9024485945701599, | |
| "kl": 0.693359375, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0713, | |
| "reward": 0.0948091521859169, | |
| "reward_std": 0.4578506797552109, | |
| "rewards/cosine_scaled_reward": -0.11926210392266512, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2943.4305419921875, | |
| "epoch": 0.3393316195372751, | |
| "grad_norm": 1.3114806413650513, | |
| "kl": 0.7470703125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0493, | |
| "reward": 0.04198681065463461, | |
| "reward_std": 0.5121570453047752, | |
| "rewards/cosine_scaled_reward": -0.1595621556043625, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3010.9166870117188, | |
| "epoch": 0.34104541559554413, | |
| "grad_norm": 0.6777936816215515, | |
| "kl": 0.697265625, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0892, | |
| "reward": 0.12530913203954697, | |
| "reward_std": 0.5297227501869202, | |
| "rewards/cosine_scaled_reward": -0.145678770262748, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2658.4722595214844, | |
| "epoch": 0.3427592116538132, | |
| "grad_norm": 1.0869694948196411, | |
| "kl": 0.5361328125, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0285, | |
| "reward": 0.42103337205480784, | |
| "reward_std": 0.5303617715835571, | |
| "rewards/cosine_scaled_reward": -0.04642775317188352, | |
| "rewards/format_reward": 0.5138888992369175, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2742.0833435058594, | |
| "epoch": 0.3444730077120823, | |
| "grad_norm": 0.6390620470046997, | |
| "kl": 0.56005859375, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.078, | |
| "reward": 0.13805552199482918, | |
| "reward_std": 0.5941917151212692, | |
| "rewards/cosine_scaled_reward": -0.18097224179655313, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2622.611083984375, | |
| "epoch": 0.3461868037703513, | |
| "grad_norm": 1.5139989852905273, | |
| "kl": 0.51416015625, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.1322, | |
| "reward": 0.2749571923632175, | |
| "reward_std": 0.6380000561475754, | |
| "rewards/cosine_scaled_reward": -0.09863251959905028, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2710.7361450195312, | |
| "epoch": 0.34790059982862037, | |
| "grad_norm": 1.517341136932373, | |
| "kl": 0.51171875, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.1124, | |
| "reward": 0.5119861587882042, | |
| "reward_std": 0.9760274440050125, | |
| "rewards/cosine_scaled_reward": -0.014840253628790379, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2822.6112060546875, | |
| "epoch": 0.3496143958868895, | |
| "grad_norm": 1.1272459030151367, | |
| "kl": 0.548828125, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0668, | |
| "reward": 0.03917721984907985, | |
| "reward_std": 0.7430369108915329, | |
| "rewards/cosine_scaled_reward": -0.17485582828521729, | |
| "rewards/format_reward": 0.38888888992369175, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2645.5694580078125, | |
| "epoch": 0.3513281919451585, | |
| "grad_norm": 1.7582755088806152, | |
| "kl": 0.5556640625, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.1055, | |
| "reward": 0.509862631559372, | |
| "reward_std": 0.7304475903511047, | |
| "rewards/cosine_scaled_reward": 0.004931296221911907, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2748.02783203125, | |
| "epoch": 0.35304198800342756, | |
| "grad_norm": 13.779873847961426, | |
| "kl": 1.0166015625, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0928, | |
| "reward": 0.13606557785533369, | |
| "reward_std": 0.5326481983065605, | |
| "rewards/cosine_scaled_reward": -0.2236338797956705, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2727.9722900390625, | |
| "epoch": 0.35475578406169667, | |
| "grad_norm": 4.678215503692627, | |
| "kl": 0.8974609375, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.1124, | |
| "reward": 0.3662101551890373, | |
| "reward_std": 0.5158084109425545, | |
| "rewards/cosine_scaled_reward": -0.05995047930628061, | |
| "rewards/format_reward": 0.4861111268401146, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2660.9722900390625, | |
| "epoch": 0.3564695801199657, | |
| "grad_norm": 2.2143702507019043, | |
| "kl": 0.7880859375, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.067, | |
| "reward": 0.29765829257667065, | |
| "reward_std": 0.7447296231985092, | |
| "rewards/cosine_scaled_reward": -0.1428375095129013, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2533.4444580078125, | |
| "epoch": 0.3581833761782348, | |
| "grad_norm": 1.057923436164856, | |
| "kl": 0.626953125, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.1388, | |
| "reward": 0.22743514459580183, | |
| "reward_std": 0.8155356049537659, | |
| "rewards/cosine_scaled_reward": -0.15017131343483925, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2513.763916015625, | |
| "epoch": 0.35989717223650386, | |
| "grad_norm": 3.4706244468688965, | |
| "kl": 0.6640625, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.2307, | |
| "reward": 0.32537855207920074, | |
| "reward_std": 0.6403735391795635, | |
| "rewards/cosine_scaled_reward": -0.11508850922109559, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3072.1806030273438, | |
| "epoch": 0.3616109682947729, | |
| "grad_norm": 0.867877721786499, | |
| "kl": 0.85009765625, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.1175, | |
| "reward": 0.021036310121417046, | |
| "reward_std": 0.5472413003444672, | |
| "rewards/cosine_scaled_reward": -0.170037392526865, | |
| "rewards/format_reward": 0.36111112125217915, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2580.791717529297, | |
| "epoch": 0.363324764353042, | |
| "grad_norm": 1.1602129936218262, | |
| "kl": 0.8037109375, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.1343, | |
| "reward": 0.6223988421261311, | |
| "reward_std": 0.851245753467083, | |
| "rewards/cosine_scaled_reward": -0.0013005826622247696, | |
| "rewards/format_reward": 0.625, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2920.2916870117188, | |
| "epoch": 0.36503856041131105, | |
| "grad_norm": 1.2226418256759644, | |
| "kl": 1.0029296875, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.1063, | |
| "reward": 0.10683083906769753, | |
| "reward_std": 0.6580070406198502, | |
| "rewards/cosine_scaled_reward": -0.21741791814565659, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2951.166748046875, | |
| "epoch": 0.3667523564695801, | |
| "grad_norm": 1.302587628364563, | |
| "kl": 1.1484375, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.1102, | |
| "reward": 0.16499032359570265, | |
| "reward_std": 0.5117045789957047, | |
| "rewards/cosine_scaled_reward": -0.13278261446976103, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2642.3056030273438, | |
| "epoch": 0.3684661525278492, | |
| "grad_norm": 1.028397560119629, | |
| "kl": 0.8671875, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.1035, | |
| "reward": 0.3645508070476353, | |
| "reward_std": 0.9228581190109253, | |
| "rewards/cosine_scaled_reward": -0.10939126997254789, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2541.0833129882812, | |
| "epoch": 0.37017994858611825, | |
| "grad_norm": 1.578083872795105, | |
| "kl": 0.86865234375, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.1489, | |
| "reward": 0.26378826051950455, | |
| "reward_std": 0.6202561929821968, | |
| "rewards/cosine_scaled_reward": -0.13199475780129433, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2184.9445190429688, | |
| "epoch": 0.3718937446443873, | |
| "grad_norm": 1.103194236755371, | |
| "kl": 0.6845703125, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0844, | |
| "reward": 0.4086096244864166, | |
| "reward_std": 0.7625616788864136, | |
| "rewards/cosine_scaled_reward": -0.11513962969183922, | |
| "rewards/format_reward": 0.6388889029622078, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2404.6805725097656, | |
| "epoch": 0.3736075407026564, | |
| "grad_norm": 2.3009181022644043, | |
| "kl": 0.8935546875, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.1892, | |
| "reward": 0.4591095373034477, | |
| "reward_std": 0.5642153918743134, | |
| "rewards/cosine_scaled_reward": -0.08294522017240524, | |
| "rewards/format_reward": 0.625, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3044.9862060546875, | |
| "epoch": 0.37532133676092544, | |
| "grad_norm": 1.2761178016662598, | |
| "kl": 1.19921875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.1195, | |
| "reward": 0.21450293064117432, | |
| "reward_std": 0.7603526711463928, | |
| "rewards/cosine_scaled_reward": -0.12191520072519779, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2793.90283203125, | |
| "epoch": 0.37703513281919454, | |
| "grad_norm": 1.6576476097106934, | |
| "kl": 1.166015625, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0977, | |
| "reward": 0.22772593423724174, | |
| "reward_std": 0.5124068222939968, | |
| "rewards/cosine_scaled_reward": -0.18474812898784876, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2835.0, | |
| "epoch": 0.3787489288774636, | |
| "grad_norm": 3.882580280303955, | |
| "kl": 1.0927734375, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0245, | |
| "reward": 0.10485807061195374, | |
| "reward_std": 0.5114092901349068, | |
| "rewards/cosine_scaled_reward": -0.20451541244983673, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2310.9305725097656, | |
| "epoch": 0.38046272493573263, | |
| "grad_norm": 1.3096808195114136, | |
| "kl": 0.87890625, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.1143, | |
| "reward": 0.4324228148907423, | |
| "reward_std": 0.5727507174015045, | |
| "rewards/cosine_scaled_reward": -0.13101080805063248, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2826.2222290039062, | |
| "epoch": 0.38217652099400173, | |
| "grad_norm": 1.182824730873108, | |
| "kl": 0.931640625, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.1159, | |
| "reward": 0.17814365401864052, | |
| "reward_std": 0.5051928982138634, | |
| "rewards/cosine_scaled_reward": -0.14703928492963314, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2644.8611450195312, | |
| "epoch": 0.3838903170522708, | |
| "grad_norm": 1.271640658378601, | |
| "kl": 0.7939453125, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0771, | |
| "reward": 0.21772570302709937, | |
| "reward_std": 0.5406957715749741, | |
| "rewards/cosine_scaled_reward": -0.17585936933755875, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2389.749969482422, | |
| "epoch": 0.3856041131105398, | |
| "grad_norm": 2.586735486984253, | |
| "kl": 0.6689453125, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": -0.0027, | |
| "reward": 0.5408617407083511, | |
| "reward_std": 0.7554269433021545, | |
| "rewards/cosine_scaled_reward": -0.03512469958513975, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2435.250030517578, | |
| "epoch": 0.3873179091688089, | |
| "grad_norm": 1.4329172372817993, | |
| "kl": 0.59521484375, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0425, | |
| "reward": 0.21151528507471085, | |
| "reward_std": 0.6967541426420212, | |
| "rewards/cosine_scaled_reward": -0.14424235187470913, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2509.1945190429688, | |
| "epoch": 0.389031705227078, | |
| "grad_norm": 1.7964338064193726, | |
| "kl": 0.51220703125, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0631, | |
| "reward": 0.3222038522362709, | |
| "reward_std": 0.7920150905847549, | |
| "rewards/cosine_scaled_reward": -0.0958425235003233, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2343.888885498047, | |
| "epoch": 0.390745501285347, | |
| "grad_norm": 1.312915563583374, | |
| "kl": 0.48828125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0327, | |
| "reward": 0.72439269348979, | |
| "reward_std": 0.6716032773256302, | |
| "rewards/cosine_scaled_reward": 0.028863003477454185, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2668.8333740234375, | |
| "epoch": 0.3924592973436161, | |
| "grad_norm": 1.1794544458389282, | |
| "kl": 0.591796875, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0617, | |
| "reward": 0.4572554435580969, | |
| "reward_std": 0.6365808099508286, | |
| "rewards/cosine_scaled_reward": -0.028316727373749018, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2405.111114501953, | |
| "epoch": 0.39417309340188517, | |
| "grad_norm": 2.7993645668029785, | |
| "kl": 0.52197265625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.193, | |
| "reward": 0.6450787968933582, | |
| "reward_std": 0.6886177062988281, | |
| "rewards/cosine_scaled_reward": -0.017738381633535028, | |
| "rewards/format_reward": 0.6805555671453476, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2889.763916015625, | |
| "epoch": 0.39588688946015427, | |
| "grad_norm": 0.9008044600486755, | |
| "kl": 0.59228515625, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0832, | |
| "reward": 0.17632517218589783, | |
| "reward_std": 0.7136962860822678, | |
| "rewards/cosine_scaled_reward": -0.11322630103677511, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2096.763885498047, | |
| "epoch": 0.3976006855184233, | |
| "grad_norm": 2.9937281608581543, | |
| "kl": 0.44482421875, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0198, | |
| "reward": 0.45767842745408416, | |
| "reward_std": 0.6805157586932182, | |
| "rewards/cosine_scaled_reward": -0.0697719173040241, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2095.625030517578, | |
| "epoch": 0.39931448157669236, | |
| "grad_norm": 3.1695449352264404, | |
| "kl": 0.52392578125, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.124, | |
| "reward": 0.8969383761286736, | |
| "reward_std": 0.8693148195743561, | |
| "rewards/cosine_scaled_reward": 0.10124696930870414, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2397.9722290039062, | |
| "epoch": 0.40102827763496146, | |
| "grad_norm": 2.038714647293091, | |
| "kl": 0.60302734375, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0803, | |
| "reward": 0.3801136128604412, | |
| "reward_std": 0.6368846967816353, | |
| "rewards/cosine_scaled_reward": -0.11549876257777214, | |
| "rewards/format_reward": 0.6111111044883728, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2803.1805419921875, | |
| "epoch": 0.4027420736932305, | |
| "grad_norm": 1.1210250854492188, | |
| "kl": 0.76171875, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0749, | |
| "reward": 0.051421504467725754, | |
| "reward_std": 0.46992237120866776, | |
| "rewards/cosine_scaled_reward": -0.21734481677412987, | |
| "rewards/format_reward": 0.4861111231148243, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2735.9583129882812, | |
| "epoch": 0.40445586975149955, | |
| "grad_norm": 1.5609543323516846, | |
| "kl": 0.6220703125, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.1051, | |
| "reward": 0.33594064973294735, | |
| "reward_std": 0.5969594717025757, | |
| "rewards/cosine_scaled_reward": -0.1306407954543829, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2207.9444580078125, | |
| "epoch": 0.40616966580976865, | |
| "grad_norm": 3.293438673019409, | |
| "kl": 0.619140625, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.1014, | |
| "reward": 0.6933649554848671, | |
| "reward_std": 0.4978405013680458, | |
| "rewards/cosine_scaled_reward": -0.02831752598285675, | |
| "rewards/format_reward": 0.75, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2911.0972900390625, | |
| "epoch": 0.4078834618680377, | |
| "grad_norm": 1.396133303642273, | |
| "kl": 0.689453125, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.1308, | |
| "reward": 0.23781822435557842, | |
| "reward_std": 0.5772198215126991, | |
| "rewards/cosine_scaled_reward": -0.1449797886889428, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2865.1944580078125, | |
| "epoch": 0.40959725792630675, | |
| "grad_norm": 1.02251398563385, | |
| "kl": 0.775390625, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.1421, | |
| "reward": 0.38452258985489607, | |
| "reward_std": 0.7435072809457779, | |
| "rewards/cosine_scaled_reward": -0.0646831514313817, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2345.4444274902344, | |
| "epoch": 0.41131105398457585, | |
| "grad_norm": 4.698256492614746, | |
| "kl": 0.6904296875, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.1594, | |
| "reward": 0.7729744166135788, | |
| "reward_std": 0.8151284381747246, | |
| "rewards/cosine_scaled_reward": 0.10176499933004379, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2070.4166259765625, | |
| "epoch": 0.4130248500428449, | |
| "grad_norm": 8.216842651367188, | |
| "kl": 0.52490234375, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.1321, | |
| "reward": 0.48884235695004463, | |
| "reward_std": 0.5597383752465248, | |
| "rewards/cosine_scaled_reward": -0.06113438308238983, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2937.0972290039062, | |
| "epoch": 0.414738646101114, | |
| "grad_norm": 1.0033913850784302, | |
| "kl": 0.5947265625, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.1027, | |
| "reward": 0.3329106804449111, | |
| "reward_std": 0.626296728849411, | |
| "rewards/cosine_scaled_reward": -0.09048910066485405, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2632.3055419921875, | |
| "epoch": 0.41645244215938304, | |
| "grad_norm": 3.2918546199798584, | |
| "kl": 0.62109375, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.151, | |
| "reward": 0.423097662627697, | |
| "reward_std": 0.7703854739665985, | |
| "rewards/cosine_scaled_reward": -0.05234006140381098, | |
| "rewards/format_reward": 0.527777798473835, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2488.8055725097656, | |
| "epoch": 0.4181662382176521, | |
| "grad_norm": 1.382688283920288, | |
| "kl": 0.51220703125, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.1239, | |
| "reward": 0.5373616181313992, | |
| "reward_std": 0.642534889280796, | |
| "rewards/cosine_scaled_reward": -0.036874750861898065, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1840.0555877685547, | |
| "epoch": 0.4198800342759212, | |
| "grad_norm": 5.2921977043151855, | |
| "kl": 0.435546875, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.1668, | |
| "reward": 1.0207914784550667, | |
| "reward_std": 0.6237036064267159, | |
| "rewards/cosine_scaled_reward": 0.08678461611270905, | |
| "rewards/format_reward": 0.8472222536802292, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2378.77783203125, | |
| "epoch": 0.42159383033419023, | |
| "grad_norm": 4.525283336639404, | |
| "kl": 0.7333984375, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.2341, | |
| "reward": 0.8125267028808594, | |
| "reward_std": 0.7737091481685638, | |
| "rewards/cosine_scaled_reward": 0.05209667468443513, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2746.7916870117188, | |
| "epoch": 0.4233076263924593, | |
| "grad_norm": 1.215826392173767, | |
| "kl": 0.7509765625, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.124, | |
| "reward": 0.16153091937303543, | |
| "reward_std": 0.7042593955993652, | |
| "rewards/cosine_scaled_reward": -0.12756787613034248, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2420.125, | |
| "epoch": 0.4250214224507284, | |
| "grad_norm": 2.161705732345581, | |
| "kl": 0.86328125, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0674, | |
| "reward": 0.6162599250674248, | |
| "reward_std": 0.7196609973907471, | |
| "rewards/cosine_scaled_reward": -0.05298116838093847, | |
| "rewards/format_reward": 0.7222222238779068, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2277.500030517578, | |
| "epoch": 0.4267352185089974, | |
| "grad_norm": 3.1015782356262207, | |
| "kl": 0.732421875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0179, | |
| "reward": 0.6064739339053631, | |
| "reward_std": 0.6056996583938599, | |
| "rewards/cosine_scaled_reward": -0.03704079985618591, | |
| "rewards/format_reward": 0.6805555671453476, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2416.9583740234375, | |
| "epoch": 0.4284490145672665, | |
| "grad_norm": 8.199381828308105, | |
| "kl": 0.904296875, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0707, | |
| "reward": 0.5718964412808418, | |
| "reward_std": 0.7699461728334427, | |
| "rewards/cosine_scaled_reward": -0.06127400905825198, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2411.2916259765625, | |
| "epoch": 0.4301628106255356, | |
| "grad_norm": 7.757229328155518, | |
| "kl": 0.7177734375, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0104, | |
| "reward": 0.6452328599989414, | |
| "reward_std": 0.8850838840007782, | |
| "rewards/cosine_scaled_reward": -0.010716899763792753, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2250.1805725097656, | |
| "epoch": 0.4318766066838046, | |
| "grad_norm": 50.71971893310547, | |
| "kl": 1.673828125, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.2537, | |
| "reward": 0.41996366158127785, | |
| "reward_std": 0.630496121942997, | |
| "rewards/cosine_scaled_reward": -0.16501817479729652, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2389.4861450195312, | |
| "epoch": 0.43359040274207367, | |
| "grad_norm": 76.95365142822266, | |
| "kl": 1.5126953125, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.1964, | |
| "reward": 0.5589244738221169, | |
| "reward_std": 0.8758179396390915, | |
| "rewards/cosine_scaled_reward": -0.019148872102960013, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2433.6944580078125, | |
| "epoch": 0.43530419880034277, | |
| "grad_norm": 89.30572509765625, | |
| "kl": 1.7294921875, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.1411, | |
| "reward": 0.3653869954869151, | |
| "reward_std": 0.6425384879112244, | |
| "rewards/cosine_scaled_reward": -0.11591762490570545, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2666.02783203125, | |
| "epoch": 0.4370179948586118, | |
| "grad_norm": 9.923705101013184, | |
| "kl": 0.7724609375, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.1243, | |
| "reward": 0.3781815767288208, | |
| "reward_std": 0.6919418126344681, | |
| "rewards/cosine_scaled_reward": -0.06785366125404835, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2432.8056030273438, | |
| "epoch": 0.4387317909168809, | |
| "grad_norm": 9.747496604919434, | |
| "kl": 0.7314453125, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.083, | |
| "reward": 0.2219482958316803, | |
| "reward_std": 0.47816336899995804, | |
| "rewards/cosine_scaled_reward": -0.20152585953474045, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2205.027801513672, | |
| "epoch": 0.44044558697514996, | |
| "grad_norm": 3.7311699390411377, | |
| "kl": 0.53857421875, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.1269, | |
| "reward": 0.5375950075685978, | |
| "reward_std": 0.6251346915960312, | |
| "rewards/cosine_scaled_reward": -0.0923136118799448, | |
| "rewards/format_reward": 0.7222222238779068, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2471.4584045410156, | |
| "epoch": 0.442159383033419, | |
| "grad_norm": 2.448763608932495, | |
| "kl": 0.91796875, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.1343, | |
| "reward": 0.33694631792604923, | |
| "reward_std": 0.4810459837317467, | |
| "rewards/cosine_scaled_reward": -0.1440268289297819, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2130.763885498047, | |
| "epoch": 0.4438731790916881, | |
| "grad_norm": 1.8066775798797607, | |
| "kl": 0.68505859375, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.1045, | |
| "reward": 0.7608818560838699, | |
| "reward_std": 0.697891928255558, | |
| "rewards/cosine_scaled_reward": -0.008447982007055543, | |
| "rewards/format_reward": 0.7777777910232544, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2359.7638549804688, | |
| "epoch": 0.44558697514995715, | |
| "grad_norm": 1.9958624839782715, | |
| "kl": 1.009765625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.1355, | |
| "reward": 0.7400075197219849, | |
| "reward_std": 0.5138791352510452, | |
| "rewards/cosine_scaled_reward": 0.00889264652505517, | |
| "rewards/format_reward": 0.7222222313284874, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2002.7361145019531, | |
| "epoch": 0.4473007712082262, | |
| "grad_norm": 5.525115966796875, | |
| "kl": 0.7490234375, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.1647, | |
| "reward": 0.7085682898759842, | |
| "reward_std": 0.6736738979816437, | |
| "rewards/cosine_scaled_reward": -0.041549197398126125, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2599.9028930664062, | |
| "epoch": 0.4490145672664953, | |
| "grad_norm": 4.714324474334717, | |
| "kl": 0.9013671875, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.1464, | |
| "reward": 0.7315462306141853, | |
| "reward_std": 0.8794215172529221, | |
| "rewards/cosine_scaled_reward": 0.05327310296706855, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.236083984375, | |
| "epoch": 0.45072836332476435, | |
| "grad_norm": 1.4935065507888794, | |
| "kl": 0.8330078125, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.1333, | |
| "reward": 0.26326372660696507, | |
| "reward_std": 0.3958895206451416, | |
| "rewards/cosine_scaled_reward": -0.11836813762784004, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2746.1111450195312, | |
| "epoch": 0.4524421593830334, | |
| "grad_norm": 2.71354341506958, | |
| "kl": 0.833984375, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0506, | |
| "reward": 0.34650287590920925, | |
| "reward_std": 0.6603603884577751, | |
| "rewards/cosine_scaled_reward": -0.11841523088514805, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2810.6111450195312, | |
| "epoch": 0.4541559554413025, | |
| "grad_norm": 3.0615949630737305, | |
| "kl": 0.8046875, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0807, | |
| "reward": 0.6215685978531837, | |
| "reward_std": 0.6345800720155239, | |
| "rewards/cosine_scaled_reward": 0.06078430451452732, | |
| "rewards/format_reward": 0.5000000018626451, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2483.513916015625, | |
| "epoch": 0.45586975149957154, | |
| "grad_norm": 2.4526662826538086, | |
| "kl": 0.689453125, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.1832, | |
| "reward": 0.4189574085175991, | |
| "reward_std": 0.4973677098751068, | |
| "rewards/cosine_scaled_reward": -0.0544101782143116, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2401.1805725097656, | |
| "epoch": 0.45758354755784064, | |
| "grad_norm": 1.3402016162872314, | |
| "kl": 0.7294921875, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.1157, | |
| "reward": 0.36370813054963946, | |
| "reward_std": 0.4258965626358986, | |
| "rewards/cosine_scaled_reward": -0.14453481137752533, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2708.3194580078125, | |
| "epoch": 0.4592973436161097, | |
| "grad_norm": 3.217470407485962, | |
| "kl": 0.599609375, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0051, | |
| "reward": 0.26998334005475044, | |
| "reward_std": 0.49185192957520485, | |
| "rewards/cosine_scaled_reward": -0.15667499974370003, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2829.4583740234375, | |
| "epoch": 0.46101113967437873, | |
| "grad_norm": 1.457294225692749, | |
| "kl": 0.591796875, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.1149, | |
| "reward": 0.2393805852625519, | |
| "reward_std": 0.7379022389650345, | |
| "rewards/cosine_scaled_reward": -0.11642082477919757, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2878.2916259765625, | |
| "epoch": 0.46272493573264784, | |
| "grad_norm": 1.101474642753601, | |
| "kl": 0.69921875, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0498, | |
| "reward": 0.37158428877592087, | |
| "reward_std": 0.5783374309539795, | |
| "rewards/cosine_scaled_reward": -0.029485642910003662, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2793.6806030273438, | |
| "epoch": 0.4644387317909169, | |
| "grad_norm": 0.8390009999275208, | |
| "kl": 0.63916015625, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0834, | |
| "reward": 0.38940694369375706, | |
| "reward_std": 0.6547586917877197, | |
| "rewards/cosine_scaled_reward": -0.06918542925268412, | |
| "rewards/format_reward": 0.5277777723968029, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2366.638885498047, | |
| "epoch": 0.4661525278491859, | |
| "grad_norm": 4.141148090362549, | |
| "kl": 0.4765625, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.2224, | |
| "reward": 0.4617117829620838, | |
| "reward_std": 0.6572139859199524, | |
| "rewards/cosine_scaled_reward": -0.060810765251517296, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2721.6805419921875, | |
| "epoch": 0.46786632390745503, | |
| "grad_norm": 1.906948208808899, | |
| "kl": 0.599609375, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.1188, | |
| "reward": 0.26052477210760117, | |
| "reward_std": 0.6570783406496048, | |
| "rewards/cosine_scaled_reward": -0.06418205983936787, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2921.3750610351562, | |
| "epoch": 0.4695801199657241, | |
| "grad_norm": 1.8937734365463257, | |
| "kl": 0.52685546875, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0185, | |
| "reward": 0.22513618250377476, | |
| "reward_std": 0.6058431342244148, | |
| "rewards/cosine_scaled_reward": -0.16520969779230654, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2988.15283203125, | |
| "epoch": 0.4712939160239931, | |
| "grad_norm": 1.208433985710144, | |
| "kl": 0.54931640625, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0485, | |
| "reward": 0.11821263573256147, | |
| "reward_std": 0.391703762114048, | |
| "rewards/cosine_scaled_reward": -0.15617146715521812, | |
| "rewards/format_reward": 0.4305555494502187, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2643.541748046875, | |
| "epoch": 0.4730077120822622, | |
| "grad_norm": 0.9594613313674927, | |
| "kl": 0.4326171875, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0807, | |
| "reward": 0.32308289408683777, | |
| "reward_std": 0.7159284129738808, | |
| "rewards/cosine_scaled_reward": -0.08151410473510623, | |
| "rewards/format_reward": 0.4861111156642437, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2926.013916015625, | |
| "epoch": 0.47472150814053127, | |
| "grad_norm": 1.3466960191726685, | |
| "kl": 0.5048828125, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.1283, | |
| "reward": 0.06485692039132118, | |
| "reward_std": 0.542218990623951, | |
| "rewards/cosine_scaled_reward": -0.17590487515553832, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2993.1945190429688, | |
| "epoch": 0.47643530419880037, | |
| "grad_norm": 3.963604688644409, | |
| "kl": 0.5546875, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.2168, | |
| "reward": 0.03267951123416424, | |
| "reward_std": 0.7076856940984726, | |
| "rewards/cosine_scaled_reward": -0.15727136190980673, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3138.6805419921875, | |
| "epoch": 0.4781491002570694, | |
| "grad_norm": 0.5958003997802734, | |
| "kl": 0.56201171875, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0759, | |
| "reward": -0.051162030547857285, | |
| "reward_std": 0.5351358503103256, | |
| "rewards/cosine_scaled_reward": -0.2478032372891903, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2737.1944885253906, | |
| "epoch": 0.47986289631533846, | |
| "grad_norm": 1.6760090589523315, | |
| "kl": 0.59228515625, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.1096, | |
| "reward": 0.45366813987493515, | |
| "reward_std": 0.7812162339687347, | |
| "rewards/cosine_scaled_reward": -0.0023325812071561813, | |
| "rewards/format_reward": 0.4583333367481828, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2423.2222290039062, | |
| "epoch": 0.48157669237360756, | |
| "grad_norm": 2.0177345275878906, | |
| "kl": 0.461669921875, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.1262, | |
| "reward": 0.38226850144565105, | |
| "reward_std": 0.6752881184220314, | |
| "rewards/cosine_scaled_reward": -0.07969908323138952, | |
| "rewards/format_reward": 0.5416666585952044, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2740.3611450195312, | |
| "epoch": 0.4832904884318766, | |
| "grad_norm": 1.8314719200134277, | |
| "kl": 0.6337890625, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.1455, | |
| "reward": 0.21622517937794328, | |
| "reward_std": 0.6346057131886482, | |
| "rewards/cosine_scaled_reward": -0.15577631071209908, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2889.2500610351562, | |
| "epoch": 0.48500428449014565, | |
| "grad_norm": 0.8971331119537354, | |
| "kl": 0.6259765625, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.1266, | |
| "reward": 0.03935375134460628, | |
| "reward_std": 0.6441294327378273, | |
| "rewards/cosine_scaled_reward": -0.20254534482955933, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2276.9584045410156, | |
| "epoch": 0.48671808054841476, | |
| "grad_norm": 6.356135368347168, | |
| "kl": 0.4833984375, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.3013, | |
| "reward": 0.32465188996866345, | |
| "reward_std": 0.6560942605137825, | |
| "rewards/cosine_scaled_reward": -0.09461849741637707, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2624.9306640625, | |
| "epoch": 0.4884318766066838, | |
| "grad_norm": 1.9299744367599487, | |
| "kl": 0.5576171875, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.1493, | |
| "reward": 0.21753913909196854, | |
| "reward_std": 0.683107927441597, | |
| "rewards/cosine_scaled_reward": -0.16206377279013395, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2555.7361450195312, | |
| "epoch": 0.49014567266495285, | |
| "grad_norm": 2.2400879859924316, | |
| "kl": 0.62890625, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.1583, | |
| "reward": 0.4158199355006218, | |
| "reward_std": 0.6385739594697952, | |
| "rewards/cosine_scaled_reward": -0.021256705978885293, | |
| "rewards/format_reward": 0.4583333320915699, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2526.52783203125, | |
| "epoch": 0.49185946872322195, | |
| "grad_norm": 3.190502405166626, | |
| "kl": 0.916015625, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.2192, | |
| "reward": 0.4090676587074995, | |
| "reward_std": 0.7619837448000908, | |
| "rewards/cosine_scaled_reward": -0.0662995120510459, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2134.65283203125, | |
| "epoch": 0.493573264781491, | |
| "grad_norm": 6.452578544616699, | |
| "kl": 1.103515625, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.368, | |
| "reward": 0.44275959208607674, | |
| "reward_std": 0.6151050329208374, | |
| "rewards/cosine_scaled_reward": -0.09806465543806553, | |
| "rewards/format_reward": 0.6388888955116272, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2290.013885498047, | |
| "epoch": 0.4952870608397601, | |
| "grad_norm": 1.6855307817459106, | |
| "kl": 0.82275390625, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.1464, | |
| "reward": 0.4334092391654849, | |
| "reward_std": 0.6985170915722847, | |
| "rewards/cosine_scaled_reward": -0.0819065012037754, | |
| "rewards/format_reward": 0.5972222164273262, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1840.6388549804688, | |
| "epoch": 0.49700085689802914, | |
| "grad_norm": 2.9146785736083984, | |
| "kl": 0.79296875, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0283, | |
| "reward": 0.7537698708474636, | |
| "reward_std": 0.5962013602256775, | |
| "rewards/cosine_scaled_reward": 0.008829381316900253, | |
| "rewards/format_reward": 0.736111119389534, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2200.4444274902344, | |
| "epoch": 0.4987146529562982, | |
| "grad_norm": 1.7983882427215576, | |
| "kl": 0.927734375, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.2134, | |
| "reward": 0.614590086042881, | |
| "reward_std": 0.9522670358419418, | |
| "rewards/cosine_scaled_reward": -0.012149423826485872, | |
| "rewards/format_reward": 0.638888880610466, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1986.0694885253906, | |
| "epoch": 0.5004284490145673, | |
| "grad_norm": 2.5195839405059814, | |
| "kl": 0.9873046875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.1778, | |
| "reward": 0.7197987511754036, | |
| "reward_std": 0.8348591700196266, | |
| "rewards/cosine_scaled_reward": 0.01267714286223054, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1970.9999694824219, | |
| "epoch": 0.5021422450728363, | |
| "grad_norm": 3.093557596206665, | |
| "kl": 1.0625, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.1538, | |
| "reward": 0.29343970119953156, | |
| "reward_std": 0.49155813455581665, | |
| "rewards/cosine_scaled_reward": -0.13105794228613377, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1924.7916870117188, | |
| "epoch": 0.5038560411311054, | |
| "grad_norm": 6.1597208976745605, | |
| "kl": 0.87158203125, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.3554, | |
| "reward": 0.4494058433920145, | |
| "reward_std": 0.747850589454174, | |
| "rewards/cosine_scaled_reward": -0.08085263520479202, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2000.6388854980469, | |
| "epoch": 0.5055698371893744, | |
| "grad_norm": 2.868744134902954, | |
| "kl": 0.8369140625, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.1929, | |
| "reward": 0.23984116781502962, | |
| "reward_std": 0.4878830164670944, | |
| "rewards/cosine_scaled_reward": -0.22730162646621466, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2322.5556030273438, | |
| "epoch": 0.5072836332476436, | |
| "grad_norm": 3.0267629623413086, | |
| "kl": 1.2763671875, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.2229, | |
| "reward": 0.4730116240680218, | |
| "reward_std": 0.672569528222084, | |
| "rewards/cosine_scaled_reward": 0.0003947049845010042, | |
| "rewards/format_reward": 0.4722222276031971, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1968.2222595214844, | |
| "epoch": 0.5089974293059126, | |
| "grad_norm": 3.3123202323913574, | |
| "kl": 1.0166015625, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.2608, | |
| "reward": 0.4829604886472225, | |
| "reward_std": 0.6525571122765541, | |
| "rewards/cosine_scaled_reward": -0.08490864699706435, | |
| "rewards/format_reward": 0.6527777686715126, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2157.8055725097656, | |
| "epoch": 0.5107112253641817, | |
| "grad_norm": 4.5067524909973145, | |
| "kl": 1.609375, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.379, | |
| "reward": 0.673854373395443, | |
| "reward_std": 0.728430263698101, | |
| "rewards/cosine_scaled_reward": 0.024427177384495735, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1613.5000305175781, | |
| "epoch": 0.5124250214224507, | |
| "grad_norm": 3.3076987266540527, | |
| "kl": 1.0537109375, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.2441, | |
| "reward": 0.8705739304423332, | |
| "reward_std": 0.7832073271274567, | |
| "rewards/cosine_scaled_reward": 0.053342508152127266, | |
| "rewards/format_reward": 0.7638888955116272, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2283.0000610351562, | |
| "epoch": 0.5141388174807198, | |
| "grad_norm": 4.202121734619141, | |
| "kl": 1.794921875, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.2233, | |
| "reward": 0.41079268511384726, | |
| "reward_std": 0.49303294718265533, | |
| "rewards/cosine_scaled_reward": -0.07238144427537918, | |
| "rewards/format_reward": 0.5555555522441864, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1734.0139465332031, | |
| "epoch": 0.5158526135389888, | |
| "grad_norm": 2.733738660812378, | |
| "kl": 1.271484375, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.3623, | |
| "reward": 0.5950284972786903, | |
| "reward_std": 0.5468220561742783, | |
| "rewards/cosine_scaled_reward": -0.042763520032167435, | |
| "rewards/format_reward": 0.6805555671453476, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2734.5138549804688, | |
| "epoch": 0.517566409597258, | |
| "grad_norm": 5.247328281402588, | |
| "kl": 1.837890625, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.1992, | |
| "reward": 0.1533558116061613, | |
| "reward_std": 0.5701670944690704, | |
| "rewards/cosine_scaled_reward": -0.10387765569612384, | |
| "rewards/format_reward": 0.36111111380159855, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1955.3333435058594, | |
| "epoch": 0.519280205655527, | |
| "grad_norm": 2.1991233825683594, | |
| "kl": 1.4111328125, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.2534, | |
| "reward": 0.4565839725546539, | |
| "reward_std": 0.5375222712755203, | |
| "rewards/cosine_scaled_reward": -0.13281912542879581, | |
| "rewards/format_reward": 0.7222222164273262, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1990.6805419921875, | |
| "epoch": 0.5209940017137961, | |
| "grad_norm": 3.0468897819519043, | |
| "kl": 1.685546875, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.2866, | |
| "reward": 0.589899554848671, | |
| "reward_std": 0.6861986592411995, | |
| "rewards/cosine_scaled_reward": -0.03143910859944299, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1996.5416870117188, | |
| "epoch": 0.5227077977720651, | |
| "grad_norm": 4.011932849884033, | |
| "kl": 1.462890625, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.2531, | |
| "reward": 0.3674123687669635, | |
| "reward_std": 0.6027099043130875, | |
| "rewards/cosine_scaled_reward": -0.10796047560870647, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1896.9166870117188, | |
| "epoch": 0.5244215938303342, | |
| "grad_norm": 2.977555751800537, | |
| "kl": 1.01171875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.1578, | |
| "reward": 0.43058543652296066, | |
| "reward_std": 0.5935798361897469, | |
| "rewards/cosine_scaled_reward": -0.12498506158590317, | |
| "rewards/format_reward": 0.6805555671453476, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2022.4584045410156, | |
| "epoch": 0.5261353898886033, | |
| "grad_norm": 2.7941606044769287, | |
| "kl": 1.24609375, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.2441, | |
| "reward": 0.7062125951051712, | |
| "reward_std": 0.8090076595544815, | |
| "rewards/cosine_scaled_reward": -0.0010603656992316246, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2287.125030517578, | |
| "epoch": 0.5278491859468724, | |
| "grad_norm": 1.6866544485092163, | |
| "kl": 1.287109375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.2517, | |
| "reward": 0.650560175999999, | |
| "reward_std": 0.742147371172905, | |
| "rewards/cosine_scaled_reward": 0.033613420091569424, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2066.5694580078125, | |
| "epoch": 0.5295629820051414, | |
| "grad_norm": 2.5993359088897705, | |
| "kl": 1.2255859375, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.1916, | |
| "reward": 0.6525638625025749, | |
| "reward_std": 0.6665498167276382, | |
| "rewards/cosine_scaled_reward": 0.02767082443460822, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2370.763885498047, | |
| "epoch": 0.5312767780634104, | |
| "grad_norm": 3.1745495796203613, | |
| "kl": 1.205078125, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.167, | |
| "reward": 0.4759225994348526, | |
| "reward_std": 0.8299422115087509, | |
| "rewards/cosine_scaled_reward": -0.04676092881709337, | |
| "rewards/format_reward": 0.569444440305233, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2503.5972290039062, | |
| "epoch": 0.5329905741216795, | |
| "grad_norm": 2.2356581687927246, | |
| "kl": 1.376953125, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.2558, | |
| "reward": 0.2966647706925869, | |
| "reward_std": 0.5752375796437263, | |
| "rewards/cosine_scaled_reward": -0.15722317062318325, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2278.0277404785156, | |
| "epoch": 0.5347043701799485, | |
| "grad_norm": 1.4038455486297607, | |
| "kl": 1.203125, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.241, | |
| "reward": 0.8164278883486986, | |
| "reward_std": 0.7621838673949242, | |
| "rewards/cosine_scaled_reward": 0.08876948896795511, | |
| "rewards/format_reward": 0.638888880610466, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1964.8333435058594, | |
| "epoch": 0.5364181662382177, | |
| "grad_norm": 5.611269950866699, | |
| "kl": 1.6904296875, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.1802, | |
| "reward": 0.5501147694885731, | |
| "reward_std": 0.48398981615900993, | |
| "rewards/cosine_scaled_reward": -0.07910929806530476, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2596.0139770507812, | |
| "epoch": 0.5381319622964867, | |
| "grad_norm": 2.7361557483673096, | |
| "kl": 0.904296875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.1967, | |
| "reward": 0.43212154414504766, | |
| "reward_std": 0.8072051256895065, | |
| "rewards/cosine_scaled_reward": -0.04088366776704788, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2164.986083984375, | |
| "epoch": 0.5398457583547558, | |
| "grad_norm": 2.377624988555908, | |
| "kl": 1.009765625, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.2612, | |
| "reward": 0.345002256333828, | |
| "reward_std": 0.6840208172798157, | |
| "rewards/cosine_scaled_reward": -0.13305442477576435, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2112.861114501953, | |
| "epoch": 0.5415595544130248, | |
| "grad_norm": 2.4749066829681396, | |
| "kl": 1.1328125, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.2521, | |
| "reward": 0.3993903249502182, | |
| "reward_std": 0.7594646960496902, | |
| "rewards/cosine_scaled_reward": -0.09891596343368292, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2763.791717529297, | |
| "epoch": 0.5432733504712939, | |
| "grad_norm": 3.959906816482544, | |
| "kl": 1.5537109375, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.1382, | |
| "reward": 0.3366717994213104, | |
| "reward_std": 0.5981347486376762, | |
| "rewards/cosine_scaled_reward": -0.10944187548011541, | |
| "rewards/format_reward": 0.5555555559694767, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1693.9583129882812, | |
| "epoch": 0.5449871465295629, | |
| "grad_norm": 7.122504711151123, | |
| "kl": 1.0234375, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.3755, | |
| "reward": 1.1441613137722015, | |
| "reward_std": 0.5525609478354454, | |
| "rewards/cosine_scaled_reward": 0.21791397035121918, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2302.7222595214844, | |
| "epoch": 0.5467009425878321, | |
| "grad_norm": 1.7870283126831055, | |
| "kl": 1.1513671875, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.2157, | |
| "reward": 0.44585637911222875, | |
| "reward_std": 0.5990823060274124, | |
| "rewards/cosine_scaled_reward": -0.09651626879349351, | |
| "rewards/format_reward": 0.638888880610466, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2418.6112060546875, | |
| "epoch": 0.5484147386461011, | |
| "grad_norm": 4.37020206451416, | |
| "kl": 1.3271484375, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.1373, | |
| "reward": 0.7411398887634277, | |
| "reward_std": 0.8346492722630501, | |
| "rewards/cosine_scaled_reward": 0.030292170122265816, | |
| "rewards/format_reward": 0.6805555522441864, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2477.4305419921875, | |
| "epoch": 0.5501285347043702, | |
| "grad_norm": 2.8605029582977295, | |
| "kl": 0.9931640625, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.1566, | |
| "reward": 0.3635707888752222, | |
| "reward_std": 0.6505779251456261, | |
| "rewards/cosine_scaled_reward": -0.14460349176079035, | |
| "rewards/format_reward": 0.6527777835726738, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1934.0694885253906, | |
| "epoch": 0.5518423307626392, | |
| "grad_norm": 2.2090539932250977, | |
| "kl": 1.033935546875, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.1413, | |
| "reward": 0.43895523250102997, | |
| "reward_std": 0.6110691279172897, | |
| "rewards/cosine_scaled_reward": -0.14857794775161892, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2659.2083129882812, | |
| "epoch": 0.5535561268209083, | |
| "grad_norm": 1.6186331510543823, | |
| "kl": 0.92578125, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.1545, | |
| "reward": 0.43137288000434637, | |
| "reward_std": 0.5587008334696293, | |
| "rewards/cosine_scaled_reward": -0.041258019395172596, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2133.0, | |
| "epoch": 0.5552699228791774, | |
| "grad_norm": 2.309271812438965, | |
| "kl": 0.646484375, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.2071, | |
| "reward": 0.8334337323904037, | |
| "reward_std": 0.7556089013814926, | |
| "rewards/cosine_scaled_reward": 0.0486613066168502, | |
| "rewards/format_reward": 0.7361111044883728, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2852.5416259765625, | |
| "epoch": 0.5569837189374465, | |
| "grad_norm": 1.367803692817688, | |
| "kl": 1.1455078125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.2217, | |
| "reward": 0.30423190630972385, | |
| "reward_std": 0.6387892812490463, | |
| "rewards/cosine_scaled_reward": -0.13955070948577486, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2254.2638549804688, | |
| "epoch": 0.5586975149957155, | |
| "grad_norm": 2.456948757171631, | |
| "kl": 0.90966796875, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.2717, | |
| "reward": 0.7499970353674144, | |
| "reward_std": 0.5177437886595726, | |
| "rewards/cosine_scaled_reward": 0.06249852292239666, | |
| "rewards/format_reward": 0.625, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2643.263916015625, | |
| "epoch": 0.5604113110539846, | |
| "grad_norm": 2.975080728530884, | |
| "kl": 0.96875, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.2016, | |
| "reward": 0.37286074459552765, | |
| "reward_std": 0.6334929168224335, | |
| "rewards/cosine_scaled_reward": -0.06356962397694588, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2430.6666564941406, | |
| "epoch": 0.5621251071122536, | |
| "grad_norm": 2.2276480197906494, | |
| "kl": 1.1484375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.2054, | |
| "reward": 0.45167311653494835, | |
| "reward_std": 0.6782046630978584, | |
| "rewards/cosine_scaled_reward": -0.07971900515258312, | |
| "rewards/format_reward": 0.6111111268401146, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2118.3194885253906, | |
| "epoch": 0.5638389031705227, | |
| "grad_norm": 1.439645767211914, | |
| "kl": 0.9873046875, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.1558, | |
| "reward": 0.3222229927778244, | |
| "reward_std": 0.5387292131781578, | |
| "rewards/cosine_scaled_reward": -0.186110720038414, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2615.3750610351562, | |
| "epoch": 0.5655526992287918, | |
| "grad_norm": 1.3368608951568604, | |
| "kl": 0.822265625, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0814, | |
| "reward": 0.19894374161958694, | |
| "reward_std": 0.4315878227353096, | |
| "rewards/cosine_scaled_reward": -0.17136146454140544, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2637.2084045410156, | |
| "epoch": 0.5672664952870609, | |
| "grad_norm": 1.4436960220336914, | |
| "kl": 0.78125, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.1763, | |
| "reward": 0.555847343057394, | |
| "reward_std": 0.7657169997692108, | |
| "rewards/cosine_scaled_reward": 0.020979220047593117, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2674.9444580078125, | |
| "epoch": 0.5689802913453299, | |
| "grad_norm": 3.6536855697631836, | |
| "kl": 1.30078125, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.1134, | |
| "reward": 0.44342901557683945, | |
| "reward_std": 0.6001273989677429, | |
| "rewards/cosine_scaled_reward": -0.09772994555532932, | |
| "rewards/format_reward": 0.6388888880610466, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2896.0277709960938, | |
| "epoch": 0.570694087403599, | |
| "grad_norm": 1.9911452531814575, | |
| "kl": 1.0078125, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.1034, | |
| "reward": -0.17525828257203102, | |
| "reward_std": 0.4928872212767601, | |
| "rewards/cosine_scaled_reward": -0.24040691554546356, | |
| "rewards/format_reward": 0.30555556528270245, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2810.861083984375, | |
| "epoch": 0.572407883461868, | |
| "grad_norm": 1.8073278665542603, | |
| "kl": 1.068359375, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.1821, | |
| "reward": -0.006253276020288467, | |
| "reward_std": 0.5080604404211044, | |
| "rewards/cosine_scaled_reward": -0.2114599784836173, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2462.65283203125, | |
| "epoch": 0.5741216795201372, | |
| "grad_norm": 1.5005158185958862, | |
| "kl": 0.771484375, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.1649, | |
| "reward": 0.45248544216156006, | |
| "reward_std": 0.5002452582120895, | |
| "rewards/cosine_scaled_reward": -0.07931282371282578, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2160.125030517578, | |
| "epoch": 0.5758354755784062, | |
| "grad_norm": 3.1605849266052246, | |
| "kl": 0.548828125, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.1752, | |
| "reward": 0.24482397455722094, | |
| "reward_std": 0.515994019806385, | |
| "rewards/cosine_scaled_reward": -0.17619912140071392, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2403.3194580078125, | |
| "epoch": 0.5775492716366752, | |
| "grad_norm": 2.8763837814331055, | |
| "kl": 0.736328125, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.1929, | |
| "reward": 0.5475254282355309, | |
| "reward_std": 0.6548926681280136, | |
| "rewards/cosine_scaled_reward": -0.038737302646040916, | |
| "rewards/format_reward": 0.6249999925494194, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2418.013946533203, | |
| "epoch": 0.5792630676949443, | |
| "grad_norm": 2.0177650451660156, | |
| "kl": 0.6171875, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.2226, | |
| "reward": 0.5356792770326138, | |
| "reward_std": 0.9891562312841415, | |
| "rewards/cosine_scaled_reward": -0.01688259281218052, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2717.888916015625, | |
| "epoch": 0.5809768637532133, | |
| "grad_norm": 2.011136293411255, | |
| "kl": 0.8603515625, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0711, | |
| "reward": 0.31583554670214653, | |
| "reward_std": 0.6509723365306854, | |
| "rewards/cosine_scaled_reward": -0.14069335255771875, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2609.1805114746094, | |
| "epoch": 0.5826906598114824, | |
| "grad_norm": 2.1439146995544434, | |
| "kl": 0.60546875, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.1085, | |
| "reward": 0.26384788006544113, | |
| "reward_std": 0.5596405640244484, | |
| "rewards/cosine_scaled_reward": -0.12502050958573818, | |
| "rewards/format_reward": 0.5138888917863369, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2366.8055725097656, | |
| "epoch": 0.5844044558697515, | |
| "grad_norm": 3.087810516357422, | |
| "kl": 1.16796875, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.2266, | |
| "reward": 0.19622072577476501, | |
| "reward_std": 0.43179403990507126, | |
| "rewards/cosine_scaled_reward": -0.20050075091421604, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2665.3055419921875, | |
| "epoch": 0.5861182519280206, | |
| "grad_norm": 2.5300474166870117, | |
| "kl": 0.7939453125, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.1537, | |
| "reward": 0.24374699965119362, | |
| "reward_std": 0.7871415168046951, | |
| "rewards/cosine_scaled_reward": -0.1350709474645555, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2231.2222900390625, | |
| "epoch": 0.5878320479862896, | |
| "grad_norm": 6.685680389404297, | |
| "kl": 0.60302734375, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.3184, | |
| "reward": 0.4801894012489356, | |
| "reward_std": 0.4852745458483696, | |
| "rewards/cosine_scaled_reward": -0.02379419095814228, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2502.8472290039062, | |
| "epoch": 0.5895458440445587, | |
| "grad_norm": 2.49977970123291, | |
| "kl": 1.0634765625, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.2075, | |
| "reward": 0.22286849096417427, | |
| "reward_std": 0.51853808760643, | |
| "rewards/cosine_scaled_reward": -0.18023241311311722, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2553.8472290039062, | |
| "epoch": 0.5912596401028277, | |
| "grad_norm": 1.4922689199447632, | |
| "kl": 0.7724609375, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.11, | |
| "reward": 0.40987285412847996, | |
| "reward_std": 0.6866099908947945, | |
| "rewards/cosine_scaled_reward": -0.03117468417622149, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2434.250030517578, | |
| "epoch": 0.5929734361610969, | |
| "grad_norm": 2.287896156311035, | |
| "kl": 0.73828125, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.1042, | |
| "reward": 0.566213920712471, | |
| "reward_std": 0.6637867465615273, | |
| "rewards/cosine_scaled_reward": -0.05022636614739895, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2606.3056030273438, | |
| "epoch": 0.5946872322193659, | |
| "grad_norm": 5.195223808288574, | |
| "kl": 0.8203125, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0878, | |
| "reward": 0.30292151868343353, | |
| "reward_std": 0.6232884004712105, | |
| "rewards/cosine_scaled_reward": -0.09853924717754126, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2294.3194580078125, | |
| "epoch": 0.596401028277635, | |
| "grad_norm": 2.7749757766723633, | |
| "kl": 1.12744140625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.113, | |
| "reward": 0.10566018056124449, | |
| "reward_std": 0.4547986686229706, | |
| "rewards/cosine_scaled_reward": -0.23883657529950142, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2798.375, | |
| "epoch": 0.598114824335904, | |
| "grad_norm": 4.372980117797852, | |
| "kl": 0.76708984375, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0724, | |
| "reward": 0.19539665430784225, | |
| "reward_std": 0.547118715941906, | |
| "rewards/cosine_scaled_reward": -0.0898016735445708, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2323.1806030273438, | |
| "epoch": 0.5998286203941731, | |
| "grad_norm": 0.9666920900344849, | |
| "kl": 0.42724609375, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.1139, | |
| "reward": 0.3154673893004656, | |
| "reward_std": 0.6931511759757996, | |
| "rewards/cosine_scaled_reward": -0.1408774359151721, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2554.791717529297, | |
| "epoch": 0.6015424164524421, | |
| "grad_norm": 1.4160966873168945, | |
| "kl": 0.673828125, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.1226, | |
| "reward": 0.2959921658039093, | |
| "reward_std": 0.6351921036839485, | |
| "rewards/cosine_scaled_reward": -0.1922817062586546, | |
| "rewards/format_reward": 0.6805555522441864, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2883.0972900390625, | |
| "epoch": 0.6032562125107113, | |
| "grad_norm": 1.3862693309783936, | |
| "kl": 0.57470703125, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0944, | |
| "reward": -0.15852557588368654, | |
| "reward_std": 0.3618531711399555, | |
| "rewards/cosine_scaled_reward": -0.22509612515568733, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2766.3611755371094, | |
| "epoch": 0.6049700085689803, | |
| "grad_norm": 2.0312767028808594, | |
| "kl": 0.47900390625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.1735, | |
| "reward": 0.0013820715248584747, | |
| "reward_std": 0.4379913955926895, | |
| "rewards/cosine_scaled_reward": -0.2284756200388074, | |
| "rewards/format_reward": 0.4583333283662796, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2303.4444885253906, | |
| "epoch": 0.6066838046272494, | |
| "grad_norm": 1.9564874172210693, | |
| "kl": 0.583984375, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.1414, | |
| "reward": 0.8931210651062429, | |
| "reward_std": 0.7761038094758987, | |
| "rewards/cosine_scaled_reward": 0.12017163541167974, | |
| "rewards/format_reward": 0.6527777649462223, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2350.3333435058594, | |
| "epoch": 0.6083976006855184, | |
| "grad_norm": 3.0156519412994385, | |
| "kl": 0.501953125, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0948, | |
| "reward": 0.21542136371135712, | |
| "reward_std": 0.4440060332417488, | |
| "rewards/cosine_scaled_reward": -0.15617820341140032, | |
| "rewards/format_reward": 0.5277777761220932, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2587.5694580078125, | |
| "epoch": 0.6101113967437874, | |
| "grad_norm": 1.8885806798934937, | |
| "kl": 0.7080078125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.1358, | |
| "reward": 0.2837059774901718, | |
| "reward_std": 0.5896440669894218, | |
| "rewards/cosine_scaled_reward": -0.1637025810778141, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2377.9722290039062, | |
| "epoch": 0.6118251928020566, | |
| "grad_norm": 2.1884989738464355, | |
| "kl": 0.7119140625, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.2069, | |
| "reward": 0.5028799092397094, | |
| "reward_std": 0.6587233692407608, | |
| "rewards/cosine_scaled_reward": -0.04022672958672047, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2724.02783203125, | |
| "epoch": 0.6135389888603257, | |
| "grad_norm": 1.1445249319076538, | |
| "kl": 0.5654296875, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.1018, | |
| "reward": 0.2546988914255053, | |
| "reward_std": 0.7167258933186531, | |
| "rewards/cosine_scaled_reward": -0.10876166447997093, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2532.2638549804688, | |
| "epoch": 0.6152527849185947, | |
| "grad_norm": 2.5419840812683105, | |
| "kl": 0.68505859375, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0873, | |
| "reward": 0.21593652665615082, | |
| "reward_std": 0.6653575152158737, | |
| "rewards/cosine_scaled_reward": -0.11425395932747051, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2583.513916015625, | |
| "epoch": 0.6169665809768637, | |
| "grad_norm": 3.3858025074005127, | |
| "kl": 0.7587890625, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0324, | |
| "reward": 0.317771688933135, | |
| "reward_std": 0.6041048616170883, | |
| "rewards/cosine_scaled_reward": -0.09111416153609753, | |
| "rewards/format_reward": 0.4999999962747097, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2334.1944274902344, | |
| "epoch": 0.6186803770351328, | |
| "grad_norm": 0.9652746915817261, | |
| "kl": 0.50732421875, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0994, | |
| "reward": 0.6510396376252174, | |
| "reward_std": 0.8507343530654907, | |
| "rewards/cosine_scaled_reward": 0.019964261911809444, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3043.9445190429688, | |
| "epoch": 0.6203941730934018, | |
| "grad_norm": 0.7930195927619934, | |
| "kl": 0.67626953125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0974, | |
| "reward": 0.13819648325443268, | |
| "reward_std": 0.5064943730831146, | |
| "rewards/cosine_scaled_reward": -0.1531239915639162, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2825.7916259765625, | |
| "epoch": 0.622107969151671, | |
| "grad_norm": 0.6896006464958191, | |
| "kl": 0.64794921875, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.1145, | |
| "reward": 0.08600431494414806, | |
| "reward_std": 0.5438744425773621, | |
| "rewards/cosine_scaled_reward": -0.17227561306208372, | |
| "rewards/format_reward": 0.4305555559694767, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2422.3334045410156, | |
| "epoch": 0.62382176520994, | |
| "grad_norm": 1.5994491577148438, | |
| "kl": 0.5615234375, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0825, | |
| "reward": 0.5049359295517206, | |
| "reward_std": 0.7525297850370407, | |
| "rewards/cosine_scaled_reward": -0.06697649694979191, | |
| "rewards/format_reward": 0.6388888955116272, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2465.0556030273438, | |
| "epoch": 0.6255355612682091, | |
| "grad_norm": 7.009267330169678, | |
| "kl": 0.66259765625, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.2849, | |
| "reward": 0.15274390950798988, | |
| "reward_std": 0.48094654455780983, | |
| "rewards/cosine_scaled_reward": -0.18057249579578638, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2835.9861450195312, | |
| "epoch": 0.6272493573264781, | |
| "grad_norm": 1.8245147466659546, | |
| "kl": 0.6435546875, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0872, | |
| "reward": 0.43825584976002574, | |
| "reward_std": 0.4411094859242439, | |
| "rewards/cosine_scaled_reward": 0.031627919524908066, | |
| "rewards/format_reward": 0.37500001210719347, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2408.2500610351562, | |
| "epoch": 0.6289631533847472, | |
| "grad_norm": 1.2879042625427246, | |
| "kl": 0.64208984375, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.1245, | |
| "reward": 0.5816475376486778, | |
| "reward_std": 0.8480053022503853, | |
| "rewards/cosine_scaled_reward": 0.026934866793453693, | |
| "rewards/format_reward": 0.5277777761220932, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3011.8472290039062, | |
| "epoch": 0.6306769494430163, | |
| "grad_norm": 1.5342601537704468, | |
| "kl": 0.9501953125, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.1117, | |
| "reward": -0.08722967363428324, | |
| "reward_std": 0.4681037962436676, | |
| "rewards/cosine_scaled_reward": -0.20333705097436905, | |
| "rewards/format_reward": 0.31944445334374905, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2088.1111450195312, | |
| "epoch": 0.6323907455012854, | |
| "grad_norm": 5.395845413208008, | |
| "kl": 0.52392578125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.278, | |
| "reward": 0.8531668335199356, | |
| "reward_std": 0.7198526412248611, | |
| "rewards/cosine_scaled_reward": 0.07936117798089981, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2144.4583587646484, | |
| "epoch": 0.6341045415595544, | |
| "grad_norm": 1.9326905012130737, | |
| "kl": 0.5986328125, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.165, | |
| "reward": 0.7016473673284054, | |
| "reward_std": 0.35017503798007965, | |
| "rewards/cosine_scaled_reward": 0.04526812210679054, | |
| "rewards/format_reward": 0.6111111268401146, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2226.8333740234375, | |
| "epoch": 0.6358183376178235, | |
| "grad_norm": 6.064547538757324, | |
| "kl": 0.47265625, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.2747, | |
| "reward": 0.36747913248836994, | |
| "reward_std": 0.47022923082113266, | |
| "rewards/cosine_scaled_reward": -0.12876042909920216, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2646.166717529297, | |
| "epoch": 0.6375321336760925, | |
| "grad_norm": 1.8105882406234741, | |
| "kl": 0.66259765625, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.1523, | |
| "reward": 0.2788702640682459, | |
| "reward_std": 0.7272945195436478, | |
| "rewards/cosine_scaled_reward": -0.082787093706429, | |
| "rewards/format_reward": 0.4444444440305233, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2658.513916015625, | |
| "epoch": 0.6392459297343616, | |
| "grad_norm": 0.861741304397583, | |
| "kl": 0.56201171875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.127, | |
| "reward": 0.25536923203617334, | |
| "reward_std": 0.549317829310894, | |
| "rewards/cosine_scaled_reward": -0.16398204606957734, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2475.7916564941406, | |
| "epoch": 0.6409597257926307, | |
| "grad_norm": 5.120214462280273, | |
| "kl": 0.66943359375, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": -0.0041, | |
| "reward": 0.4401531554758549, | |
| "reward_std": 0.5939441919326782, | |
| "rewards/cosine_scaled_reward": -0.10631232312880456, | |
| "rewards/format_reward": 0.6527777686715126, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2241.4166564941406, | |
| "epoch": 0.6426735218508998, | |
| "grad_norm": 4.6052021980285645, | |
| "kl": 0.68896484375, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.258, | |
| "reward": 0.40211474522948265, | |
| "reward_std": 0.6810158491134644, | |
| "rewards/cosine_scaled_reward": -0.13922041468322277, | |
| "rewards/format_reward": 0.6805555522441864, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2345.541717529297, | |
| "epoch": 0.6443873179091688, | |
| "grad_norm": 1.593520164489746, | |
| "kl": 0.65869140625, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.1983, | |
| "reward": 0.4966873601078987, | |
| "reward_std": 0.6450872495770454, | |
| "rewards/cosine_scaled_reward": -0.0363785345107317, | |
| "rewards/format_reward": 0.569444440305233, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2599.9583129882812, | |
| "epoch": 0.6461011139674379, | |
| "grad_norm": 1.0820523500442505, | |
| "kl": 0.54345703125, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.1269, | |
| "reward": 0.3041490036994219, | |
| "reward_std": 0.5556300804018974, | |
| "rewards/cosine_scaled_reward": -0.11181438341736794, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2423.1944274902344, | |
| "epoch": 0.6478149100257069, | |
| "grad_norm": 3.9577648639678955, | |
| "kl": 0.46435546875, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.2539, | |
| "reward": 0.3343656752258539, | |
| "reward_std": 0.6136218756437302, | |
| "rewards/cosine_scaled_reward": -0.14531716238707304, | |
| "rewards/format_reward": 0.625, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1991.0555419921875, | |
| "epoch": 0.6495287060839761, | |
| "grad_norm": 6.228683948516846, | |
| "kl": 0.7626953125, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.2113, | |
| "reward": 0.7852881997823715, | |
| "reward_std": 0.7995356619358063, | |
| "rewards/cosine_scaled_reward": 0.031532974913716316, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2142.0972595214844, | |
| "epoch": 0.6512425021422451, | |
| "grad_norm": 4.392513751983643, | |
| "kl": 0.8505859375, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.166, | |
| "reward": 0.774210050702095, | |
| "reward_std": 0.9235591739416122, | |
| "rewards/cosine_scaled_reward": 0.01904946379363537, | |
| "rewards/format_reward": 0.736111119389534, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2721.166748046875, | |
| "epoch": 0.6529562982005142, | |
| "grad_norm": 0.7555143237113953, | |
| "kl": 0.54931640625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.1157, | |
| "reward": 0.06699353083968163, | |
| "reward_std": 0.6024204641580582, | |
| "rewards/cosine_scaled_reward": -0.18872546032071114, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2592.3472595214844, | |
| "epoch": 0.6546700942587832, | |
| "grad_norm": 1.4892374277114868, | |
| "kl": 0.5986328125, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.164, | |
| "reward": 0.4388514533638954, | |
| "reward_std": 0.7740809172391891, | |
| "rewards/cosine_scaled_reward": -0.07918539177626371, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2303.125, | |
| "epoch": 0.6563838903170522, | |
| "grad_norm": 1.8696836233139038, | |
| "kl": 0.63427734375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0839, | |
| "reward": 0.2414467092603445, | |
| "reward_std": 0.5401086919009686, | |
| "rewards/cosine_scaled_reward": -0.14316555112600327, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2894.138916015625, | |
| "epoch": 0.6580976863753213, | |
| "grad_norm": 2.512624740600586, | |
| "kl": 0.7236328125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.188, | |
| "reward": -0.111133978003636, | |
| "reward_std": 0.4146636873483658, | |
| "rewards/cosine_scaled_reward": -0.256955873221159, | |
| "rewards/format_reward": 0.4027777798473835, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2577.1805725097656, | |
| "epoch": 0.6598114824335904, | |
| "grad_norm": 1.5134508609771729, | |
| "kl": 0.75341796875, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.1359, | |
| "reward": 0.12209473713301122, | |
| "reward_std": 0.42869339138269424, | |
| "rewards/cosine_scaled_reward": -0.18200820498168468, | |
| "rewards/format_reward": 0.4861111156642437, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2836.388916015625, | |
| "epoch": 0.6615252784918595, | |
| "grad_norm": 1.6320090293884277, | |
| "kl": 0.71435546875, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.1314, | |
| "reward": 0.027245239354670048, | |
| "reward_std": 0.5338631048798561, | |
| "rewards/cosine_scaled_reward": -0.21554404497146606, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2370.5416870117188, | |
| "epoch": 0.6632390745501285, | |
| "grad_norm": 2.790175437927246, | |
| "kl": 0.69384765625, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.1291, | |
| "reward": 0.45274626836180687, | |
| "reward_std": 0.5044268742203712, | |
| "rewards/cosine_scaled_reward": -0.07223799102939665, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2043.15283203125, | |
| "epoch": 0.6649528706083976, | |
| "grad_norm": 2.196779251098633, | |
| "kl": 0.91259765625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.1933, | |
| "reward": 0.6870926842093468, | |
| "reward_std": 0.7499307841062546, | |
| "rewards/cosine_scaled_reward": -0.017564778798259795, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2507.999969482422, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 4.833599090576172, | |
| "kl": 0.7890625, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.1485, | |
| "reward": 0.48738833516836166, | |
| "reward_std": 0.3942640535533428, | |
| "rewards/cosine_scaled_reward": -0.047972507774829865, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2180.5833740234375, | |
| "epoch": 0.6683804627249358, | |
| "grad_norm": 4.208037853240967, | |
| "kl": 0.57275390625, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.2068, | |
| "reward": 0.6339845806360245, | |
| "reward_std": 0.8561032116413116, | |
| "rewards/cosine_scaled_reward": -0.009396598441526294, | |
| "rewards/format_reward": 0.6527777761220932, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2664.0, | |
| "epoch": 0.6700942587832048, | |
| "grad_norm": 1.711565375328064, | |
| "kl": 0.57421875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.2027, | |
| "reward": 0.2511326225940138, | |
| "reward_std": 0.7724436074495316, | |
| "rewards/cosine_scaled_reward": -0.13137813284993172, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2048.1945190429688, | |
| "epoch": 0.6718080548414739, | |
| "grad_norm": 7.40539026260376, | |
| "kl": 0.576171875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.2932, | |
| "reward": 0.390616811811924, | |
| "reward_std": 0.46938444674015045, | |
| "rewards/cosine_scaled_reward": -0.1519138067960739, | |
| "rewards/format_reward": 0.6944444552063942, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2850.15283203125, | |
| "epoch": 0.6735218508997429, | |
| "grad_norm": 2.314105272293091, | |
| "kl": 0.712890625, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.1382, | |
| "reward": 0.12740344926714897, | |
| "reward_std": 0.5854331143200397, | |
| "rewards/cosine_scaled_reward": -0.1307427268475294, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2054.3334045410156, | |
| "epoch": 0.675235646958012, | |
| "grad_norm": 3.0562775135040283, | |
| "kl": 0.958984375, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.1762, | |
| "reward": 0.33383211493492126, | |
| "reward_std": 0.6097967401146889, | |
| "rewards/cosine_scaled_reward": -0.15947283059358597, | |
| "rewards/format_reward": 0.6527777835726738, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2001.4027709960938, | |
| "epoch": 0.676949443016281, | |
| "grad_norm": 2.749018907546997, | |
| "kl": 1.3720703125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.2474, | |
| "reward": 0.46511383540928364, | |
| "reward_std": 0.5483391135931015, | |
| "rewards/cosine_scaled_reward": -0.11466531874611974, | |
| "rewards/format_reward": 0.6944444626569748, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2663.9722900390625, | |
| "epoch": 0.6786632390745502, | |
| "grad_norm": 1.3055802583694458, | |
| "kl": 0.8837890625, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.1475, | |
| "reward": 0.21666064485907555, | |
| "reward_std": 0.8081866502761841, | |
| "rewards/cosine_scaled_reward": -0.14861411787569523, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2148.027801513672, | |
| "epoch": 0.6803770351328192, | |
| "grad_norm": 2.2016310691833496, | |
| "kl": 1.193359375, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.2683, | |
| "reward": 0.3172401809133589, | |
| "reward_std": 0.5794945135712624, | |
| "rewards/cosine_scaled_reward": -0.13999101985245943, | |
| "rewards/format_reward": 0.5972222164273262, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2772.2778930664062, | |
| "epoch": 0.6820908311910883, | |
| "grad_norm": 5.671627044677734, | |
| "kl": 0.994140625, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.1316, | |
| "reward": 0.22605895064771175, | |
| "reward_std": 0.528959184885025, | |
| "rewards/cosine_scaled_reward": -0.12308163847774267, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1907.2083740234375, | |
| "epoch": 0.6838046272493573, | |
| "grad_norm": 4.919534206390381, | |
| "kl": 1.0986328125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.1523, | |
| "reward": 0.8099863529205322, | |
| "reward_std": 0.7783814370632172, | |
| "rewards/cosine_scaled_reward": 0.03693760558962822, | |
| "rewards/format_reward": 0.7361111119389534, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.3195190429688, | |
| "epoch": 0.6855184233076264, | |
| "grad_norm": 4.336697578430176, | |
| "kl": 0.9921875, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.125, | |
| "reward": 0.08474167913664132, | |
| "reward_std": 0.4911258965730667, | |
| "rewards/cosine_scaled_reward": -0.21457360684871674, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2537.0833740234375, | |
| "epoch": 0.6872322193658955, | |
| "grad_norm": 1.9845013618469238, | |
| "kl": 0.7470703125, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.1521, | |
| "reward": 0.20629934733733535, | |
| "reward_std": 0.5084620639681816, | |
| "rewards/cosine_scaled_reward": -0.17462810222059488, | |
| "rewards/format_reward": 0.5555555671453476, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2364.2222595214844, | |
| "epoch": 0.6889460154241646, | |
| "grad_norm": 4.490449905395508, | |
| "kl": 0.85595703125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0678, | |
| "reward": 0.2729727178812027, | |
| "reward_std": 0.40766991674900055, | |
| "rewards/cosine_scaled_reward": -0.16906920075416565, | |
| "rewards/format_reward": 0.6111111044883728, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2377.500030517578, | |
| "epoch": 0.6906598114824336, | |
| "grad_norm": 2.0314667224884033, | |
| "kl": 0.9287109375, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.1243, | |
| "reward": 0.7173348069190979, | |
| "reward_std": 0.6178643703460693, | |
| "rewards/cosine_scaled_reward": 0.03922295683878474, | |
| "rewards/format_reward": 0.6388888955116272, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2395.3472595214844, | |
| "epoch": 0.6923736075407027, | |
| "grad_norm": 3.6266534328460693, | |
| "kl": 1.1142578125, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.1186, | |
| "reward": 0.37652647122740746, | |
| "reward_std": 0.6333749815821648, | |
| "rewards/cosine_scaled_reward": -0.11034788191318512, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2634.7222595214844, | |
| "epoch": 0.6940874035989717, | |
| "grad_norm": 1.629310131072998, | |
| "kl": 0.87353515625, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.1284, | |
| "reward": 0.30899196676909924, | |
| "reward_std": 0.5874167829751968, | |
| "rewards/cosine_scaled_reward": -0.11633734963834286, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1720.5416870117188, | |
| "epoch": 0.6958011996572407, | |
| "grad_norm": 3.2341248989105225, | |
| "kl": 1.0625, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.1949, | |
| "reward": 0.5183681361377239, | |
| "reward_std": 0.5259700566530228, | |
| "rewards/cosine_scaled_reward": -0.08109369967132807, | |
| "rewards/format_reward": 0.680555559694767, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2396.875, | |
| "epoch": 0.6975149957155099, | |
| "grad_norm": 2.575775146484375, | |
| "kl": 0.56005859375, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.1577, | |
| "reward": 0.16498053632676601, | |
| "reward_std": 0.6976238563656807, | |
| "rewards/cosine_scaled_reward": -0.17445417866110802, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2211.986114501953, | |
| "epoch": 0.699228791773779, | |
| "grad_norm": 5.147465229034424, | |
| "kl": 0.939453125, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.264, | |
| "reward": 0.8435009941458702, | |
| "reward_std": 0.8539558947086334, | |
| "rewards/cosine_scaled_reward": 0.1370282769203186, | |
| "rewards/format_reward": 0.569444440305233, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2221.791717529297, | |
| "epoch": 0.700942587832048, | |
| "grad_norm": 3.616407632827759, | |
| "kl": 1.2978515625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.2031, | |
| "reward": 0.5767598450183868, | |
| "reward_std": 0.6021636947989464, | |
| "rewards/cosine_scaled_reward": -0.01717562135308981, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2195.5833129882812, | |
| "epoch": 0.702656383890317, | |
| "grad_norm": 4.223770618438721, | |
| "kl": 0.9541015625, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0719, | |
| "reward": 0.4589345343410969, | |
| "reward_std": 0.5643011257052422, | |
| "rewards/cosine_scaled_reward": -0.09692162275314331, | |
| "rewards/format_reward": 0.6527777835726738, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2020.15283203125, | |
| "epoch": 0.7043701799485861, | |
| "grad_norm": 4.778375148773193, | |
| "kl": 0.9111328125, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.2462, | |
| "reward": 0.4322133334353566, | |
| "reward_std": 0.5240239724516869, | |
| "rewards/cosine_scaled_reward": -0.11722666956484318, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2159.5694580078125, | |
| "epoch": 0.7060839760068551, | |
| "grad_norm": 5.010425090789795, | |
| "kl": 1.271484375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.1504, | |
| "reward": 0.27721285074949265, | |
| "reward_std": 0.3799732178449631, | |
| "rewards/cosine_scaled_reward": -0.19472691789269447, | |
| "rewards/format_reward": 0.6666666567325592, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1994.1805419921875, | |
| "epoch": 0.7077977720651243, | |
| "grad_norm": 3.7414398193359375, | |
| "kl": 0.79296875, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.2209, | |
| "reward": 0.4823665115982294, | |
| "reward_std": 0.8085788935422897, | |
| "rewards/cosine_scaled_reward": -0.12687229178845882, | |
| "rewards/format_reward": 0.736111119389534, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2582.513885498047, | |
| "epoch": 0.7095115681233933, | |
| "grad_norm": 2.3787803649902344, | |
| "kl": 0.955078125, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.234, | |
| "reward": 0.35093772783875465, | |
| "reward_std": 0.7004451155662537, | |
| "rewards/cosine_scaled_reward": -0.06758668273687363, | |
| "rewards/format_reward": 0.4861111119389534, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2286.027801513672, | |
| "epoch": 0.7112253641816624, | |
| "grad_norm": 2.242143154144287, | |
| "kl": 1.1669921875, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.2089, | |
| "reward": 0.0793907418847084, | |
| "reward_std": 0.4775719493627548, | |
| "rewards/cosine_scaled_reward": -0.2519712895154953, | |
| "rewards/format_reward": 0.5833333320915699, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2716.8194885253906, | |
| "epoch": 0.7129391602399314, | |
| "grad_norm": 1.0189129114151, | |
| "kl": 1.0341796875, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.1319, | |
| "reward": 0.3344786912202835, | |
| "reward_std": 0.6283555030822754, | |
| "rewards/cosine_scaled_reward": -0.09664955246262252, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2145.777786254883, | |
| "epoch": 0.7146529562982005, | |
| "grad_norm": 3.3594307899475098, | |
| "kl": 0.630859375, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.1719, | |
| "reward": 0.29561759158968925, | |
| "reward_std": 0.45837917923927307, | |
| "rewards/cosine_scaled_reward": -0.16469121165573597, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2316.4722595214844, | |
| "epoch": 0.7163667523564696, | |
| "grad_norm": 3.205843448638916, | |
| "kl": 0.9482421875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.1905, | |
| "reward": 0.1763996873050928, | |
| "reward_std": 0.35552147775888443, | |
| "rewards/cosine_scaled_reward": -0.16874459758400917, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1993.9166564941406, | |
| "epoch": 0.7180805484147387, | |
| "grad_norm": 5.31653356552124, | |
| "kl": 1.2333984375, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.1316, | |
| "reward": 0.38526383973658085, | |
| "reward_std": 0.3629095181822777, | |
| "rewards/cosine_scaled_reward": -0.16153474483871832, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2585.3472290039062, | |
| "epoch": 0.7197943444730077, | |
| "grad_norm": 3.0050301551818848, | |
| "kl": 1.2138671875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.2609, | |
| "reward": 0.03709686268121004, | |
| "reward_std": 0.45924656093120575, | |
| "rewards/cosine_scaled_reward": -0.1967293554916978, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2077.4166870117188, | |
| "epoch": 0.7215081405312768, | |
| "grad_norm": 2.9638571739196777, | |
| "kl": 1.1162109375, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.1967, | |
| "reward": 0.4120505638420582, | |
| "reward_std": 0.7001288831233978, | |
| "rewards/cosine_scaled_reward": -0.09258583001792431, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1782.4722290039062, | |
| "epoch": 0.7232219365895458, | |
| "grad_norm": 2.496225595474243, | |
| "kl": 1.1123046875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.3012, | |
| "reward": 0.3580199657008052, | |
| "reward_std": 0.5790654197335243, | |
| "rewards/cosine_scaled_reward": -0.18904556892812252, | |
| "rewards/format_reward": 0.7361111044883728, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2290.388916015625, | |
| "epoch": 0.7249357326478149, | |
| "grad_norm": 2.5555100440979004, | |
| "kl": 0.8837890625, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.1268, | |
| "reward": 0.4261997193098068, | |
| "reward_std": 0.6714624091982841, | |
| "rewards/cosine_scaled_reward": -0.1341223642230034, | |
| "rewards/format_reward": 0.6944444552063942, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2174.3055419921875, | |
| "epoch": 0.726649528706084, | |
| "grad_norm": 4.850281715393066, | |
| "kl": 0.9345703125, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.2657, | |
| "reward": 0.31390602327883244, | |
| "reward_std": 0.5223901495337486, | |
| "rewards/cosine_scaled_reward": -0.1833247635513544, | |
| "rewards/format_reward": 0.6805555820465088, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2507.8334045410156, | |
| "epoch": 0.7283633247643531, | |
| "grad_norm": 3.5151827335357666, | |
| "kl": 1.26953125, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.1979, | |
| "reward": 0.6333100497722626, | |
| "reward_std": 0.7416208535432816, | |
| "rewards/cosine_scaled_reward": 0.031932787562254816, | |
| "rewards/format_reward": 0.569444440305233, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1761.2083740234375, | |
| "epoch": 0.7300771208226221, | |
| "grad_norm": 3.2045891284942627, | |
| "kl": 1.076171875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.1263, | |
| "reward": 0.9971873387694359, | |
| "reward_std": 0.7048115953803062, | |
| "rewards/cosine_scaled_reward": 0.0749825444072485, | |
| "rewards/format_reward": 0.8472222238779068, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2577.9861450195312, | |
| "epoch": 0.7317909168808912, | |
| "grad_norm": 1.8627033233642578, | |
| "kl": 1.14453125, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.2595, | |
| "reward": 0.2531158346682787, | |
| "reward_std": 0.6184235513210297, | |
| "rewards/cosine_scaled_reward": -0.10955319553613663, | |
| "rewards/format_reward": 0.4722222276031971, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1856.7222595214844, | |
| "epoch": 0.7335047129391602, | |
| "grad_norm": 4.033189296722412, | |
| "kl": 1.1201171875, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.263, | |
| "reward": 0.7039023488759995, | |
| "reward_std": 0.8175256699323654, | |
| "rewards/cosine_scaled_reward": -0.009159944485872984, | |
| "rewards/format_reward": 0.7222222238779068, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2663.4861450195312, | |
| "epoch": 0.7352185089974294, | |
| "grad_norm": 3.801396369934082, | |
| "kl": 1.130859375, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.15, | |
| "reward": 0.34896907582879066, | |
| "reward_std": 0.5518276765942574, | |
| "rewards/cosine_scaled_reward": -0.12412657774984837, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2153.555618286133, | |
| "epoch": 0.7369323050556984, | |
| "grad_norm": 2.9870073795318604, | |
| "kl": 0.9130859375, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.1497, | |
| "reward": 0.44994640722870827, | |
| "reward_std": 0.3946686089038849, | |
| "rewards/cosine_scaled_reward": -0.08752679079771042, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2261.75, | |
| "epoch": 0.7386461011139674, | |
| "grad_norm": 6.578658580780029, | |
| "kl": 1.78515625, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.1464, | |
| "reward": 0.35643661208450794, | |
| "reward_std": 0.5088120512664318, | |
| "rewards/cosine_scaled_reward": -0.12039280403405428, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2386.652801513672, | |
| "epoch": 0.7403598971722365, | |
| "grad_norm": 4.385483741760254, | |
| "kl": 1.3447265625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.3579, | |
| "reward": 0.4007231565192342, | |
| "reward_std": 0.6231922283768654, | |
| "rewards/cosine_scaled_reward": -0.09824953693896532, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2001.3611297607422, | |
| "epoch": 0.7420736932305055, | |
| "grad_norm": 4.371149063110352, | |
| "kl": 1.0439453125, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0871, | |
| "reward": 0.6510265804827213, | |
| "reward_std": 0.4398561269044876, | |
| "rewards/cosine_scaled_reward": -0.04254225082695484, | |
| "rewards/format_reward": 0.736111119389534, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1622.6666870117188, | |
| "epoch": 0.7437874892887746, | |
| "grad_norm": 6.787911891937256, | |
| "kl": 1.326171875, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.3842, | |
| "reward": 0.46582701057195663, | |
| "reward_std": 0.5145231448113918, | |
| "rewards/cosine_scaled_reward": -0.14208650775253773, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2658.7500610351562, | |
| "epoch": 0.7455012853470437, | |
| "grad_norm": 2.6709697246551514, | |
| "kl": 1.232421875, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0821, | |
| "reward": 0.31399114802479744, | |
| "reward_std": 0.5854284539818764, | |
| "rewards/cosine_scaled_reward": -0.10689331218600273, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1860.9583435058594, | |
| "epoch": 0.7472150814053128, | |
| "grad_norm": 3.8863277435302734, | |
| "kl": 0.91796875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.2564, | |
| "reward": 0.5377090591937304, | |
| "reward_std": 0.5195211619138718, | |
| "rewards/cosine_scaled_reward": -0.0853121317923069, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2466.2361755371094, | |
| "epoch": 0.7489288774635818, | |
| "grad_norm": 2.9903695583343506, | |
| "kl": 1.16015625, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.1351, | |
| "reward": 0.4385749250650406, | |
| "reward_std": 0.6242729276418686, | |
| "rewards/cosine_scaled_reward": -0.10015699185896665, | |
| "rewards/format_reward": 0.6388889029622078, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2362.8611450195312, | |
| "epoch": 0.7506426735218509, | |
| "grad_norm": 1.599947214126587, | |
| "kl": 1.189453125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.1863, | |
| "reward": 0.6738657765090466, | |
| "reward_std": 0.6156510934233665, | |
| "rewards/cosine_scaled_reward": 0.03137733961921185, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2748.2916870117188, | |
| "epoch": 0.7523564695801199, | |
| "grad_norm": 2.867025136947632, | |
| "kl": 1.033203125, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.1123, | |
| "reward": 0.22596902353689075, | |
| "reward_std": 0.5135553628206253, | |
| "rewards/cosine_scaled_reward": -0.13701549544930458, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2215.277801513672, | |
| "epoch": 0.7540702656383891, | |
| "grad_norm": 5.796390533447266, | |
| "kl": 1.396484375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.1245, | |
| "reward": 0.6629978334531188, | |
| "reward_std": 0.5948286652565002, | |
| "rewards/cosine_scaled_reward": 0.0051100607961416245, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2402.7222595214844, | |
| "epoch": 0.7557840616966581, | |
| "grad_norm": 5.96156644821167, | |
| "kl": 1.34375, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0634, | |
| "reward": 0.5063027180731297, | |
| "reward_std": 0.6581330522894859, | |
| "rewards/cosine_scaled_reward": -0.08018200099468231, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2082.263916015625, | |
| "epoch": 0.7574978577549272, | |
| "grad_norm": 3.405839443206787, | |
| "kl": 1.6640625, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.2784, | |
| "reward": 0.5193299576640129, | |
| "reward_std": 0.5714153945446014, | |
| "rewards/cosine_scaled_reward": -0.05977945402264595, | |
| "rewards/format_reward": 0.6388888880610466, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1665.7638854980469, | |
| "epoch": 0.7592116538131962, | |
| "grad_norm": 5.792540550231934, | |
| "kl": 1.0849609375, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.1343, | |
| "reward": 0.6929136589169502, | |
| "reward_std": 0.636933371424675, | |
| "rewards/cosine_scaled_reward": -0.056320954114198685, | |
| "rewards/format_reward": 0.8055555671453476, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2291.27783203125, | |
| "epoch": 0.7609254498714653, | |
| "grad_norm": 5.360567569732666, | |
| "kl": 1.2060546875, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.18, | |
| "reward": 0.36108400439843535, | |
| "reward_std": 0.5261719971895218, | |
| "rewards/cosine_scaled_reward": -0.15973576810210943, | |
| "rewards/format_reward": 0.6805555447936058, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2755.4583740234375, | |
| "epoch": 0.7626392459297343, | |
| "grad_norm": 2.5989925861358643, | |
| "kl": 0.875, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.1482, | |
| "reward": 0.14870610460639, | |
| "reward_std": 0.567838903516531, | |
| "rewards/cosine_scaled_reward": -0.18259140476584435, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2247.9583435058594, | |
| "epoch": 0.7643530419880035, | |
| "grad_norm": 5.224709987640381, | |
| "kl": 1.005859375, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.321, | |
| "reward": 0.5112787692341954, | |
| "reward_std": 0.6983462646603584, | |
| "rewards/cosine_scaled_reward": -0.049916195683181286, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2349.3055725097656, | |
| "epoch": 0.7660668380462725, | |
| "grad_norm": 3.252889633178711, | |
| "kl": 1.216796875, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.1445, | |
| "reward": 0.32806872576475143, | |
| "reward_std": 0.7308538854122162, | |
| "rewards/cosine_scaled_reward": -0.1276322863996029, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2661.5555725097656, | |
| "epoch": 0.7677806341045416, | |
| "grad_norm": 2.3366446495056152, | |
| "kl": 0.8486328125, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0872, | |
| "reward": 0.36861317604780197, | |
| "reward_std": 0.6462560296058655, | |
| "rewards/cosine_scaled_reward": -0.051804508082568645, | |
| "rewards/format_reward": 0.47222223225980997, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2592.4444580078125, | |
| "epoch": 0.7694944301628106, | |
| "grad_norm": 4.123133182525635, | |
| "kl": 1.052734375, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.2043, | |
| "reward": 0.2611931987339631, | |
| "reward_std": 0.7008328437805176, | |
| "rewards/cosine_scaled_reward": -0.147181186825037, | |
| "rewards/format_reward": 0.5555555522441864, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1906.3333740234375, | |
| "epoch": 0.7712082262210797, | |
| "grad_norm": 3.090589761734009, | |
| "kl": 0.865234375, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.1642, | |
| "reward": 0.5996736511588097, | |
| "reward_std": 0.5084411576390266, | |
| "rewards/cosine_scaled_reward": -0.06821873132139444, | |
| "rewards/format_reward": 0.736111119389534, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2815.4166870117188, | |
| "epoch": 0.7729220222793488, | |
| "grad_norm": 2.466654062271118, | |
| "kl": 0.7978515625, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.1236, | |
| "reward": 0.07433861424215138, | |
| "reward_std": 0.5574841573834419, | |
| "rewards/cosine_scaled_reward": -0.21977514401078224, | |
| "rewards/format_reward": 0.5138888917863369, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2436.8611450195312, | |
| "epoch": 0.7746358183376179, | |
| "grad_norm": 2.854764699935913, | |
| "kl": 0.8505859375, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.1263, | |
| "reward": 0.462260864675045, | |
| "reward_std": 0.7514103129506111, | |
| "rewards/cosine_scaled_reward": -0.09525846503674984, | |
| "rewards/format_reward": 0.6527777761220932, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2337.7361755371094, | |
| "epoch": 0.7763496143958869, | |
| "grad_norm": 3.1975936889648438, | |
| "kl": 0.97265625, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.1955, | |
| "reward": 0.39777151867747307, | |
| "reward_std": 0.6588628813624382, | |
| "rewards/cosine_scaled_reward": -0.11361423693597317, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2078.0972595214844, | |
| "epoch": 0.778063410454156, | |
| "grad_norm": 1.6080825328826904, | |
| "kl": 1.15625, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.2023, | |
| "reward": 0.7216087523847818, | |
| "reward_std": 0.7977120280265808, | |
| "rewards/cosine_scaled_reward": 0.006637714395765215, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2211.7916259765625, | |
| "epoch": 0.779777206512425, | |
| "grad_norm": 3.7410457134246826, | |
| "kl": 1.00390625, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.2082, | |
| "reward": 0.4414830207824707, | |
| "reward_std": 0.6565307825803757, | |
| "rewards/cosine_scaled_reward": -0.10564738605171442, | |
| "rewards/format_reward": 0.6527777761220932, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2365.52783203125, | |
| "epoch": 0.781491002570694, | |
| "grad_norm": 6.645061492919922, | |
| "kl": 0.64013671875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.2263, | |
| "reward": 0.3324281768873334, | |
| "reward_std": 0.5864489898085594, | |
| "rewards/cosine_scaled_reward": -0.1046192436479032, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2122.5833435058594, | |
| "epoch": 0.7832047986289632, | |
| "grad_norm": 2.3880536556243896, | |
| "kl": 1.04736328125, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.1423, | |
| "reward": 0.8056632168591022, | |
| "reward_std": 0.6164202988147736, | |
| "rewards/cosine_scaled_reward": 0.020887171383947134, | |
| "rewards/format_reward": 0.7638888955116272, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1959.8056030273438, | |
| "epoch": 0.7849185946872322, | |
| "grad_norm": 4.885958671569824, | |
| "kl": 0.94287109375, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.2544, | |
| "reward": 0.4146232455968857, | |
| "reward_std": 0.5990116819739342, | |
| "rewards/cosine_scaled_reward": -0.12602169532328844, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2254.0694274902344, | |
| "epoch": 0.7866323907455013, | |
| "grad_norm": 2.2345101833343506, | |
| "kl": 0.53564453125, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0455, | |
| "reward": 0.4500209465622902, | |
| "reward_std": 0.5013090819120407, | |
| "rewards/cosine_scaled_reward": -0.10137841757386923, | |
| "rewards/format_reward": 0.652777798473835, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2215.1666564941406, | |
| "epoch": 0.7883461868037703, | |
| "grad_norm": 4.272637367248535, | |
| "kl": 0.92236328125, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0789, | |
| "reward": 0.44543247297406197, | |
| "reward_std": 0.5984909385442734, | |
| "rewards/cosine_scaled_reward": -0.10367265064269304, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1930.5000305175781, | |
| "epoch": 0.7900599828620394, | |
| "grad_norm": 2.8687753677368164, | |
| "kl": 1.2333984375, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.1984, | |
| "reward": 0.4672253951430321, | |
| "reward_std": 0.6156143024563789, | |
| "rewards/cosine_scaled_reward": -0.13444286305457354, | |
| "rewards/format_reward": 0.736111119389534, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1981.9583740234375, | |
| "epoch": 0.7917737789203085, | |
| "grad_norm": 3.0882349014282227, | |
| "kl": 0.7373046875, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0448, | |
| "reward": 0.5570826064795256, | |
| "reward_std": 0.6341868117451668, | |
| "rewards/cosine_scaled_reward": -0.08951424108818173, | |
| "rewards/format_reward": 0.7361111119389534, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1936.4306030273438, | |
| "epoch": 0.7934875749785776, | |
| "grad_norm": 2.4561548233032227, | |
| "kl": 0.7119140625, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.2065, | |
| "reward": 0.5815738141536713, | |
| "reward_std": 0.7455588281154633, | |
| "rewards/cosine_scaled_reward": -0.08421308733522892, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2648.277801513672, | |
| "epoch": 0.7952013710368466, | |
| "grad_norm": 1.9183648824691772, | |
| "kl": 1.189453125, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.1831, | |
| "reward": 0.4944647327065468, | |
| "reward_std": 0.5960628166794777, | |
| "rewards/cosine_scaled_reward": -0.04443428758531809, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3089.4722290039062, | |
| "epoch": 0.7969151670951157, | |
| "grad_norm": 1.3800582885742188, | |
| "kl": 0.88671875, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.1132, | |
| "reward": 0.17487204633653164, | |
| "reward_std": 0.6750592887401581, | |
| "rewards/cosine_scaled_reward": -0.1278417520225048, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2615.013916015625, | |
| "epoch": 0.7986289631533847, | |
| "grad_norm": 2.8072264194488525, | |
| "kl": 0.93115234375, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.15, | |
| "reward": 0.17807744164019823, | |
| "reward_std": 0.6022924780845642, | |
| "rewards/cosine_scaled_reward": -0.11929461418185383, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2457.3750610351562, | |
| "epoch": 0.8003427592116538, | |
| "grad_norm": 4.940661430358887, | |
| "kl": 0.7890625, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.1464, | |
| "reward": 0.24202457256615162, | |
| "reward_std": 0.42437436431646347, | |
| "rewards/cosine_scaled_reward": -0.19148772559128702, | |
| "rewards/format_reward": 0.6249999925494194, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2192.986114501953, | |
| "epoch": 0.8020565552699229, | |
| "grad_norm": 3.136319637298584, | |
| "kl": 0.767578125, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.1721, | |
| "reward": 0.6284131053835154, | |
| "reward_std": 0.5748142190277576, | |
| "rewards/cosine_scaled_reward": -0.012182342819869518, | |
| "rewards/format_reward": 0.6527777761220932, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2432.8333435058594, | |
| "epoch": 0.803770351328192, | |
| "grad_norm": 1.9713729619979858, | |
| "kl": 0.8603515625, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.1523, | |
| "reward": 0.5457211770117283, | |
| "reward_std": 0.729132629930973, | |
| "rewards/cosine_scaled_reward": -0.0535283163189888, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2276.4305725097656, | |
| "epoch": 0.805484147386461, | |
| "grad_norm": 3.8467977046966553, | |
| "kl": 1.216796875, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.1232, | |
| "reward": 0.5122000686824322, | |
| "reward_std": 0.7733886539936066, | |
| "rewards/cosine_scaled_reward": -0.09112219791859388, | |
| "rewards/format_reward": 0.6944444552063942, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2643.7083129882812, | |
| "epoch": 0.8071979434447301, | |
| "grad_norm": 1.1509345769882202, | |
| "kl": 0.7216796875, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.1381, | |
| "reward": 0.10054661217145622, | |
| "reward_std": 0.6373118087649345, | |
| "rewards/cosine_scaled_reward": -0.16500448435544968, | |
| "rewards/format_reward": 0.4305555745959282, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2733.8056030273438, | |
| "epoch": 0.8089117395029991, | |
| "grad_norm": 1.7471221685409546, | |
| "kl": 0.60986328125, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.1037, | |
| "reward": 0.3268199451267719, | |
| "reward_std": 0.7872605472803116, | |
| "rewards/cosine_scaled_reward": -0.06575669860467315, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2760.416717529297, | |
| "epoch": 0.8106255355612683, | |
| "grad_norm": 2.182706832885742, | |
| "kl": 0.7177734375, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.103, | |
| "reward": 0.3022213885560632, | |
| "reward_std": 0.5640696436166763, | |
| "rewards/cosine_scaled_reward": -0.15444485377520323, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2262.4166564941406, | |
| "epoch": 0.8123393316195373, | |
| "grad_norm": 2.2662978172302246, | |
| "kl": 1.310546875, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.2095, | |
| "reward": 0.817143252119422, | |
| "reward_std": 0.5297734513878822, | |
| "rewards/cosine_scaled_reward": 0.0960716437548399, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2711.9722595214844, | |
| "epoch": 0.8140531276778064, | |
| "grad_norm": 5.152209758758545, | |
| "kl": 0.900390625, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0199, | |
| "reward": 0.1260463148355484, | |
| "reward_std": 0.5338724106550217, | |
| "rewards/cosine_scaled_reward": -0.1939212940633297, | |
| "rewards/format_reward": 0.5138888917863369, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2675.7361755371094, | |
| "epoch": 0.8157669237360754, | |
| "grad_norm": 2.230329990386963, | |
| "kl": 0.79931640625, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.1478, | |
| "reward": 0.14864197466522455, | |
| "reward_std": 0.6397556141018867, | |
| "rewards/cosine_scaled_reward": -0.1756790205836296, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2487.75, | |
| "epoch": 0.8174807197943444, | |
| "grad_norm": 4.63166618347168, | |
| "kl": 0.8720703125, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.3369, | |
| "reward": 0.37176867201924324, | |
| "reward_std": 0.6089313849806786, | |
| "rewards/cosine_scaled_reward": -0.07106010848656297, | |
| "rewards/format_reward": 0.5138888992369175, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2181.4722595214844, | |
| "epoch": 0.8191945158526135, | |
| "grad_norm": 3.272205114364624, | |
| "kl": 1.212890625, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.1721, | |
| "reward": 0.2916110037913313, | |
| "reward_std": 0.49708379805088043, | |
| "rewards/cosine_scaled_reward": -0.11113895289599895, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2471.8194732666016, | |
| "epoch": 0.8209083119108826, | |
| "grad_norm": 3.132082462310791, | |
| "kl": 1.228515625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.208, | |
| "reward": 0.7535388497635722, | |
| "reward_std": 0.695548452436924, | |
| "rewards/cosine_scaled_reward": 0.06426943093538284, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2416.0693969726562, | |
| "epoch": 0.8226221079691517, | |
| "grad_norm": 3.6008918285369873, | |
| "kl": 0.6015625, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.1762, | |
| "reward": 0.3882830161601305, | |
| "reward_std": 0.6291572600603104, | |
| "rewards/cosine_scaled_reward": -0.07669184263795614, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1688.6388854980469, | |
| "epoch": 0.8243359040274207, | |
| "grad_norm": 3.3292489051818848, | |
| "kl": 0.94482421875, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.1756, | |
| "reward": 0.6976406946778297, | |
| "reward_std": 0.7118247449398041, | |
| "rewards/cosine_scaled_reward": -0.040068539790809155, | |
| "rewards/format_reward": 0.7777777835726738, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2275.9723205566406, | |
| "epoch": 0.8260497000856898, | |
| "grad_norm": 5.62246036529541, | |
| "kl": 0.806640625, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.3047, | |
| "reward": 0.3320089429616928, | |
| "reward_std": 0.5019624754786491, | |
| "rewards/cosine_scaled_reward": -0.13955109613016248, | |
| "rewards/format_reward": 0.6111111268401146, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2215.8055725097656, | |
| "epoch": 0.8277634961439588, | |
| "grad_norm": 1.8363783359527588, | |
| "kl": 0.98046875, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.205, | |
| "reward": 0.4018698123982176, | |
| "reward_std": 0.5796016827225685, | |
| "rewards/cosine_scaled_reward": -0.13239844236522913, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1844.236099243164, | |
| "epoch": 0.829477292202228, | |
| "grad_norm": 4.732890605926514, | |
| "kl": 0.64990234375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.229, | |
| "reward": 0.37843877635896206, | |
| "reward_std": 0.6878086104989052, | |
| "rewards/cosine_scaled_reward": -0.1441139355301857, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2126.527801513672, | |
| "epoch": 0.831191088260497, | |
| "grad_norm": 3.030064821243286, | |
| "kl": 1.2275390625, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.2121, | |
| "reward": 0.5017230249941349, | |
| "reward_std": 0.8949761241674423, | |
| "rewards/cosine_scaled_reward": -0.0616384893655777, | |
| "rewards/format_reward": 0.625, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2167.52783203125, | |
| "epoch": 0.8329048843187661, | |
| "grad_norm": 2.106167793273926, | |
| "kl": 0.8662109375, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0938, | |
| "reward": 0.5535758845508099, | |
| "reward_std": 0.5298986956477165, | |
| "rewards/cosine_scaled_reward": -0.021823172457516193, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2419.6111450195312, | |
| "epoch": 0.8346186803770351, | |
| "grad_norm": 2.8747453689575195, | |
| "kl": 0.6708984375, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.1754, | |
| "reward": 0.2720159562304616, | |
| "reward_std": 0.545224204659462, | |
| "rewards/cosine_scaled_reward": -0.1487142387777567, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2196.486114501953, | |
| "epoch": 0.8363324764353042, | |
| "grad_norm": 2.825509786605835, | |
| "kl": 0.55517578125, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.1679, | |
| "reward": 0.6998728811740875, | |
| "reward_std": 0.6955326199531555, | |
| "rewards/cosine_scaled_reward": 0.030491996556520462, | |
| "rewards/format_reward": 0.6388888955116272, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2617.722198486328, | |
| "epoch": 0.8380462724935732, | |
| "grad_norm": 1.9763245582580566, | |
| "kl": 0.8466796875, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.1616, | |
| "reward": 0.28653959557414055, | |
| "reward_std": 0.6969783715903759, | |
| "rewards/cosine_scaled_reward": -0.12061909190379083, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2630.0972290039062, | |
| "epoch": 0.8397600685518424, | |
| "grad_norm": 1.7776055335998535, | |
| "kl": 0.73388671875, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.1053, | |
| "reward": 0.3464082106947899, | |
| "reward_std": 0.7413296326994896, | |
| "rewards/cosine_scaled_reward": -0.08374034571170341, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2610.1666870117188, | |
| "epoch": 0.8414738646101114, | |
| "grad_norm": 2.362657308578491, | |
| "kl": 1.03369140625, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.1536, | |
| "reward": 0.054581154661718756, | |
| "reward_std": 0.5118880867958069, | |
| "rewards/cosine_scaled_reward": -0.27132053300738335, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2591.611114501953, | |
| "epoch": 0.8431876606683805, | |
| "grad_norm": 1.4310436248779297, | |
| "kl": 0.748046875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.1227, | |
| "reward": 0.2780334800481796, | |
| "reward_std": 0.5931698530912399, | |
| "rewards/cosine_scaled_reward": -0.16653881408274174, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1928.0278015136719, | |
| "epoch": 0.8449014567266495, | |
| "grad_norm": 4.139144420623779, | |
| "kl": 1.39892578125, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.2949, | |
| "reward": 0.36576576717197895, | |
| "reward_std": 0.4379217103123665, | |
| "rewards/cosine_scaled_reward": -0.15045045968145132, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2274.680633544922, | |
| "epoch": 0.8466152527849186, | |
| "grad_norm": 1.5368496179580688, | |
| "kl": 0.958984375, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.1968, | |
| "reward": 0.3456185795366764, | |
| "reward_std": 0.5900547206401825, | |
| "rewards/cosine_scaled_reward": -0.11885737907141447, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1835.8333740234375, | |
| "epoch": 0.8483290488431876, | |
| "grad_norm": 4.2471394538879395, | |
| "kl": 1.61279296875, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.1987, | |
| "reward": 0.9312632232904434, | |
| "reward_std": 0.586229220032692, | |
| "rewards/cosine_scaled_reward": 0.06979827064787969, | |
| "rewards/format_reward": 0.7916666567325592, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2454.902801513672, | |
| "epoch": 0.8500428449014568, | |
| "grad_norm": 2.069298505783081, | |
| "kl": 0.9306640625, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.176, | |
| "reward": 0.6016820748336613, | |
| "reward_std": 0.7270394861698151, | |
| "rewards/cosine_scaled_reward": 0.0161188212223351, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2282.250030517578, | |
| "epoch": 0.8517566409597258, | |
| "grad_norm": 2.224278688430786, | |
| "kl": 0.7265625, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.1715, | |
| "reward": 0.43558146245777607, | |
| "reward_std": 0.6017558500170708, | |
| "rewards/cosine_scaled_reward": -0.07387594413012266, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2533.7500610351562, | |
| "epoch": 0.8534704370179949, | |
| "grad_norm": 5.092855930328369, | |
| "kl": 1.08935546875, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.1009, | |
| "reward": 0.1837000446394086, | |
| "reward_std": 0.7107623964548111, | |
| "rewards/cosine_scaled_reward": -0.19287220388650894, | |
| "rewards/format_reward": 0.569444440305233, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2394.777801513672, | |
| "epoch": 0.8551842330762639, | |
| "grad_norm": 2.348245620727539, | |
| "kl": 0.81884765625, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.1019, | |
| "reward": 0.8653097227215767, | |
| "reward_std": 0.7131348252296448, | |
| "rewards/cosine_scaled_reward": 0.13404375594109297, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1616.4583282470703, | |
| "epoch": 0.856898029134533, | |
| "grad_norm": 2.849949598312378, | |
| "kl": 0.951171875, | |
| "learning_rate": 1e-07, | |
| "loss": 0.12, | |
| "reward": 1.1779827252030373, | |
| "reward_std": 0.6799286007881165, | |
| "rewards/cosine_scaled_reward": 0.20010241214185953, | |
| "rewards/format_reward": 0.7777777761220932, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.856898029134533, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.12157149085606943, | |
| "train_runtime": 48026.4516, | |
| "train_samples_per_second": 0.75, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |