OpenRS-GRPO / trainer_state.json
jmkim89's picture
Model save
9360627 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.856898029134533,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 2862.5695190429688,
"epoch": 0.001713796058269066,
"grad_norm": 0.16925157606601715,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0467,
"reward": 0.12026740610599518,
"reward_std": 0.47210293635725975,
"rewards/cosine_scaled_reward": -0.1343107339926064,
"rewards/format_reward": 0.3888888917863369,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2739.5,
"epoch": 0.003427592116538132,
"grad_norm": 0.18508067727088928,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0391,
"reward": -0.05314926430583,
"reward_std": 0.36226021870970726,
"rewards/cosine_scaled_reward": -0.21407463820651174,
"rewards/format_reward": 0.3750000111758709,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2816.1944580078125,
"epoch": 0.005141388174807198,
"grad_norm": 0.15574845671653748,
"kl": 4.06801700592041e-05,
"learning_rate": 6e-08,
"loss": 0.024,
"reward": -0.0735303945839405,
"reward_std": 0.4152667075395584,
"rewards/cosine_scaled_reward": -0.21037630829960108,
"rewards/format_reward": 0.34722223225980997,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2746.875,
"epoch": 0.006855184233076264,
"grad_norm": 0.18099600076675415,
"kl": 3.692507743835449e-05,
"learning_rate": 8e-08,
"loss": 0.0516,
"reward": 0.2664791904389858,
"reward_std": 0.8305703550577164,
"rewards/cosine_scaled_reward": -0.07509375014342368,
"rewards/format_reward": 0.4166666716337204,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2557.513916015625,
"epoch": 0.00856898029134533,
"grad_norm": 0.173630490899086,
"kl": 2.3245811462402344e-05,
"learning_rate": 1e-07,
"loss": 0.0579,
"reward": 0.4870211333036423,
"reward_std": 0.6806018278002739,
"rewards/cosine_scaled_reward": -0.006489435210824013,
"rewards/format_reward": 0.5000000074505806,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 3163.8333129882812,
"epoch": 0.010282776349614395,
"grad_norm": 0.1903219074010849,
"kl": 4.1365623474121094e-05,
"learning_rate": 1.2e-07,
"loss": 0.0699,
"reward": 0.22140773385763168,
"reward_std": 0.614318884909153,
"rewards/cosine_scaled_reward": -0.07679613586515188,
"rewards/format_reward": 0.37500001303851604,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 2238.3055725097656,
"epoch": 0.011996572407883462,
"grad_norm": 0.2037331461906433,
"kl": 3.427267074584961e-05,
"learning_rate": 1.4e-07,
"loss": 0.0507,
"reward": 0.39292821660637856,
"reward_std": 0.6100749522447586,
"rewards/cosine_scaled_reward": -0.08825810719281435,
"rewards/format_reward": 0.5694444552063942,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 2888.4166870117188,
"epoch": 0.013710368466152529,
"grad_norm": 0.1671508252620697,
"kl": 2.8967857360839844e-05,
"learning_rate": 1.6e-07,
"loss": 0.0888,
"reward": 0.5700129643082619,
"reward_std": 1.0805757492780685,
"rewards/cosine_scaled_reward": 0.04195092432200909,
"rewards/format_reward": 0.486111119389534,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2740.638916015625,
"epoch": 0.015424164524421594,
"grad_norm": 0.2825331389904022,
"kl": 3.212690353393555e-05,
"learning_rate": 1.8e-07,
"loss": 0.1025,
"reward": 0.3288399577140808,
"reward_std": 0.6967436075210571,
"rewards/cosine_scaled_reward": -0.03696890315040946,
"rewards/format_reward": 0.4027777733281255,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 3010.7916870117188,
"epoch": 0.01713796058269066,
"grad_norm": 0.17822624742984772,
"kl": 4.1991472244262695e-05,
"learning_rate": 2e-07,
"loss": 0.0471,
"reward": 0.09832120686769485,
"reward_std": 0.6553668975830078,
"rewards/cosine_scaled_reward": -0.1036171680316329,
"rewards/format_reward": 0.3055555522441864,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2748.486114501953,
"epoch": 0.018851756640959727,
"grad_norm": 0.2476479411125183,
"kl": 3.9696693420410156e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0491,
"reward": 0.015873797237873077,
"reward_std": 0.553259089589119,
"rewards/cosine_scaled_reward": -0.16567421704530716,
"rewards/format_reward": 0.3472222238779068,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2954.3472290039062,
"epoch": 0.02056555269922879,
"grad_norm": 0.28294840455055237,
"kl": 3.898143768310547e-05,
"learning_rate": 2.4e-07,
"loss": 0.1311,
"reward": -0.11908636894077063,
"reward_std": 0.6466177105903625,
"rewards/cosine_scaled_reward": -0.22620984725654125,
"rewards/format_reward": 0.3333333367481828,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2818.986114501953,
"epoch": 0.022279348757497857,
"grad_norm": 0.18577341735363007,
"kl": 4.303455352783203e-05,
"learning_rate": 2.6e-07,
"loss": 0.0007,
"reward": 0.3697042800486088,
"reward_std": 0.7059066146612167,
"rewards/cosine_scaled_reward": -0.03042563726194203,
"rewards/format_reward": 0.4305555559694767,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2905.3333740234375,
"epoch": 0.023993144815766924,
"grad_norm": 0.226650208234787,
"kl": 3.2275915145874023e-05,
"learning_rate": 2.8e-07,
"loss": 0.0212,
"reward": 0.04198750853538513,
"reward_std": 0.5741659551858902,
"rewards/cosine_scaled_reward": -0.14567292109131813,
"rewards/format_reward": 0.33333333395421505,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 3468.2222290039062,
"epoch": 0.02570694087403599,
"grad_norm": 0.1521635353565216,
"kl": 4.279613494873047e-05,
"learning_rate": 3e-07,
"loss": 0.0233,
"reward": -0.17704490013420582,
"reward_std": 0.6536840051412582,
"rewards/cosine_scaled_reward": -0.1996335554867983,
"rewards/format_reward": 0.22222222574055195,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 2405.263916015625,
"epoch": 0.027420736932305057,
"grad_norm": 0.23728908598423004,
"kl": 2.495013177394867e-05,
"learning_rate": 3.2e-07,
"loss": 0.0632,
"reward": 0.7499620914459229,
"reward_std": 0.9962631165981293,
"rewards/cosine_scaled_reward": 0.07636993401683867,
"rewards/format_reward": 0.5972222238779068,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2764.875030517578,
"epoch": 0.02913453299057412,
"grad_norm": 0.21387562155723572,
"kl": 2.6166439056396484e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0416,
"reward": 0.27334376238286495,
"reward_std": 0.4753483533859253,
"rewards/cosine_scaled_reward": -0.05082811089232564,
"rewards/format_reward": 0.3750000111758709,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3252.486083984375,
"epoch": 0.030848329048843187,
"grad_norm": 0.209347203373909,
"kl": 4.25875186920166e-05,
"learning_rate": 3.6e-07,
"loss": 0.0587,
"reward": -0.18576696328818798,
"reward_std": 0.5022815316915512,
"rewards/cosine_scaled_reward": -0.19010569993406534,
"rewards/format_reward": 0.1944444514811039,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 3157.4166870117188,
"epoch": 0.032562125107112254,
"grad_norm": 0.22900572419166565,
"kl": 3.084540367126465e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0687,
"reward": 0.03116392099764198,
"reward_std": 0.7267041057348251,
"rewards/cosine_scaled_reward": -0.14414026169106364,
"rewards/format_reward": 0.3194444486871362,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 3228.5972290039062,
"epoch": 0.03427592116538132,
"grad_norm": 0.24043872952461243,
"kl": 2.6807188987731934e-05,
"learning_rate": 4e-07,
"loss": 0.1293,
"reward": -0.1261596381664276,
"reward_std": 0.7229140102863312,
"rewards/cosine_scaled_reward": -0.20196872018277645,
"rewards/format_reward": 0.2777777835726738,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2856.6805419921875,
"epoch": 0.03598971722365039,
"grad_norm": 0.19779175519943237,
"kl": 3.987550735473633e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0069,
"reward": 0.11652377434074879,
"reward_std": 0.8210525661706924,
"rewards/cosine_scaled_reward": -0.12229366600513458,
"rewards/format_reward": 0.3611111165955663,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 3298.3472290039062,
"epoch": 0.037703513281919454,
"grad_norm": 0.13437196612358093,
"kl": 2.828240394592285e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0123,
"reward": 0.1601133793592453,
"reward_std": 0.6881751976907253,
"rewards/cosine_scaled_reward": -0.06577664241194725,
"rewards/format_reward": 0.2916666753590107,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 3107.4583129882812,
"epoch": 0.03941730934018852,
"grad_norm": 0.1506253182888031,
"kl": 2.2932887077331543e-05,
"learning_rate": 4.6e-07,
"loss": 0.0149,
"reward": -0.13085854798555374,
"reward_std": 0.5464130863547325,
"rewards/cosine_scaled_reward": -0.20431815274059772,
"rewards/format_reward": 0.2777777807787061,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2710.6806030273438,
"epoch": 0.04113110539845758,
"grad_norm": 0.24692188203334808,
"kl": 2.8967857360839844e-05,
"learning_rate": 4.8e-07,
"loss": 0.1012,
"reward": 0.24628422083333135,
"reward_std": 0.4773574620485306,
"rewards/cosine_scaled_reward": -0.057413444737903774,
"rewards/format_reward": 0.3611111268401146,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 2784.7361450195312,
"epoch": 0.04284490145672665,
"grad_norm": 0.25797340273857117,
"kl": 2.6673078536987305e-05,
"learning_rate": 5e-07,
"loss": 0.106,
"reward": 0.46540534496307373,
"reward_std": 0.8211657330393791,
"rewards/cosine_scaled_reward": -0.01729731634259224,
"rewards/format_reward": 0.5000000037252903,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 3136.52783203125,
"epoch": 0.044558697514995714,
"grad_norm": 0.14968131482601166,
"kl": 3.291666507720947e-05,
"learning_rate": 5.2e-07,
"loss": 0.0512,
"reward": -0.09118526801466942,
"reward_std": 0.5860454589128494,
"rewards/cosine_scaled_reward": -0.21225931122899055,
"rewards/format_reward": 0.33333334140479565,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3040.0000610351562,
"epoch": 0.04627249357326478,
"grad_norm": 0.17181935906410217,
"kl": 1.5079975128173828e-05,
"learning_rate": 5.4e-07,
"loss": 0.0738,
"reward": 0.34727448783814907,
"reward_std": 0.6153330877423286,
"rewards/cosine_scaled_reward": -0.027751651592552662,
"rewards/format_reward": 0.4027777947485447,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2343.1111755371094,
"epoch": 0.04798628963153385,
"grad_norm": 0.2077193260192871,
"kl": 2.5130808353424072e-05,
"learning_rate": 5.6e-07,
"loss": 0.0598,
"reward": 0.6073902919888496,
"reward_std": 0.6849471032619476,
"rewards/cosine_scaled_reward": 0.018972909078001976,
"rewards/format_reward": 0.5694444477558136,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 3073.7222290039062,
"epoch": 0.049700085689802914,
"grad_norm": 0.21480253338813782,
"kl": 2.290681004524231e-05,
"learning_rate": 5.8e-07,
"loss": 0.0747,
"reward": 0.17731062695384026,
"reward_std": 0.8807300254702568,
"rewards/cosine_scaled_reward": -0.07801135815680027,
"rewards/format_reward": 0.3333333320915699,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 2768.02783203125,
"epoch": 0.05141388174807198,
"grad_norm": 0.25759172439575195,
"kl": 2.993270754814148e-05,
"learning_rate": 6e-07,
"loss": 0.0674,
"reward": 0.5063075462821871,
"reward_std": 0.771463930606842,
"rewards/cosine_scaled_reward": -0.010735094547271729,
"rewards/format_reward": 0.5277777910232544,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2695.6944580078125,
"epoch": 0.05312767780634105,
"grad_norm": 0.2701717019081116,
"kl": 1.3127923011779785e-05,
"learning_rate": 6.2e-07,
"loss": 0.0971,
"reward": 0.2706103939563036,
"reward_std": 0.49449611082673073,
"rewards/cosine_scaled_reward": -0.045250357885379344,
"rewards/format_reward": 0.361111119389534,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 3051.52783203125,
"epoch": 0.054841473864610114,
"grad_norm": 0.17947925627231598,
"kl": 2.6337802410125732e-05,
"learning_rate": 6.4e-07,
"loss": 0.057,
"reward": 0.45089754834771156,
"reward_std": 1.1203400194644928,
"rewards/cosine_scaled_reward": -0.02455122536048293,
"rewards/format_reward": 0.5000000074505806,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 2306.8750610351562,
"epoch": 0.056555269922879174,
"grad_norm": 0.21536274254322052,
"kl": 5.3569674491882324e-05,
"learning_rate": 6.6e-07,
"loss": 0.0764,
"reward": 0.8166992478072643,
"reward_std": 0.8387185409665108,
"rewards/cosine_scaled_reward": 0.12362739443778992,
"rewards/format_reward": 0.5694444626569748,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2698.7083740234375,
"epoch": 0.05826906598114824,
"grad_norm": 0.29884466528892517,
"kl": 0.00017189979553222656,
"learning_rate": 6.800000000000001e-07,
"loss": 0.1617,
"reward": 0.057983118342235684,
"reward_std": 0.7621737122535706,
"rewards/cosine_scaled_reward": -0.1585084507241845,
"rewards/format_reward": 0.3750000074505806,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3312.3055419921875,
"epoch": 0.05998286203941731,
"grad_norm": 0.1554093211889267,
"kl": 9.316205978393555e-05,
"learning_rate": 7e-07,
"loss": 0.0273,
"reward": -0.2900172360241413,
"reward_std": 0.5383428931236267,
"rewards/cosine_scaled_reward": -0.2700086124241352,
"rewards/format_reward": 0.2500000074505806,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2631.8055725097656,
"epoch": 0.061696658097686374,
"grad_norm": 0.19274435937404633,
"kl": 0.0002084970474243164,
"learning_rate": 7.2e-07,
"loss": 0.0306,
"reward": 0.006275304593145847,
"reward_std": 0.46724043786525726,
"rewards/cosine_scaled_reward": -0.18436234444379807,
"rewards/format_reward": 0.37500000931322575,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3124.5277709960938,
"epoch": 0.06341045415595545,
"grad_norm": 0.15709905326366425,
"kl": 7.59810209274292e-05,
"learning_rate": 7.4e-07,
"loss": 0.0561,
"reward": -0.008991474285721779,
"reward_std": 0.5808551460504532,
"rewards/cosine_scaled_reward": -0.1294957408681512,
"rewards/format_reward": 0.25000000558793545,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3045.90283203125,
"epoch": 0.06512425021422451,
"grad_norm": 0.2423790842294693,
"kl": 0.00022971630096435547,
"learning_rate": 7.599999999999999e-07,
"loss": 0.1263,
"reward": 0.1536001469939947,
"reward_std": 0.7093052342534065,
"rewards/cosine_scaled_reward": -0.07597769796848297,
"rewards/format_reward": 0.305555559694767,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 3150.0833740234375,
"epoch": 0.06683804627249357,
"grad_norm": 0.13335144519805908,
"kl": 0.0003066062927246094,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0187,
"reward": -0.01171512296423316,
"reward_std": 0.48150157928466797,
"rewards/cosine_scaled_reward": -0.1586353350430727,
"rewards/format_reward": 0.3055555559694767,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 2782.27783203125,
"epoch": 0.06855184233076264,
"grad_norm": 0.1773526668548584,
"kl": 0.0007457435131072998,
"learning_rate": 8e-07,
"loss": 0.0236,
"reward": 0.19545890390872955,
"reward_std": 0.5221360512077808,
"rewards/cosine_scaled_reward": -0.08282610075548291,
"rewards/format_reward": 0.361111112870276,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2729.9722290039062,
"epoch": 0.0702656383890317,
"grad_norm": 0.2603820860385895,
"kl": 0.0002143383026123047,
"learning_rate": 8.199999999999999e-07,
"loss": 0.1308,
"reward": 0.5641986541450024,
"reward_std": 0.7014989629387856,
"rewards/cosine_scaled_reward": 0.05293265450745821,
"rewards/format_reward": 0.4583333432674408,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2622.0555419921875,
"epoch": 0.07197943444730077,
"grad_norm": 0.19547662138938904,
"kl": 0.0008759498596191406,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0788,
"reward": 0.3987229084596038,
"reward_std": 0.6764711476862431,
"rewards/cosine_scaled_reward": -0.05063853319734335,
"rewards/format_reward": 0.5,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 2757.3611450195312,
"epoch": 0.07369323050556983,
"grad_norm": 0.133390411734581,
"kl": 0.00021369755268096924,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0354,
"reward": 0.5515957027673721,
"reward_std": 0.6986619718372822,
"rewards/cosine_scaled_reward": 0.04663117043673992,
"rewards/format_reward": 0.4583333283662796,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 2743.763916015625,
"epoch": 0.07540702656383891,
"grad_norm": 0.17805209755897522,
"kl": 0.0008558034896850586,
"learning_rate": 8.799999999999999e-07,
"loss": 0.1039,
"reward": 0.06273656419944018,
"reward_std": 0.7254525497555733,
"rewards/cosine_scaled_reward": -0.18390950025059283,
"rewards/format_reward": 0.4305555671453476,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3018.1805419921875,
"epoch": 0.07712082262210797,
"grad_norm": 0.23340974748134613,
"kl": 0.0007225275039672852,
"learning_rate": 9e-07,
"loss": 0.047,
"reward": 0.12753370963037014,
"reward_std": 0.5756559893488884,
"rewards/cosine_scaled_reward": -0.09595536440610886,
"rewards/format_reward": 0.31944444589316845,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 2453.77783203125,
"epoch": 0.07883461868037704,
"grad_norm": 0.25216469168663025,
"kl": 0.0028772354125976562,
"learning_rate": 9.2e-07,
"loss": 0.0976,
"reward": 0.4031712617725134,
"reward_std": 0.5689256861805916,
"rewards/cosine_scaled_reward": -0.05535881780087948,
"rewards/format_reward": 0.5138888955116272,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 3180.0972290039062,
"epoch": 0.0805484147386461,
"grad_norm": 0.17415259778499603,
"kl": 0.0014755725860595703,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0718,
"reward": -0.026270870119333267,
"reward_std": 0.641656719148159,
"rewards/cosine_scaled_reward": -0.15202434547245502,
"rewards/format_reward": 0.27777778171002865,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2680.7639770507812,
"epoch": 0.08226221079691516,
"grad_norm": 0.20438066124916077,
"kl": 0.001586318016052246,
"learning_rate": 9.6e-07,
"loss": 0.0807,
"reward": 0.6057721227407455,
"reward_std": 0.7416700124740601,
"rewards/cosine_scaled_reward": 0.05288607440888882,
"rewards/format_reward": 0.5000000074505806,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2346.055633544922,
"epoch": 0.08397600685518423,
"grad_norm": 0.35583311319351196,
"kl": 0.018939971923828125,
"learning_rate": 9.8e-07,
"loss": 0.1404,
"reward": 0.7048290632665157,
"reward_std": 0.6792610064148903,
"rewards/cosine_scaled_reward": 0.06074785813689232,
"rewards/format_reward": 0.5833333432674408,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 2833.5833740234375,
"epoch": 0.0856898029134533,
"grad_norm": 0.2027311623096466,
"kl": 0.0032949447631835938,
"learning_rate": 1e-06,
"loss": 0.0416,
"reward": 0.07023209612816572,
"reward_std": 0.6861855462193489,
"rewards/cosine_scaled_reward": -0.16627284698188305,
"rewards/format_reward": 0.4027777872979641,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 3051.2777709960938,
"epoch": 0.08740359897172237,
"grad_norm": 0.16748514771461487,
"kl": 0.001615285873413086,
"learning_rate": 9.999890338174275e-07,
"loss": 0.069,
"reward": 0.1449947228829842,
"reward_std": 0.7090619504451752,
"rewards/cosine_scaled_reward": -0.10111376643180847,
"rewards/format_reward": 0.34722223225980997,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3181.9583740234375,
"epoch": 0.08911739502999143,
"grad_norm": 0.16281543672084808,
"kl": 0.0019249916076660156,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0803,
"reward": -0.03632636368274689,
"reward_std": 0.5028033927083015,
"rewards/cosine_scaled_reward": -0.12927428726106882,
"rewards/format_reward": 0.22222222946584225,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 3303.1805419921875,
"epoch": 0.0908311910882605,
"grad_norm": 0.14455804228782654,
"kl": 0.0005393028259277344,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0318,
"reward": -0.10013403557240963,
"reward_std": 0.4606664590537548,
"rewards/cosine_scaled_reward": -0.17506700940430164,
"rewards/format_reward": 0.2500000046566129,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3313.1944580078125,
"epoch": 0.09254498714652956,
"grad_norm": 0.13308647274971008,
"kl": 0.0011081695556640625,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0055,
"reward": 0.10159287042915821,
"reward_std": 0.6204735822975636,
"rewards/cosine_scaled_reward": -0.060314678063150495,
"rewards/format_reward": 0.2222222276031971,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3481.15283203125,
"epoch": 0.09425878320479864,
"grad_norm": 0.13649359345436096,
"kl": 0.0008268356323242188,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0328,
"reward": -0.12874329963233322,
"reward_std": 0.5648706145584583,
"rewards/cosine_scaled_reward": -0.1754827625118196,
"rewards/format_reward": 0.22222222480922937,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 3232.7222900390625,
"epoch": 0.0959725792630677,
"grad_norm": 0.19132941961288452,
"kl": 0.0013275146484375,
"learning_rate": 9.996052735444862e-07,
"loss": 0.1077,
"reward": -0.17376804118975997,
"reward_std": 0.749246733263135,
"rewards/cosine_scaled_reward": -0.20493957586586475,
"rewards/format_reward": 0.2361111156642437,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 3279.4584350585938,
"epoch": 0.09768637532133675,
"grad_norm": 0.15241067111492157,
"kl": 0.000919342041015625,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0282,
"reward": 0.31643399875611067,
"reward_std": 0.6422489807009697,
"rewards/cosine_scaled_reward": 0.005439223721623421,
"rewards/format_reward": 0.30555556155741215,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3047.2916870117188,
"epoch": 0.09940017137960583,
"grad_norm": 0.22829630970954895,
"kl": 0.0054931640625,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0909,
"reward": -0.17570834839716554,
"reward_std": 0.4780988022685051,
"rewards/cosine_scaled_reward": -0.23368750512599945,
"rewards/format_reward": 0.2916666679084301,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 2918.5555419921875,
"epoch": 0.10111396743787489,
"grad_norm": 0.17409604787826538,
"kl": 0.010187149047851562,
"learning_rate": 9.991120277927223e-07,
"loss": -0.0001,
"reward": 0.6838416904211044,
"reward_std": 0.7215724363923073,
"rewards/cosine_scaled_reward": 0.1196986111899605,
"rewards/format_reward": 0.4444444477558136,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3240.90283203125,
"epoch": 0.10282776349614396,
"grad_norm": 0.21398130059242249,
"kl": 0.0015239715576171875,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0841,
"reward": -0.013310029171407223,
"reward_std": 0.6487029865384102,
"rewards/cosine_scaled_reward": -0.13859945815056562,
"rewards/format_reward": 0.2638888992369175,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 3323.3889770507812,
"epoch": 0.10454155955441302,
"grad_norm": 0.25011396408081055,
"kl": 0.0015153884887695312,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0677,
"reward": -0.37927111238241196,
"reward_std": 0.43354837596416473,
"rewards/cosine_scaled_reward": -0.2799133397638798,
"rewards/format_reward": 0.18055556155741215,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2683.6250610351562,
"epoch": 0.1062553556126821,
"grad_norm": 0.17982754111289978,
"kl": 0.00201416015625,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0008,
"reward": 0.40144167095422745,
"reward_std": 0.5826155617833138,
"rewards/cosine_scaled_reward": -0.02844582637771964,
"rewards/format_reward": 0.4583333432674408,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3426.2361450195312,
"epoch": 0.10796915167095116,
"grad_norm": 0.182517409324646,
"kl": 0.00151824951171875,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0294,
"reward": -0.09498679265379906,
"reward_std": 0.7008046992123127,
"rewards/cosine_scaled_reward": -0.13777116686105728,
"rewards/format_reward": 0.18055555690079927,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 2847.5972290039062,
"epoch": 0.10968294772922023,
"grad_norm": 0.31501731276512146,
"kl": 0.0022530555725097656,
"learning_rate": 9.97852329991824e-07,
"loss": 0.1548,
"reward": 0.009381972253322601,
"reward_std": 0.36741600558161736,
"rewards/cosine_scaled_reward": -0.16197567898780107,
"rewards/format_reward": 0.3333333432674408,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3167.236083984375,
"epoch": 0.11139674378748929,
"grad_norm": 0.4229466915130615,
"kl": 0.0364532470703125,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0659,
"reward": -0.029949136078357697,
"reward_std": 0.5782980695366859,
"rewards/cosine_scaled_reward": -0.13997458899393678,
"rewards/format_reward": 0.25000000838190317,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2846.8334350585938,
"epoch": 0.11311053984575835,
"grad_norm": 0.1699674278497696,
"kl": 0.0013065338134765625,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0667,
"reward": 0.2395001295953989,
"reward_std": 0.3902180567383766,
"rewards/cosine_scaled_reward": -0.053861052729189396,
"rewards/format_reward": 0.3472222313284874,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 3227.4445190429688,
"epoch": 0.11482433590402742,
"grad_norm": 0.15845970809459686,
"kl": 0.0022869110107421875,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0416,
"reward": 0.06229268200695515,
"reward_std": 0.5577914118766785,
"rewards/cosine_scaled_reward": -0.1285758875310421,
"rewards/format_reward": 0.3194444552063942,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 2906.3472290039062,
"epoch": 0.11653813196229648,
"grad_norm": 0.17754817008972168,
"kl": 0.0027103424072265625,
"learning_rate": 9.964516155915151e-07,
"loss": -0.0006,
"reward": 0.000796053558588028,
"reward_std": 0.5399865545332432,
"rewards/cosine_scaled_reward": -0.15932418778538704,
"rewards/format_reward": 0.3194444449618459,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 3079.4583740234375,
"epoch": 0.11825192802056556,
"grad_norm": 0.16689395904541016,
"kl": 0.00244140625,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0012,
"reward": 0.40755608677864075,
"reward_std": 0.592438168823719,
"rewards/cosine_scaled_reward": 0.009333595633506775,
"rewards/format_reward": 0.3888889029622078,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2852.6388549804688,
"epoch": 0.11996572407883462,
"grad_norm": 0.14442802965641022,
"kl": 0.0042266845703125,
"learning_rate": 9.956206309337066e-07,
"loss": 0.023,
"reward": 0.44340329244732857,
"reward_std": 0.43735441006720066,
"rewards/cosine_scaled_reward": 0.00642385333776474,
"rewards/format_reward": 0.4305555559694767,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 3119.8195190429688,
"epoch": 0.12167952013710369,
"grad_norm": 0.1541452407836914,
"kl": 0.003391265869140625,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0155,
"reward": 0.49696624279022217,
"reward_std": 0.9607885628938675,
"rewards/cosine_scaled_reward": 0.07487202249467373,
"rewards/format_reward": 0.3472222238779068,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 2584.513885498047,
"epoch": 0.12339331619537275,
"grad_norm": 0.16282722353935242,
"kl": 0.007266998291015625,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0302,
"reward": 0.4334046132862568,
"reward_std": 0.42579157277941704,
"rewards/cosine_scaled_reward": -0.04024216299876571,
"rewards/format_reward": 0.5138888955116272,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 3057.8611450195312,
"epoch": 0.12510711225364182,
"grad_norm": 0.19297440350055695,
"kl": 0.004047393798828125,
"learning_rate": 9.942113192828444e-07,
"loss": -0.0268,
"reward": 0.2504111938178539,
"reward_std": 0.6320941485464573,
"rewards/cosine_scaled_reward": -0.05534995626658201,
"rewards/format_reward": 0.3611111268401146,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2350.5000610351562,
"epoch": 0.1268209083119109,
"grad_norm": 0.25634145736694336,
"kl": 0.004367828369140625,
"learning_rate": 9.93698216681727e-07,
"loss": 0.1227,
"reward": 0.7754522487521172,
"reward_std": 0.8430259823799133,
"rewards/cosine_scaled_reward": 0.07522611878812313,
"rewards/format_reward": 0.625,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 3078.013916015625,
"epoch": 0.12853470437017994,
"grad_norm": 0.15847010910511017,
"kl": 0.004947662353515625,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0447,
"reward": 0.27387892454862595,
"reward_std": 0.5773990303277969,
"rewards/cosine_scaled_reward": -0.03667165897786617,
"rewards/format_reward": 0.3472222276031971,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 2247.8194427490234,
"epoch": 0.13024850042844902,
"grad_norm": 0.28341227769851685,
"kl": 0.014591217041015625,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0403,
"reward": 0.7070811688899994,
"reward_std": 0.7020798400044441,
"rewards/cosine_scaled_reward": 0.06881837674882263,
"rewards/format_reward": 0.5694444440305233,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 3159.75,
"epoch": 0.1319622964867181,
"grad_norm": 0.13436463475227356,
"kl": 0.0049896240234375,
"learning_rate": 9.9202926282791e-07,
"loss": 0.023,
"reward": 0.35647532157599926,
"reward_std": 0.7988947406411171,
"rewards/cosine_scaled_reward": 0.011570994276553392,
"rewards/format_reward": 0.33333334513008595,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 3150.0139770507812,
"epoch": 0.13367609254498714,
"grad_norm": 0.176174134016037,
"kl": 0.004405975341796875,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0747,
"reward": -0.14098340552300215,
"reward_std": 0.5686891078948975,
"rewards/cosine_scaled_reward": -0.18854726571589708,
"rewards/format_reward": 0.23611112032085657,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2571.0694580078125,
"epoch": 0.1353898886032562,
"grad_norm": 0.1847277730703354,
"kl": 0.008609771728515625,
"learning_rate": 9.908088623197048e-07,
"loss": -0.0106,
"reward": 0.3892364539206028,
"reward_std": 0.7569635957479477,
"rewards/cosine_scaled_reward": -0.06927067344076931,
"rewards/format_reward": 0.5277777835726738,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3138.5555419921875,
"epoch": 0.13710368466152528,
"grad_norm": 0.21640530228614807,
"kl": 0.005603790283203125,
"learning_rate": 9.901664203302124e-07,
"loss": 0.1324,
"reward": -0.1231984393671155,
"reward_std": 0.778315082192421,
"rewards/cosine_scaled_reward": -0.2074325531721115,
"rewards/format_reward": 0.2916666716337204,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3191.916748046875,
"epoch": 0.13881748071979436,
"grad_norm": 0.1524638533592224,
"kl": 0.014739990234375,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0255,
"reward": -0.14118600636720657,
"reward_std": 0.3157992772758007,
"rewards/cosine_scaled_reward": -0.17475967481732368,
"rewards/format_reward": 0.20833334047347307,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 2925.013916015625,
"epoch": 0.1405312767780634,
"grad_norm": 0.21411970257759094,
"kl": 0.00635528564453125,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0735,
"reward": -0.06351233087480068,
"reward_std": 0.5284828841686249,
"rewards/cosine_scaled_reward": -0.18453393690288067,
"rewards/format_reward": 0.30555555783212185,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 2801.2638549804688,
"epoch": 0.14224507283633248,
"grad_norm": 0.18929333984851837,
"kl": 0.0023746490478515625,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0434,
"reward": 0.5797148197889328,
"reward_std": 0.8048742488026619,
"rewards/cosine_scaled_reward": 0.03985740663483739,
"rewards/format_reward": 0.5000000074505806,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2881.999969482422,
"epoch": 0.14395886889460155,
"grad_norm": 0.16995370388031006,
"kl": 0.00823211669921875,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0417,
"reward": 0.1579499295912683,
"reward_std": 0.6737323254346848,
"rewards/cosine_scaled_reward": -0.12935838662087917,
"rewards/format_reward": 0.4166666716337204,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2817.888916015625,
"epoch": 0.1456726649528706,
"grad_norm": 0.17163607478141785,
"kl": 0.004947662353515625,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0843,
"reward": 0.14664312824606895,
"reward_std": 0.6406831294298172,
"rewards/cosine_scaled_reward": -0.10028954246081412,
"rewards/format_reward": 0.3472222248092294,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2666.0972595214844,
"epoch": 0.14738646101113967,
"grad_norm": 0.23853930830955505,
"kl": 0.0075836181640625,
"learning_rate": 9.85862422507884e-07,
"loss": 0.184,
"reward": 0.15615743398666382,
"reward_std": 0.6508499458432198,
"rewards/cosine_scaled_reward": -0.14414352551102638,
"rewards/format_reward": 0.4444444440305233,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 3479.0000610351562,
"epoch": 0.14910025706940874,
"grad_norm": 0.13812494277954102,
"kl": 0.003131866455078125,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0273,
"reward": -0.3952238578349352,
"reward_std": 0.4180161654949188,
"rewards/cosine_scaled_reward": -0.24622303992509842,
"rewards/format_reward": 0.0972222238779068,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 3411.0556030273438,
"epoch": 0.15081405312767782,
"grad_norm": 0.14131076633930206,
"kl": 0.00627899169921875,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0242,
"reward": -0.18497492372989655,
"reward_std": 0.3112034276127815,
"rewards/cosine_scaled_reward": -0.15498745813965797,
"rewards/format_reward": 0.12500000186264515,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2821.4305725097656,
"epoch": 0.15252784918594686,
"grad_norm": 0.23381026089191437,
"kl": 0.00811767578125,
"learning_rate": 9.83423155058946e-07,
"loss": 0.1044,
"reward": -0.15477947797626257,
"reward_std": 0.3880116418004036,
"rewards/cosine_scaled_reward": -0.257945304736495,
"rewards/format_reward": 0.3611111082136631,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2741.013885498047,
"epoch": 0.15424164524421594,
"grad_norm": 0.3015286326408386,
"kl": 0.005706787109375,
"learning_rate": 9.825677631722435e-07,
"loss": 0.146,
"reward": 0.32925539929419756,
"reward_std": 0.5706463847309351,
"rewards/cosine_scaled_reward": -0.01592785632237792,
"rewards/format_reward": 0.3611111082136631,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3004.1805419921875,
"epoch": 0.155955441302485,
"grad_norm": 0.2821044325828552,
"kl": 0.013214111328125,
"learning_rate": 9.816912885430258e-07,
"loss": 0.1457,
"reward": -0.23375913500785828,
"reward_std": 0.6937631815671921,
"rewards/cosine_scaled_reward": -0.25576844066381454,
"rewards/format_reward": 0.2777777872979641,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2889.15283203125,
"epoch": 0.15766923736075408,
"grad_norm": 0.19952206313610077,
"kl": 0.01056671142578125,
"learning_rate": 9.807937738894303e-07,
"loss": -0.0327,
"reward": 0.10378427803516388,
"reward_std": 0.6779353246092796,
"rewards/cosine_scaled_reward": -0.13560786750167608,
"rewards/format_reward": 0.37500000186264515,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2751.8055725097656,
"epoch": 0.15938303341902313,
"grad_norm": 0.18763676285743713,
"kl": 0.00572967529296875,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0734,
"reward": 0.5665245279669762,
"reward_std": 0.7802244201302528,
"rewards/cosine_scaled_reward": 0.012428927002474666,
"rewards/format_reward": 0.5416666604578495,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2943.9722290039062,
"epoch": 0.1610968294772922,
"grad_norm": 0.17491032183170319,
"kl": 0.005878448486328125,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0466,
"reward": 0.36631612479686737,
"reward_std": 0.5951685793697834,
"rewards/cosine_scaled_reward": -0.011286390479654074,
"rewards/format_reward": 0.38888889737427235,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2803.4862060546875,
"epoch": 0.16281062553556128,
"grad_norm": 0.2179604023694992,
"kl": 0.0073699951171875,
"learning_rate": 9.779754323328192e-07,
"loss": 0.1111,
"reward": 0.20993795804679394,
"reward_std": 0.5628918968141079,
"rewards/cosine_scaled_reward": -0.08253101143054664,
"rewards/format_reward": 0.3750000074505806,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3062.25,
"epoch": 0.16452442159383032,
"grad_norm": 0.1575266271829605,
"kl": 0.005573272705078125,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0192,
"reward": 0.5143513884395361,
"reward_std": 0.9291824996471405,
"rewards/cosine_scaled_reward": 0.021064545959234238,
"rewards/format_reward": 0.4722222350537777,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 3426.0555419921875,
"epoch": 0.1662382176520994,
"grad_norm": 0.152592271566391,
"kl": 0.0096588134765625,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0595,
"reward": -0.316804476082325,
"reward_std": 0.5735431797802448,
"rewards/cosine_scaled_reward": -0.2209022343158722,
"rewards/format_reward": 0.12500000186264515,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 2718.1806030273438,
"epoch": 0.16795201371036847,
"grad_norm": 0.19641156494617462,
"kl": 0.00783538818359375,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0871,
"reward": 0.34513735864311457,
"reward_std": 0.7377712428569794,
"rewards/cosine_scaled_reward": -0.09826467745006084,
"rewards/format_reward": 0.541666679084301,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 3183.8611450195312,
"epoch": 0.16966580976863754,
"grad_norm": 0.13990604877471924,
"kl": 0.00958251953125,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0408,
"reward": 0.10082972631789744,
"reward_std": 0.4568670317530632,
"rewards/cosine_scaled_reward": -0.09541848301887512,
"rewards/format_reward": 0.2916666669771075,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2988.263916015625,
"epoch": 0.1713796058269066,
"grad_norm": 0.1574762910604477,
"kl": 0.01104736328125,
"learning_rate": 9.728616793536587e-07,
"loss": 0.02,
"reward": 0.05844925343990326,
"reward_std": 0.4471042864024639,
"rewards/cosine_scaled_reward": -0.13744205003604293,
"rewards/format_reward": 0.3333333358168602,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2955.6388549804688,
"epoch": 0.17309340188517566,
"grad_norm": 0.15706215798854828,
"kl": 0.006420135498046875,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0337,
"reward": 0.032026506960392,
"reward_std": 0.35832666605710983,
"rewards/cosine_scaled_reward": -0.1298200935125351,
"rewards/format_reward": 0.2916666753590107,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 2912.0972290039062,
"epoch": 0.17480719794344474,
"grad_norm": 0.1945251077413559,
"kl": 0.0088043212890625,
"learning_rate": 9.706715543782064e-07,
"loss": 0.072,
"reward": 0.22132272832095623,
"reward_std": 0.4281787723302841,
"rewards/cosine_scaled_reward": -0.09072753041982651,
"rewards/format_reward": 0.40277779288589954,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2903.9444580078125,
"epoch": 0.17652099400171378,
"grad_norm": 0.1475774347782135,
"kl": 0.00759124755859375,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0409,
"reward": 0.16637181863188744,
"reward_std": 0.6222990080714226,
"rewards/cosine_scaled_reward": -0.10431409068405628,
"rewards/format_reward": 0.3750000149011612,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3320.1805419921875,
"epoch": 0.17823479005998286,
"grad_norm": 0.16452452540397644,
"kl": 0.006443023681640625,
"learning_rate": 9.683994186497132e-07,
"loss": 0.073,
"reward": -0.04724724031984806,
"reward_std": 0.5820007584989071,
"rewards/cosine_scaled_reward": -0.13473473582416773,
"rewards/format_reward": 0.2222222276031971,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2745.27783203125,
"epoch": 0.17994858611825193,
"grad_norm": 0.23044738173484802,
"kl": 0.0102996826171875,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0909,
"reward": 0.48719315230846405,
"reward_std": 0.9213617816567421,
"rewards/cosine_scaled_reward": -0.01334787905216217,
"rewards/format_reward": 0.5138888955116272,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 3009.0694580078125,
"epoch": 0.181662382176521,
"grad_norm": 0.25429767370224,
"kl": 0.0078125,
"learning_rate": 9.66045715125541e-07,
"loss": 0.127,
"reward": 0.27888505905866623,
"reward_std": 0.7037396281957626,
"rewards/cosine_scaled_reward": -0.048057474195957184,
"rewards/format_reward": 0.3750000009313226,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 3172.5694580078125,
"epoch": 0.18337617823479005,
"grad_norm": 0.17300733923912048,
"kl": 0.008148193359375,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0446,
"reward": 0.21187454462051392,
"reward_std": 0.549411840736866,
"rewards/cosine_scaled_reward": -0.06767383548867656,
"rewards/format_reward": 0.34722222574055195,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 3347.52783203125,
"epoch": 0.18508997429305912,
"grad_norm": 0.17588993906974792,
"kl": 0.00628662109375,
"learning_rate": 9.636109026648554e-07,
"loss": 0.06,
"reward": -0.038673363626003265,
"reward_std": 0.728736087679863,
"rewards/cosine_scaled_reward": -0.15822557546198368,
"rewards/format_reward": 0.2777777835726738,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 2627.263946533203,
"epoch": 0.1868037703513282,
"grad_norm": 0.29850271344184875,
"kl": 0.01171875,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0662,
"reward": 0.19531617127358913,
"reward_std": 0.4965377002954483,
"rewards/cosine_scaled_reward": -0.09678636118769646,
"rewards/format_reward": 0.38888889737427235,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2958.625,
"epoch": 0.18851756640959727,
"grad_norm": 0.46270403265953064,
"kl": 0.0089263916015625,
"learning_rate": 9.610954559391704e-07,
"loss": 0.1711,
"reward": 0.08645874005742371,
"reward_std": 0.9684502333402634,
"rewards/cosine_scaled_reward": -0.1512150838971138,
"rewards/format_reward": 0.3888888955116272,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2400.3472900390625,
"epoch": 0.19023136246786632,
"grad_norm": 0.18343479931354523,
"kl": 0.00701904296875,
"learning_rate": 9.598076473627796e-07,
"loss": 0.1083,
"reward": 0.22095186542719603,
"reward_std": 0.5088437423110008,
"rewards/cosine_scaled_reward": -0.15341296698898077,
"rewards/format_reward": 0.5277777835726738,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 2780.916748046875,
"epoch": 0.1919451585261354,
"grad_norm": 0.16234862804412842,
"kl": 0.007965087890625,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0115,
"reward": 0.19807963073253632,
"reward_std": 0.5584643110632896,
"rewards/cosine_scaled_reward": -0.14401574060320854,
"rewards/format_reward": 0.48611112777143717,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2751.7361450195312,
"epoch": 0.19365895458440446,
"grad_norm": 0.20913416147232056,
"kl": 0.00861358642578125,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0851,
"reward": 0.7618176154792309,
"reward_std": 1.0328082591295242,
"rewards/cosine_scaled_reward": 0.11007547879125923,
"rewards/format_reward": 0.541666679084301,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 2235.9166564941406,
"epoch": 0.1953727506426735,
"grad_norm": 0.20926620066165924,
"kl": 0.008697509765625,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0327,
"reward": 0.2064858078956604,
"reward_std": 0.4848344102501869,
"rewards/cosine_scaled_reward": -0.1675904355943203,
"rewards/format_reward": 0.5416666753590107,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2317.6250610351562,
"epoch": 0.19708654670094258,
"grad_norm": 0.4515492916107178,
"kl": 0.0092010498046875,
"learning_rate": 9.54457320834625e-07,
"loss": 0.2531,
"reward": 0.45756053365767,
"reward_std": 0.7848574221134186,
"rewards/cosine_scaled_reward": -0.04899751394987106,
"rewards/format_reward": 0.5555555745959282,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 3182.0694580078125,
"epoch": 0.19880034275921166,
"grad_norm": 0.17537973821163177,
"kl": 0.0128631591796875,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0165,
"reward": -0.06121325120329857,
"reward_std": 0.4434010796248913,
"rewards/cosine_scaled_reward": -0.1694955169223249,
"rewards/format_reward": 0.2777777798473835,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 3030.7083129882812,
"epoch": 0.20051413881748073,
"grad_norm": 0.18003451824188232,
"kl": 0.017120361328125,
"learning_rate": 9.516636183034564e-07,
"loss": 0.1071,
"reward": 0.42929551005363464,
"reward_std": 0.9132848009467125,
"rewards/cosine_scaled_reward": 0.006314422586001456,
"rewards/format_reward": 0.4166666651144624,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2533.9443969726562,
"epoch": 0.20222793487574978,
"grad_norm": 0.2703484296798706,
"kl": 0.011077880859375,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0946,
"reward": 0.4179135374724865,
"reward_std": 0.8737296983599663,
"rewards/cosine_scaled_reward": -0.04798768740147352,
"rewards/format_reward": 0.5138889029622078,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2823.5000610351562,
"epoch": 0.20394173093401885,
"grad_norm": 0.19636695086956024,
"kl": 0.01312255859375,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0303,
"reward": 0.31334975361824036,
"reward_std": 0.30826447159051895,
"rewards/cosine_scaled_reward": -0.058602908393368125,
"rewards/format_reward": 0.430555553175509,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2826.5694580078125,
"epoch": 0.20565552699228792,
"grad_norm": 0.2075241059064865,
"kl": 0.016265869140625,
"learning_rate": 9.473264167865171e-07,
"loss": 0.094,
"reward": 0.4697803445160389,
"reward_std": 0.7031994387507439,
"rewards/cosine_scaled_reward": 0.005723495967686176,
"rewards/format_reward": 0.4583333358168602,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2887.3056030273438,
"epoch": 0.207369323050557,
"grad_norm": 0.19230371713638306,
"kl": 0.0111083984375,
"learning_rate": 9.458418577899774e-07,
"loss": 0.086,
"reward": 0.3282506223767996,
"reward_std": 0.7738695293664932,
"rewards/cosine_scaled_reward": -0.05809690523892641,
"rewards/format_reward": 0.4444444589316845,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 3083.3611450195312,
"epoch": 0.20908311910882604,
"grad_norm": 0.17026208341121674,
"kl": 0.01568603515625,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0301,
"reward": -0.03662687446922064,
"reward_std": 0.5345718339085579,
"rewards/cosine_scaled_reward": -0.19886899180710316,
"rewards/format_reward": 0.361111119389534,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2682.6805725097656,
"epoch": 0.21079691516709512,
"grad_norm": 0.19728592038154602,
"kl": 0.012115478515625,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0481,
"reward": 0.3675118573009968,
"reward_std": 1.058239296078682,
"rewards/cosine_scaled_reward": -0.052355190739035606,
"rewards/format_reward": 0.4722222248092294,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 3100.1666870117188,
"epoch": 0.2125107112253642,
"grad_norm": 0.19675055146217346,
"kl": 0.013641357421875,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0775,
"reward": 0.28848724998533726,
"reward_std": 0.5403149202466011,
"rewards/cosine_scaled_reward": -0.04325637500733137,
"rewards/format_reward": 0.3750000149011612,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2862.0277709960938,
"epoch": 0.21422450728363324,
"grad_norm": 0.1939004808664322,
"kl": 0.017242431640625,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0453,
"reward": 0.3707499373704195,
"reward_std": 0.7198375910520554,
"rewards/cosine_scaled_reward": -0.016013892367482185,
"rewards/format_reward": 0.4027777807787061,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 2753.486083984375,
"epoch": 0.2159383033419023,
"grad_norm": 0.25714027881622314,
"kl": 0.0194091796875,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0648,
"reward": 0.3369361013174057,
"reward_std": 0.5913353934884071,
"rewards/cosine_scaled_reward": -0.08847637102007866,
"rewards/format_reward": 0.5138888889923692,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2923.125,
"epoch": 0.21765209940017138,
"grad_norm": 0.2240990549325943,
"kl": 0.016571044921875,
"learning_rate": 9.36531953618799e-07,
"loss": 0.076,
"reward": -0.2184343640692532,
"reward_std": 0.5479928515851498,
"rewards/cosine_scaled_reward": -0.27588383853435516,
"rewards/format_reward": 0.3333333395421505,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 3276.6111450195312,
"epoch": 0.21936589545844046,
"grad_norm": 0.15262338519096375,
"kl": 0.0207061767578125,
"learning_rate": 9.34913917072228e-07,
"loss": -0.0001,
"reward": -0.12921499274671078,
"reward_std": 0.5691854059696198,
"rewards/cosine_scaled_reward": -0.1757186003960669,
"rewards/format_reward": 0.22222223225980997,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2278.4305725097656,
"epoch": 0.2210796915167095,
"grad_norm": 0.3608929216861725,
"kl": 0.019287109375,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0927,
"reward": 0.706303309649229,
"reward_std": 0.7875337153673172,
"rewards/cosine_scaled_reward": 0.04759608302265406,
"rewards/format_reward": 0.6111111119389534,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1965.999984741211,
"epoch": 0.22279348757497858,
"grad_norm": 0.18217293918132782,
"kl": 0.018463134765625,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0064,
"reward": 1.0708431326784194,
"reward_std": 0.7828814685344696,
"rewards/cosine_scaled_reward": 0.17431045067496598,
"rewards/format_reward": 0.722222238779068,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 3232.7222290039062,
"epoch": 0.22450728363324765,
"grad_norm": 0.1822432279586792,
"kl": 0.0173797607421875,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0286,
"reward": -0.31177592277526855,
"reward_std": 0.350917749106884,
"rewards/cosine_scaled_reward": -0.27394353225827217,
"rewards/format_reward": 0.23611112032085657,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 2963.8055419921875,
"epoch": 0.2262210796915167,
"grad_norm": 0.22750675678253174,
"kl": 0.016204833984375,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0406,
"reward": 0.32277560234069824,
"reward_std": 0.8804080411791801,
"rewards/cosine_scaled_reward": -0.07472331821918488,
"rewards/format_reward": 0.4722222238779068,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 3082.263916015625,
"epoch": 0.22793487574978577,
"grad_norm": 0.2046993225812912,
"kl": 0.021820068359375,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0353,
"reward": 0.3385091759264469,
"reward_std": 0.7099575102329254,
"rewards/cosine_scaled_reward": -0.011300940066576004,
"rewards/format_reward": 0.3611111156642437,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2206.375030517578,
"epoch": 0.22964867180805484,
"grad_norm": 0.19563263654708862,
"kl": 0.017303466796875,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0577,
"reward": 0.640228021889925,
"reward_std": 0.7054692879319191,
"rewards/cosine_scaled_reward": 0.0006695720367133617,
"rewards/format_reward": 0.6388888955116272,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2421.611114501953,
"epoch": 0.23136246786632392,
"grad_norm": 0.338701069355011,
"kl": 0.0213623046875,
"learning_rate": 9.230669076497687e-07,
"loss": 0.1507,
"reward": 0.6078107673674822,
"reward_std": 0.8746988773345947,
"rewards/cosine_scaled_reward": 0.0469609391366248,
"rewards/format_reward": 0.5138888955116272,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 2651.125030517578,
"epoch": 0.23307626392459296,
"grad_norm": 0.28927695751190186,
"kl": 0.0211334228515625,
"learning_rate": 9.213010742252327e-07,
"loss": 0.1053,
"reward": 0.35874155908823013,
"reward_std": 0.7097110822796822,
"rewards/cosine_scaled_reward": -0.015073666349053383,
"rewards/format_reward": 0.3888888992369175,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 3202.0972900390625,
"epoch": 0.23479005998286204,
"grad_norm": 0.17811518907546997,
"kl": 0.02239990234375,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0492,
"reward": 0.5012375935912132,
"reward_std": 0.9828417152166367,
"rewards/cosine_scaled_reward": 0.03534099366515875,
"rewards/format_reward": 0.43055555410683155,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2946.4166870117188,
"epoch": 0.2365038560411311,
"grad_norm": 0.23094090819358826,
"kl": 0.0191650390625,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0193,
"reward": 0.09741606749594212,
"reward_std": 0.5724444687366486,
"rewards/cosine_scaled_reward": -0.13184750825166702,
"rewards/format_reward": 0.36111111007630825,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2927.0972290039062,
"epoch": 0.23821765209940018,
"grad_norm": 0.19129879772663116,
"kl": 0.024505615234375,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0374,
"reward": 0.1535217664204538,
"reward_std": 0.4049301743507385,
"rewards/cosine_scaled_reward": -0.08296133577823639,
"rewards/format_reward": 0.3194444449618459,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 2202.6944580078125,
"epoch": 0.23993144815766923,
"grad_norm": 0.4724877178668976,
"kl": 0.0255584716796875,
"learning_rate": 9.140576474687263e-07,
"loss": 0.1836,
"reward": 0.3395635038614273,
"reward_std": 0.6675402373075485,
"rewards/cosine_scaled_reward": -0.11494047567248344,
"rewards/format_reward": 0.5694444552063942,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 2910.916748046875,
"epoch": 0.2416452442159383,
"grad_norm": 0.18322300910949707,
"kl": 0.02935791015625,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0365,
"reward": 0.045268273912370205,
"reward_std": 0.6290135830640793,
"rewards/cosine_scaled_reward": -0.1440325528383255,
"rewards/format_reward": 0.3333333367481828,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 3141.638916015625,
"epoch": 0.24335904027420738,
"grad_norm": 0.1756112426519394,
"kl": 0.031341552734375,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0029,
"reward": -0.12469126284122467,
"reward_std": 0.39061762765049934,
"rewards/cosine_scaled_reward": -0.18734563700854778,
"rewards/format_reward": 0.2500000009313226,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2654.4166259765625,
"epoch": 0.24507283633247642,
"grad_norm": 0.29079416394233704,
"kl": 0.020843505859375,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0697,
"reward": 0.4159288965165615,
"reward_std": 0.7245111912488937,
"rewards/cosine_scaled_reward": -0.06981334753800184,
"rewards/format_reward": 0.555555559694767,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 3201.9306030273438,
"epoch": 0.2467866323907455,
"grad_norm": 0.197592630982399,
"kl": 0.0269775390625,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0373,
"reward": -0.11726564727723598,
"reward_std": 0.6086189821362495,
"rewards/cosine_scaled_reward": -0.21141060069203377,
"rewards/format_reward": 0.30555556155741215,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 2885.9722900390625,
"epoch": 0.24850042844901457,
"grad_norm": 0.29763004183769226,
"kl": 0.028472900390625,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0677,
"reward": 0.5742630921304226,
"reward_std": 0.37366680055856705,
"rewards/cosine_scaled_reward": 0.0649093296378851,
"rewards/format_reward": 0.4444444440305233,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2155.9444885253906,
"epoch": 0.25021422450728364,
"grad_norm": 0.3611903190612793,
"kl": 0.024993896484375,
"learning_rate": 9.026620557966279e-07,
"loss": 0.1546,
"reward": 0.5257812030613422,
"reward_std": 0.9518508315086365,
"rewards/cosine_scaled_reward": -0.049609407782554626,
"rewards/format_reward": 0.625,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 2810.9583740234375,
"epoch": 0.2519280205655527,
"grad_norm": 0.2670803964138031,
"kl": 0.031829833984375,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0389,
"reward": 0.11524944752454758,
"reward_std": 0.6441401988267899,
"rewards/cosine_scaled_reward": -0.12293083127588034,
"rewards/format_reward": 0.36111112032085657,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 3006.0416259765625,
"epoch": 0.2536418166238218,
"grad_norm": 0.230261892080307,
"kl": 0.0283203125,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0866,
"reward": -0.06906389445066452,
"reward_std": 0.41436275094747543,
"rewards/cosine_scaled_reward": -0.21508748084306717,
"rewards/format_reward": 0.3611111156642437,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 2527.4305572509766,
"epoch": 0.25535561268209084,
"grad_norm": 0.2313620001077652,
"kl": 0.028106689453125,
"learning_rate": 8.967309592491052e-07,
"loss": 0.05,
"reward": 0.3055970072746277,
"reward_std": 0.8265255615115166,
"rewards/cosine_scaled_reward": -0.0972014885628596,
"rewards/format_reward": 0.5000000074505806,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2374.0416870117188,
"epoch": 0.2570694087403599,
"grad_norm": 0.7321764826774597,
"kl": 0.028839111328125,
"learning_rate": 8.9471999940354e-07,
"loss": 0.1817,
"reward": 0.8978928253054619,
"reward_std": 0.7169746980071068,
"rewards/cosine_scaled_reward": 0.16422418132424355,
"rewards/format_reward": 0.5694444477558136,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2437.0694580078125,
"epoch": 0.258783204798629,
"grad_norm": 0.7071412801742554,
"kl": 0.041168212890625,
"learning_rate": 8.926922383915315e-07,
"loss": 0.2136,
"reward": 0.06301388889551163,
"reward_std": 0.4757090378552675,
"rewards/cosine_scaled_reward": -0.21849306486546993,
"rewards/format_reward": 0.5000000074505806,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2845.4583129882812,
"epoch": 0.26049700085689803,
"grad_norm": 0.5604143738746643,
"kl": 0.046142578125,
"learning_rate": 8.906477750432903e-07,
"loss": 0.1265,
"reward": 0.16903822124004364,
"reward_std": 0.5248951427638531,
"rewards/cosine_scaled_reward": -0.09603646397590637,
"rewards/format_reward": 0.36111111380159855,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2768.4027709960938,
"epoch": 0.2622107969151671,
"grad_norm": 0.23171323537826538,
"kl": 0.0499267578125,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0185,
"reward": 0.15082042291760445,
"reward_std": 0.7368991822004318,
"rewards/cosine_scaled_reward": -0.12597868964076042,
"rewards/format_reward": 0.4027777798473835,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2804.9861450195312,
"epoch": 0.2639245929734362,
"grad_norm": 0.40300193428993225,
"kl": 0.05609130859375,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0954,
"reward": 0.4552767127752304,
"reward_std": 0.7285914719104767,
"rewards/cosine_scaled_reward": -0.00847275834530592,
"rewards/format_reward": 0.472222238779068,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3049.0416870117188,
"epoch": 0.2656383890317052,
"grad_norm": 0.23651528358459473,
"kl": 0.0596923828125,
"learning_rate": 8.844151714648274e-07,
"loss": -0.0013,
"reward": 0.12507159425877035,
"reward_std": 0.8443149924278259,
"rewards/cosine_scaled_reward": -0.10413086414337158,
"rewards/format_reward": 0.3333333469927311,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 2847.4584350585938,
"epoch": 0.26735218508997427,
"grad_norm": 0.3277675211429596,
"kl": 0.05987548828125,
"learning_rate": 8.823049032816478e-07,
"loss": 0.032,
"reward": 0.31620367243885994,
"reward_std": 0.7322921454906464,
"rewards/cosine_scaled_reward": -0.10578705929219723,
"rewards/format_reward": 0.5277777835726738,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2717.8750610351562,
"epoch": 0.26906598114824337,
"grad_norm": 0.39394786953926086,
"kl": 0.0775146484375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0925,
"reward": 0.10540201608091593,
"reward_std": 0.6488600596785545,
"rewards/cosine_scaled_reward": -0.13479896634817123,
"rewards/format_reward": 0.3750000037252903,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 2818.125,
"epoch": 0.2707797772065124,
"grad_norm": 0.40347573161125183,
"kl": 0.0806884765625,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0484,
"reward": 0.07575460057705641,
"reward_std": 0.6178670972585678,
"rewards/cosine_scaled_reward": -0.15656715538352728,
"rewards/format_reward": 0.3888888889923692,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3166.1666259765625,
"epoch": 0.27249357326478146,
"grad_norm": 0.3342011868953705,
"kl": 0.1055908203125,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0491,
"reward": -0.02998074982315302,
"reward_std": 0.6097311675548553,
"rewards/cosine_scaled_reward": -0.19554592855274677,
"rewards/format_reward": 0.36111111380159855,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3013.7222290039062,
"epoch": 0.27420736932305056,
"grad_norm": 0.4173794388771057,
"kl": 0.106689453125,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0339,
"reward": 0.3507204055786133,
"reward_std": 0.6021532118320465,
"rewards/cosine_scaled_reward": 0.0017490852624177933,
"rewards/format_reward": 0.34722222201526165,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 2572.8194274902344,
"epoch": 0.2759211653813196,
"grad_norm": 0.4282573163509369,
"kl": 0.123046875,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0645,
"reward": 0.010059013031423092,
"reward_std": 0.5160095170140266,
"rewards/cosine_scaled_reward": -0.15469271643087268,
"rewards/format_reward": 0.3194444514811039,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2242.9444274902344,
"epoch": 0.2776349614395887,
"grad_norm": 0.39615368843078613,
"kl": 0.1090087890625,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0772,
"reward": 0.16390804119873792,
"reward_std": 0.5712290816009045,
"rewards/cosine_scaled_reward": -0.18887930922210217,
"rewards/format_reward": 0.5416666716337204,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2832.9861450195312,
"epoch": 0.27934875749785776,
"grad_norm": 0.6331592798233032,
"kl": 0.1380615234375,
"learning_rate": 8.670853944836176e-07,
"loss": 0.1022,
"reward": 0.20613746903836727,
"reward_std": 0.7383135333657265,
"rewards/cosine_scaled_reward": -0.04276460176333785,
"rewards/format_reward": 0.2916666753590107,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2359.250030517578,
"epoch": 0.2810625535561268,
"grad_norm": 0.8423472046852112,
"kl": 0.1591796875,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0087,
"reward": 0.1561935730278492,
"reward_std": 0.8059368506073952,
"rewards/cosine_scaled_reward": -0.10245877737179399,
"rewards/format_reward": 0.3611111156642437,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 2781.041717529297,
"epoch": 0.2827763496143959,
"grad_norm": 0.5075474977493286,
"kl": 0.17626953125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0061,
"reward": 0.35288394801318645,
"reward_std": 0.7819623723626137,
"rewards/cosine_scaled_reward": -0.038835824467241764,
"rewards/format_reward": 0.4305555634200573,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 2699.3611450195312,
"epoch": 0.28449014567266495,
"grad_norm": 0.41815418004989624,
"kl": 0.1614990234375,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0354,
"reward": 0.2616021269932389,
"reward_std": 0.8704780116677284,
"rewards/cosine_scaled_reward": -0.07753227837383747,
"rewards/format_reward": 0.416666672565043,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 2883.5972900390625,
"epoch": 0.286203941730934,
"grad_norm": 0.6100507378578186,
"kl": 0.192626953125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.1117,
"reward": 0.6217167973518372,
"reward_std": 1.1077049523591995,
"rewards/cosine_scaled_reward": 0.08863616734743118,
"rewards/format_reward": 0.4444444477558136,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 2844.77783203125,
"epoch": 0.2879177377892031,
"grad_norm": 1.0341858863830566,
"kl": 0.193359375,
"learning_rate": 8.557485869176825e-07,
"loss": 0.1403,
"reward": 0.44696745090186596,
"reward_std": 0.8215643167495728,
"rewards/cosine_scaled_reward": 0.008205945428926498,
"rewards/format_reward": 0.4305555671453476,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 2149.9306030273438,
"epoch": 0.28963153384747214,
"grad_norm": 0.8718350529670715,
"kl": 0.213134765625,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0265,
"reward": 0.24263115064240992,
"reward_std": 0.7163522839546204,
"rewards/cosine_scaled_reward": -0.08701775036752224,
"rewards/format_reward": 0.416666679084301,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 2499.9722900390625,
"epoch": 0.2913453299057412,
"grad_norm": 0.7302869558334351,
"kl": 0.233154296875,
"learning_rate": 8.511087728614862e-07,
"loss": 0.1123,
"reward": 0.1794309187680483,
"reward_std": 0.7098504453897476,
"rewards/cosine_scaled_reward": -0.056117892265319824,
"rewards/format_reward": 0.2916666679084301,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2474.6250610351562,
"epoch": 0.2930591259640103,
"grad_norm": 0.5687596797943115,
"kl": 0.2568359375,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0829,
"reward": 0.3411689009517431,
"reward_std": 0.6550407111644745,
"rewards/cosine_scaled_reward": -0.1280266623944044,
"rewards/format_reward": 0.597222238779068,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2846.763916015625,
"epoch": 0.29477292202227934,
"grad_norm": 0.48782670497894287,
"kl": 0.252197265625,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0672,
"reward": 0.22427130304276943,
"reward_std": 0.6338695511221886,
"rewards/cosine_scaled_reward": -0.10314211621880531,
"rewards/format_reward": 0.4305555559694767,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 2281.2638549804688,
"epoch": 0.29648671808054844,
"grad_norm": 1.23881196975708,
"kl": 0.234375,
"learning_rate": 8.440392717955475e-07,
"loss": 0.1363,
"reward": 0.24636091478168964,
"reward_std": 0.725439690053463,
"rewards/cosine_scaled_reward": -0.13376398687250912,
"rewards/format_reward": 0.5138888899236917,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 2596.7500610351562,
"epoch": 0.2982005141388175,
"grad_norm": 0.9256901741027832,
"kl": 0.32763671875,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0993,
"reward": 0.03723787656053901,
"reward_std": 0.669374942779541,
"rewards/cosine_scaled_reward": -0.18276994861662388,
"rewards/format_reward": 0.4027777835726738,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2802.7916870117188,
"epoch": 0.29991431019708653,
"grad_norm": 1.6108390092849731,
"kl": 0.41748046875,
"learning_rate": 8.392544243589427e-07,
"loss": -0.0161,
"reward": -0.026769233867526054,
"reward_std": 0.7613073363900185,
"rewards/cosine_scaled_reward": -0.1939401812851429,
"rewards/format_reward": 0.3611111231148243,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 2250.7916564941406,
"epoch": 0.30162810625535563,
"grad_norm": 1.3027092218399048,
"kl": 0.29345703125,
"learning_rate": 8.368407953869103e-07,
"loss": 0.1672,
"reward": 0.34848211891949177,
"reward_std": 0.8886565566062927,
"rewards/cosine_scaled_reward": -0.07575894566252828,
"rewards/format_reward": 0.5000000074505806,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 3187.25,
"epoch": 0.3033419023136247,
"grad_norm": 0.7333221435546875,
"kl": 0.390625,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0057,
"reward": -0.06705992296338081,
"reward_std": 0.5766744017601013,
"rewards/cosine_scaled_reward": -0.16547441016882658,
"rewards/format_reward": 0.26388889364898205,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 2658.9583129882812,
"epoch": 0.3050556983718937,
"grad_norm": 2.110689878463745,
"kl": 0.40185546875,
"learning_rate": 8.319717151140072e-07,
"loss": 0.1018,
"reward": 0.15619678050279617,
"reward_std": 0.5456085540354252,
"rewards/cosine_scaled_reward": -0.15106826776172966,
"rewards/format_reward": 0.4583333358168602,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 3046.361083984375,
"epoch": 0.3067694944301628,
"grad_norm": 1.409805417060852,
"kl": 0.44482421875,
"learning_rate": 8.295165011252396e-07,
"loss": 0.1019,
"reward": -0.08309876918792725,
"reward_std": 0.6837619245052338,
"rewards/cosine_scaled_reward": -0.1665493929758668,
"rewards/format_reward": 0.2500000027939677,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 2090.263916015625,
"epoch": 0.30848329048843187,
"grad_norm": 1.259092926979065,
"kl": 0.4365234375,
"learning_rate": 8.270476638965461e-07,
"loss": 0.1312,
"reward": 0.5355786010622978,
"reward_std": 0.9339739978313446,
"rewards/cosine_scaled_reward": -0.07248848024755716,
"rewards/format_reward": 0.6805555671453476,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 2998.388916015625,
"epoch": 0.3101970865467009,
"grad_norm": 0.7514684796333313,
"kl": 0.5283203125,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0645,
"reward": 0.0823521837592125,
"reward_std": 0.6557292975485325,
"rewards/cosine_scaled_reward": -0.1602128129452467,
"rewards/format_reward": 0.4027777835726738,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 3348.6806030273438,
"epoch": 0.31191088260497,
"grad_norm": 1.1279796361923218,
"kl": 0.6435546875,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0509,
"reward": -0.29229177720844746,
"reward_std": 0.44720375537872314,
"rewards/cosine_scaled_reward": -0.27809032425284386,
"rewards/format_reward": 0.26388889364898205,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2909.6806030273438,
"epoch": 0.31362467866323906,
"grad_norm": 0.8539410829544067,
"kl": 0.5654296875,
"learning_rate": 8.195606193320136e-07,
"loss": 0.1078,
"reward": 0.20359659614041448,
"reward_std": 0.7151020988821983,
"rewards/cosine_scaled_reward": -0.07875726278871298,
"rewards/format_reward": 0.361111112870276,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 2720.6666870117188,
"epoch": 0.31533847472150817,
"grad_norm": 1.0726344585418701,
"kl": 0.6005859375,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0571,
"reward": 0.45398143492639065,
"reward_std": 0.8964811712503433,
"rewards/cosine_scaled_reward": -0.04384262952953577,
"rewards/format_reward": 0.5416666716337204,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2828.9305419921875,
"epoch": 0.3170522707797772,
"grad_norm": 0.9460340142250061,
"kl": 0.6220703125,
"learning_rate": 8.145033635316128e-07,
"loss": 0.1297,
"reward": -0.03706150595098734,
"reward_std": 0.7321052774786949,
"rewards/cosine_scaled_reward": -0.1990863112732768,
"rewards/format_reward": 0.3611111231148243,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2725.8750610351562,
"epoch": 0.31876606683804626,
"grad_norm": 1.0472413301467896,
"kl": 0.5556640625,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0507,
"reward": -0.008732129819691181,
"reward_std": 0.43902990967035294,
"rewards/cosine_scaled_reward": -0.2057549599558115,
"rewards/format_reward": 0.4027777835726738,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 2761.0556030273438,
"epoch": 0.32047986289631536,
"grad_norm": 0.9237687587738037,
"kl": 0.55126953125,
"learning_rate": 8.093945422764069e-07,
"loss": 0.1122,
"reward": 0.34623236872721463,
"reward_std": 0.8785705417394638,
"rewards/cosine_scaled_reward": -0.07688381336629391,
"rewards/format_reward": 0.5000000074505806,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 3188.4027709960938,
"epoch": 0.3221936589545844,
"grad_norm": 1.4287723302841187,
"kl": 0.607421875,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0457,
"reward": -0.10439129918813705,
"reward_std": 0.6522045210003853,
"rewards/cosine_scaled_reward": -0.20497343130409718,
"rewards/format_reward": 0.30555556807667017,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 2965.75,
"epoch": 0.32390745501285345,
"grad_norm": 1.0540153980255127,
"kl": 0.6044921875,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0926,
"reward": 0.00805249996483326,
"reward_std": 0.5005255490541458,
"rewards/cosine_scaled_reward": -0.1904182005673647,
"rewards/format_reward": 0.38888888992369175,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2995.3611450195312,
"epoch": 0.32562125107112255,
"grad_norm": 2.005993604660034,
"kl": 0.60546875,
"learning_rate": 8.01636806561836e-07,
"loss": 0.1571,
"reward": -0.2390465196222067,
"reward_std": 0.5108147040009499,
"rewards/cosine_scaled_reward": -0.2792454734444618,
"rewards/format_reward": 0.3194444449618459,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2995.02783203125,
"epoch": 0.3273350471293916,
"grad_norm": 0.914374828338623,
"kl": 0.537109375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0791,
"reward": -0.008268387988209724,
"reward_std": 0.7869899272918701,
"rewards/cosine_scaled_reward": -0.19163418684911449,
"rewards/format_reward": 0.3750000037252903,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2657.3055725097656,
"epoch": 0.32904884318766064,
"grad_norm": 0.9198621511459351,
"kl": 0.6298828125,
"learning_rate": 7.964034505716476e-07,
"loss": 0.1016,
"reward": 0.14560853224247694,
"reward_std": 0.44526704400777817,
"rewards/cosine_scaled_reward": -0.177195742726326,
"rewards/format_reward": 0.5000000074505806,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 2256.6666564941406,
"epoch": 0.33076263924592975,
"grad_norm": 0.9307562708854675,
"kl": 0.5947265625,
"learning_rate": 7.93768694627233e-07,
"loss": 0.082,
"reward": 0.184324630536139,
"reward_std": 0.5673187747597694,
"rewards/cosine_scaled_reward": -0.17867101542651653,
"rewards/format_reward": 0.5416666679084301,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2654.013916015625,
"epoch": 0.3324764353041988,
"grad_norm": 1.2104908227920532,
"kl": 0.6787109375,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0927,
"reward": 0.5049788989126682,
"reward_std": 0.6255298256874084,
"rewards/cosine_scaled_reward": 0.0024894457310438156,
"rewards/format_reward": 0.5000000074505806,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 3088.2222900390625,
"epoch": 0.3341902313624679,
"grad_norm": 2.0733349323272705,
"kl": 0.787109375,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0256,
"reward": -0.1914132796227932,
"reward_std": 0.39547703973948956,
"rewards/cosine_scaled_reward": -0.23459553346037865,
"rewards/format_reward": 0.2777777761220932,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2572.5416259765625,
"epoch": 0.33590402742073694,
"grad_norm": 0.9238296151161194,
"kl": 0.6484375,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0799,
"reward": 0.4742476176470518,
"reward_std": 0.8941326662898064,
"rewards/cosine_scaled_reward": -0.04759840480983257,
"rewards/format_reward": 0.5694444477558136,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2907.4583129882812,
"epoch": 0.337617823479006,
"grad_norm": 0.9024485945701599,
"kl": 0.693359375,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0713,
"reward": 0.0948091521859169,
"reward_std": 0.4578506797552109,
"rewards/cosine_scaled_reward": -0.11926210392266512,
"rewards/format_reward": 0.33333334513008595,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2943.4305419921875,
"epoch": 0.3393316195372751,
"grad_norm": 1.3114806413650513,
"kl": 0.7470703125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0493,
"reward": 0.04198681065463461,
"reward_std": 0.5121570453047752,
"rewards/cosine_scaled_reward": -0.1595621556043625,
"rewards/format_reward": 0.3611111156642437,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 3010.9166870117188,
"epoch": 0.34104541559554413,
"grad_norm": 0.6777936816215515,
"kl": 0.697265625,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0892,
"reward": 0.12530913203954697,
"reward_std": 0.5297227501869202,
"rewards/cosine_scaled_reward": -0.145678770262748,
"rewards/format_reward": 0.4166666716337204,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2658.4722595214844,
"epoch": 0.3427592116538132,
"grad_norm": 1.0869694948196411,
"kl": 0.5361328125,
"learning_rate": 7.75e-07,
"loss": 0.0285,
"reward": 0.42103337205480784,
"reward_std": 0.5303617715835571,
"rewards/cosine_scaled_reward": -0.04642775317188352,
"rewards/format_reward": 0.5138888992369175,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2742.0833435058594,
"epoch": 0.3444730077120823,
"grad_norm": 0.6390620470046997,
"kl": 0.56005859375,
"learning_rate": 7.72273839962904e-07,
"loss": 0.078,
"reward": 0.13805552199482918,
"reward_std": 0.5941917151212692,
"rewards/cosine_scaled_reward": -0.18097224179655313,
"rewards/format_reward": 0.5000000037252903,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2622.611083984375,
"epoch": 0.3461868037703513,
"grad_norm": 1.5139989852905273,
"kl": 0.51416015625,
"learning_rate": 7.695368466124296e-07,
"loss": 0.1322,
"reward": 0.2749571923632175,
"reward_std": 0.6380000561475754,
"rewards/cosine_scaled_reward": -0.09863251959905028,
"rewards/format_reward": 0.4722222313284874,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 2710.7361450195312,
"epoch": 0.34790059982862037,
"grad_norm": 1.517341136932373,
"kl": 0.51171875,
"learning_rate": 7.667891533457718e-07,
"loss": 0.1124,
"reward": 0.5119861587882042,
"reward_std": 0.9760274440050125,
"rewards/cosine_scaled_reward": -0.014840253628790379,
"rewards/format_reward": 0.541666679084301,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2822.6112060546875,
"epoch": 0.3496143958868895,
"grad_norm": 1.1272459030151367,
"kl": 0.548828125,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0668,
"reward": 0.03917721984907985,
"reward_std": 0.7430369108915329,
"rewards/cosine_scaled_reward": -0.17485582828521729,
"rewards/format_reward": 0.38888888992369175,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2645.5694580078125,
"epoch": 0.3513281919451585,
"grad_norm": 1.7582755088806152,
"kl": 0.5556640625,
"learning_rate": 7.612622032536507e-07,
"loss": 0.1055,
"reward": 0.509862631559372,
"reward_std": 0.7304475903511047,
"rewards/cosine_scaled_reward": 0.004931296221911907,
"rewards/format_reward": 0.5000000111758709,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2748.02783203125,
"epoch": 0.35304198800342756,
"grad_norm": 13.779873847961426,
"kl": 1.0166015625,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0928,
"reward": 0.13606557785533369,
"reward_std": 0.5326481983065605,
"rewards/cosine_scaled_reward": -0.2236338797956705,
"rewards/format_reward": 0.5833333507180214,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2727.9722900390625,
"epoch": 0.35475578406169667,
"grad_norm": 4.678215503692627,
"kl": 0.8974609375,
"learning_rate": 7.556940671764124e-07,
"loss": 0.1124,
"reward": 0.3662101551890373,
"reward_std": 0.5158084109425545,
"rewards/cosine_scaled_reward": -0.05995047930628061,
"rewards/format_reward": 0.4861111268401146,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2660.9722900390625,
"epoch": 0.3564695801199657,
"grad_norm": 2.2143702507019043,
"kl": 0.7880859375,
"learning_rate": 7.528948933102438e-07,
"loss": 0.067,
"reward": 0.29765829257667065,
"reward_std": 0.7447296231985092,
"rewards/cosine_scaled_reward": -0.1428375095129013,
"rewards/format_reward": 0.5833333432674408,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2533.4444580078125,
"epoch": 0.3581833761782348,
"grad_norm": 1.057923436164856,
"kl": 0.626953125,
"learning_rate": 7.500858306332172e-07,
"loss": 0.1388,
"reward": 0.22743514459580183,
"reward_std": 0.8155356049537659,
"rewards/cosine_scaled_reward": -0.15017131343483925,
"rewards/format_reward": 0.5277777835726738,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 2513.763916015625,
"epoch": 0.35989717223650386,
"grad_norm": 3.4706244468688965,
"kl": 0.6640625,
"learning_rate": 7.472670160550848e-07,
"loss": 0.2307,
"reward": 0.32537855207920074,
"reward_std": 0.6403735391795635,
"rewards/cosine_scaled_reward": -0.11508850922109559,
"rewards/format_reward": 0.555555559694767,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 3072.1806030273438,
"epoch": 0.3616109682947729,
"grad_norm": 0.867877721786499,
"kl": 0.85009765625,
"learning_rate": 7.444385869608921e-07,
"loss": 0.1175,
"reward": 0.021036310121417046,
"reward_std": 0.5472413003444672,
"rewards/cosine_scaled_reward": -0.170037392526865,
"rewards/format_reward": 0.36111112125217915,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 2580.791717529297,
"epoch": 0.363324764353042,
"grad_norm": 1.1602129936218262,
"kl": 0.8037109375,
"learning_rate": 7.416006812042827e-07,
"loss": 0.1343,
"reward": 0.6223988421261311,
"reward_std": 0.851245753467083,
"rewards/cosine_scaled_reward": -0.0013005826622247696,
"rewards/format_reward": 0.625,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2920.2916870117188,
"epoch": 0.36503856041131105,
"grad_norm": 1.2226418256759644,
"kl": 1.0029296875,
"learning_rate": 7.387534371007797e-07,
"loss": 0.1063,
"reward": 0.10683083906769753,
"reward_std": 0.6580070406198502,
"rewards/cosine_scaled_reward": -0.21741791814565659,
"rewards/format_reward": 0.5416666641831398,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2951.166748046875,
"epoch": 0.3667523564695801,
"grad_norm": 1.302587628364563,
"kl": 1.1484375,
"learning_rate": 7.358969934210438e-07,
"loss": 0.1102,
"reward": 0.16499032359570265,
"reward_std": 0.5117045789957047,
"rewards/cosine_scaled_reward": -0.13278261446976103,
"rewards/format_reward": 0.4305555671453476,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2642.3056030273438,
"epoch": 0.3684661525278492,
"grad_norm": 1.028397560119629,
"kl": 0.8671875,
"learning_rate": 7.330314893841101e-07,
"loss": 0.1035,
"reward": 0.3645508070476353,
"reward_std": 0.9228581190109253,
"rewards/cosine_scaled_reward": -0.10939126997254789,
"rewards/format_reward": 0.5833333283662796,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 2541.0833129882812,
"epoch": 0.37017994858611825,
"grad_norm": 1.578083872795105,
"kl": 0.86865234375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.1489,
"reward": 0.26378826051950455,
"reward_std": 0.6202561929821968,
"rewards/cosine_scaled_reward": -0.13199475780129433,
"rewards/format_reward": 0.5277777910232544,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 2184.9445190429688,
"epoch": 0.3718937446443873,
"grad_norm": 1.103194236755371,
"kl": 0.6845703125,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0844,
"reward": 0.4086096244864166,
"reward_std": 0.7625616788864136,
"rewards/cosine_scaled_reward": -0.11513962969183922,
"rewards/format_reward": 0.6388889029622078,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2404.6805725097656,
"epoch": 0.3736075407026564,
"grad_norm": 2.3009181022644043,
"kl": 0.8935546875,
"learning_rate": 7.243820139034464e-07,
"loss": 0.1892,
"reward": 0.4591095373034477,
"reward_std": 0.5642153918743134,
"rewards/cosine_scaled_reward": -0.08294522017240524,
"rewards/format_reward": 0.625,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 3044.9862060546875,
"epoch": 0.37532133676092544,
"grad_norm": 1.2761178016662598,
"kl": 1.19921875,
"learning_rate": 7.214816693576234e-07,
"loss": 0.1195,
"reward": 0.21450293064117432,
"reward_std": 0.7603526711463928,
"rewards/cosine_scaled_reward": -0.12191520072519779,
"rewards/format_reward": 0.4583333358168602,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2793.90283203125,
"epoch": 0.37703513281919454,
"grad_norm": 1.6576476097106934,
"kl": 1.166015625,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0977,
"reward": 0.22772593423724174,
"reward_std": 0.5124068222939968,
"rewards/cosine_scaled_reward": -0.18474812898784876,
"rewards/format_reward": 0.5972222313284874,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 2835.0,
"epoch": 0.3787489288774636,
"grad_norm": 3.882580280303955,
"kl": 1.0927734375,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0245,
"reward": 0.10485807061195374,
"reward_std": 0.5114092901349068,
"rewards/cosine_scaled_reward": -0.20451541244983673,
"rewards/format_reward": 0.5138888955116272,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 2310.9305725097656,
"epoch": 0.38046272493573263,
"grad_norm": 1.3096808195114136,
"kl": 0.87890625,
"learning_rate": 7.127310565369415e-07,
"loss": 0.1143,
"reward": 0.4324228148907423,
"reward_std": 0.5727507174015045,
"rewards/cosine_scaled_reward": -0.13101080805063248,
"rewards/format_reward": 0.6944444477558136,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2826.2222290039062,
"epoch": 0.38217652099400173,
"grad_norm": 1.182824730873108,
"kl": 0.931640625,
"learning_rate": 7.097981330836616e-07,
"loss": 0.1159,
"reward": 0.17814365401864052,
"reward_std": 0.5051928982138634,
"rewards/cosine_scaled_reward": -0.14703928492963314,
"rewards/format_reward": 0.4722222238779068,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 2644.8611450195312,
"epoch": 0.3838903170522708,
"grad_norm": 1.271640658378601,
"kl": 0.7939453125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0771,
"reward": 0.21772570302709937,
"reward_std": 0.5406957715749741,
"rewards/cosine_scaled_reward": -0.17585936933755875,
"rewards/format_reward": 0.5694444477558136,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2389.749969482422,
"epoch": 0.3856041131105398,
"grad_norm": 2.586735486984253,
"kl": 0.6689453125,
"learning_rate": 7.039090644965509e-07,
"loss": -0.0027,
"reward": 0.5408617407083511,
"reward_std": 0.7554269433021545,
"rewards/cosine_scaled_reward": -0.03512469958513975,
"rewards/format_reward": 0.6111111119389534,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2435.250030517578,
"epoch": 0.3873179091688089,
"grad_norm": 1.4329172372817993,
"kl": 0.59521484375,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0425,
"reward": 0.21151528507471085,
"reward_std": 0.6967541426420212,
"rewards/cosine_scaled_reward": -0.14424235187470913,
"rewards/format_reward": 0.5000000149011612,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2509.1945190429688,
"epoch": 0.389031705227078,
"grad_norm": 1.7964338064193726,
"kl": 0.51220703125,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0631,
"reward": 0.3222038522362709,
"reward_std": 0.7920150905847549,
"rewards/cosine_scaled_reward": -0.0958425235003233,
"rewards/format_reward": 0.5138888955116272,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 2343.888885498047,
"epoch": 0.390745501285347,
"grad_norm": 1.312915563583374,
"kl": 0.48828125,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0327,
"reward": 0.72439269348979,
"reward_std": 0.6716032773256302,
"rewards/cosine_scaled_reward": 0.028863003477454185,
"rewards/format_reward": 0.6666666865348816,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2668.8333740234375,
"epoch": 0.3924592973436161,
"grad_norm": 1.1794544458389282,
"kl": 0.591796875,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0617,
"reward": 0.4572554435580969,
"reward_std": 0.6365808099508286,
"rewards/cosine_scaled_reward": -0.028316727373749018,
"rewards/format_reward": 0.5138888955116272,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 2405.111114501953,
"epoch": 0.39417309340188517,
"grad_norm": 2.7993645668029785,
"kl": 0.52197265625,
"learning_rate": 6.890576474687263e-07,
"loss": 0.193,
"reward": 0.6450787968933582,
"reward_std": 0.6886177062988281,
"rewards/cosine_scaled_reward": -0.017738381633535028,
"rewards/format_reward": 0.6805555671453476,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 2889.763916015625,
"epoch": 0.39588688946015427,
"grad_norm": 0.9008044600486755,
"kl": 0.59228515625,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0832,
"reward": 0.17632517218589783,
"reward_std": 0.7136962860822678,
"rewards/cosine_scaled_reward": -0.11322630103677511,
"rewards/format_reward": 0.4027777835726738,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2096.763885498047,
"epoch": 0.3976006855184233,
"grad_norm": 2.9937281608581543,
"kl": 0.44482421875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0198,
"reward": 0.45767842745408416,
"reward_std": 0.6805157586932182,
"rewards/cosine_scaled_reward": -0.0697719173040241,
"rewards/format_reward": 0.5972222313284874,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 2095.625030517578,
"epoch": 0.39931448157669236,
"grad_norm": 3.1695449352264404,
"kl": 0.52392578125,
"learning_rate": 6.800643086250121e-07,
"loss": 0.124,
"reward": 0.8969383761286736,
"reward_std": 0.8693148195743561,
"rewards/cosine_scaled_reward": 0.10124696930870414,
"rewards/format_reward": 0.6944444477558136,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 2397.9722290039062,
"epoch": 0.40102827763496146,
"grad_norm": 2.038714647293091,
"kl": 0.60302734375,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0803,
"reward": 0.3801136128604412,
"reward_std": 0.6368846967816353,
"rewards/cosine_scaled_reward": -0.11549876257777214,
"rewards/format_reward": 0.6111111044883728,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2803.1805419921875,
"epoch": 0.4027420736932305,
"grad_norm": 1.1210250854492188,
"kl": 0.76171875,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0749,
"reward": 0.051421504467725754,
"reward_std": 0.46992237120866776,
"rewards/cosine_scaled_reward": -0.21734481677412987,
"rewards/format_reward": 0.4861111231148243,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 2735.9583129882812,
"epoch": 0.40445586975149955,
"grad_norm": 1.5609543323516846,
"kl": 0.6220703125,
"learning_rate": 6.710139192768694e-07,
"loss": 0.1051,
"reward": 0.33594064973294735,
"reward_std": 0.5969594717025757,
"rewards/cosine_scaled_reward": -0.1306407954543829,
"rewards/format_reward": 0.597222238779068,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2207.9444580078125,
"epoch": 0.40616966580976865,
"grad_norm": 3.293438673019409,
"kl": 0.619140625,
"learning_rate": 6.679851303883891e-07,
"loss": 0.1014,
"reward": 0.6933649554848671,
"reward_std": 0.4978405013680458,
"rewards/cosine_scaled_reward": -0.02831752598285675,
"rewards/format_reward": 0.75,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2911.0972900390625,
"epoch": 0.4078834618680377,
"grad_norm": 1.396133303642273,
"kl": 0.689453125,
"learning_rate": 6.649505910711058e-07,
"loss": 0.1308,
"reward": 0.23781822435557842,
"reward_std": 0.5772198215126991,
"rewards/cosine_scaled_reward": -0.1449797886889428,
"rewards/format_reward": 0.5277777835726738,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 2865.1944580078125,
"epoch": 0.40959725792630675,
"grad_norm": 1.02251398563385,
"kl": 0.775390625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.1421,
"reward": 0.38452258985489607,
"reward_std": 0.7435072809457779,
"rewards/cosine_scaled_reward": -0.0646831514313817,
"rewards/format_reward": 0.5138888880610466,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 2345.4444274902344,
"epoch": 0.41131105398457585,
"grad_norm": 4.698256492614746,
"kl": 0.6904296875,
"learning_rate": 6.588648530198504e-07,
"loss": 0.1594,
"reward": 0.7729744166135788,
"reward_std": 0.8151284381747246,
"rewards/cosine_scaled_reward": 0.10176499933004379,
"rewards/format_reward": 0.5694444477558136,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2070.4166259765625,
"epoch": 0.4130248500428449,
"grad_norm": 8.216842651367188,
"kl": 0.52490234375,
"learning_rate": 6.558139508961654e-07,
"loss": 0.1321,
"reward": 0.48884235695004463,
"reward_std": 0.5597383752465248,
"rewards/cosine_scaled_reward": -0.06113438308238983,
"rewards/format_reward": 0.6111111119389534,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 2937.0972290039062,
"epoch": 0.414738646101114,
"grad_norm": 1.0033913850784302,
"kl": 0.5947265625,
"learning_rate": 6.527578915497951e-07,
"loss": 0.1027,
"reward": 0.3329106804449111,
"reward_std": 0.626296728849411,
"rewards/cosine_scaled_reward": -0.09048910066485405,
"rewards/format_reward": 0.5138888955116272,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 2632.3055419921875,
"epoch": 0.41645244215938304,
"grad_norm": 3.2918546199798584,
"kl": 0.62109375,
"learning_rate": 6.496968239287603e-07,
"loss": 0.151,
"reward": 0.423097662627697,
"reward_std": 0.7703854739665985,
"rewards/cosine_scaled_reward": -0.05234006140381098,
"rewards/format_reward": 0.527777798473835,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 2488.8055725097656,
"epoch": 0.4181662382176521,
"grad_norm": 1.382688283920288,
"kl": 0.51220703125,
"learning_rate": 6.466308972251785e-07,
"loss": 0.1239,
"reward": 0.5373616181313992,
"reward_std": 0.642534889280796,
"rewards/cosine_scaled_reward": -0.036874750861898065,
"rewards/format_reward": 0.6111111119389534,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 1840.0555877685547,
"epoch": 0.4198800342759212,
"grad_norm": 5.2921977043151855,
"kl": 0.435546875,
"learning_rate": 6.435602608679916e-07,
"loss": 0.1668,
"reward": 1.0207914784550667,
"reward_std": 0.6237036064267159,
"rewards/cosine_scaled_reward": 0.08678461611270905,
"rewards/format_reward": 0.8472222536802292,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2378.77783203125,
"epoch": 0.42159383033419023,
"grad_norm": 4.525283336639404,
"kl": 0.7333984375,
"learning_rate": 6.404850645156841e-07,
"loss": 0.2341,
"reward": 0.8125267028808594,
"reward_std": 0.7737091481685638,
"rewards/cosine_scaled_reward": 0.05209667468443513,
"rewards/format_reward": 0.7083333432674408,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 2746.7916870117188,
"epoch": 0.4233076263924593,
"grad_norm": 1.215826392173767,
"kl": 0.7509765625,
"learning_rate": 6.374054580489873e-07,
"loss": 0.124,
"reward": 0.16153091937303543,
"reward_std": 0.7042593955993652,
"rewards/cosine_scaled_reward": -0.12756787613034248,
"rewards/format_reward": 0.4166666716337204,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 2420.125,
"epoch": 0.4250214224507284,
"grad_norm": 2.161705732345581,
"kl": 0.86328125,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0674,
"reward": 0.6162599250674248,
"reward_std": 0.7196609973907471,
"rewards/cosine_scaled_reward": -0.05298116838093847,
"rewards/format_reward": 0.7222222238779068,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 2277.500030517578,
"epoch": 0.4267352185089974,
"grad_norm": 3.1015782356262207,
"kl": 0.732421875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0179,
"reward": 0.6064739339053631,
"reward_std": 0.6056996583938599,
"rewards/cosine_scaled_reward": -0.03704079985618591,
"rewards/format_reward": 0.6805555671453476,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2416.9583740234375,
"epoch": 0.4284490145672665,
"grad_norm": 8.199381828308105,
"kl": 0.904296875,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0707,
"reward": 0.5718964412808418,
"reward_std": 0.7699461728334427,
"rewards/cosine_scaled_reward": -0.06127400905825198,
"rewards/format_reward": 0.6944444477558136,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2411.2916259765625,
"epoch": 0.4301628106255356,
"grad_norm": 7.757229328155518,
"kl": 0.7177734375,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0104,
"reward": 0.6452328599989414,
"reward_std": 0.8850838840007782,
"rewards/cosine_scaled_reward": -0.010716899763792753,
"rewards/format_reward": 0.6666666716337204,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 2250.1805725097656,
"epoch": 0.4318766066838046,
"grad_norm": 50.71971893310547,
"kl": 1.673828125,
"learning_rate": 6.219465344613258e-07,
"loss": 0.2537,
"reward": 0.41996366158127785,
"reward_std": 0.630496121942997,
"rewards/cosine_scaled_reward": -0.16501817479729652,
"rewards/format_reward": 0.7500000149011612,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2389.4861450195312,
"epoch": 0.43359040274207367,
"grad_norm": 76.95365142822266,
"kl": 1.5126953125,
"learning_rate": 6.188436263278172e-07,
"loss": 0.1964,
"reward": 0.5589244738221169,
"reward_std": 0.8758179396390915,
"rewards/cosine_scaled_reward": -0.019148872102960013,
"rewards/format_reward": 0.597222238779068,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2433.6944580078125,
"epoch": 0.43530419880034277,
"grad_norm": 89.30572509765625,
"kl": 1.7294921875,
"learning_rate": 6.157373628530852e-07,
"loss": 0.1411,
"reward": 0.3653869954869151,
"reward_std": 0.6425384879112244,
"rewards/cosine_scaled_reward": -0.11591762490570545,
"rewards/format_reward": 0.5972222313284874,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2666.02783203125,
"epoch": 0.4370179948586118,
"grad_norm": 9.923705101013184,
"kl": 0.7724609375,
"learning_rate": 6.126278954320294e-07,
"loss": 0.1243,
"reward": 0.3781815767288208,
"reward_std": 0.6919418126344681,
"rewards/cosine_scaled_reward": -0.06785366125404835,
"rewards/format_reward": 0.5138888880610466,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 2432.8056030273438,
"epoch": 0.4387317909168809,
"grad_norm": 9.747496604919434,
"kl": 0.7314453125,
"learning_rate": 6.095153756157051e-07,
"loss": 0.083,
"reward": 0.2219482958316803,
"reward_std": 0.47816336899995804,
"rewards/cosine_scaled_reward": -0.20152585953474045,
"rewards/format_reward": 0.6250000074505806,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2205.027801513672,
"epoch": 0.44044558697514996,
"grad_norm": 3.7311699390411377,
"kl": 0.53857421875,
"learning_rate": 6.06399955103937e-07,
"loss": 0.1269,
"reward": 0.5375950075685978,
"reward_std": 0.6251346915960312,
"rewards/cosine_scaled_reward": -0.0923136118799448,
"rewards/format_reward": 0.7222222238779068,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2471.4584045410156,
"epoch": 0.442159383033419,
"grad_norm": 2.448763608932495,
"kl": 0.91796875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.1343,
"reward": 0.33694631792604923,
"reward_std": 0.4810459837317467,
"rewards/cosine_scaled_reward": -0.1440268289297819,
"rewards/format_reward": 0.6250000149011612,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 2130.763885498047,
"epoch": 0.4438731790916881,
"grad_norm": 1.8066775798797607,
"kl": 0.68505859375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.1045,
"reward": 0.7608818560838699,
"reward_std": 0.697891928255558,
"rewards/cosine_scaled_reward": -0.008447982007055543,
"rewards/format_reward": 0.7777777910232544,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 2359.7638549804688,
"epoch": 0.44558697514995715,
"grad_norm": 1.9958624839782715,
"kl": 1.009765625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.1355,
"reward": 0.7400075197219849,
"reward_std": 0.5138791352510452,
"rewards/cosine_scaled_reward": 0.00889264652505517,
"rewards/format_reward": 0.7222222313284874,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 2002.7361145019531,
"epoch": 0.4473007712082262,
"grad_norm": 5.525115966796875,
"kl": 0.7490234375,
"learning_rate": 5.939123048916173e-07,
"loss": 0.1647,
"reward": 0.7085682898759842,
"reward_std": 0.6736738979816437,
"rewards/cosine_scaled_reward": -0.041549197398126125,
"rewards/format_reward": 0.7916666716337204,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 2599.9028930664062,
"epoch": 0.4490145672664953,
"grad_norm": 4.714324474334717,
"kl": 0.9013671875,
"learning_rate": 5.907846610890011e-07,
"loss": 0.1464,
"reward": 0.7315462306141853,
"reward_std": 0.8794215172529221,
"rewards/cosine_scaled_reward": 0.05327310296706855,
"rewards/format_reward": 0.6250000149011612,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2593.236083984375,
"epoch": 0.45072836332476435,
"grad_norm": 1.4935065507888794,
"kl": 0.8330078125,
"learning_rate": 5.87655029499542e-07,
"loss": 0.1333,
"reward": 0.26326372660696507,
"reward_std": 0.3958895206451416,
"rewards/cosine_scaled_reward": -0.11836813762784004,
"rewards/format_reward": 0.5000000074505806,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 2746.1111450195312,
"epoch": 0.4524421593830334,
"grad_norm": 2.71354341506958,
"kl": 0.833984375,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0506,
"reward": 0.34650287590920925,
"reward_std": 0.6603603884577751,
"rewards/cosine_scaled_reward": -0.11841523088514805,
"rewards/format_reward": 0.5833333283662796,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 2810.6111450195312,
"epoch": 0.4541559554413025,
"grad_norm": 3.0615949630737305,
"kl": 0.8046875,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0807,
"reward": 0.6215685978531837,
"reward_std": 0.6345800720155239,
"rewards/cosine_scaled_reward": 0.06078430451452732,
"rewards/format_reward": 0.5000000018626451,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2483.513916015625,
"epoch": 0.45586975149957154,
"grad_norm": 2.4526662826538086,
"kl": 0.689453125,
"learning_rate": 5.78255733788191e-07,
"loss": 0.1832,
"reward": 0.4189574085175991,
"reward_std": 0.4973677098751068,
"rewards/cosine_scaled_reward": -0.0544101782143116,
"rewards/format_reward": 0.5277777835726738,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2401.1805725097656,
"epoch": 0.45758354755784064,
"grad_norm": 1.3402016162872314,
"kl": 0.7294921875,
"learning_rate": 5.751196772469237e-07,
"loss": 0.1157,
"reward": 0.36370813054963946,
"reward_std": 0.4258965626358986,
"rewards/cosine_scaled_reward": -0.14453481137752533,
"rewards/format_reward": 0.6527777910232544,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2708.3194580078125,
"epoch": 0.4592973436161097,
"grad_norm": 3.217470407485962,
"kl": 0.599609375,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0051,
"reward": 0.26998334005475044,
"reward_std": 0.49185192957520485,
"rewards/cosine_scaled_reward": -0.15667499974370003,
"rewards/format_reward": 0.5833333432674408,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 2829.4583740234375,
"epoch": 0.46101113967437873,
"grad_norm": 1.457294225692749,
"kl": 0.591796875,
"learning_rate": 5.688440441781398e-07,
"loss": 0.1149,
"reward": 0.2393805852625519,
"reward_std": 0.7379022389650345,
"rewards/cosine_scaled_reward": -0.11642082477919757,
"rewards/format_reward": 0.4722222313284874,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2878.2916259765625,
"epoch": 0.46272493573264784,
"grad_norm": 1.101474642753601,
"kl": 0.69921875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0498,
"reward": 0.37158428877592087,
"reward_std": 0.5783374309539795,
"rewards/cosine_scaled_reward": -0.029485642910003662,
"rewards/format_reward": 0.430555559694767,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 2793.6806030273438,
"epoch": 0.4644387317909169,
"grad_norm": 0.8390009999275208,
"kl": 0.63916015625,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0834,
"reward": 0.38940694369375706,
"reward_std": 0.6547586917877197,
"rewards/cosine_scaled_reward": -0.06918542925268412,
"rewards/format_reward": 0.5277777723968029,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2366.638885498047,
"epoch": 0.4661525278491859,
"grad_norm": 4.141148090362549,
"kl": 0.4765625,
"learning_rate": 5.594240889475106e-07,
"loss": 0.2224,
"reward": 0.4617117829620838,
"reward_std": 0.6572139859199524,
"rewards/cosine_scaled_reward": -0.060810765251517296,
"rewards/format_reward": 0.5833333358168602,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 2721.6805419921875,
"epoch": 0.46786632390745503,
"grad_norm": 1.906948208808899,
"kl": 0.599609375,
"learning_rate": 5.562829811526154e-07,
"loss": 0.1188,
"reward": 0.26052477210760117,
"reward_std": 0.6570783406496048,
"rewards/cosine_scaled_reward": -0.06418205983936787,
"rewards/format_reward": 0.3888888992369175,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 2921.3750610351562,
"epoch": 0.4695801199657241,
"grad_norm": 1.8937734365463257,
"kl": 0.52685546875,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0185,
"reward": 0.22513618250377476,
"reward_std": 0.6058431342244148,
"rewards/cosine_scaled_reward": -0.16520969779230654,
"rewards/format_reward": 0.555555559694767,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 2988.15283203125,
"epoch": 0.4712939160239931,
"grad_norm": 1.208433985710144,
"kl": 0.54931640625,
"learning_rate": 5.5e-07,
"loss": 0.0485,
"reward": 0.11821263573256147,
"reward_std": 0.391703762114048,
"rewards/cosine_scaled_reward": -0.15617146715521812,
"rewards/format_reward": 0.4305555494502187,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 2643.541748046875,
"epoch": 0.4730077120822622,
"grad_norm": 0.9594613313674927,
"kl": 0.4326171875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0807,
"reward": 0.32308289408683777,
"reward_std": 0.7159284129738808,
"rewards/cosine_scaled_reward": -0.08151410473510623,
"rewards/format_reward": 0.4861111156642437,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 2926.013916015625,
"epoch": 0.47472150814053127,
"grad_norm": 1.3466960191726685,
"kl": 0.5048828125,
"learning_rate": 5.437170188473847e-07,
"loss": 0.1283,
"reward": 0.06485692039132118,
"reward_std": 0.542218990623951,
"rewards/cosine_scaled_reward": -0.17590487515553832,
"rewards/format_reward": 0.4166666679084301,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2993.1945190429688,
"epoch": 0.47643530419880037,
"grad_norm": 3.963604688644409,
"kl": 0.5546875,
"learning_rate": 5.405759110524894e-07,
"loss": 0.2168,
"reward": 0.03267951123416424,
"reward_std": 0.7076856940984726,
"rewards/cosine_scaled_reward": -0.15727136190980673,
"rewards/format_reward": 0.3472222276031971,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 3138.6805419921875,
"epoch": 0.4781491002570694,
"grad_norm": 0.5958003997802734,
"kl": 0.56201171875,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0759,
"reward": -0.051162030547857285,
"reward_std": 0.5351358503103256,
"rewards/cosine_scaled_reward": -0.2478032372891903,
"rewards/format_reward": 0.4444444477558136,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2737.1944885253906,
"epoch": 0.47986289631533846,
"grad_norm": 1.6760090589523315,
"kl": 0.59228515625,
"learning_rate": 5.342952264838747e-07,
"loss": 0.1096,
"reward": 0.45366813987493515,
"reward_std": 0.7812162339687347,
"rewards/cosine_scaled_reward": -0.0023325812071561813,
"rewards/format_reward": 0.4583333367481828,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 2423.2222290039062,
"epoch": 0.48157669237360756,
"grad_norm": 2.0177345275878906,
"kl": 0.461669921875,
"learning_rate": 5.311559558218603e-07,
"loss": 0.1262,
"reward": 0.38226850144565105,
"reward_std": 0.6752881184220314,
"rewards/cosine_scaled_reward": -0.07969908323138952,
"rewards/format_reward": 0.5416666585952044,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 2740.3611450195312,
"epoch": 0.4832904884318766,
"grad_norm": 1.8314719200134277,
"kl": 0.6337890625,
"learning_rate": 5.28017603591974e-07,
"loss": 0.1455,
"reward": 0.21622517937794328,
"reward_std": 0.6346057131886482,
"rewards/cosine_scaled_reward": -0.15577631071209908,
"rewards/format_reward": 0.5277777910232544,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2889.2500610351562,
"epoch": 0.48500428449014565,
"grad_norm": 0.8971331119537354,
"kl": 0.6259765625,
"learning_rate": 5.248803227530763e-07,
"loss": 0.1266,
"reward": 0.03935375134460628,
"reward_std": 0.6441294327378273,
"rewards/cosine_scaled_reward": -0.20254534482955933,
"rewards/format_reward": 0.4444444477558136,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2276.9584045410156,
"epoch": 0.48671808054841476,
"grad_norm": 6.356135368347168,
"kl": 0.4833984375,
"learning_rate": 5.21744266211809e-07,
"loss": 0.3013,
"reward": 0.32465188996866345,
"reward_std": 0.6560942605137825,
"rewards/cosine_scaled_reward": -0.09461849741637707,
"rewards/format_reward": 0.5138889029622078,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 2624.9306640625,
"epoch": 0.4884318766066838,
"grad_norm": 1.9299744367599487,
"kl": 0.5576171875,
"learning_rate": 5.186095868151436e-07,
"loss": 0.1493,
"reward": 0.21753913909196854,
"reward_std": 0.683107927441597,
"rewards/cosine_scaled_reward": -0.16206377279013395,
"rewards/format_reward": 0.5416666716337204,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2555.7361450195312,
"epoch": 0.49014567266495285,
"grad_norm": 2.2400879859924316,
"kl": 0.62890625,
"learning_rate": 5.154764373429315e-07,
"loss": 0.1583,
"reward": 0.4158199355006218,
"reward_std": 0.6385739594697952,
"rewards/cosine_scaled_reward": -0.021256705978885293,
"rewards/format_reward": 0.4583333320915699,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 2526.52783203125,
"epoch": 0.49185946872322195,
"grad_norm": 3.190502405166626,
"kl": 0.916015625,
"learning_rate": 5.123449705004581e-07,
"loss": 0.2192,
"reward": 0.4090676587074995,
"reward_std": 0.7619837448000908,
"rewards/cosine_scaled_reward": -0.0662995120510459,
"rewards/format_reward": 0.5416666641831398,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 2134.65283203125,
"epoch": 0.493573264781491,
"grad_norm": 6.452578544616699,
"kl": 1.103515625,
"learning_rate": 5.09215338910999e-07,
"loss": 0.368,
"reward": 0.44275959208607674,
"reward_std": 0.6151050329208374,
"rewards/cosine_scaled_reward": -0.09806465543806553,
"rewards/format_reward": 0.6388888955116272,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 2290.013885498047,
"epoch": 0.4952870608397601,
"grad_norm": 1.6855307817459106,
"kl": 0.82275390625,
"learning_rate": 5.060876951083828e-07,
"loss": 0.1464,
"reward": 0.4334092391654849,
"reward_std": 0.6985170915722847,
"rewards/cosine_scaled_reward": -0.0819065012037754,
"rewards/format_reward": 0.5972222164273262,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1840.6388549804688,
"epoch": 0.49700085689802914,
"grad_norm": 2.9146785736083984,
"kl": 0.79296875,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0283,
"reward": 0.7537698708474636,
"reward_std": 0.5962013602256775,
"rewards/cosine_scaled_reward": 0.008829381316900253,
"rewards/format_reward": 0.736111119389534,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 2200.4444274902344,
"epoch": 0.4987146529562982,
"grad_norm": 1.7983882427215576,
"kl": 0.927734375,
"learning_rate": 4.998389805071536e-07,
"loss": 0.2134,
"reward": 0.614590086042881,
"reward_std": 0.9522670358419418,
"rewards/cosine_scaled_reward": -0.012149423826485872,
"rewards/format_reward": 0.638888880610466,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1986.0694885253906,
"epoch": 0.5004284490145673,
"grad_norm": 2.5195839405059814,
"kl": 0.9873046875,
"learning_rate": 4.967182142620745e-07,
"loss": 0.1778,
"reward": 0.7197987511754036,
"reward_std": 0.8348591700196266,
"rewards/cosine_scaled_reward": 0.01267714286223054,
"rewards/format_reward": 0.6944444477558136,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1970.9999694824219,
"epoch": 0.5021422450728363,
"grad_norm": 3.093557596206665,
"kl": 1.0625,
"learning_rate": 4.93600044896063e-07,
"loss": 0.1538,
"reward": 0.29343970119953156,
"reward_std": 0.49155813455581665,
"rewards/cosine_scaled_reward": -0.13105794228613377,
"rewards/format_reward": 0.555555559694767,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1924.7916870117188,
"epoch": 0.5038560411311054,
"grad_norm": 6.1597208976745605,
"kl": 0.87158203125,
"learning_rate": 4.904846243842949e-07,
"loss": 0.3554,
"reward": 0.4494058433920145,
"reward_std": 0.747850589454174,
"rewards/cosine_scaled_reward": -0.08085263520479202,
"rewards/format_reward": 0.6111111119389534,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 2000.6388854980469,
"epoch": 0.5055698371893744,
"grad_norm": 2.868744134902954,
"kl": 0.8369140625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.1929,
"reward": 0.23984116781502962,
"reward_std": 0.4878830164670944,
"rewards/cosine_scaled_reward": -0.22730162646621466,
"rewards/format_reward": 0.6944444477558136,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 2322.5556030273438,
"epoch": 0.5072836332476436,
"grad_norm": 3.0267629623413086,
"kl": 1.2763671875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.2229,
"reward": 0.4730116240680218,
"reward_std": 0.672569528222084,
"rewards/cosine_scaled_reward": 0.0003947049845010042,
"rewards/format_reward": 0.4722222276031971,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1968.2222595214844,
"epoch": 0.5089974293059126,
"grad_norm": 3.3123202323913574,
"kl": 1.0166015625,
"learning_rate": 4.811563736721829e-07,
"loss": 0.2608,
"reward": 0.4829604886472225,
"reward_std": 0.6525571122765541,
"rewards/cosine_scaled_reward": -0.08490864699706435,
"rewards/format_reward": 0.6527777686715126,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 2157.8055725097656,
"epoch": 0.5107112253641817,
"grad_norm": 4.5067524909973145,
"kl": 1.609375,
"learning_rate": 4.780534655386743e-07,
"loss": 0.379,
"reward": 0.673854373395443,
"reward_std": 0.728430263698101,
"rewards/cosine_scaled_reward": 0.024427177384495735,
"rewards/format_reward": 0.6250000149011612,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1613.5000305175781,
"epoch": 0.5124250214224507,
"grad_norm": 3.3076987266540527,
"kl": 1.0537109375,
"learning_rate": 4.749540639777539e-07,
"loss": 0.2441,
"reward": 0.8705739304423332,
"reward_std": 0.7832073271274567,
"rewards/cosine_scaled_reward": 0.053342508152127266,
"rewards/format_reward": 0.7638888955116272,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 2283.0000610351562,
"epoch": 0.5141388174807198,
"grad_norm": 4.202121734619141,
"kl": 1.794921875,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.2233,
"reward": 0.41079268511384726,
"reward_std": 0.49303294718265533,
"rewards/cosine_scaled_reward": -0.07238144427537918,
"rewards/format_reward": 0.5555555522441864,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1734.0139465332031,
"epoch": 0.5158526135389888,
"grad_norm": 2.733738660812378,
"kl": 1.271484375,
"learning_rate": 4.68766384637248e-07,
"loss": 0.3623,
"reward": 0.5950284972786903,
"reward_std": 0.5468220561742783,
"rewards/cosine_scaled_reward": -0.042763520032167435,
"rewards/format_reward": 0.6805555671453476,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2734.5138549804688,
"epoch": 0.517566409597258,
"grad_norm": 5.247328281402588,
"kl": 1.837890625,
"learning_rate": 4.656784084364238e-07,
"loss": 0.1992,
"reward": 0.1533558116061613,
"reward_std": 0.5701670944690704,
"rewards/cosine_scaled_reward": -0.10387765569612384,
"rewards/format_reward": 0.36111111380159855,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1955.3333435058594,
"epoch": 0.519280205655527,
"grad_norm": 2.1991233825683594,
"kl": 1.4111328125,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.2534,
"reward": 0.4565839725546539,
"reward_std": 0.5375222712755203,
"rewards/cosine_scaled_reward": -0.13281912542879581,
"rewards/format_reward": 0.7222222164273262,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 1990.6805419921875,
"epoch": 0.5209940017137961,
"grad_norm": 3.0468897819519043,
"kl": 1.685546875,
"learning_rate": 4.59514935484316e-07,
"loss": 0.2866,
"reward": 0.589899554848671,
"reward_std": 0.6861986592411995,
"rewards/cosine_scaled_reward": -0.03143910859944299,
"rewards/format_reward": 0.6527777910232544,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 1996.5416870117188,
"epoch": 0.5227077977720651,
"grad_norm": 4.011932849884033,
"kl": 1.462890625,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.2531,
"reward": 0.3674123687669635,
"reward_std": 0.6027099043130875,
"rewards/cosine_scaled_reward": -0.10796047560870647,
"rewards/format_reward": 0.5833333283662796,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1896.9166870117188,
"epoch": 0.5244215938303342,
"grad_norm": 2.977555751800537,
"kl": 1.01171875,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.1578,
"reward": 0.43058543652296066,
"reward_std": 0.5935798361897469,
"rewards/cosine_scaled_reward": -0.12498506158590317,
"rewards/format_reward": 0.6805555671453476,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 2022.4584045410156,
"epoch": 0.5261353898886033,
"grad_norm": 2.7941606044769287,
"kl": 1.24609375,
"learning_rate": 4.503031760712397e-07,
"loss": 0.2441,
"reward": 0.7062125951051712,
"reward_std": 0.8090076595544815,
"rewards/cosine_scaled_reward": -0.0010603656992316246,
"rewards/format_reward": 0.7083333283662796,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2287.125030517578,
"epoch": 0.5278491859468724,
"grad_norm": 1.6866544485092163,
"kl": 1.287109375,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.2517,
"reward": 0.650560175999999,
"reward_std": 0.742147371172905,
"rewards/cosine_scaled_reward": 0.033613420091569424,
"rewards/format_reward": 0.5833333507180214,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 2066.5694580078125,
"epoch": 0.5295629820051414,
"grad_norm": 2.5993359088897705,
"kl": 1.2255859375,
"learning_rate": 4.441860491038345e-07,
"loss": 0.1916,
"reward": 0.6525638625025749,
"reward_std": 0.6665498167276382,
"rewards/cosine_scaled_reward": 0.02767082443460822,
"rewards/format_reward": 0.5972222313284874,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 2370.763885498047,
"epoch": 0.5312767780634104,
"grad_norm": 3.1745495796203613,
"kl": 1.205078125,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.167,
"reward": 0.4759225994348526,
"reward_std": 0.8299422115087509,
"rewards/cosine_scaled_reward": -0.04676092881709337,
"rewards/format_reward": 0.569444440305233,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 2503.5972290039062,
"epoch": 0.5329905741216795,
"grad_norm": 2.2356581687927246,
"kl": 1.376953125,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.2558,
"reward": 0.2966647706925869,
"reward_std": 0.5752375796437263,
"rewards/cosine_scaled_reward": -0.15722317062318325,
"rewards/format_reward": 0.6111111119389534,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 2278.0277404785156,
"epoch": 0.5347043701799485,
"grad_norm": 1.4038455486297607,
"kl": 1.203125,
"learning_rate": 4.350494089288943e-07,
"loss": 0.241,
"reward": 0.8164278883486986,
"reward_std": 0.7621838673949242,
"rewards/cosine_scaled_reward": 0.08876948896795511,
"rewards/format_reward": 0.638888880610466,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 1964.8333435058594,
"epoch": 0.5364181662382177,
"grad_norm": 5.611269950866699,
"kl": 1.6904296875,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.1802,
"reward": 0.5501147694885731,
"reward_std": 0.48398981615900993,
"rewards/cosine_scaled_reward": -0.07910929806530476,
"rewards/format_reward": 0.7083333283662796,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2596.0139770507812,
"epoch": 0.5381319622964867,
"grad_norm": 2.7361557483673096,
"kl": 0.904296875,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.1967,
"reward": 0.43212154414504766,
"reward_std": 0.8072051256895065,
"rewards/cosine_scaled_reward": -0.04088366776704788,
"rewards/format_reward": 0.5138888880610466,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2164.986083984375,
"epoch": 0.5398457583547558,
"grad_norm": 2.377624988555908,
"kl": 1.009765625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.2612,
"reward": 0.345002256333828,
"reward_std": 0.6840208172798157,
"rewards/cosine_scaled_reward": -0.13305442477576435,
"rewards/format_reward": 0.611111119389534,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 2112.861114501953,
"epoch": 0.5415595544130248,
"grad_norm": 2.4749066829681396,
"kl": 1.1328125,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.2521,
"reward": 0.3993903249502182,
"reward_std": 0.7594646960496902,
"rewards/cosine_scaled_reward": -0.09891596343368292,
"rewards/format_reward": 0.5972222238779068,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2763.791717529297,
"epoch": 0.5432733504712939,
"grad_norm": 3.959906816482544,
"kl": 1.5537109375,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.1382,
"reward": 0.3366717994213104,
"reward_std": 0.5981347486376762,
"rewards/cosine_scaled_reward": -0.10944187548011541,
"rewards/format_reward": 0.5555555559694767,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1693.9583129882812,
"epoch": 0.5449871465295629,
"grad_norm": 7.122504711151123,
"kl": 1.0234375,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.3755,
"reward": 1.1441613137722015,
"reward_std": 0.5525609478354454,
"rewards/cosine_scaled_reward": 0.21791397035121918,
"rewards/format_reward": 0.7083333432674408,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 2302.7222595214844,
"epoch": 0.5467009425878321,
"grad_norm": 1.7870283126831055,
"kl": 1.1513671875,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.2157,
"reward": 0.44585637911222875,
"reward_std": 0.5990823060274124,
"rewards/cosine_scaled_reward": -0.09651626879349351,
"rewards/format_reward": 0.638888880610466,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2418.6112060546875,
"epoch": 0.5484147386461011,
"grad_norm": 4.37020206451416,
"kl": 1.3271484375,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.1373,
"reward": 0.7411398887634277,
"reward_std": 0.8346492722630501,
"rewards/cosine_scaled_reward": 0.030292170122265816,
"rewards/format_reward": 0.6805555522441864,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 2477.4305419921875,
"epoch": 0.5501285347043702,
"grad_norm": 2.8605029582977295,
"kl": 0.9931640625,
"learning_rate": 4.079579333738039e-07,
"loss": 0.1566,
"reward": 0.3635707888752222,
"reward_std": 0.6505779251456261,
"rewards/cosine_scaled_reward": -0.14460349176079035,
"rewards/format_reward": 0.6527777835726738,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 1934.0694885253906,
"epoch": 0.5518423307626392,
"grad_norm": 2.2090539932250977,
"kl": 1.033935546875,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.1413,
"reward": 0.43895523250102997,
"reward_std": 0.6110691279172897,
"rewards/cosine_scaled_reward": -0.14857794775161892,
"rewards/format_reward": 0.7361111342906952,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 2659.2083129882812,
"epoch": 0.5535561268209083,
"grad_norm": 1.6186331510543823,
"kl": 0.92578125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.1545,
"reward": 0.43137288000434637,
"reward_std": 0.5587008334696293,
"rewards/cosine_scaled_reward": -0.041258019395172596,
"rewards/format_reward": 0.5138888955116272,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 2133.0,
"epoch": 0.5552699228791774,
"grad_norm": 2.309271812438965,
"kl": 0.646484375,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.2071,
"reward": 0.8334337323904037,
"reward_std": 0.7556089013814926,
"rewards/cosine_scaled_reward": 0.0486613066168502,
"rewards/format_reward": 0.7361111044883728,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2852.5416259765625,
"epoch": 0.5569837189374465,
"grad_norm": 1.367803692817688,
"kl": 1.1455078125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.2217,
"reward": 0.30423190630972385,
"reward_std": 0.6387892812490463,
"rewards/cosine_scaled_reward": -0.13955070948577486,
"rewards/format_reward": 0.5833333432674408,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2254.2638549804688,
"epoch": 0.5586975149957155,
"grad_norm": 2.456948757171631,
"kl": 0.90966796875,
"learning_rate": 3.931425787051832e-07,
"loss": 0.2717,
"reward": 0.7499970353674144,
"reward_std": 0.5177437886595726,
"rewards/cosine_scaled_reward": 0.06249852292239666,
"rewards/format_reward": 0.625,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 2643.263916015625,
"epoch": 0.5604113110539846,
"grad_norm": 2.975080728530884,
"kl": 0.96875,
"learning_rate": 3.902018669163384e-07,
"loss": 0.2016,
"reward": 0.37286074459552765,
"reward_std": 0.6334929168224335,
"rewards/cosine_scaled_reward": -0.06356962397694588,
"rewards/format_reward": 0.5000000074505806,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2430.6666564941406,
"epoch": 0.5621251071122536,
"grad_norm": 2.2276480197906494,
"kl": 1.1484375,
"learning_rate": 3.872689434630585e-07,
"loss": 0.2054,
"reward": 0.45167311653494835,
"reward_std": 0.6782046630978584,
"rewards/cosine_scaled_reward": -0.07971900515258312,
"rewards/format_reward": 0.6111111268401146,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 2118.3194885253906,
"epoch": 0.5638389031705227,
"grad_norm": 1.439645767211914,
"kl": 0.9873046875,
"learning_rate": 3.843439512918949e-07,
"loss": 0.1558,
"reward": 0.3222229927778244,
"reward_std": 0.5387292131781578,
"rewards/cosine_scaled_reward": -0.186110720038414,
"rewards/format_reward": 0.6944444477558136,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2615.3750610351562,
"epoch": 0.5655526992287918,
"grad_norm": 1.3368608951568604,
"kl": 0.822265625,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0814,
"reward": 0.19894374161958694,
"reward_std": 0.4315878227353096,
"rewards/cosine_scaled_reward": -0.17136146454140544,
"rewards/format_reward": 0.5416666641831398,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2637.2084045410156,
"epoch": 0.5672664952870609,
"grad_norm": 1.4436960220336914,
"kl": 0.78125,
"learning_rate": 3.785183306423767e-07,
"loss": 0.1763,
"reward": 0.555847343057394,
"reward_std": 0.7657169997692108,
"rewards/cosine_scaled_reward": 0.020979220047593117,
"rewards/format_reward": 0.5138888955116272,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 2674.9444580078125,
"epoch": 0.5689802913453299,
"grad_norm": 3.6536855697631836,
"kl": 1.30078125,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.1134,
"reward": 0.44342901557683945,
"reward_std": 0.6001273989677429,
"rewards/cosine_scaled_reward": -0.09772994555532932,
"rewards/format_reward": 0.6388888880610466,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 2896.0277709960938,
"epoch": 0.570694087403599,
"grad_norm": 1.9911452531814575,
"kl": 1.0078125,
"learning_rate": 3.72726140684072e-07,
"loss": 0.1034,
"reward": -0.17525828257203102,
"reward_std": 0.4928872212767601,
"rewards/cosine_scaled_reward": -0.24040691554546356,
"rewards/format_reward": 0.30555556528270245,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 2810.861083984375,
"epoch": 0.572407883461868,
"grad_norm": 1.8073278665542603,
"kl": 1.068359375,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.1821,
"reward": -0.006253276020288467,
"reward_std": 0.5080604404211044,
"rewards/cosine_scaled_reward": -0.2114599784836173,
"rewards/format_reward": 0.4166666753590107,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2462.65283203125,
"epoch": 0.5741216795201372,
"grad_norm": 1.5005158185958862,
"kl": 0.771484375,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.1649,
"reward": 0.45248544216156006,
"reward_std": 0.5002452582120895,
"rewards/cosine_scaled_reward": -0.07931282371282578,
"rewards/format_reward": 0.611111119389534,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 2160.125030517578,
"epoch": 0.5758354755784062,
"grad_norm": 3.1605849266052246,
"kl": 0.548828125,
"learning_rate": 3.641030065789562e-07,
"loss": 0.1752,
"reward": 0.24482397455722094,
"reward_std": 0.515994019806385,
"rewards/cosine_scaled_reward": -0.17619912140071392,
"rewards/format_reward": 0.597222238779068,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2403.3194580078125,
"epoch": 0.5775492716366752,
"grad_norm": 2.8763837814331055,
"kl": 0.736328125,
"learning_rate": 3.612465628992203e-07,
"loss": 0.1929,
"reward": 0.5475254282355309,
"reward_std": 0.6548926681280136,
"rewards/cosine_scaled_reward": -0.038737302646040916,
"rewards/format_reward": 0.6249999925494194,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2418.013946533203,
"epoch": 0.5792630676949443,
"grad_norm": 2.0177650451660156,
"kl": 0.6171875,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.2226,
"reward": 0.5356792770326138,
"reward_std": 0.9891562312841415,
"rewards/cosine_scaled_reward": -0.01688259281218052,
"rewards/format_reward": 0.5694444552063942,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 2717.888916015625,
"epoch": 0.5809768637532133,
"grad_norm": 2.011136293411255,
"kl": 0.8603515625,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0711,
"reward": 0.31583554670214653,
"reward_std": 0.6509723365306854,
"rewards/cosine_scaled_reward": -0.14069335255771875,
"rewards/format_reward": 0.5972222313284874,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2609.1805114746094,
"epoch": 0.5826906598114824,
"grad_norm": 2.1439146995544434,
"kl": 0.60546875,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.1085,
"reward": 0.26384788006544113,
"reward_std": 0.5596405640244484,
"rewards/cosine_scaled_reward": -0.12502050958573818,
"rewards/format_reward": 0.5138888917863369,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 2366.8055725097656,
"epoch": 0.5844044558697515,
"grad_norm": 3.087810516357422,
"kl": 1.16796875,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.2266,
"reward": 0.19622072577476501,
"reward_std": 0.43179403990507126,
"rewards/cosine_scaled_reward": -0.20050075091421604,
"rewards/format_reward": 0.5972222313284874,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2665.3055419921875,
"epoch": 0.5861182519280206,
"grad_norm": 2.5300474166870117,
"kl": 0.7939453125,
"learning_rate": 3.471051066897562e-07,
"loss": 0.1537,
"reward": 0.24374699965119362,
"reward_std": 0.7871415168046951,
"rewards/cosine_scaled_reward": -0.1350709474645555,
"rewards/format_reward": 0.5138889029622078,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 2231.2222900390625,
"epoch": 0.5878320479862896,
"grad_norm": 6.685680389404297,
"kl": 0.60302734375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.3184,
"reward": 0.4801894012489356,
"reward_std": 0.4852745458483696,
"rewards/cosine_scaled_reward": -0.02379419095814228,
"rewards/format_reward": 0.5277777835726738,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2502.8472290039062,
"epoch": 0.5895458440445587,
"grad_norm": 2.49977970123291,
"kl": 1.0634765625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.2075,
"reward": 0.22286849096417427,
"reward_std": 0.51853808760643,
"rewards/cosine_scaled_reward": -0.18023241311311722,
"rewards/format_reward": 0.5833333432674408,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 2553.8472290039062,
"epoch": 0.5912596401028277,
"grad_norm": 1.4922689199447632,
"kl": 0.7724609375,
"learning_rate": 3.387377967463493e-07,
"loss": 0.11,
"reward": 0.40987285412847996,
"reward_std": 0.6866099908947945,
"rewards/cosine_scaled_reward": -0.03117468417622149,
"rewards/format_reward": 0.4722222313284874,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 2434.250030517578,
"epoch": 0.5929734361610969,
"grad_norm": 2.287896156311035,
"kl": 0.73828125,
"learning_rate": 3.359691059183761e-07,
"loss": 0.1042,
"reward": 0.566213920712471,
"reward_std": 0.6637867465615273,
"rewards/cosine_scaled_reward": -0.05022636614739895,
"rewards/format_reward": 0.6666666865348816,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 2606.3056030273438,
"epoch": 0.5946872322193659,
"grad_norm": 5.195223808288574,
"kl": 0.8203125,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0878,
"reward": 0.30292151868343353,
"reward_std": 0.6232884004712105,
"rewards/cosine_scaled_reward": -0.09853924717754126,
"rewards/format_reward": 0.5000000074505806,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 2294.3194580078125,
"epoch": 0.596401028277635,
"grad_norm": 2.7749757766723633,
"kl": 1.12744140625,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.113,
"reward": 0.10566018056124449,
"reward_std": 0.4547986686229706,
"rewards/cosine_scaled_reward": -0.23883657529950142,
"rewards/format_reward": 0.5833333507180214,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 2798.375,
"epoch": 0.598114824335904,
"grad_norm": 4.372980117797852,
"kl": 0.76708984375,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0724,
"reward": 0.19539665430784225,
"reward_std": 0.547118715941906,
"rewards/cosine_scaled_reward": -0.0898016735445708,
"rewards/format_reward": 0.37500000186264515,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2323.1806030273438,
"epoch": 0.5998286203941731,
"grad_norm": 0.9666920900344849,
"kl": 0.42724609375,
"learning_rate": 3.250000000000001e-07,
"loss": 0.1139,
"reward": 0.3154673893004656,
"reward_std": 0.6931511759757996,
"rewards/cosine_scaled_reward": -0.1408774359151721,
"rewards/format_reward": 0.5972222238779068,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 2554.791717529297,
"epoch": 0.6015424164524421,
"grad_norm": 1.4160966873168945,
"kl": 0.673828125,
"learning_rate": 3.222848061454764e-07,
"loss": 0.1226,
"reward": 0.2959921658039093,
"reward_std": 0.6351921036839485,
"rewards/cosine_scaled_reward": -0.1922817062586546,
"rewards/format_reward": 0.6805555522441864,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2883.0972900390625,
"epoch": 0.6032562125107113,
"grad_norm": 1.3862693309783936,
"kl": 0.57470703125,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0944,
"reward": -0.15852557588368654,
"reward_std": 0.3618531711399555,
"rewards/cosine_scaled_reward": -0.22509612515568733,
"rewards/format_reward": 0.29166666977107525,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2766.3611755371094,
"epoch": 0.6049700085689803,
"grad_norm": 2.0312767028808594,
"kl": 0.47900390625,
"learning_rate": 3.168878457820915e-07,
"loss": 0.1735,
"reward": 0.0013820715248584747,
"reward_std": 0.4379913955926895,
"rewards/cosine_scaled_reward": -0.2284756200388074,
"rewards/format_reward": 0.4583333283662796,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 2303.4444885253906,
"epoch": 0.6066838046272494,
"grad_norm": 1.9564874172210693,
"kl": 0.583984375,
"learning_rate": 3.142063423134644e-07,
"loss": 0.1414,
"reward": 0.8931210651062429,
"reward_std": 0.7761038094758987,
"rewards/cosine_scaled_reward": 0.12017163541167974,
"rewards/format_reward": 0.6527777649462223,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2350.3333435058594,
"epoch": 0.6083976006855184,
"grad_norm": 3.0156519412994385,
"kl": 0.501953125,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0948,
"reward": 0.21542136371135712,
"reward_std": 0.4440060332417488,
"rewards/cosine_scaled_reward": -0.15617820341140032,
"rewards/format_reward": 0.5277777761220932,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 2587.5694580078125,
"epoch": 0.6101113967437874,
"grad_norm": 1.8885806798934937,
"kl": 0.7080078125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.1358,
"reward": 0.2837059774901718,
"reward_std": 0.5896440669894218,
"rewards/cosine_scaled_reward": -0.1637025810778141,
"rewards/format_reward": 0.6111111119389534,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 2377.9722290039062,
"epoch": 0.6118251928020566,
"grad_norm": 2.1884989738464355,
"kl": 0.7119140625,
"learning_rate": 3.062313053727671e-07,
"loss": 0.2069,
"reward": 0.5028799092397094,
"reward_std": 0.6587233692407608,
"rewards/cosine_scaled_reward": -0.04022672958672047,
"rewards/format_reward": 0.5833333358168602,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2724.02783203125,
"epoch": 0.6135389888603257,
"grad_norm": 1.1445249319076538,
"kl": 0.5654296875,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.1018,
"reward": 0.2546988914255053,
"reward_std": 0.7167258933186531,
"rewards/cosine_scaled_reward": -0.10876166447997093,
"rewards/format_reward": 0.4722222238779068,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2532.2638549804688,
"epoch": 0.6152527849185947,
"grad_norm": 2.5419840812683105,
"kl": 0.68505859375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0873,
"reward": 0.21593652665615082,
"reward_std": 0.6653575152158737,
"rewards/cosine_scaled_reward": -0.11425395932747051,
"rewards/format_reward": 0.4444444477558136,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 2583.513916015625,
"epoch": 0.6169665809768637,
"grad_norm": 3.3858025074005127,
"kl": 0.7587890625,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0324,
"reward": 0.317771688933135,
"reward_std": 0.6041048616170883,
"rewards/cosine_scaled_reward": -0.09111416153609753,
"rewards/format_reward": 0.4999999962747097,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 2334.1944274902344,
"epoch": 0.6186803770351328,
"grad_norm": 0.9652746915817261,
"kl": 0.50732421875,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0994,
"reward": 0.6510396376252174,
"reward_std": 0.8507343530654907,
"rewards/cosine_scaled_reward": 0.019964261911809444,
"rewards/format_reward": 0.6111111119389534,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 3043.9445190429688,
"epoch": 0.6203941730934018,
"grad_norm": 0.7930195927619934,
"kl": 0.67626953125,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0974,
"reward": 0.13819648325443268,
"reward_std": 0.5064943730831146,
"rewards/cosine_scaled_reward": -0.1531239915639162,
"rewards/format_reward": 0.4444444552063942,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 2825.7916259765625,
"epoch": 0.622107969151671,
"grad_norm": 0.6896006464958191,
"kl": 0.64794921875,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.1145,
"reward": 0.08600431494414806,
"reward_std": 0.5438744425773621,
"rewards/cosine_scaled_reward": -0.17227561306208372,
"rewards/format_reward": 0.4305555559694767,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 2422.3334045410156,
"epoch": 0.62382176520994,
"grad_norm": 1.5994491577148438,
"kl": 0.5615234375,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0825,
"reward": 0.5049359295517206,
"reward_std": 0.7525297850370407,
"rewards/cosine_scaled_reward": -0.06697649694979191,
"rewards/format_reward": 0.6388888955116272,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 2465.0556030273438,
"epoch": 0.6255355612682091,
"grad_norm": 7.009267330169678,
"kl": 0.66259765625,
"learning_rate": 2.854966364683872e-07,
"loss": 0.2849,
"reward": 0.15274390950798988,
"reward_std": 0.48094654455780983,
"rewards/cosine_scaled_reward": -0.18057249579578638,
"rewards/format_reward": 0.5138888880610466,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 2835.9861450195312,
"epoch": 0.6272493573264781,
"grad_norm": 1.8245147466659546,
"kl": 0.6435546875,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0872,
"reward": 0.43825584976002574,
"reward_std": 0.4411094859242439,
"rewards/cosine_scaled_reward": 0.031627919524908066,
"rewards/format_reward": 0.37500001210719347,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2408.2500610351562,
"epoch": 0.6289631533847472,
"grad_norm": 1.2879042625427246,
"kl": 0.64208984375,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.1245,
"reward": 0.5816475376486778,
"reward_std": 0.8480053022503853,
"rewards/cosine_scaled_reward": 0.026934866793453693,
"rewards/format_reward": 0.5277777761220932,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 3011.8472290039062,
"epoch": 0.6306769494430163,
"grad_norm": 1.5342601537704468,
"kl": 0.9501953125,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.1117,
"reward": -0.08722967363428324,
"reward_std": 0.4681037962436676,
"rewards/cosine_scaled_reward": -0.20333705097436905,
"rewards/format_reward": 0.31944445334374905,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 2088.1111450195312,
"epoch": 0.6323907455012854,
"grad_norm": 5.395845413208008,
"kl": 0.52392578125,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.278,
"reward": 0.8531668335199356,
"reward_std": 0.7198526412248611,
"rewards/cosine_scaled_reward": 0.07936117798089981,
"rewards/format_reward": 0.6944444477558136,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 2144.4583587646484,
"epoch": 0.6341045415595544,
"grad_norm": 1.9326905012130737,
"kl": 0.5986328125,
"learning_rate": 2.729523361034538e-07,
"loss": 0.165,
"reward": 0.7016473673284054,
"reward_std": 0.35017503798007965,
"rewards/cosine_scaled_reward": 0.04526812210679054,
"rewards/format_reward": 0.6111111268401146,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 2226.8333740234375,
"epoch": 0.6358183376178235,
"grad_norm": 6.064547538757324,
"kl": 0.47265625,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.2747,
"reward": 0.36747913248836994,
"reward_std": 0.47022923082113266,
"rewards/cosine_scaled_reward": -0.12876042909920216,
"rewards/format_reward": 0.6250000149011612,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 2646.166717529297,
"epoch": 0.6375321336760925,
"grad_norm": 1.8105882406234741,
"kl": 0.66259765625,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.1523,
"reward": 0.2788702640682459,
"reward_std": 0.7272945195436478,
"rewards/cosine_scaled_reward": -0.082787093706429,
"rewards/format_reward": 0.4444444440305233,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2658.513916015625,
"epoch": 0.6392459297343616,
"grad_norm": 0.861741304397583,
"kl": 0.56201171875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.127,
"reward": 0.25536923203617334,
"reward_std": 0.549317829310894,
"rewards/cosine_scaled_reward": -0.16398204606957734,
"rewards/format_reward": 0.5833333358168602,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 2475.7916564941406,
"epoch": 0.6409597257926307,
"grad_norm": 5.120214462280273,
"kl": 0.66943359375,
"learning_rate": 2.631592046130896e-07,
"loss": -0.0041,
"reward": 0.4401531554758549,
"reward_std": 0.5939441919326782,
"rewards/cosine_scaled_reward": -0.10631232312880456,
"rewards/format_reward": 0.6527777686715126,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 2241.4166564941406,
"epoch": 0.6426735218508998,
"grad_norm": 4.6052021980285645,
"kl": 0.68896484375,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.258,
"reward": 0.40211474522948265,
"reward_std": 0.6810158491134644,
"rewards/cosine_scaled_reward": -0.13922041468322277,
"rewards/format_reward": 0.6805555522441864,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 2345.541717529297,
"epoch": 0.6443873179091688,
"grad_norm": 1.593520164489746,
"kl": 0.65869140625,
"learning_rate": 2.583460445215911e-07,
"loss": 0.1983,
"reward": 0.4966873601078987,
"reward_std": 0.6450872495770454,
"rewards/cosine_scaled_reward": -0.0363785345107317,
"rewards/format_reward": 0.569444440305233,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2599.9583129882812,
"epoch": 0.6461011139674379,
"grad_norm": 1.0820523500442505,
"kl": 0.54345703125,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.1269,
"reward": 0.3041490036994219,
"reward_std": 0.5556300804018974,
"rewards/cosine_scaled_reward": -0.11181438341736794,
"rewards/format_reward": 0.5277777910232544,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 2423.1944274902344,
"epoch": 0.6478149100257069,
"grad_norm": 3.9577648639678955,
"kl": 0.46435546875,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.2539,
"reward": 0.3343656752258539,
"reward_std": 0.6136218756437302,
"rewards/cosine_scaled_reward": -0.14531716238707304,
"rewards/format_reward": 0.625,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 1991.0555419921875,
"epoch": 0.6495287060839761,
"grad_norm": 6.228683948516846,
"kl": 0.7626953125,
"learning_rate": 2.512332043064913e-07,
"loss": 0.2113,
"reward": 0.7852881997823715,
"reward_std": 0.7995356619358063,
"rewards/cosine_scaled_reward": 0.031532974913716316,
"rewards/format_reward": 0.722222238779068,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 2142.0972595214844,
"epoch": 0.6512425021422451,
"grad_norm": 4.392513751983643,
"kl": 0.8505859375,
"learning_rate": 2.488912271385139e-07,
"loss": 0.166,
"reward": 0.774210050702095,
"reward_std": 0.9235591739416122,
"rewards/cosine_scaled_reward": 0.01904946379363537,
"rewards/format_reward": 0.736111119389534,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 2721.166748046875,
"epoch": 0.6529562982005142,
"grad_norm": 0.7555143237113953,
"kl": 0.54931640625,
"learning_rate": 2.465639255873246e-07,
"loss": 0.1157,
"reward": 0.06699353083968163,
"reward_std": 0.6024204641580582,
"rewards/cosine_scaled_reward": -0.18872546032071114,
"rewards/format_reward": 0.4444444477558136,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 2592.3472595214844,
"epoch": 0.6546700942587832,
"grad_norm": 1.4892374277114868,
"kl": 0.5986328125,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.164,
"reward": 0.4388514533638954,
"reward_std": 0.7740809172391891,
"rewards/cosine_scaled_reward": -0.07918539177626371,
"rewards/format_reward": 0.597222238779068,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2303.125,
"epoch": 0.6563838903170522,
"grad_norm": 1.8696836233139038,
"kl": 0.63427734375,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0839,
"reward": 0.2414467092603445,
"reward_std": 0.5401086919009686,
"rewards/cosine_scaled_reward": -0.14316555112600327,
"rewards/format_reward": 0.5277777835726738,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 2894.138916015625,
"epoch": 0.6580976863753213,
"grad_norm": 2.512624740600586,
"kl": 0.7236328125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.188,
"reward": -0.111133978003636,
"reward_std": 0.4146636873483658,
"rewards/cosine_scaled_reward": -0.256955873221159,
"rewards/format_reward": 0.4027777798473835,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2577.1805725097656,
"epoch": 0.6598114824335904,
"grad_norm": 1.5134508609771729,
"kl": 0.75341796875,
"learning_rate": 2.374037332934512e-07,
"loss": 0.1359,
"reward": 0.12209473713301122,
"reward_std": 0.42869339138269424,
"rewards/cosine_scaled_reward": -0.18200820498168468,
"rewards/format_reward": 0.4861111156642437,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 2836.388916015625,
"epoch": 0.6615252784918595,
"grad_norm": 1.6320090293884277,
"kl": 0.71435546875,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.1314,
"reward": 0.027245239354670048,
"reward_std": 0.5338631048798561,
"rewards/cosine_scaled_reward": -0.21554404497146606,
"rewards/format_reward": 0.4583333432674408,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 2370.5416870117188,
"epoch": 0.6632390745501285,
"grad_norm": 2.790175437927246,
"kl": 0.69384765625,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.1291,
"reward": 0.45274626836180687,
"reward_std": 0.5044268742203712,
"rewards/cosine_scaled_reward": -0.07223799102939665,
"rewards/format_reward": 0.5972222313284874,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 2043.15283203125,
"epoch": 0.6649528706083976,
"grad_norm": 2.196779251098633,
"kl": 0.91259765625,
"learning_rate": 2.306931685585657e-07,
"loss": 0.1933,
"reward": 0.6870926842093468,
"reward_std": 0.7499307841062546,
"rewards/cosine_scaled_reward": -0.017564778798259795,
"rewards/format_reward": 0.722222238779068,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 2507.999969482422,
"epoch": 0.6666666666666666,
"grad_norm": 4.833599090576172,
"kl": 0.7890625,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.1485,
"reward": 0.48738833516836166,
"reward_std": 0.3942640535533428,
"rewards/cosine_scaled_reward": -0.047972507774829865,
"rewards/format_reward": 0.5833333283662796,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 2180.5833740234375,
"epoch": 0.6683804627249358,
"grad_norm": 4.208037853240967,
"kl": 0.57275390625,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.2068,
"reward": 0.6339845806360245,
"reward_std": 0.8561032116413116,
"rewards/cosine_scaled_reward": -0.009396598441526294,
"rewards/format_reward": 0.6527777761220932,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 2664.0,
"epoch": 0.6700942587832048,
"grad_norm": 1.711565375328064,
"kl": 0.57421875,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.2027,
"reward": 0.2511326225940138,
"reward_std": 0.7724436074495316,
"rewards/cosine_scaled_reward": -0.13137813284993172,
"rewards/format_reward": 0.5138888955116272,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 2048.1945190429688,
"epoch": 0.6718080548414739,
"grad_norm": 7.40539026260376,
"kl": 0.576171875,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.2932,
"reward": 0.390616811811924,
"reward_std": 0.46938444674015045,
"rewards/cosine_scaled_reward": -0.1519138067960739,
"rewards/format_reward": 0.6944444552063942,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 2850.15283203125,
"epoch": 0.6735218508997429,
"grad_norm": 2.314105272293091,
"kl": 0.712890625,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.1382,
"reward": 0.12740344926714897,
"reward_std": 0.5854331143200397,
"rewards/cosine_scaled_reward": -0.1307427268475294,
"rewards/format_reward": 0.3888888992369175,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 2054.3334045410156,
"epoch": 0.675235646958012,
"grad_norm": 3.0562775135040283,
"kl": 0.958984375,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.1762,
"reward": 0.33383211493492126,
"reward_std": 0.6097967401146889,
"rewards/cosine_scaled_reward": -0.15947283059358597,
"rewards/format_reward": 0.6527777835726738,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 2001.4027709960938,
"epoch": 0.676949443016281,
"grad_norm": 2.749018907546997,
"kl": 1.3720703125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.2474,
"reward": 0.46511383540928364,
"reward_std": 0.5483391135931015,
"rewards/cosine_scaled_reward": -0.11466531874611974,
"rewards/format_reward": 0.6944444626569748,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 2663.9722900390625,
"epoch": 0.6786632390745502,
"grad_norm": 1.3055802583694458,
"kl": 0.8837890625,
"learning_rate": 2.134908592756607e-07,
"loss": 0.1475,
"reward": 0.21666064485907555,
"reward_std": 0.8081866502761841,
"rewards/cosine_scaled_reward": -0.14861411787569523,
"rewards/format_reward": 0.5138889029622078,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 2148.027801513672,
"epoch": 0.6803770351328192,
"grad_norm": 2.2016310691833496,
"kl": 1.193359375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.2683,
"reward": 0.3172401809133589,
"reward_std": 0.5794945135712624,
"rewards/cosine_scaled_reward": -0.13999101985245943,
"rewards/format_reward": 0.5972222164273262,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 2772.2778930664062,
"epoch": 0.6820908311910883,
"grad_norm": 5.671627044677734,
"kl": 0.994140625,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.1316,
"reward": 0.22605895064771175,
"reward_std": 0.528959184885025,
"rewards/cosine_scaled_reward": -0.12308163847774267,
"rewards/format_reward": 0.4722222313284874,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 1907.2083740234375,
"epoch": 0.6838046272493573,
"grad_norm": 4.919534206390381,
"kl": 1.0986328125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.1523,
"reward": 0.8099863529205322,
"reward_std": 0.7783814370632172,
"rewards/cosine_scaled_reward": 0.03693760558962822,
"rewards/format_reward": 0.7361111119389534,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 2289.3195190429688,
"epoch": 0.6855184233076264,
"grad_norm": 4.336697578430176,
"kl": 0.9921875,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.125,
"reward": 0.08474167913664132,
"reward_std": 0.4911258965730667,
"rewards/cosine_scaled_reward": -0.21457360684871674,
"rewards/format_reward": 0.5138888955116272,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 2537.0833740234375,
"epoch": 0.6872322193658955,
"grad_norm": 1.9845013618469238,
"kl": 0.7470703125,
"learning_rate": 2.032690407508949e-07,
"loss": 0.1521,
"reward": 0.20629934733733535,
"reward_std": 0.5084620639681816,
"rewards/cosine_scaled_reward": -0.17462810222059488,
"rewards/format_reward": 0.5555555671453476,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2364.2222595214844,
"epoch": 0.6889460154241646,
"grad_norm": 4.490449905395508,
"kl": 0.85595703125,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0678,
"reward": 0.2729727178812027,
"reward_std": 0.40766991674900055,
"rewards/cosine_scaled_reward": -0.16906920075416565,
"rewards/format_reward": 0.6111111044883728,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2377.500030517578,
"epoch": 0.6906598114824336,
"grad_norm": 2.0314667224884033,
"kl": 0.9287109375,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.1243,
"reward": 0.7173348069190979,
"reward_std": 0.6178643703460693,
"rewards/cosine_scaled_reward": 0.03922295683878474,
"rewards/format_reward": 0.6388888955116272,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 2395.3472595214844,
"epoch": 0.6923736075407027,
"grad_norm": 3.6266534328460693,
"kl": 1.1142578125,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.1186,
"reward": 0.37652647122740746,
"reward_std": 0.6333749815821648,
"rewards/cosine_scaled_reward": -0.11034788191318512,
"rewards/format_reward": 0.5972222238779068,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2634.7222595214844,
"epoch": 0.6940874035989717,
"grad_norm": 1.629310131072998,
"kl": 0.87353515625,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.1284,
"reward": 0.30899196676909924,
"reward_std": 0.5874167829751968,
"rewards/cosine_scaled_reward": -0.11633734963834286,
"rewards/format_reward": 0.541666679084301,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 1720.5416870117188,
"epoch": 0.6958011996572407,
"grad_norm": 3.2341248989105225,
"kl": 1.0625,
"learning_rate": 1.934696604901642e-07,
"loss": 0.1949,
"reward": 0.5183681361377239,
"reward_std": 0.5259700566530228,
"rewards/cosine_scaled_reward": -0.08109369967132807,
"rewards/format_reward": 0.680555559694767,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 2396.875,
"epoch": 0.6975149957155099,
"grad_norm": 2.575775146484375,
"kl": 0.56005859375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.1577,
"reward": 0.16498053632676601,
"reward_std": 0.6976238563656807,
"rewards/cosine_scaled_reward": -0.17445417866110802,
"rewards/format_reward": 0.5138888955116272,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2211.986114501953,
"epoch": 0.699228791773779,
"grad_norm": 5.147465229034424,
"kl": 0.939453125,
"learning_rate": 1.8967088307307e-07,
"loss": 0.264,
"reward": 0.8435009941458702,
"reward_std": 0.8539558947086334,
"rewards/cosine_scaled_reward": 0.1370282769203186,
"rewards/format_reward": 0.569444440305233,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 2221.791717529297,
"epoch": 0.700942587832048,
"grad_norm": 3.616407632827759,
"kl": 1.2978515625,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.2031,
"reward": 0.5767598450183868,
"reward_std": 0.6021636947989464,
"rewards/cosine_scaled_reward": -0.01717562135308981,
"rewards/format_reward": 0.611111119389534,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 2195.5833129882812,
"epoch": 0.702656383890317,
"grad_norm": 4.223770618438721,
"kl": 0.9541015625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0719,
"reward": 0.4589345343410969,
"reward_std": 0.5643011257052422,
"rewards/cosine_scaled_reward": -0.09692162275314331,
"rewards/format_reward": 0.6527777835726738,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 2020.15283203125,
"epoch": 0.7043701799485861,
"grad_norm": 4.778375148773193,
"kl": 0.9111328125,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.2462,
"reward": 0.4322133334353566,
"reward_std": 0.5240239724516869,
"rewards/cosine_scaled_reward": -0.11722666956484318,
"rewards/format_reward": 0.666666679084301,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 2159.5694580078125,
"epoch": 0.7060839760068551,
"grad_norm": 5.010425090789795,
"kl": 1.271484375,
"learning_rate": 1.822847957491922e-07,
"loss": 0.1504,
"reward": 0.27721285074949265,
"reward_std": 0.3799732178449631,
"rewards/cosine_scaled_reward": -0.19472691789269447,
"rewards/format_reward": 0.6666666567325592,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 1994.1805419921875,
"epoch": 0.7077977720651243,
"grad_norm": 3.7414398193359375,
"kl": 0.79296875,
"learning_rate": 1.804828558898332e-07,
"loss": 0.2209,
"reward": 0.4823665115982294,
"reward_std": 0.8085788935422897,
"rewards/cosine_scaled_reward": -0.12687229178845882,
"rewards/format_reward": 0.736111119389534,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2582.513885498047,
"epoch": 0.7095115681233933,
"grad_norm": 2.3787803649902344,
"kl": 0.955078125,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.234,
"reward": 0.35093772783875465,
"reward_std": 0.7004451155662537,
"rewards/cosine_scaled_reward": -0.06758668273687363,
"rewards/format_reward": 0.4861111119389534,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2286.027801513672,
"epoch": 0.7112253641816624,
"grad_norm": 2.242143154144287,
"kl": 1.1669921875,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.2089,
"reward": 0.0793907418847084,
"reward_std": 0.4775719493627548,
"rewards/cosine_scaled_reward": -0.2519712895154953,
"rewards/format_reward": 0.5833333320915699,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 2716.8194885253906,
"epoch": 0.7129391602399314,
"grad_norm": 1.0189129114151,
"kl": 1.0341796875,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.1319,
"reward": 0.3344786912202835,
"reward_std": 0.6283555030822754,
"rewards/cosine_scaled_reward": -0.09664955246262252,
"rewards/format_reward": 0.5277777835726738,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2145.777786254883,
"epoch": 0.7146529562982005,
"grad_norm": 3.3594307899475098,
"kl": 0.630859375,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.1719,
"reward": 0.29561759158968925,
"reward_std": 0.45837917923927307,
"rewards/cosine_scaled_reward": -0.16469121165573597,
"rewards/format_reward": 0.6250000074505806,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2316.4722595214844,
"epoch": 0.7163667523564696,
"grad_norm": 3.205843448638916,
"kl": 0.9482421875,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.1905,
"reward": 0.1763996873050928,
"reward_std": 0.35552147775888443,
"rewards/cosine_scaled_reward": -0.16874459758400917,
"rewards/format_reward": 0.5138888880610466,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 1993.9166564941406,
"epoch": 0.7180805484147387,
"grad_norm": 5.31653356552124,
"kl": 1.2333984375,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.1316,
"reward": 0.38526383973658085,
"reward_std": 0.3629095181822777,
"rewards/cosine_scaled_reward": -0.16153474483871832,
"rewards/format_reward": 0.7083333283662796,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 2585.3472290039062,
"epoch": 0.7197943444730077,
"grad_norm": 3.0050301551818848,
"kl": 1.2138671875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.2609,
"reward": 0.03709686268121004,
"reward_std": 0.45924656093120575,
"rewards/cosine_scaled_reward": -0.1967293554916978,
"rewards/format_reward": 0.430555559694767,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 2077.4166870117188,
"epoch": 0.7215081405312768,
"grad_norm": 2.9638571739196777,
"kl": 1.1162109375,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.1967,
"reward": 0.4120505638420582,
"reward_std": 0.7001288831233978,
"rewards/cosine_scaled_reward": -0.09258583001792431,
"rewards/format_reward": 0.5972222313284874,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 1782.4722290039062,
"epoch": 0.7232219365895458,
"grad_norm": 2.496225595474243,
"kl": 1.1123046875,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.3012,
"reward": 0.3580199657008052,
"reward_std": 0.5790654197335243,
"rewards/cosine_scaled_reward": -0.18904556892812252,
"rewards/format_reward": 0.7361111044883728,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2290.388916015625,
"epoch": 0.7249357326478149,
"grad_norm": 2.5555100440979004,
"kl": 0.8837890625,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.1268,
"reward": 0.4261997193098068,
"reward_std": 0.6714624091982841,
"rewards/cosine_scaled_reward": -0.1341223642230034,
"rewards/format_reward": 0.6944444552063942,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 2174.3055419921875,
"epoch": 0.726649528706084,
"grad_norm": 4.850281715393066,
"kl": 0.9345703125,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.2657,
"reward": 0.31390602327883244,
"reward_std": 0.5223901495337486,
"rewards/cosine_scaled_reward": -0.1833247635513544,
"rewards/format_reward": 0.6805555820465088,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 2507.8334045410156,
"epoch": 0.7283633247643531,
"grad_norm": 3.5151827335357666,
"kl": 1.26953125,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.1979,
"reward": 0.6333100497722626,
"reward_std": 0.7416208535432816,
"rewards/cosine_scaled_reward": 0.031932787562254816,
"rewards/format_reward": 0.569444440305233,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 1761.2083740234375,
"epoch": 0.7300771208226221,
"grad_norm": 3.2045891284942627,
"kl": 1.076171875,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.1263,
"reward": 0.9971873387694359,
"reward_std": 0.7048115953803062,
"rewards/cosine_scaled_reward": 0.0749825444072485,
"rewards/format_reward": 0.8472222238779068,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 2577.9861450195312,
"epoch": 0.7317909168808912,
"grad_norm": 1.8627033233642578,
"kl": 1.14453125,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.2595,
"reward": 0.2531158346682787,
"reward_std": 0.6184235513210297,
"rewards/cosine_scaled_reward": -0.10955319553613663,
"rewards/format_reward": 0.4722222276031971,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 1856.7222595214844,
"epoch": 0.7335047129391602,
"grad_norm": 4.033189296722412,
"kl": 1.1201171875,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.263,
"reward": 0.7039023488759995,
"reward_std": 0.8175256699323654,
"rewards/cosine_scaled_reward": -0.009159944485872984,
"rewards/format_reward": 0.7222222238779068,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 2663.4861450195312,
"epoch": 0.7352185089974294,
"grad_norm": 3.801396369934082,
"kl": 1.130859375,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.15,
"reward": 0.34896907582879066,
"reward_std": 0.5518276765942574,
"rewards/cosine_scaled_reward": -0.12412657774984837,
"rewards/format_reward": 0.597222238779068,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 2153.555618286133,
"epoch": 0.7369323050556984,
"grad_norm": 2.9870073795318604,
"kl": 0.9130859375,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.1497,
"reward": 0.44994640722870827,
"reward_std": 0.3946686089038849,
"rewards/cosine_scaled_reward": -0.08752679079771042,
"rewards/format_reward": 0.6250000074505806,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 2261.75,
"epoch": 0.7386461011139674,
"grad_norm": 6.578658580780029,
"kl": 1.78515625,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.1464,
"reward": 0.35643661208450794,
"reward_std": 0.5088120512664318,
"rewards/cosine_scaled_reward": -0.12039280403405428,
"rewards/format_reward": 0.5972222238779068,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 2386.652801513672,
"epoch": 0.7403598971722365,
"grad_norm": 4.385483741760254,
"kl": 1.3447265625,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.3579,
"reward": 0.4007231565192342,
"reward_std": 0.6231922283768654,
"rewards/cosine_scaled_reward": -0.09824953693896532,
"rewards/format_reward": 0.5972222238779068,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2001.3611297607422,
"epoch": 0.7420736932305055,
"grad_norm": 4.371149063110352,
"kl": 1.0439453125,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0871,
"reward": 0.6510265804827213,
"reward_std": 0.4398561269044876,
"rewards/cosine_scaled_reward": -0.04254225082695484,
"rewards/format_reward": 0.736111119389534,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 1622.6666870117188,
"epoch": 0.7437874892887746,
"grad_norm": 6.787911891937256,
"kl": 1.326171875,
"learning_rate": 1.469297078922642e-07,
"loss": 0.3842,
"reward": 0.46582701057195663,
"reward_std": 0.5145231448113918,
"rewards/cosine_scaled_reward": -0.14208650775253773,
"rewards/format_reward": 0.7500000149011612,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2658.7500610351562,
"epoch": 0.7455012853470437,
"grad_norm": 2.6709697246551514,
"kl": 1.232421875,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0821,
"reward": 0.31399114802479744,
"reward_std": 0.5854284539818764,
"rewards/cosine_scaled_reward": -0.10689331218600273,
"rewards/format_reward": 0.5277777835726738,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 1860.9583435058594,
"epoch": 0.7472150814053128,
"grad_norm": 3.8863277435302734,
"kl": 0.91796875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.2564,
"reward": 0.5377090591937304,
"reward_std": 0.5195211619138718,
"rewards/cosine_scaled_reward": -0.0853121317923069,
"rewards/format_reward": 0.7083333283662796,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 2466.2361755371094,
"epoch": 0.7489288774635818,
"grad_norm": 2.9903695583343506,
"kl": 1.16015625,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.1351,
"reward": 0.4385749250650406,
"reward_std": 0.6242729276418686,
"rewards/cosine_scaled_reward": -0.10015699185896665,
"rewards/format_reward": 0.6388889029622078,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 2362.8611450195312,
"epoch": 0.7506426735218509,
"grad_norm": 1.599947214126587,
"kl": 1.189453125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.1863,
"reward": 0.6738657765090466,
"reward_std": 0.6156510934233665,
"rewards/cosine_scaled_reward": 0.03137733961921185,
"rewards/format_reward": 0.611111119389534,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 2748.2916870117188,
"epoch": 0.7523564695801199,
"grad_norm": 2.867025136947632,
"kl": 1.033203125,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.1123,
"reward": 0.22596902353689075,
"reward_std": 0.5135553628206253,
"rewards/cosine_scaled_reward": -0.13701549544930458,
"rewards/format_reward": 0.5000000074505806,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 2215.277801513672,
"epoch": 0.7540702656383891,
"grad_norm": 5.796390533447266,
"kl": 1.396484375,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.1245,
"reward": 0.6629978334531188,
"reward_std": 0.5948286652565002,
"rewards/cosine_scaled_reward": 0.0051100607961416245,
"rewards/format_reward": 0.6527777910232544,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 2402.7222595214844,
"epoch": 0.7557840616966581,
"grad_norm": 5.96156644821167,
"kl": 1.34375,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0634,
"reward": 0.5063027180731297,
"reward_std": 0.6581330522894859,
"rewards/cosine_scaled_reward": -0.08018200099468231,
"rewards/format_reward": 0.6666666716337204,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2082.263916015625,
"epoch": 0.7574978577549272,
"grad_norm": 3.405839443206787,
"kl": 1.6640625,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.2784,
"reward": 0.5193299576640129,
"reward_std": 0.5714153945446014,
"rewards/cosine_scaled_reward": -0.05977945402264595,
"rewards/format_reward": 0.6388888880610466,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 1665.7638854980469,
"epoch": 0.7592116538131962,
"grad_norm": 5.792540550231934,
"kl": 1.0849609375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.1343,
"reward": 0.6929136589169502,
"reward_std": 0.636933371424675,
"rewards/cosine_scaled_reward": -0.056320954114198685,
"rewards/format_reward": 0.8055555671453476,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 2291.27783203125,
"epoch": 0.7609254498714653,
"grad_norm": 5.360567569732666,
"kl": 1.2060546875,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.18,
"reward": 0.36108400439843535,
"reward_std": 0.5261719971895218,
"rewards/cosine_scaled_reward": -0.15973576810210943,
"rewards/format_reward": 0.6805555447936058,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 2755.4583740234375,
"epoch": 0.7626392459297343,
"grad_norm": 2.5989925861358643,
"kl": 0.875,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.1482,
"reward": 0.14870610460639,
"reward_std": 0.567838903516531,
"rewards/cosine_scaled_reward": -0.18259140476584435,
"rewards/format_reward": 0.5138888880610466,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2247.9583435058594,
"epoch": 0.7643530419880035,
"grad_norm": 5.224709987640381,
"kl": 1.005859375,
"learning_rate": 1.316005813502869e-07,
"loss": 0.321,
"reward": 0.5112787692341954,
"reward_std": 0.6983462646603584,
"rewards/cosine_scaled_reward": -0.049916195683181286,
"rewards/format_reward": 0.6111111119389534,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2349.3055725097656,
"epoch": 0.7660668380462725,
"grad_norm": 3.252889633178711,
"kl": 1.216796875,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.1445,
"reward": 0.32806872576475143,
"reward_std": 0.7308538854122162,
"rewards/cosine_scaled_reward": -0.1276322863996029,
"rewards/format_reward": 0.5833333358168602,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 2661.5555725097656,
"epoch": 0.7677806341045416,
"grad_norm": 2.3366446495056152,
"kl": 0.8486328125,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0872,
"reward": 0.36861317604780197,
"reward_std": 0.6462560296058655,
"rewards/cosine_scaled_reward": -0.051804508082568645,
"rewards/format_reward": 0.47222223225980997,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 2592.4444580078125,
"epoch": 0.7694944301628106,
"grad_norm": 4.123133182525635,
"kl": 1.052734375,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.2043,
"reward": 0.2611931987339631,
"reward_std": 0.7008328437805176,
"rewards/cosine_scaled_reward": -0.147181186825037,
"rewards/format_reward": 0.5555555522441864,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 1906.3333740234375,
"epoch": 0.7712082262210797,
"grad_norm": 3.090589761734009,
"kl": 0.865234375,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.1642,
"reward": 0.5996736511588097,
"reward_std": 0.5084411576390266,
"rewards/cosine_scaled_reward": -0.06821873132139444,
"rewards/format_reward": 0.736111119389534,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 2815.4166870117188,
"epoch": 0.7729220222793488,
"grad_norm": 2.466654062271118,
"kl": 0.7978515625,
"learning_rate": 1.260741462457165e-07,
"loss": 0.1236,
"reward": 0.07433861424215138,
"reward_std": 0.5574841573834419,
"rewards/cosine_scaled_reward": -0.21977514401078224,
"rewards/format_reward": 0.5138888917863369,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2436.8611450195312,
"epoch": 0.7746358183376179,
"grad_norm": 2.854764699935913,
"kl": 0.8505859375,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.1263,
"reward": 0.462260864675045,
"reward_std": 0.7514103129506111,
"rewards/cosine_scaled_reward": -0.09525846503674984,
"rewards/format_reward": 0.6527777761220932,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2337.7361755371094,
"epoch": 0.7763496143958869,
"grad_norm": 3.1975936889648438,
"kl": 0.97265625,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.1955,
"reward": 0.39777151867747307,
"reward_std": 0.6588628813624382,
"rewards/cosine_scaled_reward": -0.11361423693597317,
"rewards/format_reward": 0.6250000074505806,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 2078.0972595214844,
"epoch": 0.778063410454156,
"grad_norm": 1.6080825328826904,
"kl": 1.15625,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.2023,
"reward": 0.7216087523847818,
"reward_std": 0.7977120280265808,
"rewards/cosine_scaled_reward": 0.006637714395765215,
"rewards/format_reward": 0.7083333283662796,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 2211.7916259765625,
"epoch": 0.779777206512425,
"grad_norm": 3.7410457134246826,
"kl": 1.00390625,
"learning_rate": 1.220245676671809e-07,
"loss": 0.2082,
"reward": 0.4414830207824707,
"reward_std": 0.6565307825803757,
"rewards/cosine_scaled_reward": -0.10564738605171442,
"rewards/format_reward": 0.6527777761220932,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 2365.52783203125,
"epoch": 0.781491002570694,
"grad_norm": 6.645061492919922,
"kl": 0.64013671875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.2263,
"reward": 0.3324281768873334,
"reward_std": 0.5864489898085594,
"rewards/cosine_scaled_reward": -0.1046192436479032,
"rewards/format_reward": 0.5416666641831398,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2122.5833435058594,
"epoch": 0.7832047986289632,
"grad_norm": 2.3880536556243896,
"kl": 1.04736328125,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.1423,
"reward": 0.8056632168591022,
"reward_std": 0.6164202988147736,
"rewards/cosine_scaled_reward": 0.020887171383947134,
"rewards/format_reward": 0.7638888955116272,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 1959.8056030273438,
"epoch": 0.7849185946872322,
"grad_norm": 4.885958671569824,
"kl": 0.94287109375,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.2544,
"reward": 0.4146232455968857,
"reward_std": 0.5990116819739342,
"rewards/cosine_scaled_reward": -0.12602169532328844,
"rewards/format_reward": 0.6666666865348816,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 2254.0694274902344,
"epoch": 0.7866323907455013,
"grad_norm": 2.2345101833343506,
"kl": 0.53564453125,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0455,
"reward": 0.4500209465622902,
"reward_std": 0.5013090819120407,
"rewards/cosine_scaled_reward": -0.10137841757386923,
"rewards/format_reward": 0.652777798473835,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2215.1666564941406,
"epoch": 0.7883461868037703,
"grad_norm": 4.272637367248535,
"kl": 0.92236328125,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0789,
"reward": 0.44543247297406197,
"reward_std": 0.5984909385442734,
"rewards/cosine_scaled_reward": -0.10367265064269304,
"rewards/format_reward": 0.6527777910232544,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 1930.5000305175781,
"epoch": 0.7900599828620394,
"grad_norm": 2.8687753677368164,
"kl": 1.2333984375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.1984,
"reward": 0.4672253951430321,
"reward_std": 0.6156143024563789,
"rewards/cosine_scaled_reward": -0.13444286305457354,
"rewards/format_reward": 0.736111119389534,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 1981.9583740234375,
"epoch": 0.7917737789203085,
"grad_norm": 3.0882349014282227,
"kl": 0.7373046875,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0448,
"reward": 0.5570826064795256,
"reward_std": 0.6341868117451668,
"rewards/cosine_scaled_reward": -0.08951424108818173,
"rewards/format_reward": 0.7361111119389534,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 1936.4306030273438,
"epoch": 0.7934875749785776,
"grad_norm": 2.4561548233032227,
"kl": 0.7119140625,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.2065,
"reward": 0.5815738141536713,
"reward_std": 0.7455588281154633,
"rewards/cosine_scaled_reward": -0.08421308733522892,
"rewards/format_reward": 0.7500000149011612,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2648.277801513672,
"epoch": 0.7952013710368466,
"grad_norm": 1.9183648824691772,
"kl": 1.189453125,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.1831,
"reward": 0.4944647327065468,
"reward_std": 0.5960628166794777,
"rewards/cosine_scaled_reward": -0.04443428758531809,
"rewards/format_reward": 0.5833333358168602,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 3089.4722290039062,
"epoch": 0.7969151670951157,
"grad_norm": 1.3800582885742188,
"kl": 0.88671875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.1132,
"reward": 0.17487204633653164,
"reward_std": 0.6750592887401581,
"rewards/cosine_scaled_reward": -0.1278417520225048,
"rewards/format_reward": 0.430555559694767,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2615.013916015625,
"epoch": 0.7986289631533847,
"grad_norm": 2.8072264194488525,
"kl": 0.93115234375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.15,
"reward": 0.17807744164019823,
"reward_std": 0.6022924780845642,
"rewards/cosine_scaled_reward": -0.11929461418185383,
"rewards/format_reward": 0.4166666679084301,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 2457.3750610351562,
"epoch": 0.8003427592116538,
"grad_norm": 4.940661430358887,
"kl": 0.7890625,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.1464,
"reward": 0.24202457256615162,
"reward_std": 0.42437436431646347,
"rewards/cosine_scaled_reward": -0.19148772559128702,
"rewards/format_reward": 0.6249999925494194,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 2192.986114501953,
"epoch": 0.8020565552699229,
"grad_norm": 3.136319637298584,
"kl": 0.767578125,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.1721,
"reward": 0.6284131053835154,
"reward_std": 0.5748142190277576,
"rewards/cosine_scaled_reward": -0.012182342819869518,
"rewards/format_reward": 0.6527777761220932,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 2432.8333435058594,
"epoch": 0.803770351328192,
"grad_norm": 1.9713729619979858,
"kl": 0.8603515625,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.1523,
"reward": 0.5457211770117283,
"reward_std": 0.729132629930973,
"rewards/cosine_scaled_reward": -0.0535283163189888,
"rewards/format_reward": 0.6527777910232544,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2276.4305725097656,
"epoch": 0.805484147386461,
"grad_norm": 3.8467977046966553,
"kl": 1.216796875,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.1232,
"reward": 0.5122000686824322,
"reward_std": 0.7733886539936066,
"rewards/cosine_scaled_reward": -0.09112219791859388,
"rewards/format_reward": 0.6944444552063942,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 2643.7083129882812,
"epoch": 0.8071979434447301,
"grad_norm": 1.1509345769882202,
"kl": 0.7216796875,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.1381,
"reward": 0.10054661217145622,
"reward_std": 0.6373118087649345,
"rewards/cosine_scaled_reward": -0.16500448435544968,
"rewards/format_reward": 0.4305555745959282,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 2733.8056030273438,
"epoch": 0.8089117395029991,
"grad_norm": 1.7471221685409546,
"kl": 0.60986328125,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.1037,
"reward": 0.3268199451267719,
"reward_std": 0.7872605472803116,
"rewards/cosine_scaled_reward": -0.06575669860467315,
"rewards/format_reward": 0.4583333507180214,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2760.416717529297,
"epoch": 0.8106255355612683,
"grad_norm": 2.182706832885742,
"kl": 0.7177734375,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.103,
"reward": 0.3022213885560632,
"reward_std": 0.5640696436166763,
"rewards/cosine_scaled_reward": -0.15444485377520323,
"rewards/format_reward": 0.611111119389534,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2262.4166564941406,
"epoch": 0.8123393316195373,
"grad_norm": 2.2662978172302246,
"kl": 1.310546875,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.2095,
"reward": 0.817143252119422,
"reward_std": 0.5297734513878822,
"rewards/cosine_scaled_reward": 0.0960716437548399,
"rewards/format_reward": 0.6250000149011612,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 2711.9722595214844,
"epoch": 0.8140531276778064,
"grad_norm": 5.152209758758545,
"kl": 0.900390625,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0199,
"reward": 0.1260463148355484,
"reward_std": 0.5338724106550217,
"rewards/cosine_scaled_reward": -0.1939212940633297,
"rewards/format_reward": 0.5138888917863369,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 2675.7361755371094,
"epoch": 0.8157669237360754,
"grad_norm": 2.230329990386963,
"kl": 0.79931640625,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1478,
"reward": 0.14864197466522455,
"reward_std": 0.6397556141018867,
"rewards/cosine_scaled_reward": -0.1756790205836296,
"rewards/format_reward": 0.5000000037252903,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 2487.75,
"epoch": 0.8174807197943444,
"grad_norm": 4.63166618347168,
"kl": 0.8720703125,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.3369,
"reward": 0.37176867201924324,
"reward_std": 0.6089313849806786,
"rewards/cosine_scaled_reward": -0.07106010848656297,
"rewards/format_reward": 0.5138888992369175,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 2181.4722595214844,
"epoch": 0.8191945158526135,
"grad_norm": 3.272205114364624,
"kl": 1.212890625,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.1721,
"reward": 0.2916110037913313,
"reward_std": 0.49708379805088043,
"rewards/cosine_scaled_reward": -0.11113895289599895,
"rewards/format_reward": 0.5138889029622078,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2471.8194732666016,
"epoch": 0.8209083119108826,
"grad_norm": 3.132082462310791,
"kl": 1.228515625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.208,
"reward": 0.7535388497635722,
"reward_std": 0.695548452436924,
"rewards/cosine_scaled_reward": 0.06426943093538284,
"rewards/format_reward": 0.6250000074505806,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 2416.0693969726562,
"epoch": 0.8226221079691517,
"grad_norm": 3.6008918285369873,
"kl": 0.6015625,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.1762,
"reward": 0.3882830161601305,
"reward_std": 0.6291572600603104,
"rewards/cosine_scaled_reward": -0.07669184263795614,
"rewards/format_reward": 0.5416666641831398,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 1688.6388854980469,
"epoch": 0.8243359040274207,
"grad_norm": 3.3292489051818848,
"kl": 0.94482421875,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.1756,
"reward": 0.6976406946778297,
"reward_std": 0.7118247449398041,
"rewards/cosine_scaled_reward": -0.040068539790809155,
"rewards/format_reward": 0.7777777835726738,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 2275.9723205566406,
"epoch": 0.8260497000856898,
"grad_norm": 5.62246036529541,
"kl": 0.806640625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.3047,
"reward": 0.3320089429616928,
"reward_std": 0.5019624754786491,
"rewards/cosine_scaled_reward": -0.13955109613016248,
"rewards/format_reward": 0.6111111268401146,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 2215.8055725097656,
"epoch": 0.8277634961439588,
"grad_norm": 1.8363783359527588,
"kl": 0.98046875,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.205,
"reward": 0.4018698123982176,
"reward_std": 0.5796016827225685,
"rewards/cosine_scaled_reward": -0.13239844236522913,
"rewards/format_reward": 0.6666666716337204,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 1844.236099243164,
"epoch": 0.829477292202228,
"grad_norm": 4.732890605926514,
"kl": 0.64990234375,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.229,
"reward": 0.37843877635896206,
"reward_std": 0.6878086104989052,
"rewards/cosine_scaled_reward": -0.1441139355301857,
"rewards/format_reward": 0.6666666716337204,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 2126.527801513672,
"epoch": 0.831191088260497,
"grad_norm": 3.030064821243286,
"kl": 1.2275390625,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.2121,
"reward": 0.5017230249941349,
"reward_std": 0.8949761241674423,
"rewards/cosine_scaled_reward": -0.0616384893655777,
"rewards/format_reward": 0.625,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2167.52783203125,
"epoch": 0.8329048843187661,
"grad_norm": 2.106167793273926,
"kl": 0.8662109375,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0938,
"reward": 0.5535758845508099,
"reward_std": 0.5298986956477165,
"rewards/cosine_scaled_reward": -0.021823172457516193,
"rewards/format_reward": 0.597222238779068,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2419.6111450195312,
"epoch": 0.8346186803770351,
"grad_norm": 2.8747453689575195,
"kl": 0.6708984375,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.1754,
"reward": 0.2720159562304616,
"reward_std": 0.545224204659462,
"rewards/cosine_scaled_reward": -0.1487142387777567,
"rewards/format_reward": 0.5694444552063942,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 2196.486114501953,
"epoch": 0.8363324764353042,
"grad_norm": 2.825509786605835,
"kl": 0.55517578125,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.1679,
"reward": 0.6998728811740875,
"reward_std": 0.6955326199531555,
"rewards/cosine_scaled_reward": 0.030491996556520462,
"rewards/format_reward": 0.6388888955116272,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 2617.722198486328,
"epoch": 0.8380462724935732,
"grad_norm": 1.9763245582580566,
"kl": 0.8466796875,
"learning_rate": 1.013262614978859e-07,
"loss": 0.1616,
"reward": 0.28653959557414055,
"reward_std": 0.6969783715903759,
"rewards/cosine_scaled_reward": -0.12061909190379083,
"rewards/format_reward": 0.5277777835726738,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2630.0972290039062,
"epoch": 0.8397600685518424,
"grad_norm": 1.7776055335998535,
"kl": 0.73388671875,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.1053,
"reward": 0.3464082106947899,
"reward_std": 0.7413296326994896,
"rewards/cosine_scaled_reward": -0.08374034571170341,
"rewards/format_reward": 0.5138888880610466,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2610.1666870117188,
"epoch": 0.8414738646101114,
"grad_norm": 2.362657308578491,
"kl": 1.03369140625,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.1536,
"reward": 0.054581154661718756,
"reward_std": 0.5118880867958069,
"rewards/cosine_scaled_reward": -0.27132053300738335,
"rewards/format_reward": 0.5972222238779068,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 2591.611114501953,
"epoch": 0.8431876606683805,
"grad_norm": 1.4310436248779297,
"kl": 0.748046875,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.1227,
"reward": 0.2780334800481796,
"reward_std": 0.5931698530912399,
"rewards/cosine_scaled_reward": -0.16653881408274174,
"rewards/format_reward": 0.611111119389534,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 1928.0278015136719,
"epoch": 0.8449014567266495,
"grad_norm": 4.139144420623779,
"kl": 1.39892578125,
"learning_rate": 1.005372381963547e-07,
"loss": 0.2949,
"reward": 0.36576576717197895,
"reward_std": 0.4379217103123665,
"rewards/cosine_scaled_reward": -0.15045045968145132,
"rewards/format_reward": 0.6666666716337204,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2274.680633544922,
"epoch": 0.8466152527849186,
"grad_norm": 1.5368496179580688,
"kl": 0.958984375,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.1968,
"reward": 0.3456185795366764,
"reward_std": 0.5900547206401825,
"rewards/cosine_scaled_reward": -0.11885737907141447,
"rewards/format_reward": 0.5833333358168602,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 1835.8333740234375,
"epoch": 0.8483290488431876,
"grad_norm": 4.2471394538879395,
"kl": 1.61279296875,
"learning_rate": 1.002741278414069e-07,
"loss": 0.1987,
"reward": 0.9312632232904434,
"reward_std": 0.586229220032692,
"rewards/cosine_scaled_reward": 0.06979827064787969,
"rewards/format_reward": 0.7916666567325592,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 2454.902801513672,
"epoch": 0.8500428449014568,
"grad_norm": 2.069298505783081,
"kl": 0.9306640625,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.176,
"reward": 0.6016820748336613,
"reward_std": 0.7270394861698151,
"rewards/cosine_scaled_reward": 0.0161188212223351,
"rewards/format_reward": 0.5694444477558136,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 2282.250030517578,
"epoch": 0.8517566409597258,
"grad_norm": 2.224278688430786,
"kl": 0.7265625,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.1715,
"reward": 0.43558146245777607,
"reward_std": 0.6017558500170708,
"rewards/cosine_scaled_reward": -0.07387594413012266,
"rewards/format_reward": 0.5833333432674408,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 2533.7500610351562,
"epoch": 0.8534704370179949,
"grad_norm": 5.092855930328369,
"kl": 1.08935546875,
"learning_rate": 1.000438641958131e-07,
"loss": 0.1009,
"reward": 0.1837000446394086,
"reward_std": 0.7107623964548111,
"rewards/cosine_scaled_reward": -0.19287220388650894,
"rewards/format_reward": 0.569444440305233,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 2394.777801513672,
"epoch": 0.8551842330762639,
"grad_norm": 2.348245620727539,
"kl": 0.81884765625,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.1019,
"reward": 0.8653097227215767,
"reward_std": 0.7131348252296448,
"rewards/cosine_scaled_reward": 0.13404375594109297,
"rewards/format_reward": 0.5972222238779068,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1616.4583282470703,
"epoch": 0.856898029134533,
"grad_norm": 2.849949598312378,
"kl": 0.951171875,
"learning_rate": 1e-07,
"loss": 0.12,
"reward": 1.1779827252030373,
"reward_std": 0.6799286007881165,
"rewards/cosine_scaled_reward": 0.20010241214185953,
"rewards/format_reward": 0.7777777761220932,
"step": 500
},
{
"epoch": 0.856898029134533,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.12157149085606943,
"train_runtime": 48026.4516,
"train_samples_per_second": 0.75,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}