{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.856898029134533,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 2862.5695190429688,
      "epoch": 0.001713796058269066,
      "grad_norm": 0.16925157606601715,
      "kl": 0.0,
      "learning_rate": 2e-08,
      "loss": 0.0467,
      "reward": 0.12026740610599518,
      "reward_std": 0.47210293635725975,
      "rewards/cosine_scaled_reward": -0.1343107339926064,
      "rewards/format_reward": 0.3888888917863369,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2739.5,
      "epoch": 0.003427592116538132,
      "grad_norm": 0.18508067727088928,
      "kl": 0.0,
      "learning_rate": 4e-08,
      "loss": 0.0391,
      "reward": -0.05314926430583,
      "reward_std": 0.36226021870970726,
      "rewards/cosine_scaled_reward": -0.21407463820651174,
      "rewards/format_reward": 0.3750000111758709,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2816.1944580078125,
      "epoch": 0.005141388174807198,
      "grad_norm": 0.15574845671653748,
      "kl": 4.06801700592041e-05,
      "learning_rate": 6e-08,
      "loss": 0.024,
      "reward": -0.0735303945839405,
      "reward_std": 0.4152667075395584,
      "rewards/cosine_scaled_reward": -0.21037630829960108,
      "rewards/format_reward": 0.34722223225980997,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2746.875,
      "epoch": 0.006855184233076264,
      "grad_norm": 0.18099600076675415,
      "kl": 3.692507743835449e-05,
      "learning_rate": 8e-08,
      "loss": 0.0516,
      "reward": 0.2664791904389858,
      "reward_std": 0.8305703550577164,
      "rewards/cosine_scaled_reward": -0.07509375014342368,
      "rewards/format_reward": 0.4166666716337204,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2557.513916015625,
      "epoch": 0.00856898029134533,
      "grad_norm": 0.173630490899086,
      "kl": 2.3245811462402344e-05,
      "learning_rate": 1e-07,
      "loss": 0.0579,
      "reward": 0.4870211333036423,
      "reward_std": 0.6806018278002739,
      "rewards/cosine_scaled_reward": -0.006489435210824013,
      "rewards/format_reward": 0.5000000074505806,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3163.8333129882812,
      "epoch": 0.010282776349614395,
      "grad_norm": 0.1903219074010849,
      "kl": 4.1365623474121094e-05,
      "learning_rate": 1.2e-07,
      "loss": 0.0699,
      "reward": 0.22140773385763168,
      "reward_std": 0.614318884909153,
      "rewards/cosine_scaled_reward": -0.07679613586515188,
      "rewards/format_reward": 0.37500001303851604,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2238.3055725097656,
      "epoch": 0.011996572407883462,
      "grad_norm": 0.2037331461906433,
      "kl": 3.427267074584961e-05,
      "learning_rate": 1.4e-07,
      "loss": 0.0507,
      "reward": 0.39292821660637856,
      "reward_std": 0.6100749522447586,
      "rewards/cosine_scaled_reward": -0.08825810719281435,
      "rewards/format_reward": 0.5694444552063942,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2888.4166870117188,
      "epoch": 0.013710368466152529,
      "grad_norm": 0.1671508252620697,
      "kl": 2.8967857360839844e-05,
      "learning_rate": 1.6e-07,
      "loss": 0.0888,
      "reward": 0.5700129643082619,
      "reward_std": 1.0805757492780685,
      "rewards/cosine_scaled_reward": 0.04195092432200909,
      "rewards/format_reward": 0.486111119389534,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2740.638916015625,
      "epoch": 0.015424164524421594,
      "grad_norm": 0.2825331389904022,
      "kl": 3.212690353393555e-05,
      "learning_rate": 1.8e-07,
      "loss": 0.1025,
      "reward": 0.3288399577140808,
      "reward_std": 0.6967436075210571,
      "rewards/cosine_scaled_reward": -0.03696890315040946,
      "rewards/format_reward": 0.4027777733281255,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3010.7916870117188,
      "epoch": 0.01713796058269066,
      "grad_norm": 0.17822624742984772,
      "kl": 4.1991472244262695e-05,
      "learning_rate": 2e-07,
      "loss": 0.0471,
      "reward": 0.09832120686769485,
      "reward_std": 0.6553668975830078,
      "rewards/cosine_scaled_reward": -0.1036171680316329,
      "rewards/format_reward": 0.3055555522441864,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2748.486114501953,
      "epoch": 0.018851756640959727,
      "grad_norm": 0.2476479411125183,
      "kl": 3.9696693420410156e-05,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 0.0491,
      "reward": 0.015873797237873077,
      "reward_std": 0.553259089589119,
      "rewards/cosine_scaled_reward": -0.16567421704530716,
      "rewards/format_reward": 0.3472222238779068,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2954.3472290039062,
      "epoch": 0.02056555269922879,
      "grad_norm": 0.28294840455055237,
      "kl": 3.898143768310547e-05,
      "learning_rate": 2.4e-07,
      "loss": 0.1311,
      "reward": -0.11908636894077063,
      "reward_std": 0.6466177105903625,
      "rewards/cosine_scaled_reward": -0.22620984725654125,
      "rewards/format_reward": 0.3333333367481828,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2818.986114501953,
      "epoch": 0.022279348757497857,
      "grad_norm": 0.18577341735363007,
      "kl": 4.303455352783203e-05,
      "learning_rate": 2.6e-07,
      "loss": 0.0007,
      "reward": 0.3697042800486088,
      "reward_std": 0.7059066146612167,
      "rewards/cosine_scaled_reward": -0.03042563726194203,
      "rewards/format_reward": 0.4305555559694767,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2905.3333740234375,
      "epoch": 0.023993144815766924,
      "grad_norm": 0.226650208234787,
      "kl": 3.2275915145874023e-05,
      "learning_rate": 2.8e-07,
      "loss": 0.0212,
      "reward": 0.04198750853538513,
      "reward_std": 0.5741659551858902,
      "rewards/cosine_scaled_reward": -0.14567292109131813,
      "rewards/format_reward": 0.33333333395421505,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3468.2222290039062,
      "epoch": 0.02570694087403599,
      "grad_norm": 0.1521635353565216,
      "kl": 4.279613494873047e-05,
      "learning_rate": 3e-07,
      "loss": 0.0233,
      "reward": -0.17704490013420582,
      "reward_std": 0.6536840051412582,
      "rewards/cosine_scaled_reward": -0.1996335554867983,
      "rewards/format_reward": 0.22222222574055195,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2405.263916015625,
      "epoch": 0.027420736932305057,
      "grad_norm": 0.23728908598423004,
      "kl": 2.495013177394867e-05,
      "learning_rate": 3.2e-07,
      "loss": 0.0632,
      "reward": 0.7499620914459229,
      "reward_std": 0.9962631165981293,
      "rewards/cosine_scaled_reward": 0.07636993401683867,
      "rewards/format_reward": 0.5972222238779068,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2764.875030517578,
      "epoch": 0.02913453299057412,
      "grad_norm": 0.21387562155723572,
      "kl": 2.6166439056396484e-05,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 0.0416,
      "reward": 0.27334376238286495,
      "reward_std": 0.4753483533859253,
      "rewards/cosine_scaled_reward": -0.05082811089232564,
      "rewards/format_reward": 0.3750000111758709,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3252.486083984375,
      "epoch": 0.030848329048843187,
      "grad_norm": 0.209347203373909,
      "kl": 4.25875186920166e-05,
      "learning_rate": 3.6e-07,
      "loss": 0.0587,
      "reward": -0.18576696328818798,
      "reward_std": 0.5022815316915512,
      "rewards/cosine_scaled_reward": -0.19010569993406534,
      "rewards/format_reward": 0.1944444514811039,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3157.4166870117188,
      "epoch": 0.032562125107112254,
      "grad_norm": 0.22900572419166565,
      "kl": 3.084540367126465e-05,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 0.0687,
      "reward": 0.03116392099764198,
      "reward_std": 0.7267041057348251,
      "rewards/cosine_scaled_reward": -0.14414026169106364,
      "rewards/format_reward": 0.3194444486871362,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3228.5972290039062,
      "epoch": 0.03427592116538132,
      "grad_norm": 0.24043872952461243,
      "kl": 2.6807188987731934e-05,
      "learning_rate": 4e-07,
      "loss": 0.1293,
      "reward": -0.1261596381664276,
      "reward_std": 0.7229140102863312,
      "rewards/cosine_scaled_reward": -0.20196872018277645,
      "rewards/format_reward": 0.2777777835726738,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2856.6805419921875,
      "epoch": 0.03598971722365039,
      "grad_norm": 0.19779175519943237,
      "kl": 3.987550735473633e-05,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 0.0069,
      "reward": 0.11652377434074879,
      "reward_std": 0.8210525661706924,
      "rewards/cosine_scaled_reward": -0.12229366600513458,
      "rewards/format_reward": 0.3611111165955663,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3298.3472290039062,
      "epoch": 0.037703513281919454,
      "grad_norm": 0.13437196612358093,
      "kl": 2.828240394592285e-05,
      "learning_rate": 4.3999999999999997e-07,
      "loss": 0.0123,
      "reward": 0.1601133793592453,
      "reward_std": 0.6881751976907253,
      "rewards/cosine_scaled_reward": -0.06577664241194725,
      "rewards/format_reward": 0.2916666753590107,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3107.4583129882812,
      "epoch": 0.03941730934018852,
      "grad_norm": 0.1506253182888031,
      "kl": 2.2932887077331543e-05,
      "learning_rate": 4.6e-07,
      "loss": 0.0149,
      "reward": -0.13085854798555374,
      "reward_std": 0.5464130863547325,
      "rewards/cosine_scaled_reward": -0.20431815274059772,
      "rewards/format_reward": 0.2777777807787061,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2710.6806030273438,
      "epoch": 0.04113110539845758,
      "grad_norm": 0.24692188203334808,
      "kl": 2.8967857360839844e-05,
      "learning_rate": 4.8e-07,
      "loss": 0.1012,
      "reward": 0.24628422083333135,
      "reward_std": 0.4773574620485306,
      "rewards/cosine_scaled_reward": -0.057413444737903774,
      "rewards/format_reward": 0.3611111268401146,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2784.7361450195312,
      "epoch": 0.04284490145672665,
      "grad_norm": 0.25797340273857117,
      "kl": 2.6673078536987305e-05,
      "learning_rate": 5e-07,
      "loss": 0.106,
      "reward": 0.46540534496307373,
      "reward_std": 0.8211657330393791,
      "rewards/cosine_scaled_reward": -0.01729731634259224,
      "rewards/format_reward": 0.5000000037252903,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3136.52783203125,
      "epoch": 0.044558697514995714,
      "grad_norm": 0.14968131482601166,
      "kl": 3.291666507720947e-05,
      "learning_rate": 5.2e-07,
      "loss": 0.0512,
      "reward": -0.09118526801466942,
      "reward_std": 0.5860454589128494,
      "rewards/cosine_scaled_reward": -0.21225931122899055,
      "rewards/format_reward": 0.33333334140479565,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3040.0000610351562,
      "epoch": 0.04627249357326478,
      "grad_norm": 0.17181935906410217,
      "kl": 1.5079975128173828e-05,
      "learning_rate": 5.4e-07,
      "loss": 0.0738,
      "reward": 0.34727448783814907,
      "reward_std": 0.6153330877423286,
      "rewards/cosine_scaled_reward": -0.027751651592552662,
      "rewards/format_reward": 0.4027777947485447,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2343.1111755371094,
      "epoch": 0.04798628963153385,
      "grad_norm": 0.2077193260192871,
      "kl": 2.5130808353424072e-05,
      "learning_rate": 5.6e-07,
      "loss": 0.0598,
      "reward": 0.6073902919888496,
      "reward_std": 0.6849471032619476,
      "rewards/cosine_scaled_reward": 0.018972909078001976,
      "rewards/format_reward": 0.5694444477558136,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3073.7222290039062,
      "epoch": 0.049700085689802914,
      "grad_norm": 0.21480253338813782,
      "kl": 2.290681004524231e-05,
      "learning_rate": 5.8e-07,
      "loss": 0.0747,
      "reward": 0.17731062695384026,
      "reward_std": 0.8807300254702568,
      "rewards/cosine_scaled_reward": -0.07801135815680027,
      "rewards/format_reward": 0.3333333320915699,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2768.02783203125,
      "epoch": 0.05141388174807198,
      "grad_norm": 0.25759172439575195,
      "kl": 2.993270754814148e-05,
      "learning_rate": 6e-07,
      "loss": 0.0674,
      "reward": 0.5063075462821871,
      "reward_std": 0.771463930606842,
      "rewards/cosine_scaled_reward": -0.010735094547271729,
      "rewards/format_reward": 0.5277777910232544,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2695.6944580078125,
      "epoch": 0.05312767780634105,
      "grad_norm": 0.2701717019081116,
      "kl": 1.3127923011779785e-05,
      "learning_rate": 6.2e-07,
      "loss": 0.0971,
      "reward": 0.2706103939563036,
      "reward_std": 0.49449611082673073,
      "rewards/cosine_scaled_reward": -0.045250357885379344,
      "rewards/format_reward": 0.361111119389534,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3051.52783203125,
      "epoch": 0.054841473864610114,
      "grad_norm": 0.17947925627231598,
      "kl": 2.6337802410125732e-05,
      "learning_rate": 6.4e-07,
      "loss": 0.057,
      "reward": 0.45089754834771156,
      "reward_std": 1.1203400194644928,
      "rewards/cosine_scaled_reward": -0.02455122536048293,
      "rewards/format_reward": 0.5000000074505806,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2306.8750610351562,
      "epoch": 0.056555269922879174,
      "grad_norm": 0.21536274254322052,
      "kl": 5.3569674491882324e-05,
      "learning_rate": 6.6e-07,
      "loss": 0.0764,
      "reward": 0.8166992478072643,
      "reward_std": 0.8387185409665108,
      "rewards/cosine_scaled_reward": 0.12362739443778992,
      "rewards/format_reward": 0.5694444626569748,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2698.7083740234375,
      "epoch": 0.05826906598114824,
      "grad_norm": 0.29884466528892517,
      "kl": 0.00017189979553222656,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.1617,
      "reward": 0.057983118342235684,
      "reward_std": 0.7621737122535706,
      "rewards/cosine_scaled_reward": -0.1585084507241845,
      "rewards/format_reward": 0.3750000074505806,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3312.3055419921875,
      "epoch": 0.05998286203941731,
      "grad_norm": 0.1554093211889267,
      "kl": 9.316205978393555e-05,
      "learning_rate": 7e-07,
      "loss": 0.0273,
      "reward": -0.2900172360241413,
      "reward_std": 0.5383428931236267,
      "rewards/cosine_scaled_reward": -0.2700086124241352,
      "rewards/format_reward": 0.2500000074505806,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2631.8055725097656,
      "epoch": 0.061696658097686374,
      "grad_norm": 0.19274435937404633,
      "kl": 0.0002084970474243164,
      "learning_rate": 7.2e-07,
      "loss": 0.0306,
      "reward": 0.006275304593145847,
      "reward_std": 0.46724043786525726,
      "rewards/cosine_scaled_reward": -0.18436234444379807,
      "rewards/format_reward": 0.37500000931322575,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3124.5277709960938,
      "epoch": 0.06341045415595545,
      "grad_norm": 0.15709905326366425,
      "kl": 7.59810209274292e-05,
      "learning_rate": 7.4e-07,
      "loss": 0.0561,
      "reward": -0.008991474285721779,
      "reward_std": 0.5808551460504532,
      "rewards/cosine_scaled_reward": -0.1294957408681512,
      "rewards/format_reward": 0.25000000558793545,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3045.90283203125,
      "epoch": 0.06512425021422451,
      "grad_norm": 0.2423790842294693,
      "kl": 0.00022971630096435547,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.1263,
      "reward": 0.1536001469939947,
      "reward_std": 0.7093052342534065,
      "rewards/cosine_scaled_reward": -0.07597769796848297,
      "rewards/format_reward": 0.305555559694767,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3150.0833740234375,
      "epoch": 0.06683804627249357,
      "grad_norm": 0.13335144519805908,
      "kl": 0.0003066062927246094,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.0187,
      "reward": -0.01171512296423316,
      "reward_std": 0.48150157928466797,
      "rewards/cosine_scaled_reward": -0.1586353350430727,
      "rewards/format_reward": 0.3055555559694767,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2782.27783203125,
      "epoch": 0.06855184233076264,
      "grad_norm": 0.1773526668548584,
      "kl": 0.0007457435131072998,
      "learning_rate": 8e-07,
      "loss": 0.0236,
      "reward": 0.19545890390872955,
      "reward_std": 0.5221360512077808,
      "rewards/cosine_scaled_reward": -0.08282610075548291,
      "rewards/format_reward": 0.361111112870276,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2729.9722290039062,
      "epoch": 0.0702656383890317,
      "grad_norm": 0.2603820860385895,
      "kl": 0.0002143383026123047,
      "learning_rate": 8.199999999999999e-07,
      "loss": 0.1308,
      "reward": 0.5641986541450024,
      "reward_std": 0.7014989629387856,
      "rewards/cosine_scaled_reward": 0.05293265450745821,
      "rewards/format_reward": 0.4583333432674408,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2622.0555419921875,
      "epoch": 0.07197943444730077,
      "grad_norm": 0.19547662138938904,
      "kl": 0.0008759498596191406,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.0788,
      "reward": 0.3987229084596038,
      "reward_std": 0.6764711476862431,
      "rewards/cosine_scaled_reward": -0.05063853319734335,
      "rewards/format_reward": 0.5,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2757.3611450195312,
      "epoch": 0.07369323050556983,
      "grad_norm": 0.133390411734581,
      "kl": 0.00021369755268096924,
      "learning_rate": 8.599999999999999e-07,
      "loss": 0.0354,
      "reward": 0.5515957027673721,
      "reward_std": 0.6986619718372822,
      "rewards/cosine_scaled_reward": 0.04663117043673992,
      "rewards/format_reward": 0.4583333283662796,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2743.763916015625,
      "epoch": 0.07540702656383891,
      "grad_norm": 0.17805209755897522,
      "kl": 0.0008558034896850586,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.1039,
      "reward": 0.06273656419944018,
      "reward_std": 0.7254525497555733,
      "rewards/cosine_scaled_reward": -0.18390950025059283,
      "rewards/format_reward": 0.4305555671453476,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3018.1805419921875,
      "epoch": 0.07712082262210797,
      "grad_norm": 0.23340974748134613,
      "kl": 0.0007225275039672852,
      "learning_rate": 9e-07,
      "loss": 0.047,
      "reward": 0.12753370963037014,
      "reward_std": 0.5756559893488884,
      "rewards/cosine_scaled_reward": -0.09595536440610886,
      "rewards/format_reward": 0.31944444589316845,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2453.77783203125,
      "epoch": 0.07883461868037704,
      "grad_norm": 0.25216469168663025,
      "kl": 0.0028772354125976562,
      "learning_rate": 9.2e-07,
      "loss": 0.0976,
      "reward": 0.4031712617725134,
      "reward_std": 0.5689256861805916,
      "rewards/cosine_scaled_reward": -0.05535881780087948,
      "rewards/format_reward": 0.5138888955116272,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3180.0972290039062,
      "epoch": 0.0805484147386461,
      "grad_norm": 0.17415259778499603,
      "kl": 0.0014755725860595703,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.0718,
      "reward": -0.026270870119333267,
      "reward_std": 0.641656719148159,
      "rewards/cosine_scaled_reward": -0.15202434547245502,
      "rewards/format_reward": 0.27777778171002865,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2680.7639770507812,
      "epoch": 0.08226221079691516,
      "grad_norm": 0.20438066124916077,
      "kl": 0.001586318016052246,
      "learning_rate": 9.6e-07,
      "loss": 0.0807,
      "reward": 0.6057721227407455,
      "reward_std": 0.7416700124740601,
      "rewards/cosine_scaled_reward": 0.05288607440888882,
      "rewards/format_reward": 0.5000000074505806,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2346.055633544922,
      "epoch": 0.08397600685518423,
      "grad_norm": 0.35583311319351196,
      "kl": 0.018939971923828125,
      "learning_rate": 9.8e-07,
      "loss": 0.1404,
      "reward": 0.7048290632665157,
      "reward_std": 0.6792610064148903,
      "rewards/cosine_scaled_reward": 0.06074785813689232,
      "rewards/format_reward": 0.5833333432674408,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2833.5833740234375,
      "epoch": 0.0856898029134533,
      "grad_norm": 0.2027311623096466,
      "kl": 0.0032949447631835938,
      "learning_rate": 1e-06,
      "loss": 0.0416,
      "reward": 0.07023209612816572,
      "reward_std": 0.6861855462193489,
      "rewards/cosine_scaled_reward": -0.16627284698188305,
      "rewards/format_reward": 0.4027777872979641,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3051.2777709960938,
      "epoch": 0.08740359897172237,
      "grad_norm": 0.16748514771461487,
      "kl": 0.001615285873413086,
      "learning_rate": 9.999890338174275e-07,
      "loss": 0.069,
      "reward": 0.1449947228829842,
      "reward_std": 0.7090619504451752,
      "rewards/cosine_scaled_reward": -0.10111376643180847,
      "rewards/format_reward": 0.34722223225980997,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3181.9583740234375,
      "epoch": 0.08911739502999143,
      "grad_norm": 0.16281543672084808,
      "kl": 0.0019249916076660156,
      "learning_rate": 9.999561358041868e-07,
      "loss": 0.0803,
      "reward": -0.03632636368274689,
      "reward_std": 0.5028033927083015,
      "rewards/cosine_scaled_reward": -0.12927428726106882,
      "rewards/format_reward": 0.22222222946584225,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3303.1805419921875,
      "epoch": 0.0908311910882605,
      "grad_norm": 0.14455804228782654,
      "kl": 0.0005393028259277344,
      "learning_rate": 9.999013075636804e-07,
      "loss": 0.0318,
      "reward": -0.10013403557240963,
      "reward_std": 0.4606664590537548,
      "rewards/cosine_scaled_reward": -0.17506700940430164,
      "rewards/format_reward": 0.2500000046566129,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3313.1944580078125,
      "epoch": 0.09254498714652956,
      "grad_norm": 0.13308647274971008,
      "kl": 0.0011081695556640625,
      "learning_rate": 9.998245517681593e-07,
      "loss": 0.0055,
      "reward": 0.10159287042915821,
      "reward_std": 0.6204735822975636,
      "rewards/cosine_scaled_reward": -0.060314678063150495,
      "rewards/format_reward": 0.2222222276031971,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3481.15283203125,
      "epoch": 0.09425878320479864,
      "grad_norm": 0.13649359345436096,
      "kl": 0.0008268356323242188,
      "learning_rate": 9.997258721585931e-07,
      "loss": 0.0328,
      "reward": -0.12874329963233322,
      "reward_std": 0.5648706145584583,
      "rewards/cosine_scaled_reward": -0.1754827625118196,
      "rewards/format_reward": 0.22222222480922937,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3232.7222900390625,
      "epoch": 0.0959725792630677,
      "grad_norm": 0.19132941961288452,
      "kl": 0.0013275146484375,
      "learning_rate": 9.996052735444862e-07,
      "loss": 0.1077,
      "reward": -0.17376804118975997,
      "reward_std": 0.749246733263135,
      "rewards/cosine_scaled_reward": -0.20493957586586475,
      "rewards/format_reward": 0.2361111156642437,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3279.4584350585938,
      "epoch": 0.09768637532133675,
      "grad_norm": 0.15241067111492157,
      "kl": 0.000919342041015625,
      "learning_rate": 9.994627618036452e-07,
      "loss": 0.0282,
      "reward": 0.31643399875611067,
      "reward_std": 0.6422489807009697,
      "rewards/cosine_scaled_reward": 0.005439223721623421,
      "rewards/format_reward": 0.30555556155741215,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3047.2916870117188,
      "epoch": 0.09940017137960583,
      "grad_norm": 0.22829630970954895,
      "kl": 0.0054931640625,
      "learning_rate": 9.992983438818915e-07,
      "loss": 0.0909,
      "reward": -0.17570834839716554,
      "reward_std": 0.4780988022685051,
      "rewards/cosine_scaled_reward": -0.23368750512599945,
      "rewards/format_reward": 0.2916666679084301,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2918.5555419921875,
      "epoch": 0.10111396743787489,
      "grad_norm": 0.17409604787826538,
      "kl": 0.010187149047851562,
      "learning_rate": 9.991120277927223e-07,
      "loss": -0.0001,
      "reward": 0.6838416904211044,
      "reward_std": 0.7215724363923073,
      "rewards/cosine_scaled_reward": 0.1196986111899605,
      "rewards/format_reward": 0.4444444477558136,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3240.90283203125,
      "epoch": 0.10282776349614396,
      "grad_norm": 0.21398130059242249,
      "kl": 0.0015239715576171875,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.0841,
      "reward": -0.013310029171407223,
      "reward_std": 0.6487029865384102,
      "rewards/cosine_scaled_reward": -0.13859945815056562,
      "rewards/format_reward": 0.2638888992369175,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3323.3889770507812,
      "epoch": 0.10454155955441302,
      "grad_norm": 0.25011396408081055,
      "kl": 0.0015153884887695312,
      "learning_rate": 9.98673738502114e-07,
      "loss": 0.0677,
      "reward": -0.37927111238241196,
      "reward_std": 0.43354837596416473,
      "rewards/cosine_scaled_reward": -0.2799133397638798,
      "rewards/format_reward": 0.18055556155741215,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2683.6250610351562,
      "epoch": 0.1062553556126821,
      "grad_norm": 0.17982754111289978,
      "kl": 0.00201416015625,
      "learning_rate": 9.98421786662277e-07,
      "loss": 0.0008,
      "reward": 0.40144167095422745,
      "reward_std": 0.5826155617833138,
      "rewards/cosine_scaled_reward": -0.02844582637771964,
      "rewards/format_reward": 0.4583333432674408,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3426.2361450195312,
      "epoch": 0.10796915167095116,
      "grad_norm": 0.182517409324646,
      "kl": 0.00151824951171875,
      "learning_rate": 9.981479793771866e-07,
      "loss": 0.0294,
      "reward": -0.09498679265379906,
      "reward_std": 0.7008046992123127,
      "rewards/cosine_scaled_reward": -0.13777116686105728,
      "rewards/format_reward": 0.18055555690079927,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2847.5972290039062,
      "epoch": 0.10968294772922023,
      "grad_norm": 0.31501731276512146,
      "kl": 0.0022530555725097656,
      "learning_rate": 9.97852329991824e-07,
      "loss": 0.1548,
      "reward": 0.009381972253322601,
      "reward_std": 0.36741600558161736,
      "rewards/cosine_scaled_reward": -0.16197567898780107,
      "rewards/format_reward": 0.3333333432674408,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3167.236083984375,
      "epoch": 0.11139674378748929,
      "grad_norm": 0.4229466915130615,
      "kl": 0.0364532470703125,
      "learning_rate": 9.975348529157229e-07,
      "loss": 0.0659,
      "reward": -0.029949136078357697,
      "reward_std": 0.5782980695366859,
      "rewards/cosine_scaled_reward": -0.13997458899393678,
      "rewards/format_reward": 0.25000000838190317,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2846.8334350585938,
      "epoch": 0.11311053984575835,
      "grad_norm": 0.1699674278497696,
      "kl": 0.0013065338134765625,
      "learning_rate": 9.971955636222684e-07,
      "loss": 0.0667,
      "reward": 0.2395001295953989,
      "reward_std": 0.3902180567383766,
      "rewards/cosine_scaled_reward": -0.053861052729189396,
      "rewards/format_reward": 0.3472222313284874,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3227.4445190429688,
      "epoch": 0.11482433590402742,
      "grad_norm": 0.15845970809459686,
      "kl": 0.0022869110107421875,
      "learning_rate": 9.968344786479415e-07,
      "loss": 0.0416,
      "reward": 0.06229268200695515,
      "reward_std": 0.5577914118766785,
      "rewards/cosine_scaled_reward": -0.1285758875310421,
      "rewards/format_reward": 0.3194444552063942,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2906.3472290039062,
      "epoch": 0.11653813196229648,
      "grad_norm": 0.17754817008972168,
      "kl": 0.0027103424072265625,
      "learning_rate": 9.964516155915151e-07,
      "loss": -0.0006,
      "reward": 0.000796053558588028,
      "reward_std": 0.5399865545332432,
      "rewards/cosine_scaled_reward": -0.15932418778538704,
      "rewards/format_reward": 0.3194444449618459,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3079.4583740234375,
      "epoch": 0.11825192802056556,
      "grad_norm": 0.16689395904541016,
      "kl": 0.00244140625,
      "learning_rate": 9.960469931131936e-07,
      "loss": 0.0012,
      "reward": 0.40755608677864075,
      "reward_std": 0.592438168823719,
      "rewards/cosine_scaled_reward": 0.009333595633506775,
      "rewards/format_reward": 0.3888889029622078,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2852.6388549804688,
      "epoch": 0.11996572407883462,
      "grad_norm": 0.14442802965641022,
      "kl": 0.0042266845703125,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.023,
      "reward": 0.44340329244732857,
      "reward_std": 0.43735441006720066,
      "rewards/cosine_scaled_reward": 0.00642385333776474,
      "rewards/format_reward": 0.4305555559694767,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3119.8195190429688,
      "epoch": 0.12167952013710369,
      "grad_norm": 0.1541452407836914,
      "kl": 0.003391265869140625,
      "learning_rate": 9.951725498333448e-07,
      "loss": 0.0155,
      "reward": 0.49696624279022217,
      "reward_std": 0.9607885628938675,
      "rewards/cosine_scaled_reward": 0.07487202249467373,
      "rewards/format_reward": 0.3472222238779068,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2584.513885498047,
      "epoch": 0.12339331619537275,
      "grad_norm": 0.16282722353935242,
      "kl": 0.007266998291015625,
      "learning_rate": 9.947027716509488e-07,
      "loss": 0.0302,
      "reward": 0.4334046132862568,
      "reward_std": 0.42579157277941704,
      "rewards/cosine_scaled_reward": -0.04024216299876571,
      "rewards/format_reward": 0.5138888955116272,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3057.8611450195312,
      "epoch": 0.12510711225364182,
      "grad_norm": 0.19297440350055695,
      "kl": 0.004047393798828125,
      "learning_rate": 9.942113192828444e-07,
      "loss": -0.0268,
      "reward": 0.2504111938178539,
      "reward_std": 0.6320941485464573,
      "rewards/cosine_scaled_reward": -0.05534995626658201,
      "rewards/format_reward": 0.3611111268401146,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2350.5000610351562,
      "epoch": 0.1268209083119109,
      "grad_norm": 0.25634145736694336,
      "kl": 0.004367828369140625,
      "learning_rate": 9.93698216681727e-07,
      "loss": 0.1227,
      "reward": 0.7754522487521172,
      "reward_std": 0.8430259823799133,
      "rewards/cosine_scaled_reward": 0.07522611878812313,
      "rewards/format_reward": 0.625,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3078.013916015625,
      "epoch": 0.12853470437017994,
      "grad_norm": 0.15847010910511017,
      "kl": 0.004947662353515625,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0447,
      "reward": 0.27387892454862595,
      "reward_std": 0.5773990303277969,
      "rewards/cosine_scaled_reward": -0.03667165897786617,
      "rewards/format_reward": 0.3472222276031971,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2247.8194427490234,
      "epoch": 0.13024850042844902,
      "grad_norm": 0.28341227769851685,
      "kl": 0.014591217041015625,
      "learning_rate": 9.926071618660237e-07,
      "loss": 0.0403,
      "reward": 0.7070811688899994,
      "reward_std": 0.7020798400044441,
      "rewards/cosine_scaled_reward": 0.06881837674882263,
      "rewards/format_reward": 0.5694444440305233,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3159.75,
      "epoch": 0.1319622964867181,
      "grad_norm": 0.13436463475227356,
      "kl": 0.0049896240234375,
      "learning_rate": 9.9202926282791e-07,
      "loss": 0.023,
      "reward": 0.35647532157599926,
      "reward_std": 0.7988947406411171,
      "rewards/cosine_scaled_reward": 0.011570994276553392,
      "rewards/format_reward": 0.33333334513008595,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3150.0139770507812,
      "epoch": 0.13367609254498714,
      "grad_norm": 0.176174134016037,
      "kl": 0.004405975341796875,
      "learning_rate": 9.91429819907136e-07,
      "loss": 0.0747,
      "reward": -0.14098340552300215,
      "reward_std": 0.5686891078948975,
      "rewards/cosine_scaled_reward": -0.18854726571589708,
      "rewards/format_reward": 0.23611112032085657,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2571.0694580078125,
      "epoch": 0.1353898886032562,
      "grad_norm": 0.1847277730703354,
      "kl": 0.008609771728515625,
      "learning_rate": 9.908088623197048e-07,
      "loss": -0.0106,
      "reward": 0.3892364539206028,
      "reward_std": 0.7569635957479477,
      "rewards/cosine_scaled_reward": -0.06927067344076931,
      "rewards/format_reward": 0.5277777835726738,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3138.5555419921875,
      "epoch": 0.13710368466152528,
      "grad_norm": 0.21640530228614807,
      "kl": 0.005603790283203125,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.1324,
      "reward": -0.1231984393671155,
      "reward_std": 0.778315082192421,
      "rewards/cosine_scaled_reward": -0.2074325531721115,
      "rewards/format_reward": 0.2916666716337204,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3191.916748046875,
      "epoch": 0.13881748071979436,
      "grad_norm": 0.1524638533592224,
      "kl": 0.014739990234375,
      "learning_rate": 9.895025252503755e-07,
      "loss": 0.0255,
      "reward": -0.14118600636720657,
      "reward_std": 0.3157992772758007,
      "rewards/cosine_scaled_reward": -0.17475967481732368,
      "rewards/format_reward": 0.20833334047347307,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2925.013916015625,
      "epoch": 0.1405312767780634,
      "grad_norm": 0.21411970257759094,
      "kl": 0.00635528564453125,
      "learning_rate": 9.888172094375033e-07,
      "loss": 0.0735,
      "reward": -0.06351233087480068,
      "reward_std": 0.5284828841686249,
      "rewards/cosine_scaled_reward": -0.18453393690288067,
      "rewards/format_reward": 0.30555555783212185,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2801.2638549804688,
      "epoch": 0.14224507283633248,
      "grad_norm": 0.18929333984851837,
      "kl": 0.0023746490478515625,
      "learning_rate": 9.881105062929221e-07,
      "loss": 0.0434,
      "reward": 0.5797148197889328,
      "reward_std": 0.8048742488026619,
      "rewards/cosine_scaled_reward": 0.03985740663483739,
      "rewards/format_reward": 0.5000000074505806,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2881.999969482422,
      "epoch": 0.14395886889460155,
      "grad_norm": 0.16995370388031006,
      "kl": 0.00823211669921875,
      "learning_rate": 9.873824502603459e-07,
      "loss": 0.0417,
      "reward": 0.1579499295912683,
      "reward_std": 0.6737323254346848,
      "rewards/cosine_scaled_reward": -0.12935838662087917,
      "rewards/format_reward": 0.4166666716337204,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2817.888916015625,
      "epoch": 0.1456726649528706,
      "grad_norm": 0.17163607478141785,
      "kl": 0.004947662353515625,
      "learning_rate": 9.866330768241983e-07,
      "loss": 0.0843,
      "reward": 0.14664312824606895,
      "reward_std": 0.6406831294298172,
      "rewards/cosine_scaled_reward": -0.10028954246081412,
      "rewards/format_reward": 0.3472222248092294,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2666.0972595214844,
      "epoch": 0.14738646101113967,
      "grad_norm": 0.23853930830955505,
      "kl": 0.0075836181640625,
      "learning_rate": 9.85862422507884e-07,
      "loss": 0.184,
      "reward": 0.15615743398666382,
      "reward_std": 0.6508499458432198,
      "rewards/cosine_scaled_reward": -0.14414352551102638,
      "rewards/format_reward": 0.4444444440305233,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3479.0000610351562,
      "epoch": 0.14910025706940874,
      "grad_norm": 0.13812494277954102,
      "kl": 0.003131866455078125,
      "learning_rate": 9.850705248720068e-07,
      "loss": 0.0273,
      "reward": -0.3952238578349352,
      "reward_std": 0.4180161654949188,
      "rewards/cosine_scaled_reward": -0.24622303992509842,
      "rewards/format_reward": 0.0972222238779068,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3411.0556030273438,
      "epoch": 0.15081405312767782,
      "grad_norm": 0.14131076633930206,
      "kl": 0.00627899169921875,
      "learning_rate": 9.8425742251254e-07,
      "loss": 0.0242,
      "reward": -0.18497492372989655,
      "reward_std": 0.3112034276127815,
      "rewards/cosine_scaled_reward": -0.15498745813965797,
      "rewards/format_reward": 0.12500000186264515,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2821.4305725097656,
      "epoch": 0.15252784918594686,
      "grad_norm": 0.23381026089191437,
      "kl": 0.00811767578125,
      "learning_rate": 9.83423155058946e-07,
      "loss": 0.1044,
      "reward": -0.15477947797626257,
      "reward_std": 0.3880116418004036,
      "rewards/cosine_scaled_reward": -0.257945304736495,
      "rewards/format_reward": 0.3611111082136631,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2741.013885498047,
      "epoch": 0.15424164524421594,
      "grad_norm": 0.3015286326408386,
      "kl": 0.005706787109375,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.146,
      "reward": 0.32925539929419756,
      "reward_std": 0.5706463847309351,
      "rewards/cosine_scaled_reward": -0.01592785632237792,
      "rewards/format_reward": 0.3611111082136631,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3004.1805419921875,
      "epoch": 0.155955441302485,
      "grad_norm": 0.2821044325828552,
      "kl": 0.013214111328125,
      "learning_rate": 9.816912885430258e-07,
      "loss": 0.1457,
      "reward": -0.23375913500785828,
      "reward_std": 0.6937631815671921,
      "rewards/cosine_scaled_reward": -0.25576844066381454,
      "rewards/format_reward": 0.2777777872979641,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.15283203125,
      "epoch": 0.15766923736075408,
      "grad_norm": 0.19952206313610077,
      "kl": 0.01056671142578125,
      "learning_rate": 9.807937738894303e-07,
      "loss": -0.0327,
      "reward": 0.10378427803516388,
      "reward_std": 0.6779353246092796,
      "rewards/cosine_scaled_reward": -0.13560786750167608,
      "rewards/format_reward": 0.37500000186264515,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2751.8055725097656,
      "epoch": 0.15938303341902313,
      "grad_norm": 0.18763676285743713,
      "kl": 0.00572967529296875,
      "learning_rate": 9.798752629550546e-07,
      "loss": 0.0734,
      "reward": 0.5665245279669762,
      "reward_std": 0.7802244201302528,
      "rewards/cosine_scaled_reward": 0.012428927002474666,
      "rewards/format_reward": 0.5416666604578495,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2943.9722290039062,
      "epoch": 0.1610968294772922,
      "grad_norm": 0.17491032183170319,
      "kl": 0.005878448486328125,
      "learning_rate": 9.78935800506826e-07,
      "loss": 0.0466,
      "reward": 0.36631612479686737,
      "reward_std": 0.5951685793697834,
      "rewards/cosine_scaled_reward": -0.011286390479654074,
      "rewards/format_reward": 0.38888889737427235,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2803.4862060546875,
      "epoch": 0.16281062553556128,
      "grad_norm": 0.2179604023694992,
      "kl": 0.0073699951171875,
      "learning_rate": 9.779754323328192e-07,
      "loss": 0.1111,
      "reward": 0.20993795804679394,
      "reward_std": 0.5628918968141079,
      "rewards/cosine_scaled_reward": -0.08253101143054664,
      "rewards/format_reward": 0.3750000074505806,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3062.25,
      "epoch": 0.16452442159383032,
      "grad_norm": 0.1575266271829605,
      "kl": 0.005573272705078125,
      "learning_rate": 9.769942052400235e-07,
      "loss": 0.0192,
      "reward": 0.5143513884395361,
      "reward_std": 0.9291824996471405,
      "rewards/cosine_scaled_reward": 0.021064545959234238,
      "rewards/format_reward": 0.4722222350537777,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3426.0555419921875,
      "epoch": 0.1662382176520994,
      "grad_norm": 0.152592271566391,
      "kl": 0.0096588134765625,
      "learning_rate": 9.759921670520634e-07,
      "loss": 0.0595,
      "reward": -0.316804476082325,
      "reward_std": 0.5735431797802448,
      "rewards/cosine_scaled_reward": -0.2209022343158722,
      "rewards/format_reward": 0.12500000186264515,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2718.1806030273438,
      "epoch": 0.16795201371036847,
      "grad_norm": 0.19641156494617462,
      "kl": 0.00783538818359375,
      "learning_rate": 9.749693666068663e-07,
      "loss": 0.0871,
      "reward": 0.34513735864311457,
      "reward_std": 0.7377712428569794,
      "rewards/cosine_scaled_reward": -0.09826467745006084,
      "rewards/format_reward": 0.541666679084301,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3183.8611450195312,
      "epoch": 0.16966580976863754,
      "grad_norm": 0.13990604877471924,
      "kl": 0.00958251953125,
      "learning_rate": 9.739258537542835e-07,
      "loss": 0.0408,
      "reward": 0.10082972631789744,
      "reward_std": 0.4568670317530632,
      "rewards/cosine_scaled_reward": -0.09541848301887512,
      "rewards/format_reward": 0.2916666669771075,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2988.263916015625,
      "epoch": 0.1713796058269066,
      "grad_norm": 0.1574762910604477,
      "kl": 0.01104736328125,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.02,
      "reward": 0.05844925343990326,
      "reward_std": 0.4471042864024639,
      "rewards/cosine_scaled_reward": -0.13744205003604293,
      "rewards/format_reward": 0.3333333358168602,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2955.6388549804688,
      "epoch": 0.17309340188517566,
      "grad_norm": 0.15706215798854828,
      "kl": 0.006420135498046875,
      "learning_rate": 9.717768952713511e-07,
      "loss": 0.0337,
      "reward": 0.032026506960392,
      "reward_std": 0.35832666605710983,
      "rewards/cosine_scaled_reward": -0.1298200935125351,
      "rewards/format_reward": 0.2916666753590107,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2912.0972290039062,
      "epoch": 0.17480719794344474,
      "grad_norm": 0.1945251077413559,
      "kl": 0.0088043212890625,
      "learning_rate": 9.706715543782064e-07,
      "loss": 0.072,
      "reward": 0.22132272832095623,
      "reward_std": 0.4281787723302841,
      "rewards/cosine_scaled_reward": -0.09072753041982651,
      "rewards/format_reward": 0.40277779288589954,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2903.9444580078125,
      "epoch": 0.17652099400171378,
      "grad_norm": 0.1475774347782135,
      "kl": 0.00759124755859375,
      "learning_rate": 9.695457105469804e-07,
      "loss": 0.0409,
      "reward": 0.16637181863188744,
      "reward_std": 0.6222990080714226,
      "rewards/cosine_scaled_reward": -0.10431409068405628,
      "rewards/format_reward": 0.3750000149011612,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3320.1805419921875,
      "epoch": 0.17823479005998286,
      "grad_norm": 0.16452452540397644,
      "kl": 0.006443023681640625,
      "learning_rate": 9.683994186497132e-07,
      "loss": 0.073,
      "reward": -0.04724724031984806,
      "reward_std": 0.5820007584989071,
      "rewards/cosine_scaled_reward": -0.13473473582416773,
      "rewards/format_reward": 0.2222222276031971,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2745.27783203125,
      "epoch": 0.17994858611825193,
      "grad_norm": 0.23044738173484802,
      "kl": 0.0102996826171875,
      "learning_rate": 9.672327345550543e-07,
      "loss": 0.0909,
      "reward": 0.48719315230846405,
      "reward_std": 0.9213617816567421,
      "rewards/cosine_scaled_reward": -0.01334787905216217,
      "rewards/format_reward": 0.5138888955116272,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3009.0694580078125,
      "epoch": 0.181662382176521,
      "grad_norm": 0.25429767370224,
      "kl": 0.0078125,
      "learning_rate": 9.66045715125541e-07,
      "loss": 0.127,
      "reward": 0.27888505905866623,
      "reward_std": 0.7037396281957626,
      "rewards/cosine_scaled_reward": -0.048057474195957184,
      "rewards/format_reward": 0.3750000009313226,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3172.5694580078125,
      "epoch": 0.18337617823479005,
      "grad_norm": 0.17300733923912048,
      "kl": 0.008148193359375,
      "learning_rate": 9.648384182148252e-07,
      "loss": 0.0446,
      "reward": 0.21187454462051392,
      "reward_std": 0.549411840736866,
      "rewards/cosine_scaled_reward": -0.06767383548867656,
      "rewards/format_reward": 0.34722222574055195,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3347.52783203125,
      "epoch": 0.18508997429305912,
      "grad_norm": 0.17588993906974792,
      "kl": 0.00628662109375,
      "learning_rate": 9.636109026648554e-07,
      "loss": 0.06,
      "reward": -0.038673363626003265,
      "reward_std": 0.728736087679863,
      "rewards/cosine_scaled_reward": -0.15822557546198368,
      "rewards/format_reward": 0.2777777835726738,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2627.263946533203,
      "epoch": 0.1868037703513282,
      "grad_norm": 0.29850271344184875,
      "kl": 0.01171875,
      "learning_rate": 9.623632283030077e-07,
      "loss": 0.0662,
      "reward": 0.19531617127358913,
      "reward_std": 0.4965377002954483,
      "rewards/cosine_scaled_reward": -0.09678636118769646,
      "rewards/format_reward": 0.38888889737427235,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2958.625,
      "epoch": 0.18851756640959727,
      "grad_norm": 0.46270403265953064,
      "kl": 0.0089263916015625,
      "learning_rate": 9.610954559391704e-07,
      "loss": 0.1711,
      "reward": 0.08645874005742371,
      "reward_std": 0.9684502333402634,
      "rewards/cosine_scaled_reward": -0.1512150838971138,
      "rewards/format_reward": 0.3888888955116272,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2400.3472900390625,
      "epoch": 0.19023136246786632,
      "grad_norm": 0.18343479931354523,
      "kl": 0.00701904296875,
      "learning_rate": 9.598076473627796e-07,
      "loss": 0.1083,
      "reward": 0.22095186542719603,
      "reward_std": 0.5088437423110008,
      "rewards/cosine_scaled_reward": -0.15341296698898077,
      "rewards/format_reward": 0.5277777835726738,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2780.916748046875,
      "epoch": 0.1919451585261354,
      "grad_norm": 0.16234862804412842,
      "kl": 0.007965087890625,
      "learning_rate": 9.58499865339809e-07,
      "loss": 0.0115,
      "reward": 0.19807963073253632,
      "reward_std": 0.5584643110632896,
      "rewards/cosine_scaled_reward": -0.14401574060320854,
      "rewards/format_reward": 0.48611112777143717,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2751.7361450195312,
      "epoch": 0.19365895458440446,
      "grad_norm": 0.20913416147232056,
      "kl": 0.00861358642578125,
      "learning_rate": 9.571721736097088e-07,
      "loss": 0.0851,
      "reward": 0.7618176154792309,
      "reward_std": 1.0328082591295242,
      "rewards/cosine_scaled_reward": 0.11007547879125923,
      "rewards/format_reward": 0.541666679084301,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2235.9166564941406,
      "epoch": 0.1953727506426735,
      "grad_norm": 0.20926620066165924,
      "kl": 0.008697509765625,
      "learning_rate": 9.55824636882301e-07,
      "loss": 0.0327,
      "reward": 0.2064858078956604,
      "reward_std": 0.4848344102501869,
      "rewards/cosine_scaled_reward": -0.1675904355943203,
      "rewards/format_reward": 0.5416666753590107,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2317.6250610351562,
      "epoch": 0.19708654670094258,
      "grad_norm": 0.4515492916107178,
      "kl": 0.0092010498046875,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.2531,
      "reward": 0.45756053365767,
      "reward_std": 0.7848574221134186,
      "rewards/cosine_scaled_reward": -0.04899751394987106,
      "rewards/format_reward": 0.5555555745959282,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3182.0694580078125,
      "epoch": 0.19880034275921166,
      "grad_norm": 0.17537973821163177,
      "kl": 0.0128631591796875,
      "learning_rate": 9.530702921077358e-07,
      "loss": 0.0165,
      "reward": -0.06121325120329857,
      "reward_std": 0.4434010796248913,
      "rewards/cosine_scaled_reward": -0.1694955169223249,
      "rewards/format_reward": 0.2777777798473835,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3030.7083129882812,
      "epoch": 0.20051413881748073,
      "grad_norm": 0.18003451824188232,
      "kl": 0.017120361328125,
      "learning_rate": 9.516636183034564e-07,
      "loss": 0.1071,
      "reward": 0.42929551005363464,
      "reward_std": 0.9132848009467125,
      "rewards/cosine_scaled_reward": 0.006314422586001456,
      "rewards/format_reward": 0.4166666651144624,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2533.9443969726562,
      "epoch": 0.20222793487574978,
      "grad_norm": 0.2703484296798706,
      "kl": 0.011077880859375,
      "learning_rate": 9.502373679810839e-07,
      "loss": 0.0946,
      "reward": 0.4179135374724865,
      "reward_std": 0.8737296983599663,
      "rewards/cosine_scaled_reward": -0.04798768740147352,
      "rewards/format_reward": 0.5138889029622078,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2823.5000610351562,
      "epoch": 0.20394173093401885,
      "grad_norm": 0.19636695086956024,
      "kl": 0.01312255859375,
      "learning_rate": 9.487916106540465e-07,
      "loss": 0.0303,
      "reward": 0.31334975361824036,
      "reward_std": 0.30826447159051895,
      "rewards/cosine_scaled_reward": -0.058602908393368125,
      "rewards/format_reward": 0.430555553175509,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2826.5694580078125,
      "epoch": 0.20565552699228792,
      "grad_norm": 0.2075241059064865,
      "kl": 0.016265869140625,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.094,
      "reward": 0.4697803445160389,
      "reward_std": 0.7031994387507439,
      "rewards/cosine_scaled_reward": 0.005723495967686176,
      "rewards/format_reward": 0.4583333358168602,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2887.3056030273438,
      "epoch": 0.207369323050557,
      "grad_norm": 0.19230371713638306,
      "kl": 0.0111083984375,
      "learning_rate": 9.458418577899774e-07,
      "loss": 0.086,
      "reward": 0.3282506223767996,
      "reward_std": 0.7738695293664932,
      "rewards/cosine_scaled_reward": -0.05809690523892641,
      "rewards/format_reward": 0.4444444589316845,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3083.3611450195312,
      "epoch": 0.20908311910882604,
      "grad_norm": 0.17026208341121674,
      "kl": 0.01568603515625,
      "learning_rate": 9.443380060197385e-07,
      "loss": 0.0301,
      "reward": -0.03662687446922064,
      "reward_std": 0.5345718339085579,
      "rewards/cosine_scaled_reward": -0.19886899180710316,
      "rewards/format_reward": 0.361111119389534,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2682.6805725097656,
      "epoch": 0.21079691516709512,
      "grad_norm": 0.19728592038154602,
      "kl": 0.012115478515625,
      "learning_rate": 9.428149347714143e-07,
      "loss": 0.0481,
      "reward": 0.3675118573009968,
      "reward_std": 1.058239296078682,
      "rewards/cosine_scaled_reward": -0.052355190739035606,
      "rewards/format_reward": 0.4722222248092294,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3100.1666870117188,
      "epoch": 0.2125107112253642,
      "grad_norm": 0.19675055146217346,
      "kl": 0.013641357421875,
      "learning_rate": 9.412727182773486e-07,
      "loss": 0.0775,
      "reward": 0.28848724998533726,
      "reward_std": 0.5403149202466011,
      "rewards/cosine_scaled_reward": -0.04325637500733137,
      "rewards/format_reward": 0.3750000149011612,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2862.0277709960938,
      "epoch": 0.21422450728363324,
      "grad_norm": 0.1939004808664322,
      "kl": 0.017242431640625,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.0453,
      "reward": 0.3707499373704195,
      "reward_std": 0.7198375910520554,
      "rewards/cosine_scaled_reward": -0.016013892367482185,
      "rewards/format_reward": 0.4027777807787061,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2753.486083984375,
      "epoch": 0.2159383033419023,
      "grad_norm": 0.25714027881622314,
      "kl": 0.0194091796875,
      "learning_rate": 9.381311511432658e-07,
      "loss": 0.0648,
      "reward": 0.3369361013174057,
      "reward_std": 0.5913353934884071,
      "rewards/cosine_scaled_reward": -0.08847637102007866,
      "rewards/format_reward": 0.5138888889923692,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2923.125,
      "epoch": 0.21765209940017138,
      "grad_norm": 0.2240990549325943,
      "kl": 0.016571044921875,
      "learning_rate": 9.36531953618799e-07,
      "loss": 0.076,
      "reward": -0.2184343640692532,
      "reward_std": 0.5479928515851498,
      "rewards/cosine_scaled_reward": -0.27588383853435516,
      "rewards/format_reward": 0.3333333395421505,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3276.6111450195312,
      "epoch": 0.21936589545844046,
      "grad_norm": 0.15262338519096375,
      "kl": 0.0207061767578125,
      "learning_rate": 9.34913917072228e-07,
      "loss": -0.0001,
      "reward": -0.12921499274671078,
      "reward_std": 0.5691854059696198,
      "rewards/cosine_scaled_reward": -0.1757186003960669,
      "rewards/format_reward": 0.22222223225980997,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2278.4305725097656,
      "epoch": 0.2210796915167095,
      "grad_norm": 0.3608929216861725,
      "kl": 0.019287109375,
      "learning_rate": 9.332771203643714e-07,
      "loss": 0.0927,
      "reward": 0.706303309649229,
      "reward_std": 0.7875337153673172,
      "rewards/cosine_scaled_reward": 0.04759608302265406,
      "rewards/format_reward": 0.6111111119389534,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1965.999984741211,
      "epoch": 0.22279348757497858,
      "grad_norm": 0.18217293918132782,
      "kl": 0.018463134765625,
      "learning_rate": 9.316216432703916e-07,
      "loss": 0.0064,
      "reward": 1.0708431326784194,
      "reward_std": 0.7828814685344696,
      "rewards/cosine_scaled_reward": 0.17431045067496598,
      "rewards/format_reward": 0.722222238779068,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3232.7222290039062,
      "epoch": 0.22450728363324765,
      "grad_norm": 0.1822432279586792,
      "kl": 0.0173797607421875,
      "learning_rate": 9.299475664759068e-07,
      "loss": 0.0286,
      "reward": -0.31177592277526855,
      "reward_std": 0.350917749106884,
      "rewards/cosine_scaled_reward": -0.27394353225827217,
      "rewards/format_reward": 0.23611112032085657,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2963.8055419921875,
      "epoch": 0.2262210796915167,
      "grad_norm": 0.22750675678253174,
      "kl": 0.016204833984375,
      "learning_rate": 9.282549715730579e-07,
      "loss": 0.0406,
      "reward": 0.32277560234069824,
      "reward_std": 0.8804080411791801,
      "rewards/cosine_scaled_reward": -0.07472331821918488,
      "rewards/format_reward": 0.4722222238779068,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3082.263916015625,
      "epoch": 0.22793487574978577,
      "grad_norm": 0.2046993225812912,
      "kl": 0.021820068359375,
      "learning_rate": 9.265439410565328e-07,
      "loss": 0.0353,
      "reward": 0.3385091759264469,
      "reward_std": 0.7099575102329254,
      "rewards/cosine_scaled_reward": -0.011300940066576004,
      "rewards/format_reward": 0.3611111156642437,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2206.375030517578,
      "epoch": 0.22964867180805484,
      "grad_norm": 0.19563263654708862,
      "kl": 0.017303466796875,
      "learning_rate": 9.248145583195447e-07,
      "loss": 0.0577,
      "reward": 0.640228021889925,
      "reward_std": 0.7054692879319191,
      "rewards/cosine_scaled_reward": 0.0006695720367133617,
      "rewards/format_reward": 0.6388888955116272,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2421.611114501953,
      "epoch": 0.23136246786632392,
      "grad_norm": 0.338701069355011,
      "kl": 0.0213623046875,
      "learning_rate": 9.230669076497687e-07,
      "loss": 0.1507,
      "reward": 0.6078107673674822,
      "reward_std": 0.8746988773345947,
      "rewards/cosine_scaled_reward": 0.0469609391366248,
      "rewards/format_reward": 0.5138888955116272,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2651.125030517578,
      "epoch": 0.23307626392459296,
      "grad_norm": 0.28927695751190186,
      "kl": 0.0211334228515625,
      "learning_rate": 9.213010742252327e-07,
      "loss": 0.1053,
      "reward": 0.35874155908823013,
      "reward_std": 0.7097110822796822,
      "rewards/cosine_scaled_reward": -0.015073666349053383,
      "rewards/format_reward": 0.3888888992369175,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3202.0972900390625,
      "epoch": 0.23479005998286204,
      "grad_norm": 0.17811518907546997,
      "kl": 0.02239990234375,
      "learning_rate": 9.195171441101668e-07,
      "loss": 0.0492,
      "reward": 0.5012375935912132,
      "reward_std": 0.9828417152166367,
      "rewards/cosine_scaled_reward": 0.03534099366515875,
      "rewards/format_reward": 0.43055555410683155,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2946.4166870117188,
      "epoch": 0.2365038560411311,
      "grad_norm": 0.23094090819358826,
      "kl": 0.0191650390625,
      "learning_rate": 9.177152042508077e-07,
      "loss": 0.0193,
      "reward": 0.09741606749594212,
      "reward_std": 0.5724444687366486,
      "rewards/cosine_scaled_reward": -0.13184750825166702,
      "rewards/format_reward": 0.36111111007630825,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2927.0972290039062,
      "epoch": 0.23821765209940018,
      "grad_norm": 0.19129879772663116,
      "kl": 0.024505615234375,
      "learning_rate": 9.158953424711624e-07,
      "loss": 0.0374,
      "reward": 0.1535217664204538,
      "reward_std": 0.4049301743507385,
      "rewards/cosine_scaled_reward": -0.08296133577823639,
      "rewards/format_reward": 0.3194444449618459,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2202.6944580078125,
      "epoch": 0.23993144815766923,
      "grad_norm": 0.4724877178668976,
      "kl": 0.0255584716796875,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.1836,
      "reward": 0.3395635038614273,
      "reward_std": 0.6675402373075485,
      "rewards/cosine_scaled_reward": -0.11494047567248344,
      "rewards/format_reward": 0.5694444552063942,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2910.916748046875,
      "epoch": 0.2416452442159383,
      "grad_norm": 0.18322300910949707,
      "kl": 0.02935791015625,
      "learning_rate": 9.122022088101613e-07,
      "loss": 0.0365,
      "reward": 0.045268273912370205,
      "reward_std": 0.6290135830640793,
      "rewards/cosine_scaled_reward": -0.1440325528383255,
      "rewards/format_reward": 0.3333333367481828,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3141.638916015625,
      "epoch": 0.24335904027420738,
      "grad_norm": 0.1756112426519394,
      "kl": 0.031341552734375,
      "learning_rate": 9.103291169269299e-07,
      "loss": 0.0029,
      "reward": -0.12469126284122467,
      "reward_std": 0.39061762765049934,
      "rewards/cosine_scaled_reward": -0.18734563700854778,
      "rewards/format_reward": 0.2500000009313226,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2654.4166259765625,
      "epoch": 0.24507283633247642,
      "grad_norm": 0.29079416394233704,
      "kl": 0.020843505859375,
      "learning_rate": 9.084384631108882e-07,
      "loss": 0.0697,
      "reward": 0.4159288965165615,
      "reward_std": 0.7245111912488937,
      "rewards/cosine_scaled_reward": -0.06981334753800184,
      "rewards/format_reward": 0.555555559694767,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3201.9306030273438,
      "epoch": 0.2467866323907455,
      "grad_norm": 0.197592630982399,
      "kl": 0.0269775390625,
      "learning_rate": 9.065303395098358e-07,
      "loss": 0.0373,
      "reward": -0.11726564727723598,
      "reward_std": 0.6086189821362495,
      "rewards/cosine_scaled_reward": -0.21141060069203377,
      "rewards/format_reward": 0.30555556155741215,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2885.9722900390625,
      "epoch": 0.24850042844901457,
      "grad_norm": 0.29763004183769226,
      "kl": 0.028472900390625,
      "learning_rate": 9.046048391230247e-07,
      "loss": 0.0677,
      "reward": 0.5742630921304226,
      "reward_std": 0.37366680055856705,
      "rewards/cosine_scaled_reward": 0.0649093296378851,
      "rewards/format_reward": 0.4444444440305233,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2155.9444885253906,
      "epoch": 0.25021422450728364,
      "grad_norm": 0.3611903190612793,
      "kl": 0.024993896484375,
      "learning_rate": 9.026620557966279e-07,
      "loss": 0.1546,
      "reward": 0.5257812030613422,
      "reward_std": 0.9518508315086365,
      "rewards/cosine_scaled_reward": -0.049609407782554626,
      "rewards/format_reward": 0.625,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2810.9583740234375,
      "epoch": 0.2519280205655527,
      "grad_norm": 0.2670803964138031,
      "kl": 0.031829833984375,
      "learning_rate": 9.007020842191634e-07,
      "loss": 0.0389,
      "reward": 0.11524944752454758,
      "reward_std": 0.6441401988267899,
      "rewards/cosine_scaled_reward": -0.12293083127588034,
      "rewards/format_reward": 0.36111112032085657,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3006.0416259765625,
      "epoch": 0.2536418166238218,
      "grad_norm": 0.230261892080307,
      "kl": 0.0283203125,
      "learning_rate": 8.987250199168808e-07,
      "loss": 0.0866,
      "reward": -0.06906389445066452,
      "reward_std": 0.41436275094747543,
      "rewards/cosine_scaled_reward": -0.21508748084306717,
      "rewards/format_reward": 0.3611111156642437,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2527.4305572509766,
      "epoch": 0.25535561268209084,
      "grad_norm": 0.2313620001077652,
      "kl": 0.028106689453125,
      "learning_rate": 8.967309592491052e-07,
      "loss": 0.05,
      "reward": 0.3055970072746277,
      "reward_std": 0.8265255615115166,
      "rewards/cosine_scaled_reward": -0.0972014885628596,
      "rewards/format_reward": 0.5000000074505806,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2374.0416870117188,
      "epoch": 0.2570694087403599,
      "grad_norm": 0.7321764826774597,
      "kl": 0.028839111328125,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.1817,
      "reward": 0.8978928253054619,
      "reward_std": 0.7169746980071068,
      "rewards/cosine_scaled_reward": 0.16422418132424355,
      "rewards/format_reward": 0.5694444477558136,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2437.0694580078125,
      "epoch": 0.258783204798629,
      "grad_norm": 0.7071412801742554,
      "kl": 0.041168212890625,
      "learning_rate": 8.926922383915315e-07,
      "loss": 0.2136,
      "reward": 0.06301388889551163,
      "reward_std": 0.4757090378552675,
      "rewards/cosine_scaled_reward": -0.21849306486546993,
      "rewards/format_reward": 0.5000000074505806,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2845.4583129882812,
      "epoch": 0.26049700085689803,
      "grad_norm": 0.5604143738746643,
      "kl": 0.046142578125,
      "learning_rate": 8.906477750432903e-07,
      "loss": 0.1265,
      "reward": 0.16903822124004364,
      "reward_std": 0.5248951427638531,
      "rewards/cosine_scaled_reward": -0.09603646397590637,
      "rewards/format_reward": 0.36111111380159855,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2768.4027709960938,
      "epoch": 0.2622107969151671,
      "grad_norm": 0.23171323537826538,
      "kl": 0.0499267578125,
      "learning_rate": 8.88586709003076e-07,
      "loss": 0.0185,
      "reward": 0.15082042291760445,
      "reward_std": 0.7368991822004318,
      "rewards/cosine_scaled_reward": -0.12597868964076042,
      "rewards/format_reward": 0.4027777798473835,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2804.9861450195312,
      "epoch": 0.2639245929734362,
      "grad_norm": 0.40300193428993225,
      "kl": 0.05609130859375,
      "learning_rate": 8.865091407243394e-07,
      "loss": 0.0954,
      "reward": 0.4552767127752304,
      "reward_std": 0.7285914719104767,
      "rewards/cosine_scaled_reward": -0.00847275834530592,
      "rewards/format_reward": 0.472222238779068,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3049.0416870117188,
      "epoch": 0.2656383890317052,
      "grad_norm": 0.23651528358459473,
      "kl": 0.0596923828125,
      "learning_rate": 8.844151714648274e-07,
      "loss": -0.0013,
      "reward": 0.12507159425877035,
      "reward_std": 0.8443149924278259,
      "rewards/cosine_scaled_reward": -0.10413086414337158,
      "rewards/format_reward": 0.3333333469927311,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2847.4584350585938,
      "epoch": 0.26735218508997427,
      "grad_norm": 0.3277675211429596,
      "kl": 0.05987548828125,
      "learning_rate": 8.823049032816478e-07,
      "loss": 0.032,
      "reward": 0.31620367243885994,
      "reward_std": 0.7322921454906464,
      "rewards/cosine_scaled_reward": -0.10578705929219723,
      "rewards/format_reward": 0.5277777835726738,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2717.8750610351562,
      "epoch": 0.26906598114824337,
      "grad_norm": 0.39394786953926086,
      "kl": 0.0775146484375,
      "learning_rate": 8.801784390262943e-07,
      "loss": 0.0925,
      "reward": 0.10540201608091593,
      "reward_std": 0.6488600596785545,
      "rewards/cosine_scaled_reward": -0.13479896634817123,
      "rewards/format_reward": 0.3750000037252903,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2818.125,
      "epoch": 0.2707797772065124,
      "grad_norm": 0.40347573161125183,
      "kl": 0.0806884765625,
      "learning_rate": 8.780358823396352e-07,
      "loss": 0.0484,
      "reward": 0.07575460057705641,
      "reward_std": 0.6178670972585678,
      "rewards/cosine_scaled_reward": -0.15656715538352728,
      "rewards/format_reward": 0.3888888889923692,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3166.1666259765625,
      "epoch": 0.27249357326478146,
      "grad_norm": 0.3342011868953705,
      "kl": 0.1055908203125,
      "learning_rate": 8.758773376468604e-07,
      "loss": 0.0491,
      "reward": -0.02998074982315302,
      "reward_std": 0.6097311675548553,
      "rewards/cosine_scaled_reward": -0.19554592855274677,
      "rewards/format_reward": 0.36111111380159855,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3013.7222290039062,
      "epoch": 0.27420736932305056,
      "grad_norm": 0.4173794388771057,
      "kl": 0.106689453125,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.0339,
      "reward": 0.3507204055786133,
      "reward_std": 0.6021532118320465,
      "rewards/cosine_scaled_reward": 0.0017490852624177933,
      "rewards/format_reward": 0.34722222201526165,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2572.8194274902344,
      "epoch": 0.2759211653813196,
      "grad_norm": 0.4282573163509369,
      "kl": 0.123046875,
      "learning_rate": 8.715127058347614e-07,
      "loss": 0.0645,
      "reward": 0.010059013031423092,
      "reward_std": 0.5160095170140266,
      "rewards/cosine_scaled_reward": -0.15469271643087268,
      "rewards/format_reward": 0.3194444514811039,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2242.9444274902344,
      "epoch": 0.2776349614395887,
      "grad_norm": 0.39615368843078613,
      "kl": 0.1090087890625,
      "learning_rate": 8.693068314414344e-07,
      "loss": 0.0772,
      "reward": 0.16390804119873792,
      "reward_std": 0.5712290816009045,
      "rewards/cosine_scaled_reward": -0.18887930922210217,
      "rewards/format_reward": 0.5416666716337204,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2832.9861450195312,
      "epoch": 0.27934875749785776,
      "grad_norm": 0.6331592798233032,
      "kl": 0.1380615234375,
      "learning_rate": 8.670853944836176e-07,
      "loss": 0.1022,
      "reward": 0.20613746903836727,
      "reward_std": 0.7383135333657265,
      "rewards/cosine_scaled_reward": -0.04276460176333785,
      "rewards/format_reward": 0.2916666753590107,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2359.250030517578,
      "epoch": 0.2810625535561268,
      "grad_norm": 0.8423472046852112,
      "kl": 0.1591796875,
      "learning_rate": 8.648485032310144e-07,
      "loss": 0.0087,
      "reward": 0.1561935730278492,
      "reward_std": 0.8059368506073952,
      "rewards/cosine_scaled_reward": -0.10245877737179399,
      "rewards/format_reward": 0.3611111156642437,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2781.041717529297,
      "epoch": 0.2827763496143959,
      "grad_norm": 0.5075474977493286,
      "kl": 0.17626953125,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.0061,
      "reward": 0.35288394801318645,
      "reward_std": 0.7819623723626137,
      "rewards/cosine_scaled_reward": -0.038835824467241764,
      "rewards/format_reward": 0.4305555634200573,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2699.3611450195312,
      "epoch": 0.28449014567266495,
      "grad_norm": 0.41815418004989624,
      "kl": 0.1614990234375,
      "learning_rate": 8.603287946810513e-07,
      "loss": 0.0354,
      "reward": 0.2616021269932389,
      "reward_std": 0.8704780116677284,
      "rewards/cosine_scaled_reward": -0.07753227837383747,
      "rewards/format_reward": 0.416666672565043,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2883.5972900390625,
      "epoch": 0.286203941730934,
      "grad_norm": 0.6100507378578186,
      "kl": 0.192626953125,
      "learning_rate": 8.580461976679099e-07,
      "loss": 0.1117,
      "reward": 0.6217167973518372,
      "reward_std": 1.1077049523591995,
      "rewards/cosine_scaled_reward": 0.08863616734743118,
      "rewards/format_reward": 0.4444444477558136,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2844.77783203125,
      "epoch": 0.2879177377892031,
      "grad_norm": 1.0341858863830566,
      "kl": 0.193359375,
      "learning_rate": 8.557485869176825e-07,
      "loss": 0.1403,
      "reward": 0.44696745090186596,
      "reward_std": 0.8215643167495728,
      "rewards/cosine_scaled_reward": 0.008205945428926498,
      "rewards/format_reward": 0.4305555671453476,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2149.9306030273438,
      "epoch": 0.28963153384747214,
      "grad_norm": 0.8718350529670715,
      "kl": 0.213134765625,
      "learning_rate": 8.534360744126753e-07,
      "loss": 0.0265,
      "reward": 0.24263115064240992,
      "reward_std": 0.7163522839546204,
      "rewards/cosine_scaled_reward": -0.08701775036752224,
      "rewards/format_reward": 0.416666679084301,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2499.9722900390625,
      "epoch": 0.2913453299057412,
      "grad_norm": 0.7302869558334351,
      "kl": 0.233154296875,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.1123,
      "reward": 0.1794309187680483,
      "reward_std": 0.7098504453897476,
      "rewards/cosine_scaled_reward": -0.056117892265319824,
      "rewards/format_reward": 0.2916666679084301,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2474.6250610351562,
      "epoch": 0.2930591259640103,
      "grad_norm": 0.5687596797943115,
      "kl": 0.2568359375,
      "learning_rate": 8.487667956935087e-07,
      "loss": 0.0829,
      "reward": 0.3411689009517431,
      "reward_std": 0.6550407111644745,
      "rewards/cosine_scaled_reward": -0.1280266623944044,
      "rewards/format_reward": 0.597222238779068,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2846.763916015625,
      "epoch": 0.29477292202227934,
      "grad_norm": 0.48782670497894287,
      "kl": 0.252197265625,
      "learning_rate": 8.464102570534061e-07,
      "loss": 0.0672,
      "reward": 0.22427130304276943,
      "reward_std": 0.6338695511221886,
      "rewards/cosine_scaled_reward": -0.10314211621880531,
      "rewards/format_reward": 0.4305555559694767,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2281.2638549804688,
      "epoch": 0.29648671808054844,
      "grad_norm": 1.23881196975708,
      "kl": 0.234375,
      "learning_rate": 8.440392717955475e-07,
      "loss": 0.1363,
      "reward": 0.24636091478168964,
      "reward_std": 0.725439690053463,
      "rewards/cosine_scaled_reward": -0.13376398687250912,
      "rewards/format_reward": 0.5138888899236917,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2596.7500610351562,
      "epoch": 0.2982005141388175,
      "grad_norm": 0.9256901741027832,
      "kl": 0.32763671875,
      "learning_rate": 8.416539554784089e-07,
      "loss": 0.0993,
      "reward": 0.03723787656053901,
      "reward_std": 0.669374942779541,
      "rewards/cosine_scaled_reward": -0.18276994861662388,
      "rewards/format_reward": 0.4027777835726738,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2802.7916870117188,
      "epoch": 0.29991431019708653,
      "grad_norm": 1.6108390092849731,
      "kl": 0.41748046875,
      "learning_rate": 8.392544243589427e-07,
      "loss": -0.0161,
      "reward": -0.026769233867526054,
      "reward_std": 0.7613073363900185,
      "rewards/cosine_scaled_reward": -0.1939401812851429,
      "rewards/format_reward": 0.3611111231148243,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2250.7916564941406,
      "epoch": 0.30162810625535563,
      "grad_norm": 1.3027092218399048,
      "kl": 0.29345703125,
      "learning_rate": 8.368407953869103e-07,
      "loss": 0.1672,
      "reward": 0.34848211891949177,
      "reward_std": 0.8886565566062927,
      "rewards/cosine_scaled_reward": -0.07575894566252828,
      "rewards/format_reward": 0.5000000074505806,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3187.25,
      "epoch": 0.3033419023136247,
      "grad_norm": 0.7333221435546875,
      "kl": 0.390625,
      "learning_rate": 8.344131861991828e-07,
      "loss": 0.0057,
      "reward": -0.06705992296338081,
      "reward_std": 0.5766744017601013,
      "rewards/cosine_scaled_reward": -0.16547441016882658,
      "rewards/format_reward": 0.26388889364898205,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2658.9583129882812,
      "epoch": 0.3050556983718937,
      "grad_norm": 2.110689878463745,
      "kl": 0.40185546875,
      "learning_rate": 8.319717151140072e-07,
      "loss": 0.1018,
      "reward": 0.15619678050279617,
      "reward_std": 0.5456085540354252,
      "rewards/cosine_scaled_reward": -0.15106826776172966,
      "rewards/format_reward": 0.4583333358168602,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3046.361083984375,
      "epoch": 0.3067694944301628,
      "grad_norm": 1.409805417060852,
      "kl": 0.44482421875,
      "learning_rate": 8.295165011252396e-07,
      "loss": 0.1019,
      "reward": -0.08309876918792725,
      "reward_std": 0.6837619245052338,
      "rewards/cosine_scaled_reward": -0.1665493929758668,
      "rewards/format_reward": 0.2500000027939677,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2090.263916015625,
      "epoch": 0.30848329048843187,
      "grad_norm": 1.259092926979065,
      "kl": 0.4365234375,
      "learning_rate": 8.270476638965461e-07,
      "loss": 0.1312,
      "reward": 0.5355786010622978,
      "reward_std": 0.9339739978313446,
      "rewards/cosine_scaled_reward": -0.07248848024755716,
      "rewards/format_reward": 0.6805555671453476,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2998.388916015625,
      "epoch": 0.3101970865467009,
      "grad_norm": 0.7514684796333313,
      "kl": 0.5283203125,
      "learning_rate": 8.245653237555705e-07,
      "loss": 0.0645,
      "reward": 0.0823521837592125,
      "reward_std": 0.6557292975485325,
      "rewards/cosine_scaled_reward": -0.1602128129452467,
      "rewards/format_reward": 0.4027777835726738,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3348.6806030273438,
      "epoch": 0.31191088260497,
      "grad_norm": 1.1279796361923218,
      "kl": 0.6435546875,
      "learning_rate": 8.220696016880687e-07,
      "loss": 0.0509,
      "reward": -0.29229177720844746,
      "reward_std": 0.44720375537872314,
      "rewards/cosine_scaled_reward": -0.27809032425284386,
      "rewards/format_reward": 0.26388889364898205,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2909.6806030273438,
      "epoch": 0.31362467866323906,
      "grad_norm": 0.8539410829544067,
      "kl": 0.5654296875,
      "learning_rate": 8.195606193320136e-07,
      "loss": 0.1078,
      "reward": 0.20359659614041448,
      "reward_std": 0.7151020988821983,
      "rewards/cosine_scaled_reward": -0.07875726278871298,
      "rewards/format_reward": 0.361111112870276,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2720.6666870117188,
      "epoch": 0.31533847472150817,
      "grad_norm": 1.0726344585418701,
      "kl": 0.6005859375,
      "learning_rate": 8.170384989716657e-07,
      "loss": 0.0571,
      "reward": 0.45398143492639065,
      "reward_std": 0.8964811712503433,
      "rewards/cosine_scaled_reward": -0.04384262952953577,
      "rewards/format_reward": 0.5416666716337204,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2828.9305419921875,
      "epoch": 0.3170522707797772,
      "grad_norm": 0.9460340142250061,
      "kl": 0.6220703125,
      "learning_rate": 8.145033635316128e-07,
      "loss": 0.1297,
      "reward": -0.03706150595098734,
      "reward_std": 0.7321052774786949,
      "rewards/cosine_scaled_reward": -0.1990863112732768,
      "rewards/format_reward": 0.3611111231148243,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2725.8750610351562,
      "epoch": 0.31876606683804626,
      "grad_norm": 1.0472413301467896,
      "kl": 0.5556640625,
      "learning_rate": 8.119553365707802e-07,
      "loss": 0.0507,
      "reward": -0.008732129819691181,
      "reward_std": 0.43902990967035294,
      "rewards/cosine_scaled_reward": -0.2057549599558115,
      "rewards/format_reward": 0.4027777835726738,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2761.0556030273438,
      "epoch": 0.32047986289631536,
      "grad_norm": 0.9237687587738037,
      "kl": 0.55126953125,
      "learning_rate": 8.093945422764069e-07,
      "loss": 0.1122,
      "reward": 0.34623236872721463,
      "reward_std": 0.8785705417394638,
      "rewards/cosine_scaled_reward": -0.07688381336629391,
      "rewards/format_reward": 0.5000000074505806,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3188.4027709960938,
      "epoch": 0.3221936589545844,
      "grad_norm": 1.4287723302841187,
      "kl": 0.607421875,
      "learning_rate": 8.068211054579943e-07,
      "loss": 0.0457,
      "reward": -0.10439129918813705,
      "reward_std": 0.6522045210003853,
      "rewards/cosine_scaled_reward": -0.20497343130409718,
      "rewards/format_reward": 0.30555556807667017,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2965.75,
      "epoch": 0.32390745501285345,
      "grad_norm": 1.0540153980255127,
      "kl": 0.6044921875,
      "learning_rate": 8.04235151541222e-07,
      "loss": 0.0926,
      "reward": 0.00805249996483326,
      "reward_std": 0.5005255490541458,
      "rewards/cosine_scaled_reward": -0.1904182005673647,
      "rewards/format_reward": 0.38888888992369175,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2995.3611450195312,
      "epoch": 0.32562125107112255,
      "grad_norm": 2.005993604660034,
      "kl": 0.60546875,
      "learning_rate": 8.01636806561836e-07,
      "loss": 0.1571,
      "reward": -0.2390465196222067,
      "reward_std": 0.5108147040009499,
      "rewards/cosine_scaled_reward": -0.2792454734444618,
      "rewards/format_reward": 0.3194444449618459,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2995.02783203125,
      "epoch": 0.3273350471293916,
      "grad_norm": 0.914374828338623,
      "kl": 0.537109375,
      "learning_rate": 7.990261971595048e-07,
      "loss": 0.0791,
      "reward": -0.008268387988209724,
      "reward_std": 0.7869899272918701,
      "rewards/cosine_scaled_reward": -0.19163418684911449,
      "rewards/format_reward": 0.3750000037252903,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2657.3055725097656,
      "epoch": 0.32904884318766064,
      "grad_norm": 0.9198621511459351,
      "kl": 0.6298828125,
      "learning_rate": 7.964034505716476e-07,
      "loss": 0.1016,
      "reward": 0.14560853224247694,
      "reward_std": 0.44526704400777817,
      "rewards/cosine_scaled_reward": -0.177195742726326,
      "rewards/format_reward": 0.5000000074505806,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2256.6666564941406,
      "epoch": 0.33076263924592975,
      "grad_norm": 0.9307562708854675,
      "kl": 0.5947265625,
      "learning_rate": 7.93768694627233e-07,
      "loss": 0.082,
      "reward": 0.184324630536139,
      "reward_std": 0.5673187747597694,
      "rewards/cosine_scaled_reward": -0.17867101542651653,
      "rewards/format_reward": 0.5416666679084301,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2654.013916015625,
      "epoch": 0.3324764353041988,
      "grad_norm": 1.2104908227920532,
      "kl": 0.6787109375,
      "learning_rate": 7.911220577405484e-07,
      "loss": 0.0927,
      "reward": 0.5049788989126682,
      "reward_std": 0.6255298256874084,
      "rewards/cosine_scaled_reward": 0.0024894457310438156,
      "rewards/format_reward": 0.5000000074505806,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3088.2222900390625,
      "epoch": 0.3341902313624679,
      "grad_norm": 2.0733349323272705,
      "kl": 0.787109375,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.0256,
      "reward": -0.1914132796227932,
      "reward_std": 0.39547703973948956,
      "rewards/cosine_scaled_reward": -0.23459553346037865,
      "rewards/format_reward": 0.2777777761220932,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2572.5416259765625,
      "epoch": 0.33590402742073694,
      "grad_norm": 0.9238296151161194,
      "kl": 0.6484375,
      "learning_rate": 7.857936576865356e-07,
      "loss": 0.0799,
      "reward": 0.4742476176470518,
      "reward_std": 0.8941326662898064,
      "rewards/cosine_scaled_reward": -0.04759840480983257,
      "rewards/format_reward": 0.5694444477558136,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2907.4583129882812,
      "epoch": 0.337617823479006,
      "grad_norm": 0.9024485945701599,
      "kl": 0.693359375,
      "learning_rate": 7.831121542179086e-07,
      "loss": 0.0713,
      "reward": 0.0948091521859169,
      "reward_std": 0.4578506797552109,
      "rewards/cosine_scaled_reward": -0.11926210392266512,
      "rewards/format_reward": 0.33333334513008595,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2943.4305419921875,
      "epoch": 0.3393316195372751,
      "grad_norm": 1.3114806413650513,
      "kl": 0.7470703125,
      "learning_rate": 7.804192891917571e-07,
      "loss": 0.0493,
      "reward": 0.04198681065463461,
      "reward_std": 0.5121570453047752,
      "rewards/cosine_scaled_reward": -0.1595621556043625,
      "rewards/format_reward": 0.3611111156642437,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3010.9166870117188,
      "epoch": 0.34104541559554413,
      "grad_norm": 0.6777936816215515,
      "kl": 0.697265625,
      "learning_rate": 7.777151938545235e-07,
      "loss": 0.0892,
      "reward": 0.12530913203954697,
      "reward_std": 0.5297227501869202,
      "rewards/cosine_scaled_reward": -0.145678770262748,
      "rewards/format_reward": 0.4166666716337204,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2658.4722595214844,
      "epoch": 0.3427592116538132,
      "grad_norm": 1.0869694948196411,
      "kl": 0.5361328125,
      "learning_rate": 7.75e-07,
      "loss": 0.0285,
      "reward": 0.42103337205480784,
      "reward_std": 0.5303617715835571,
      "rewards/cosine_scaled_reward": -0.04642775317188352,
      "rewards/format_reward": 0.5138888992369175,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2742.0833435058594,
      "epoch": 0.3444730077120823,
      "grad_norm": 0.6390620470046997,
      "kl": 0.56005859375,
      "learning_rate": 7.72273839962904e-07,
      "loss": 0.078,
      "reward": 0.13805552199482918,
      "reward_std": 0.5941917151212692,
      "rewards/cosine_scaled_reward": -0.18097224179655313,
      "rewards/format_reward": 0.5000000037252903,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2622.611083984375,
      "epoch": 0.3461868037703513,
      "grad_norm": 1.5139989852905273,
      "kl": 0.51416015625,
      "learning_rate": 7.695368466124296e-07,
      "loss": 0.1322,
      "reward": 0.2749571923632175,
      "reward_std": 0.6380000561475754,
      "rewards/cosine_scaled_reward": -0.09863251959905028,
      "rewards/format_reward": 0.4722222313284874,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2710.7361450195312,
      "epoch": 0.34790059982862037,
      "grad_norm": 1.517341136932373,
      "kl": 0.51171875,
      "learning_rate": 7.667891533457718e-07,
      "loss": 0.1124,
      "reward": 0.5119861587882042,
      "reward_std": 0.9760274440050125,
      "rewards/cosine_scaled_reward": -0.014840253628790379,
      "rewards/format_reward": 0.541666679084301,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2822.6112060546875,
      "epoch": 0.3496143958868895,
      "grad_norm": 1.1272459030151367,
      "kl": 0.548828125,
      "learning_rate": 7.640308940816239e-07,
      "loss": 0.0668,
      "reward": 0.03917721984907985,
      "reward_std": 0.7430369108915329,
      "rewards/cosine_scaled_reward": -0.17485582828521729,
      "rewards/format_reward": 0.38888888992369175,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2645.5694580078125,
      "epoch": 0.3513281919451585,
      "grad_norm": 1.7582755088806152,
      "kl": 0.5556640625,
      "learning_rate": 7.612622032536507e-07,
      "loss": 0.1055,
      "reward": 0.509862631559372,
      "reward_std": 0.7304475903511047,
      "rewards/cosine_scaled_reward": 0.004931296221911907,
      "rewards/format_reward": 0.5000000111758709,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2748.02783203125,
      "epoch": 0.35304198800342756,
      "grad_norm": 13.779873847961426,
      "kl": 1.0166015625,
      "learning_rate": 7.584832158039378e-07,
      "loss": 0.0928,
      "reward": 0.13606557785533369,
      "reward_std": 0.5326481983065605,
      "rewards/cosine_scaled_reward": -0.2236338797956705,
      "rewards/format_reward": 0.5833333507180214,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2727.9722900390625,
      "epoch": 0.35475578406169667,
      "grad_norm": 4.678215503692627,
      "kl": 0.8974609375,
      "learning_rate": 7.556940671764124e-07,
      "loss": 0.1124,
      "reward": 0.3662101551890373,
      "reward_std": 0.5158084109425545,
      "rewards/cosine_scaled_reward": -0.05995047930628061,
      "rewards/format_reward": 0.4861111268401146,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2660.9722900390625,
      "epoch": 0.3564695801199657,
      "grad_norm": 2.2143702507019043,
      "kl": 0.7880859375,
      "learning_rate": 7.528948933102438e-07,
      "loss": 0.067,
      "reward": 0.29765829257667065,
      "reward_std": 0.7447296231985092,
      "rewards/cosine_scaled_reward": -0.1428375095129013,
      "rewards/format_reward": 0.5833333432674408,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2533.4444580078125,
      "epoch": 0.3581833761782348,
      "grad_norm": 1.057923436164856,
      "kl": 0.626953125,
      "learning_rate": 7.500858306332172e-07,
      "loss": 0.1388,
      "reward": 0.22743514459580183,
      "reward_std": 0.8155356049537659,
      "rewards/cosine_scaled_reward": -0.15017131343483925,
      "rewards/format_reward": 0.5277777835726738,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2513.763916015625,
      "epoch": 0.35989717223650386,
      "grad_norm": 3.4706244468688965,
      "kl": 0.6640625,
      "learning_rate": 7.472670160550848e-07,
      "loss": 0.2307,
      "reward": 0.32537855207920074,
      "reward_std": 0.6403735391795635,
      "rewards/cosine_scaled_reward": -0.11508850922109559,
      "rewards/format_reward": 0.555555559694767,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3072.1806030273438,
      "epoch": 0.3616109682947729,
      "grad_norm": 0.867877721786499,
      "kl": 0.85009765625,
      "learning_rate": 7.444385869608921e-07,
      "loss": 0.1175,
      "reward": 0.021036310121417046,
      "reward_std": 0.5472413003444672,
      "rewards/cosine_scaled_reward": -0.170037392526865,
      "rewards/format_reward": 0.36111112125217915,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2580.791717529297,
      "epoch": 0.363324764353042,
      "grad_norm": 1.1602129936218262,
      "kl": 0.8037109375,
      "learning_rate": 7.416006812042827e-07,
      "loss": 0.1343,
      "reward": 0.6223988421261311,
      "reward_std": 0.851245753467083,
      "rewards/cosine_scaled_reward": -0.0013005826622247696,
      "rewards/format_reward": 0.625,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2920.2916870117188,
      "epoch": 0.36503856041131105,
      "grad_norm": 1.2226418256759644,
      "kl": 1.0029296875,
      "learning_rate": 7.387534371007797e-07,
      "loss": 0.1063,
      "reward": 0.10683083906769753,
      "reward_std": 0.6580070406198502,
      "rewards/cosine_scaled_reward": -0.21741791814565659,
      "rewards/format_reward": 0.5416666641831398,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2951.166748046875,
      "epoch": 0.3667523564695801,
      "grad_norm": 1.302587628364563,
      "kl": 1.1484375,
      "learning_rate": 7.358969934210438e-07,
      "loss": 0.1102,
      "reward": 0.16499032359570265,
      "reward_std": 0.5117045789957047,
      "rewards/cosine_scaled_reward": -0.13278261446976103,
      "rewards/format_reward": 0.4305555671453476,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2642.3056030273438,
      "epoch": 0.3684661525278492,
      "grad_norm": 1.028397560119629,
      "kl": 0.8671875,
      "learning_rate": 7.330314893841101e-07,
      "loss": 0.1035,
      "reward": 0.3645508070476353,
      "reward_std": 0.9228581190109253,
      "rewards/cosine_scaled_reward": -0.10939126997254789,
      "rewards/format_reward": 0.5833333283662796,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2541.0833129882812,
      "epoch": 0.37017994858611825,
      "grad_norm": 1.578083872795105,
      "kl": 0.86865234375,
      "learning_rate": 7.301570646506027e-07,
      "loss": 0.1489,
      "reward": 0.26378826051950455,
      "reward_std": 0.6202561929821968,
      "rewards/cosine_scaled_reward": -0.13199475780129433,
      "rewards/format_reward": 0.5277777910232544,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2184.9445190429688,
      "epoch": 0.3718937446443873,
      "grad_norm": 1.103194236755371,
      "kl": 0.6845703125,
      "learning_rate": 7.27273859315928e-07,
      "loss": 0.0844,
      "reward": 0.4086096244864166,
      "reward_std": 0.7625616788864136,
      "rewards/cosine_scaled_reward": -0.11513962969183922,
      "rewards/format_reward": 0.6388889029622078,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2404.6805725097656,
      "epoch": 0.3736075407026564,
      "grad_norm": 2.3009181022644043,
      "kl": 0.8935546875,
      "learning_rate": 7.243820139034464e-07,
      "loss": 0.1892,
      "reward": 0.4591095373034477,
      "reward_std": 0.5642153918743134,
      "rewards/cosine_scaled_reward": -0.08294522017240524,
      "rewards/format_reward": 0.625,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3044.9862060546875,
      "epoch": 0.37532133676092544,
      "grad_norm": 1.2761178016662598,
      "kl": 1.19921875,
      "learning_rate": 7.214816693576234e-07,
      "loss": 0.1195,
      "reward": 0.21450293064117432,
      "reward_std": 0.7603526711463928,
      "rewards/cosine_scaled_reward": -0.12191520072519779,
      "rewards/format_reward": 0.4583333358168602,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2793.90283203125,
      "epoch": 0.37703513281919454,
      "grad_norm": 1.6576476097106934,
      "kl": 1.166015625,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.0977,
      "reward": 0.22772593423724174,
      "reward_std": 0.5124068222939968,
      "rewards/cosine_scaled_reward": -0.18474812898784876,
      "rewards/format_reward": 0.5972222313284874,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2835.0,
      "epoch": 0.3787489288774636,
      "grad_norm": 3.882580280303955,
      "kl": 1.0927734375,
      "learning_rate": 7.156560487081051e-07,
      "loss": 0.0245,
      "reward": 0.10485807061195374,
      "reward_std": 0.5114092901349068,
      "rewards/cosine_scaled_reward": -0.20451541244983673,
      "rewards/format_reward": 0.5138888955116272,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2310.9305725097656,
      "epoch": 0.38046272493573263,
      "grad_norm": 1.3096808195114136,
      "kl": 0.87890625,
      "learning_rate": 7.127310565369415e-07,
      "loss": 0.1143,
      "reward": 0.4324228148907423,
      "reward_std": 0.5727507174015045,
      "rewards/cosine_scaled_reward": -0.13101080805063248,
      "rewards/format_reward": 0.6944444477558136,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2826.2222290039062,
      "epoch": 0.38217652099400173,
      "grad_norm": 1.182824730873108,
      "kl": 0.931640625,
      "learning_rate": 7.097981330836616e-07,
      "loss": 0.1159,
      "reward": 0.17814365401864052,
      "reward_std": 0.5051928982138634,
      "rewards/cosine_scaled_reward": -0.14703928492963314,
      "rewards/format_reward": 0.4722222238779068,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2644.8611450195312,
      "epoch": 0.3838903170522708,
      "grad_norm": 1.271640658378601,
      "kl": 0.7939453125,
      "learning_rate": 7.068574212948169e-07,
      "loss": 0.0771,
      "reward": 0.21772570302709937,
      "reward_std": 0.5406957715749741,
      "rewards/cosine_scaled_reward": -0.17585936933755875,
      "rewards/format_reward": 0.5694444477558136,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2389.749969482422,
      "epoch": 0.3856041131105398,
      "grad_norm": 2.586735486984253,
      "kl": 0.6689453125,
      "learning_rate": 7.039090644965509e-07,
      "loss": -0.0027,
      "reward": 0.5408617407083511,
      "reward_std": 0.7554269433021545,
      "rewards/cosine_scaled_reward": -0.03512469958513975,
      "rewards/format_reward": 0.6111111119389534,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2435.250030517578,
      "epoch": 0.3873179091688089,
      "grad_norm": 1.4329172372817993,
      "kl": 0.59521484375,
      "learning_rate": 7.009532063876148e-07,
      "loss": 0.0425,
      "reward": 0.21151528507471085,
      "reward_std": 0.6967541426420212,
      "rewards/cosine_scaled_reward": -0.14424235187470913,
      "rewards/format_reward": 0.5000000149011612,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2509.1945190429688,
      "epoch": 0.389031705227078,
      "grad_norm": 1.7964338064193726,
      "kl": 0.51220703125,
      "learning_rate": 6.979899910323624e-07,
      "loss": 0.0631,
      "reward": 0.3222038522362709,
      "reward_std": 0.7920150905847549,
      "rewards/cosine_scaled_reward": -0.0958425235003233,
      "rewards/format_reward": 0.5138888955116272,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2343.888885498047,
      "epoch": 0.390745501285347,
      "grad_norm": 1.312915563583374,
      "kl": 0.48828125,
      "learning_rate": 6.950195628537299e-07,
      "loss": 0.0327,
      "reward": 0.72439269348979,
      "reward_std": 0.6716032773256302,
      "rewards/cosine_scaled_reward": 0.028863003477454185,
      "rewards/format_reward": 0.6666666865348816,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2668.8333740234375,
      "epoch": 0.3924592973436161,
      "grad_norm": 1.1794544458389282,
      "kl": 0.591796875,
      "learning_rate": 6.920420666261961e-07,
      "loss": 0.0617,
      "reward": 0.4572554435580969,
      "reward_std": 0.6365808099508286,
      "rewards/cosine_scaled_reward": -0.028316727373749018,
      "rewards/format_reward": 0.5138888955116272,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2405.111114501953,
      "epoch": 0.39417309340188517,
      "grad_norm": 2.7993645668029785,
      "kl": 0.52197265625,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.193,
      "reward": 0.6450787968933582,
      "reward_std": 0.6886177062988281,
      "rewards/cosine_scaled_reward": -0.017738381633535028,
      "rewards/format_reward": 0.6805555671453476,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.763916015625,
      "epoch": 0.39588688946015427,
      "grad_norm": 0.9008044600486755,
      "kl": 0.59228515625,
      "learning_rate": 6.860664508377001e-07,
      "loss": 0.0832,
      "reward": 0.17632517218589783,
      "reward_std": 0.7136962860822678,
      "rewards/cosine_scaled_reward": -0.11322630103677511,
      "rewards/format_reward": 0.4027777835726738,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2096.763885498047,
      "epoch": 0.3976006855184233,
      "grad_norm": 2.9937281608581543,
      "kl": 0.44482421875,
      "learning_rate": 6.83068622519821e-07,
      "loss": 0.0198,
      "reward": 0.45767842745408416,
      "reward_std": 0.6805157586932182,
      "rewards/cosine_scaled_reward": -0.0697719173040241,
      "rewards/format_reward": 0.5972222313284874,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2095.625030517578,
      "epoch": 0.39931448157669236,
      "grad_norm": 3.1695449352264404,
      "kl": 0.52392578125,
      "learning_rate": 6.800643086250121e-07,
      "loss": 0.124,
      "reward": 0.8969383761286736,
      "reward_std": 0.8693148195743561,
      "rewards/cosine_scaled_reward": 0.10124696930870414,
      "rewards/format_reward": 0.6944444477558136,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2397.9722290039062,
      "epoch": 0.40102827763496146,
      "grad_norm": 2.038714647293091,
      "kl": 0.60302734375,
      "learning_rate": 6.770536555792944e-07,
      "loss": 0.0803,
      "reward": 0.3801136128604412,
      "reward_std": 0.6368846967816353,
      "rewards/cosine_scaled_reward": -0.11549876257777214,
      "rewards/format_reward": 0.6111111044883728,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2803.1805419921875,
      "epoch": 0.4027420736932305,
      "grad_norm": 1.1210250854492188,
      "kl": 0.76171875,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.0749,
      "reward": 0.051421504467725754,
      "reward_std": 0.46992237120866776,
      "rewards/cosine_scaled_reward": -0.21734481677412987,
      "rewards/format_reward": 0.4861111231148243,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2735.9583129882812,
      "epoch": 0.40445586975149955,
      "grad_norm": 1.5609543323516846,
      "kl": 0.6220703125,
      "learning_rate": 6.710139192768694e-07,
      "loss": 0.1051,
      "reward": 0.33594064973294735,
      "reward_std": 0.5969594717025757,
      "rewards/cosine_scaled_reward": -0.1306407954543829,
      "rewards/format_reward": 0.597222238779068,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2207.9444580078125,
      "epoch": 0.40616966580976865,
      "grad_norm": 3.293438673019409,
      "kl": 0.619140625,
      "learning_rate": 6.679851303883891e-07,
      "loss": 0.1014,
      "reward": 0.6933649554848671,
      "reward_std": 0.4978405013680458,
      "rewards/cosine_scaled_reward": -0.02831752598285675,
      "rewards/format_reward": 0.75,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2911.0972900390625,
      "epoch": 0.4078834618680377,
      "grad_norm": 1.396133303642273,
      "kl": 0.689453125,
      "learning_rate": 6.649505910711058e-07,
      "loss": 0.1308,
      "reward": 0.23781822435557842,
      "reward_std": 0.5772198215126991,
      "rewards/cosine_scaled_reward": -0.1449797886889428,
      "rewards/format_reward": 0.5277777835726738,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2865.1944580078125,
      "epoch": 0.40959725792630675,
      "grad_norm": 1.02251398563385,
      "kl": 0.775390625,
      "learning_rate": 6.619104492241847e-07,
      "loss": 0.1421,
      "reward": 0.38452258985489607,
      "reward_std": 0.7435072809457779,
      "rewards/cosine_scaled_reward": -0.0646831514313817,
      "rewards/format_reward": 0.5138888880610466,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2345.4444274902344,
      "epoch": 0.41131105398457585,
      "grad_norm": 4.698256492614746,
      "kl": 0.6904296875,
      "learning_rate": 6.588648530198504e-07,
      "loss": 0.1594,
      "reward": 0.7729744166135788,
      "reward_std": 0.8151284381747246,
      "rewards/cosine_scaled_reward": 0.10176499933004379,
      "rewards/format_reward": 0.5694444477558136,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2070.4166259765625,
      "epoch": 0.4130248500428449,
      "grad_norm": 8.216842651367188,
      "kl": 0.52490234375,
      "learning_rate": 6.558139508961654e-07,
      "loss": 0.1321,
      "reward": 0.48884235695004463,
      "reward_std": 0.5597383752465248,
      "rewards/cosine_scaled_reward": -0.06113438308238983,
      "rewards/format_reward": 0.6111111119389534,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2937.0972290039062,
      "epoch": 0.414738646101114,
      "grad_norm": 1.0033913850784302,
      "kl": 0.5947265625,
      "learning_rate": 6.527578915497951e-07,
      "loss": 0.1027,
      "reward": 0.3329106804449111,
      "reward_std": 0.626296728849411,
      "rewards/cosine_scaled_reward": -0.09048910066485405,
      "rewards/format_reward": 0.5138888955116272,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2632.3055419921875,
      "epoch": 0.41645244215938304,
      "grad_norm": 3.2918546199798584,
      "kl": 0.62109375,
      "learning_rate": 6.496968239287603e-07,
      "loss": 0.151,
      "reward": 0.423097662627697,
      "reward_std": 0.7703854739665985,
      "rewards/cosine_scaled_reward": -0.05234006140381098,
      "rewards/format_reward": 0.527777798473835,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2488.8055725097656,
      "epoch": 0.4181662382176521,
      "grad_norm": 1.382688283920288,
      "kl": 0.51220703125,
      "learning_rate": 6.466308972251785e-07,
      "loss": 0.1239,
      "reward": 0.5373616181313992,
      "reward_std": 0.642534889280796,
      "rewards/cosine_scaled_reward": -0.036874750861898065,
      "rewards/format_reward": 0.6111111119389534,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1840.0555877685547,
      "epoch": 0.4198800342759212,
      "grad_norm": 5.2921977043151855,
      "kl": 0.435546875,
      "learning_rate": 6.435602608679916e-07,
      "loss": 0.1668,
      "reward": 1.0207914784550667,
      "reward_std": 0.6237036064267159,
      "rewards/cosine_scaled_reward": 0.08678461611270905,
      "rewards/format_reward": 0.8472222536802292,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2378.77783203125,
      "epoch": 0.42159383033419023,
      "grad_norm": 4.525283336639404,
      "kl": 0.7333984375,
      "learning_rate": 6.404850645156841e-07,
      "loss": 0.2341,
      "reward": 0.8125267028808594,
      "reward_std": 0.7737091481685638,
      "rewards/cosine_scaled_reward": 0.05209667468443513,
      "rewards/format_reward": 0.7083333432674408,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2746.7916870117188,
      "epoch": 0.4233076263924593,
      "grad_norm": 1.215826392173767,
      "kl": 0.7509765625,
      "learning_rate": 6.374054580489873e-07,
      "loss": 0.124,
      "reward": 0.16153091937303543,
      "reward_std": 0.7042593955993652,
      "rewards/cosine_scaled_reward": -0.12756787613034248,
      "rewards/format_reward": 0.4166666716337204,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2420.125,
      "epoch": 0.4250214224507284,
      "grad_norm": 2.161705732345581,
      "kl": 0.86328125,
      "learning_rate": 6.343215915635761e-07,
      "loss": 0.0674,
      "reward": 0.6162599250674248,
      "reward_std": 0.7196609973907471,
      "rewards/cosine_scaled_reward": -0.05298116838093847,
      "rewards/format_reward": 0.7222222238779068,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2277.500030517578,
      "epoch": 0.4267352185089974,
      "grad_norm": 3.1015782356262207,
      "kl": 0.732421875,
      "learning_rate": 6.31233615362752e-07,
      "loss": 0.0179,
      "reward": 0.6064739339053631,
      "reward_std": 0.6056996583938599,
      "rewards/cosine_scaled_reward": -0.03704079985618591,
      "rewards/format_reward": 0.6805555671453476,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2416.9583740234375,
      "epoch": 0.4284490145672665,
      "grad_norm": 8.199381828308105,
      "kl": 0.904296875,
      "learning_rate": 6.281416799501187e-07,
      "loss": 0.0707,
      "reward": 0.5718964412808418,
      "reward_std": 0.7699461728334427,
      "rewards/cosine_scaled_reward": -0.06127400905825198,
      "rewards/format_reward": 0.6944444477558136,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2411.2916259765625,
      "epoch": 0.4301628106255356,
      "grad_norm": 7.757229328155518,
      "kl": 0.7177734375,
      "learning_rate": 6.25045936022246e-07,
      "loss": 0.0104,
      "reward": 0.6452328599989414,
      "reward_std": 0.8850838840007782,
      "rewards/cosine_scaled_reward": -0.010716899763792753,
      "rewards/format_reward": 0.6666666716337204,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2250.1805725097656,
      "epoch": 0.4318766066838046,
      "grad_norm": 50.71971893310547,
      "kl": 1.673828125,
      "learning_rate": 6.219465344613258e-07,
      "loss": 0.2537,
      "reward": 0.41996366158127785,
      "reward_std": 0.630496121942997,
      "rewards/cosine_scaled_reward": -0.16501817479729652,
      "rewards/format_reward": 0.7500000149011612,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2389.4861450195312,
      "epoch": 0.43359040274207367,
      "grad_norm": 76.95365142822266,
      "kl": 1.5126953125,
      "learning_rate": 6.188436263278172e-07,
      "loss": 0.1964,
      "reward": 0.5589244738221169,
      "reward_std": 0.8758179396390915,
      "rewards/cosine_scaled_reward": -0.019148872102960013,
      "rewards/format_reward": 0.597222238779068,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2433.6944580078125,
      "epoch": 0.43530419880034277,
      "grad_norm": 89.30572509765625,
      "kl": 1.7294921875,
      "learning_rate": 6.157373628530852e-07,
      "loss": 0.1411,
      "reward": 0.3653869954869151,
      "reward_std": 0.6425384879112244,
      "rewards/cosine_scaled_reward": -0.11591762490570545,
      "rewards/format_reward": 0.5972222313284874,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2666.02783203125,
      "epoch": 0.4370179948586118,
      "grad_norm": 9.923705101013184,
      "kl": 0.7724609375,
      "learning_rate": 6.126278954320294e-07,
      "loss": 0.1243,
      "reward": 0.3781815767288208,
      "reward_std": 0.6919418126344681,
      "rewards/cosine_scaled_reward": -0.06785366125404835,
      "rewards/format_reward": 0.5138888880610466,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2432.8056030273438,
      "epoch": 0.4387317909168809,
      "grad_norm": 9.747496604919434,
      "kl": 0.7314453125,
      "learning_rate": 6.095153756157051e-07,
      "loss": 0.083,
      "reward": 0.2219482958316803,
      "reward_std": 0.47816336899995804,
      "rewards/cosine_scaled_reward": -0.20152585953474045,
      "rewards/format_reward": 0.6250000074505806,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2205.027801513672,
      "epoch": 0.44044558697514996,
      "grad_norm": 3.7311699390411377,
      "kl": 0.53857421875,
      "learning_rate": 6.06399955103937e-07,
      "loss": 0.1269,
      "reward": 0.5375950075685978,
      "reward_std": 0.6251346915960312,
      "rewards/cosine_scaled_reward": -0.0923136118799448,
      "rewards/format_reward": 0.7222222238779068,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2471.4584045410156,
      "epoch": 0.442159383033419,
      "grad_norm": 2.448763608932495,
      "kl": 0.91796875,
      "learning_rate": 6.032817857379256e-07,
      "loss": 0.1343,
      "reward": 0.33694631792604923,
      "reward_std": 0.4810459837317467,
      "rewards/cosine_scaled_reward": -0.1440268289297819,
      "rewards/format_reward": 0.6250000149011612,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2130.763885498047,
      "epoch": 0.4438731790916881,
      "grad_norm": 1.8066775798797607,
      "kl": 0.68505859375,
      "learning_rate": 6.001610194928464e-07,
      "loss": 0.1045,
      "reward": 0.7608818560838699,
      "reward_std": 0.697891928255558,
      "rewards/cosine_scaled_reward": -0.008447982007055543,
      "rewards/format_reward": 0.7777777910232544,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2359.7638549804688,
      "epoch": 0.44558697514995715,
      "grad_norm": 1.9958624839782715,
      "kl": 1.009765625,
      "learning_rate": 5.97037808470444e-07,
      "loss": 0.1355,
      "reward": 0.7400075197219849,
      "reward_std": 0.5138791352510452,
      "rewards/cosine_scaled_reward": 0.00889264652505517,
      "rewards/format_reward": 0.7222222313284874,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2002.7361145019531,
      "epoch": 0.4473007712082262,
      "grad_norm": 5.525115966796875,
      "kl": 0.7490234375,
      "learning_rate": 5.939123048916173e-07,
      "loss": 0.1647,
      "reward": 0.7085682898759842,
      "reward_std": 0.6736738979816437,
      "rewards/cosine_scaled_reward": -0.041549197398126125,
      "rewards/format_reward": 0.7916666716337204,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2599.9028930664062,
      "epoch": 0.4490145672664953,
      "grad_norm": 4.714324474334717,
      "kl": 0.9013671875,
      "learning_rate": 5.907846610890011e-07,
      "loss": 0.1464,
      "reward": 0.7315462306141853,
      "reward_std": 0.8794215172529221,
      "rewards/cosine_scaled_reward": 0.05327310296706855,
      "rewards/format_reward": 0.6250000149011612,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2593.236083984375,
      "epoch": 0.45072836332476435,
      "grad_norm": 1.4935065507888794,
      "kl": 0.8330078125,
      "learning_rate": 5.87655029499542e-07,
      "loss": 0.1333,
      "reward": 0.26326372660696507,
      "reward_std": 0.3958895206451416,
      "rewards/cosine_scaled_reward": -0.11836813762784004,
      "rewards/format_reward": 0.5000000074505806,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2746.1111450195312,
      "epoch": 0.4524421593830334,
      "grad_norm": 2.71354341506958,
      "kl": 0.833984375,
      "learning_rate": 5.845235626570683e-07,
      "loss": 0.0506,
      "reward": 0.34650287590920925,
      "reward_std": 0.6603603884577751,
      "rewards/cosine_scaled_reward": -0.11841523088514805,
      "rewards/format_reward": 0.5833333283662796,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2810.6111450195312,
      "epoch": 0.4541559554413025,
      "grad_norm": 3.0615949630737305,
      "kl": 0.8046875,
      "learning_rate": 5.813904131848564e-07,
      "loss": 0.0807,
      "reward": 0.6215685978531837,
      "reward_std": 0.6345800720155239,
      "rewards/cosine_scaled_reward": 0.06078430451452732,
      "rewards/format_reward": 0.5000000018626451,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2483.513916015625,
      "epoch": 0.45586975149957154,
      "grad_norm": 2.4526662826538086,
      "kl": 0.689453125,
      "learning_rate": 5.78255733788191e-07,
      "loss": 0.1832,
      "reward": 0.4189574085175991,
      "reward_std": 0.4973677098751068,
      "rewards/cosine_scaled_reward": -0.0544101782143116,
      "rewards/format_reward": 0.5277777835726738,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2401.1805725097656,
      "epoch": 0.45758354755784064,
      "grad_norm": 1.3402016162872314,
      "kl": 0.7294921875,
      "learning_rate": 5.751196772469237e-07,
      "loss": 0.1157,
      "reward": 0.36370813054963946,
      "reward_std": 0.4258965626358986,
      "rewards/cosine_scaled_reward": -0.14453481137752533,
      "rewards/format_reward": 0.6527777910232544,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2708.3194580078125,
      "epoch": 0.4592973436161097,
      "grad_norm": 3.217470407485962,
      "kl": 0.599609375,
      "learning_rate": 5.71982396408026e-07,
      "loss": 0.0051,
      "reward": 0.26998334005475044,
      "reward_std": 0.49185192957520485,
      "rewards/cosine_scaled_reward": -0.15667499974370003,
      "rewards/format_reward": 0.5833333432674408,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2829.4583740234375,
      "epoch": 0.46101113967437873,
      "grad_norm": 1.457294225692749,
      "kl": 0.591796875,
      "learning_rate": 5.688440441781398e-07,
      "loss": 0.1149,
      "reward": 0.2393805852625519,
      "reward_std": 0.7379022389650345,
      "rewards/cosine_scaled_reward": -0.11642082477919757,
      "rewards/format_reward": 0.4722222313284874,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2878.2916259765625,
      "epoch": 0.46272493573264784,
      "grad_norm": 1.101474642753601,
      "kl": 0.69921875,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.0498,
      "reward": 0.37158428877592087,
      "reward_std": 0.5783374309539795,
      "rewards/cosine_scaled_reward": -0.029485642910003662,
      "rewards/format_reward": 0.430555559694767,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2793.6806030273438,
      "epoch": 0.4644387317909169,
      "grad_norm": 0.8390009999275208,
      "kl": 0.63916015625,
      "learning_rate": 5.625647374256061e-07,
      "loss": 0.0834,
      "reward": 0.38940694369375706,
      "reward_std": 0.6547586917877197,
      "rewards/cosine_scaled_reward": -0.06918542925268412,
      "rewards/format_reward": 0.5277777723968029,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2366.638885498047,
      "epoch": 0.4661525278491859,
      "grad_norm": 4.141148090362549,
      "kl": 0.4765625,
      "learning_rate": 5.594240889475106e-07,
      "loss": 0.2224,
      "reward": 0.4617117829620838,
      "reward_std": 0.6572139859199524,
      "rewards/cosine_scaled_reward": -0.060810765251517296,
      "rewards/format_reward": 0.5833333358168602,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2721.6805419921875,
      "epoch": 0.46786632390745503,
      "grad_norm": 1.906948208808899,
      "kl": 0.599609375,
      "learning_rate": 5.562829811526154e-07,
      "loss": 0.1188,
      "reward": 0.26052477210760117,
      "reward_std": 0.6570783406496048,
      "rewards/cosine_scaled_reward": -0.06418205983936787,
      "rewards/format_reward": 0.3888888992369175,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2921.3750610351562,
      "epoch": 0.4695801199657241,
      "grad_norm": 1.8937734365463257,
      "kl": 0.52685546875,
      "learning_rate": 5.531415671340826e-07,
      "loss": 0.0185,
      "reward": 0.22513618250377476,
      "reward_std": 0.6058431342244148,
      "rewards/cosine_scaled_reward": -0.16520969779230654,
      "rewards/format_reward": 0.555555559694767,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2988.15283203125,
      "epoch": 0.4712939160239931,
      "grad_norm": 1.208433985710144,
      "kl": 0.54931640625,
      "learning_rate": 5.5e-07,
      "loss": 0.0485,
      "reward": 0.11821263573256147,
      "reward_std": 0.391703762114048,
      "rewards/cosine_scaled_reward": -0.15617146715521812,
      "rewards/format_reward": 0.4305555494502187,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2643.541748046875,
      "epoch": 0.4730077120822622,
      "grad_norm": 0.9594613313674927,
      "kl": 0.4326171875,
      "learning_rate": 5.468584328659172e-07,
      "loss": 0.0807,
      "reward": 0.32308289408683777,
      "reward_std": 0.7159284129738808,
      "rewards/cosine_scaled_reward": -0.08151410473510623,
      "rewards/format_reward": 0.4861111156642437,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2926.013916015625,
      "epoch": 0.47472150814053127,
      "grad_norm": 1.3466960191726685,
      "kl": 0.5048828125,
      "learning_rate": 5.437170188473847e-07,
      "loss": 0.1283,
      "reward": 0.06485692039132118,
      "reward_std": 0.542218990623951,
      "rewards/cosine_scaled_reward": -0.17590487515553832,
      "rewards/format_reward": 0.4166666679084301,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2993.1945190429688,
      "epoch": 0.47643530419880037,
      "grad_norm": 3.963604688644409,
      "kl": 0.5546875,
      "learning_rate": 5.405759110524894e-07,
      "loss": 0.2168,
      "reward": 0.03267951123416424,
      "reward_std": 0.7076856940984726,
      "rewards/cosine_scaled_reward": -0.15727136190980673,
      "rewards/format_reward": 0.3472222276031971,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3138.6805419921875,
      "epoch": 0.4781491002570694,
      "grad_norm": 0.5958003997802734,
      "kl": 0.56201171875,
      "learning_rate": 5.37435262574394e-07,
      "loss": 0.0759,
      "reward": -0.051162030547857285,
      "reward_std": 0.5351358503103256,
      "rewards/cosine_scaled_reward": -0.2478032372891903,
      "rewards/format_reward": 0.4444444477558136,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2737.1944885253906,
      "epoch": 0.47986289631533846,
      "grad_norm": 1.6760090589523315,
      "kl": 0.59228515625,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.1096,
      "reward": 0.45366813987493515,
      "reward_std": 0.7812162339687347,
      "rewards/cosine_scaled_reward": -0.0023325812071561813,
      "rewards/format_reward": 0.4583333367481828,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2423.2222290039062,
      "epoch": 0.48157669237360756,
      "grad_norm": 2.0177345275878906,
      "kl": 0.461669921875,
      "learning_rate": 5.311559558218603e-07,
      "loss": 0.1262,
      "reward": 0.38226850144565105,
      "reward_std": 0.6752881184220314,
      "rewards/cosine_scaled_reward": -0.07969908323138952,
      "rewards/format_reward": 0.5416666585952044,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2740.3611450195312,
      "epoch": 0.4832904884318766,
      "grad_norm": 1.8314719200134277,
      "kl": 0.6337890625,
      "learning_rate": 5.28017603591974e-07,
      "loss": 0.1455,
      "reward": 0.21622517937794328,
      "reward_std": 0.6346057131886482,
      "rewards/cosine_scaled_reward": -0.15577631071209908,
      "rewards/format_reward": 0.5277777910232544,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2889.2500610351562,
      "epoch": 0.48500428449014565,
      "grad_norm": 0.8971331119537354,
      "kl": 0.6259765625,
      "learning_rate": 5.248803227530763e-07,
      "loss": 0.1266,
      "reward": 0.03935375134460628,
      "reward_std": 0.6441294327378273,
      "rewards/cosine_scaled_reward": -0.20254534482955933,
      "rewards/format_reward": 0.4444444477558136,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2276.9584045410156,
      "epoch": 0.48671808054841476,
      "grad_norm": 6.356135368347168,
      "kl": 0.4833984375,
      "learning_rate": 5.21744266211809e-07,
      "loss": 0.3013,
      "reward": 0.32465188996866345,
      "reward_std": 0.6560942605137825,
      "rewards/cosine_scaled_reward": -0.09461849741637707,
      "rewards/format_reward": 0.5138889029622078,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2624.9306640625,
      "epoch": 0.4884318766066838,
      "grad_norm": 1.9299744367599487,
      "kl": 0.5576171875,
      "learning_rate": 5.186095868151436e-07,
      "loss": 0.1493,
      "reward": 0.21753913909196854,
      "reward_std": 0.683107927441597,
      "rewards/cosine_scaled_reward": -0.16206377279013395,
      "rewards/format_reward": 0.5416666716337204,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2555.7361450195312,
      "epoch": 0.49014567266495285,
      "grad_norm": 2.2400879859924316,
      "kl": 0.62890625,
      "learning_rate": 5.154764373429315e-07,
      "loss": 0.1583,
      "reward": 0.4158199355006218,
      "reward_std": 0.6385739594697952,
      "rewards/cosine_scaled_reward": -0.021256705978885293,
      "rewards/format_reward": 0.4583333320915699,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2526.52783203125,
      "epoch": 0.49185946872322195,
      "grad_norm": 3.190502405166626,
      "kl": 0.916015625,
      "learning_rate": 5.123449705004581e-07,
      "loss": 0.2192,
      "reward": 0.4090676587074995,
      "reward_std": 0.7619837448000908,
      "rewards/cosine_scaled_reward": -0.0662995120510459,
      "rewards/format_reward": 0.5416666641831398,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2134.65283203125,
      "epoch": 0.493573264781491,
      "grad_norm": 6.452578544616699,
      "kl": 1.103515625,
      "learning_rate": 5.09215338910999e-07,
      "loss": 0.368,
      "reward": 0.44275959208607674,
      "reward_std": 0.6151050329208374,
      "rewards/cosine_scaled_reward": -0.09806465543806553,
      "rewards/format_reward": 0.6388888955116272,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2290.013885498047,
      "epoch": 0.4952870608397601,
      "grad_norm": 1.6855307817459106,
      "kl": 0.82275390625,
      "learning_rate": 5.060876951083828e-07,
      "loss": 0.1464,
      "reward": 0.4334092391654849,
      "reward_std": 0.6985170915722847,
      "rewards/cosine_scaled_reward": -0.0819065012037754,
      "rewards/format_reward": 0.5972222164273262,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1840.6388549804688,
      "epoch": 0.49700085689802914,
      "grad_norm": 2.9146785736083984,
      "kl": 0.79296875,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0283,
      "reward": 0.7537698708474636,
      "reward_std": 0.5962013602256775,
      "rewards/cosine_scaled_reward": 0.008829381316900253,
      "rewards/format_reward": 0.736111119389534,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2200.4444274902344,
      "epoch": 0.4987146529562982,
      "grad_norm": 1.7983882427215576,
      "kl": 0.927734375,
      "learning_rate": 4.998389805071536e-07,
      "loss": 0.2134,
      "reward": 0.614590086042881,
      "reward_std": 0.9522670358419418,
      "rewards/cosine_scaled_reward": -0.012149423826485872,
      "rewards/format_reward": 0.638888880610466,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1986.0694885253906,
      "epoch": 0.5004284490145673,
      "grad_norm": 2.5195839405059814,
      "kl": 0.9873046875,
      "learning_rate": 4.967182142620745e-07,
      "loss": 0.1778,
      "reward": 0.7197987511754036,
      "reward_std": 0.8348591700196266,
      "rewards/cosine_scaled_reward": 0.01267714286223054,
      "rewards/format_reward": 0.6944444477558136,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1970.9999694824219,
      "epoch": 0.5021422450728363,
      "grad_norm": 3.093557596206665,
      "kl": 1.0625,
      "learning_rate": 4.93600044896063e-07,
      "loss": 0.1538,
      "reward": 0.29343970119953156,
      "reward_std": 0.49155813455581665,
      "rewards/cosine_scaled_reward": -0.13105794228613377,
      "rewards/format_reward": 0.555555559694767,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1924.7916870117188,
      "epoch": 0.5038560411311054,
      "grad_norm": 6.1597208976745605,
      "kl": 0.87158203125,
      "learning_rate": 4.904846243842949e-07,
      "loss": 0.3554,
      "reward": 0.4494058433920145,
      "reward_std": 0.747850589454174,
      "rewards/cosine_scaled_reward": -0.08085263520479202,
      "rewards/format_reward": 0.6111111119389534,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2000.6388854980469,
      "epoch": 0.5055698371893744,
      "grad_norm": 2.868744134902954,
      "kl": 0.8369140625,
      "learning_rate": 4.873721045679706e-07,
      "loss": 0.1929,
      "reward": 0.23984116781502962,
      "reward_std": 0.4878830164670944,
      "rewards/cosine_scaled_reward": -0.22730162646621466,
      "rewards/format_reward": 0.6944444477558136,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2322.5556030273438,
      "epoch": 0.5072836332476436,
      "grad_norm": 3.0267629623413086,
      "kl": 1.2763671875,
      "learning_rate": 4.842626371469149e-07,
      "loss": 0.2229,
      "reward": 0.4730116240680218,
      "reward_std": 0.672569528222084,
      "rewards/cosine_scaled_reward": 0.0003947049845010042,
      "rewards/format_reward": 0.4722222276031971,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1968.2222595214844,
      "epoch": 0.5089974293059126,
      "grad_norm": 3.3123202323913574,
      "kl": 1.0166015625,
      "learning_rate": 4.811563736721829e-07,
      "loss": 0.2608,
      "reward": 0.4829604886472225,
      "reward_std": 0.6525571122765541,
      "rewards/cosine_scaled_reward": -0.08490864699706435,
      "rewards/format_reward": 0.6527777686715126,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2157.8055725097656,
      "epoch": 0.5107112253641817,
      "grad_norm": 4.5067524909973145,
      "kl": 1.609375,
      "learning_rate": 4.780534655386743e-07,
      "loss": 0.379,
      "reward": 0.673854373395443,
      "reward_std": 0.728430263698101,
      "rewards/cosine_scaled_reward": 0.024427177384495735,
      "rewards/format_reward": 0.6250000149011612,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1613.5000305175781,
      "epoch": 0.5124250214224507,
      "grad_norm": 3.3076987266540527,
      "kl": 1.0537109375,
      "learning_rate": 4.749540639777539e-07,
      "loss": 0.2441,
      "reward": 0.8705739304423332,
      "reward_std": 0.7832073271274567,
      "rewards/cosine_scaled_reward": 0.053342508152127266,
      "rewards/format_reward": 0.7638888955116272,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2283.0000610351562,
      "epoch": 0.5141388174807198,
      "grad_norm": 4.202121734619141,
      "kl": 1.794921875,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.2233,
      "reward": 0.41079268511384726,
      "reward_std": 0.49303294718265533,
      "rewards/cosine_scaled_reward": -0.07238144427537918,
      "rewards/format_reward": 0.5555555522441864,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1734.0139465332031,
      "epoch": 0.5158526135389888,
      "grad_norm": 2.733738660812378,
      "kl": 1.271484375,
      "learning_rate": 4.68766384637248e-07,
      "loss": 0.3623,
      "reward": 0.5950284972786903,
      "reward_std": 0.5468220561742783,
      "rewards/cosine_scaled_reward": -0.042763520032167435,
      "rewards/format_reward": 0.6805555671453476,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2734.5138549804688,
      "epoch": 0.517566409597258,
      "grad_norm": 5.247328281402588,
      "kl": 1.837890625,
      "learning_rate": 4.656784084364238e-07,
      "loss": 0.1992,
      "reward": 0.1533558116061613,
      "reward_std": 0.5701670944690704,
      "rewards/cosine_scaled_reward": -0.10387765569612384,
      "rewards/format_reward": 0.36111111380159855,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1955.3333435058594,
      "epoch": 0.519280205655527,
      "grad_norm": 2.1991233825683594,
      "kl": 1.4111328125,
      "learning_rate": 4.6259454195101267e-07,
      "loss": 0.2534,
      "reward": 0.4565839725546539,
      "reward_std": 0.5375222712755203,
      "rewards/cosine_scaled_reward": -0.13281912542879581,
      "rewards/format_reward": 0.7222222164273262,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1990.6805419921875,
      "epoch": 0.5209940017137961,
      "grad_norm": 3.0468897819519043,
      "kl": 1.685546875,
      "learning_rate": 4.59514935484316e-07,
      "loss": 0.2866,
      "reward": 0.589899554848671,
      "reward_std": 0.6861986592411995,
      "rewards/cosine_scaled_reward": -0.03143910859944299,
      "rewards/format_reward": 0.6527777910232544,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1996.5416870117188,
      "epoch": 0.5227077977720651,
      "grad_norm": 4.011932849884033,
      "kl": 1.462890625,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.2531,
      "reward": 0.3674123687669635,
      "reward_std": 0.6027099043130875,
      "rewards/cosine_scaled_reward": -0.10796047560870647,
      "rewards/format_reward": 0.5833333283662796,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1896.9166870117188,
      "epoch": 0.5244215938303342,
      "grad_norm": 2.977555751800537,
      "kl": 1.01171875,
      "learning_rate": 4.5336910277482155e-07,
      "loss": 0.1578,
      "reward": 0.43058543652296066,
      "reward_std": 0.5935798361897469,
      "rewards/cosine_scaled_reward": -0.12498506158590317,
      "rewards/format_reward": 0.6805555671453476,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2022.4584045410156,
      "epoch": 0.5261353898886033,
      "grad_norm": 2.7941606044769287,
      "kl": 1.24609375,
      "learning_rate": 4.503031760712397e-07,
      "loss": 0.2441,
      "reward": 0.7062125951051712,
      "reward_std": 0.8090076595544815,
      "rewards/cosine_scaled_reward": -0.0010603656992316246,
      "rewards/format_reward": 0.7083333283662796,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2287.125030517578,
      "epoch": 0.5278491859468724,
      "grad_norm": 1.6866544485092163,
      "kl": 1.287109375,
      "learning_rate": 4.4724210845020494e-07,
      "loss": 0.2517,
      "reward": 0.650560175999999,
      "reward_std": 0.742147371172905,
      "rewards/cosine_scaled_reward": 0.033613420091569424,
      "rewards/format_reward": 0.5833333507180214,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2066.5694580078125,
      "epoch": 0.5295629820051414,
      "grad_norm": 2.5993359088897705,
      "kl": 1.2255859375,
      "learning_rate": 4.441860491038345e-07,
      "loss": 0.1916,
      "reward": 0.6525638625025749,
      "reward_std": 0.6665498167276382,
      "rewards/cosine_scaled_reward": 0.02767082443460822,
      "rewards/format_reward": 0.5972222313284874,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2370.763885498047,
      "epoch": 0.5312767780634104,
      "grad_norm": 3.1745495796203613,
      "kl": 1.205078125,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.167,
      "reward": 0.4759225994348526,
      "reward_std": 0.8299422115087509,
      "rewards/cosine_scaled_reward": -0.04676092881709337,
      "rewards/format_reward": 0.569444440305233,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2503.5972290039062,
      "epoch": 0.5329905741216795,
      "grad_norm": 2.2356581687927246,
      "kl": 1.376953125,
      "learning_rate": 4.3808955077581546e-07,
      "loss": 0.2558,
      "reward": 0.2966647706925869,
      "reward_std": 0.5752375796437263,
      "rewards/cosine_scaled_reward": -0.15722317062318325,
      "rewards/format_reward": 0.6111111119389534,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2278.0277404785156,
      "epoch": 0.5347043701799485,
      "grad_norm": 1.4038455486297607,
      "kl": 1.203125,
      "learning_rate": 4.350494089288943e-07,
      "loss": 0.241,
      "reward": 0.8164278883486986,
      "reward_std": 0.7621838673949242,
      "rewards/cosine_scaled_reward": 0.08876948896795511,
      "rewards/format_reward": 0.638888880610466,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1964.8333435058594,
      "epoch": 0.5364181662382177,
      "grad_norm": 5.611269950866699,
      "kl": 1.6904296875,
      "learning_rate": 4.3201486961161093e-07,
      "loss": 0.1802,
      "reward": 0.5501147694885731,
      "reward_std": 0.48398981615900993,
      "rewards/cosine_scaled_reward": -0.07910929806530476,
      "rewards/format_reward": 0.7083333283662796,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2596.0139770507812,
      "epoch": 0.5381319622964867,
      "grad_norm": 2.7361557483673096,
      "kl": 0.904296875,
      "learning_rate": 4.2898608072313045e-07,
      "loss": 0.1967,
      "reward": 0.43212154414504766,
      "reward_std": 0.8072051256895065,
      "rewards/cosine_scaled_reward": -0.04088366776704788,
      "rewards/format_reward": 0.5138888880610466,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2164.986083984375,
      "epoch": 0.5398457583547558,
      "grad_norm": 2.377624988555908,
      "kl": 1.009765625,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.2612,
      "reward": 0.345002256333828,
      "reward_std": 0.6840208172798157,
      "rewards/cosine_scaled_reward": -0.13305442477576435,
      "rewards/format_reward": 0.611111119389534,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2112.861114501953,
      "epoch": 0.5415595544130248,
      "grad_norm": 2.4749066829681396,
      "kl": 1.1328125,
      "learning_rate": 4.2294634442070553e-07,
      "loss": 0.2521,
      "reward": 0.3993903249502182,
      "reward_std": 0.7594646960496902,
      "rewards/cosine_scaled_reward": -0.09891596343368292,
      "rewards/format_reward": 0.5972222238779068,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2763.791717529297,
      "epoch": 0.5432733504712939,
      "grad_norm": 3.959906816482544,
      "kl": 1.5537109375,
      "learning_rate": 4.1993569137498776e-07,
      "loss": 0.1382,
      "reward": 0.3366717994213104,
      "reward_std": 0.5981347486376762,
      "rewards/cosine_scaled_reward": -0.10944187548011541,
      "rewards/format_reward": 0.5555555559694767,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1693.9583129882812,
      "epoch": 0.5449871465295629,
      "grad_norm": 7.122504711151123,
      "kl": 1.0234375,
      "learning_rate": 4.1693137748017915e-07,
      "loss": 0.3755,
      "reward": 1.1441613137722015,
      "reward_std": 0.5525609478354454,
      "rewards/cosine_scaled_reward": 0.21791397035121918,
      "rewards/format_reward": 0.7083333432674408,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2302.7222595214844,
      "epoch": 0.5467009425878321,
      "grad_norm": 1.7870283126831055,
      "kl": 1.1513671875,
      "learning_rate": 4.1393354916230005e-07,
      "loss": 0.2157,
      "reward": 0.44585637911222875,
      "reward_std": 0.5990823060274124,
      "rewards/cosine_scaled_reward": -0.09651626879349351,
      "rewards/format_reward": 0.638888880610466,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2418.6112060546875,
      "epoch": 0.5484147386461011,
      "grad_norm": 4.37020206451416,
      "kl": 1.3271484375,
      "learning_rate": 4.1094235253127374e-07,
      "loss": 0.1373,
      "reward": 0.7411398887634277,
      "reward_std": 0.8346492722630501,
      "rewards/cosine_scaled_reward": 0.030292170122265816,
      "rewards/format_reward": 0.6805555522441864,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2477.4305419921875,
      "epoch": 0.5501285347043702,
      "grad_norm": 2.8605029582977295,
      "kl": 0.9931640625,
      "learning_rate": 4.079579333738039e-07,
      "loss": 0.1566,
      "reward": 0.3635707888752222,
      "reward_std": 0.6505779251456261,
      "rewards/cosine_scaled_reward": -0.14460349176079035,
      "rewards/format_reward": 0.6527777835726738,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1934.0694885253906,
      "epoch": 0.5518423307626392,
      "grad_norm": 2.2090539932250977,
      "kl": 1.033935546875,
      "learning_rate": 4.0498043714627006e-07,
      "loss": 0.1413,
      "reward": 0.43895523250102997,
      "reward_std": 0.6110691279172897,
      "rewards/cosine_scaled_reward": -0.14857794775161892,
      "rewards/format_reward": 0.7361111342906952,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2659.2083129882812,
      "epoch": 0.5535561268209083,
      "grad_norm": 1.6186331510543823,
      "kl": 0.92578125,
      "learning_rate": 4.020100089676376e-07,
      "loss": 0.1545,
      "reward": 0.43137288000434637,
      "reward_std": 0.5587008334696293,
      "rewards/cosine_scaled_reward": -0.041258019395172596,
      "rewards/format_reward": 0.5138888955116272,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2133.0,
      "epoch": 0.5552699228791774,
      "grad_norm": 2.309271812438965,
      "kl": 0.646484375,
      "learning_rate": 3.9904679361238526e-07,
      "loss": 0.2071,
      "reward": 0.8334337323904037,
      "reward_std": 0.7556089013814926,
      "rewards/cosine_scaled_reward": 0.0486613066168502,
      "rewards/format_reward": 0.7361111044883728,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2852.5416259765625,
      "epoch": 0.5569837189374465,
      "grad_norm": 1.367803692817688,
      "kl": 1.1455078125,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.2217,
      "reward": 0.30423190630972385,
      "reward_std": 0.6387892812490463,
      "rewards/cosine_scaled_reward": -0.13955070948577486,
      "rewards/format_reward": 0.5833333432674408,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2254.2638549804688,
      "epoch": 0.5586975149957155,
      "grad_norm": 2.456948757171631,
      "kl": 0.90966796875,
      "learning_rate": 3.931425787051832e-07,
      "loss": 0.2717,
      "reward": 0.7499970353674144,
      "reward_std": 0.5177437886595726,
      "rewards/cosine_scaled_reward": 0.06249852292239666,
      "rewards/format_reward": 0.625,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2643.263916015625,
      "epoch": 0.5604113110539846,
      "grad_norm": 2.975080728530884,
      "kl": 0.96875,
      "learning_rate": 3.902018669163384e-07,
      "loss": 0.2016,
      "reward": 0.37286074459552765,
      "reward_std": 0.6334929168224335,
      "rewards/cosine_scaled_reward": -0.06356962397694588,
      "rewards/format_reward": 0.5000000074505806,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2430.6666564941406,
      "epoch": 0.5621251071122536,
      "grad_norm": 2.2276480197906494,
      "kl": 1.1484375,
      "learning_rate": 3.872689434630585e-07,
      "loss": 0.2054,
      "reward": 0.45167311653494835,
      "reward_std": 0.6782046630978584,
      "rewards/cosine_scaled_reward": -0.07971900515258312,
      "rewards/format_reward": 0.6111111268401146,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2118.3194885253906,
      "epoch": 0.5638389031705227,
      "grad_norm": 1.439645767211914,
      "kl": 0.9873046875,
      "learning_rate": 3.843439512918949e-07,
      "loss": 0.1558,
      "reward": 0.3222229927778244,
      "reward_std": 0.5387292131781578,
      "rewards/cosine_scaled_reward": -0.186110720038414,
      "rewards/format_reward": 0.6944444477558136,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2615.3750610351562,
      "epoch": 0.5655526992287918,
      "grad_norm": 1.3368608951568604,
      "kl": 0.822265625,
      "learning_rate": 3.8142703296283953e-07,
      "loss": 0.0814,
      "reward": 0.19894374161958694,
      "reward_std": 0.4315878227353096,
      "rewards/cosine_scaled_reward": -0.17136146454140544,
      "rewards/format_reward": 0.5416666641831398,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2637.2084045410156,
      "epoch": 0.5672664952870609,
      "grad_norm": 1.4436960220336914,
      "kl": 0.78125,
      "learning_rate": 3.785183306423767e-07,
      "loss": 0.1763,
      "reward": 0.555847343057394,
      "reward_std": 0.7657169997692108,
      "rewards/cosine_scaled_reward": 0.020979220047593117,
      "rewards/format_reward": 0.5138888955116272,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2674.9444580078125,
      "epoch": 0.5689802913453299,
      "grad_norm": 3.6536855697631836,
      "kl": 1.30078125,
      "learning_rate": 3.7561798609655373e-07,
      "loss": 0.1134,
      "reward": 0.44342901557683945,
      "reward_std": 0.6001273989677429,
      "rewards/cosine_scaled_reward": -0.09772994555532932,
      "rewards/format_reward": 0.6388888880610466,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2896.0277709960938,
      "epoch": 0.570694087403599,
      "grad_norm": 1.9911452531814575,
      "kl": 1.0078125,
      "learning_rate": 3.72726140684072e-07,
      "loss": 0.1034,
      "reward": -0.17525828257203102,
      "reward_std": 0.4928872212767601,
      "rewards/cosine_scaled_reward": -0.24040691554546356,
      "rewards/format_reward": 0.30555556528270245,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2810.861083984375,
      "epoch": 0.572407883461868,
      "grad_norm": 1.8073278665542603,
      "kl": 1.068359375,
      "learning_rate": 3.6984293534939737e-07,
      "loss": 0.1821,
      "reward": -0.006253276020288467,
      "reward_std": 0.5080604404211044,
      "rewards/cosine_scaled_reward": -0.2114599784836173,
      "rewards/format_reward": 0.4166666753590107,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2462.65283203125,
      "epoch": 0.5741216795201372,
      "grad_norm": 1.5005158185958862,
      "kl": 0.771484375,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.1649,
      "reward": 0.45248544216156006,
      "reward_std": 0.5002452582120895,
      "rewards/cosine_scaled_reward": -0.07931282371282578,
      "rewards/format_reward": 0.611111119389534,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2160.125030517578,
      "epoch": 0.5758354755784062,
      "grad_norm": 3.1605849266052246,
      "kl": 0.548828125,
      "learning_rate": 3.641030065789562e-07,
      "loss": 0.1752,
      "reward": 0.24482397455722094,
      "reward_std": 0.515994019806385,
      "rewards/cosine_scaled_reward": -0.17619912140071392,
      "rewards/format_reward": 0.597222238779068,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2403.3194580078125,
      "epoch": 0.5775492716366752,
      "grad_norm": 2.8763837814331055,
      "kl": 0.736328125,
      "learning_rate": 3.612465628992203e-07,
      "loss": 0.1929,
      "reward": 0.5475254282355309,
      "reward_std": 0.6548926681280136,
      "rewards/cosine_scaled_reward": -0.038737302646040916,
      "rewards/format_reward": 0.6249999925494194,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2418.013946533203,
      "epoch": 0.5792630676949443,
      "grad_norm": 2.0177650451660156,
      "kl": 0.6171875,
      "learning_rate": 3.5839931879571725e-07,
      "loss": 0.2226,
      "reward": 0.5356792770326138,
      "reward_std": 0.9891562312841415,
      "rewards/cosine_scaled_reward": -0.01688259281218052,
      "rewards/format_reward": 0.5694444552063942,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2717.888916015625,
      "epoch": 0.5809768637532133,
      "grad_norm": 2.011136293411255,
      "kl": 0.8603515625,
      "learning_rate": 3.555614130391079e-07,
      "loss": 0.0711,
      "reward": 0.31583554670214653,
      "reward_std": 0.6509723365306854,
      "rewards/cosine_scaled_reward": -0.14069335255771875,
      "rewards/format_reward": 0.5972222313284874,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2609.1805114746094,
      "epoch": 0.5826906598114824,
      "grad_norm": 2.1439146995544434,
      "kl": 0.60546875,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.1085,
      "reward": 0.26384788006544113,
      "reward_std": 0.5596405640244484,
      "rewards/cosine_scaled_reward": -0.12502050958573818,
      "rewards/format_reward": 0.5138888917863369,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2366.8055725097656,
      "epoch": 0.5844044558697515,
      "grad_norm": 3.087810516357422,
      "kl": 1.16796875,
      "learning_rate": 3.4991416936678276e-07,
      "loss": 0.2266,
      "reward": 0.19622072577476501,
      "reward_std": 0.43179403990507126,
      "rewards/cosine_scaled_reward": -0.20050075091421604,
      "rewards/format_reward": 0.5972222313284874,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2665.3055419921875,
      "epoch": 0.5861182519280206,
      "grad_norm": 2.5300474166870117,
      "kl": 0.7939453125,
      "learning_rate": 3.471051066897562e-07,
      "loss": 0.1537,
      "reward": 0.24374699965119362,
      "reward_std": 0.7871415168046951,
      "rewards/cosine_scaled_reward": -0.1350709474645555,
      "rewards/format_reward": 0.5138889029622078,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2231.2222900390625,
      "epoch": 0.5878320479862896,
      "grad_norm": 6.685680389404297,
      "kl": 0.60302734375,
      "learning_rate": 3.4430593282358777e-07,
      "loss": 0.3184,
      "reward": 0.4801894012489356,
      "reward_std": 0.4852745458483696,
      "rewards/cosine_scaled_reward": -0.02379419095814228,
      "rewards/format_reward": 0.5277777835726738,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2502.8472290039062,
      "epoch": 0.5895458440445587,
      "grad_norm": 2.49977970123291,
      "kl": 1.0634765625,
      "learning_rate": 3.4151678419606233e-07,
      "loss": 0.2075,
      "reward": 0.22286849096417427,
      "reward_std": 0.51853808760643,
      "rewards/cosine_scaled_reward": -0.18023241311311722,
      "rewards/format_reward": 0.5833333432674408,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2553.8472290039062,
      "epoch": 0.5912596401028277,
      "grad_norm": 1.4922689199447632,
      "kl": 0.7724609375,
      "learning_rate": 3.387377967463493e-07,
      "loss": 0.11,
      "reward": 0.40987285412847996,
      "reward_std": 0.6866099908947945,
      "rewards/cosine_scaled_reward": -0.03117468417622149,
      "rewards/format_reward": 0.4722222313284874,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2434.250030517578,
      "epoch": 0.5929734361610969,
      "grad_norm": 2.287896156311035,
      "kl": 0.73828125,
      "learning_rate": 3.359691059183761e-07,
      "loss": 0.1042,
      "reward": 0.566213920712471,
      "reward_std": 0.6637867465615273,
      "rewards/cosine_scaled_reward": -0.05022636614739895,
      "rewards/format_reward": 0.6666666865348816,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2606.3056030273438,
      "epoch": 0.5946872322193659,
      "grad_norm": 5.195223808288574,
      "kl": 0.8203125,
      "learning_rate": 3.3321084665422803e-07,
      "loss": 0.0878,
      "reward": 0.30292151868343353,
      "reward_std": 0.6232884004712105,
      "rewards/cosine_scaled_reward": -0.09853924717754126,
      "rewards/format_reward": 0.5000000074505806,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2294.3194580078125,
      "epoch": 0.596401028277635,
      "grad_norm": 2.7749757766723633,
      "kl": 1.12744140625,
      "learning_rate": 3.3046315338757026e-07,
      "loss": 0.113,
      "reward": 0.10566018056124449,
      "reward_std": 0.4547986686229706,
      "rewards/cosine_scaled_reward": -0.23883657529950142,
      "rewards/format_reward": 0.5833333507180214,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2798.375,
      "epoch": 0.598114824335904,
      "grad_norm": 4.372980117797852,
      "kl": 0.76708984375,
      "learning_rate": 3.2772616003709616e-07,
      "loss": 0.0724,
      "reward": 0.19539665430784225,
      "reward_std": 0.547118715941906,
      "rewards/cosine_scaled_reward": -0.0898016735445708,
      "rewards/format_reward": 0.37500000186264515,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2323.1806030273438,
      "epoch": 0.5998286203941731,
      "grad_norm": 0.9666920900344849,
      "kl": 0.42724609375,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.1139,
      "reward": 0.3154673893004656,
      "reward_std": 0.6931511759757996,
      "rewards/cosine_scaled_reward": -0.1408774359151721,
      "rewards/format_reward": 0.5972222238779068,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2554.791717529297,
      "epoch": 0.6015424164524421,
      "grad_norm": 1.4160966873168945,
      "kl": 0.673828125,
      "learning_rate": 3.222848061454764e-07,
      "loss": 0.1226,
      "reward": 0.2959921658039093,
      "reward_std": 0.6351921036839485,
      "rewards/cosine_scaled_reward": -0.1922817062586546,
      "rewards/format_reward": 0.6805555522441864,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2883.0972900390625,
      "epoch": 0.6032562125107113,
      "grad_norm": 1.3862693309783936,
      "kl": 0.57470703125,
      "learning_rate": 3.195807108082429e-07,
      "loss": 0.0944,
      "reward": -0.15852557588368654,
      "reward_std": 0.3618531711399555,
      "rewards/cosine_scaled_reward": -0.22509612515568733,
      "rewards/format_reward": 0.29166666977107525,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2766.3611755371094,
      "epoch": 0.6049700085689803,
      "grad_norm": 2.0312767028808594,
      "kl": 0.47900390625,
      "learning_rate": 3.168878457820915e-07,
      "loss": 0.1735,
      "reward": 0.0013820715248584747,
      "reward_std": 0.4379913955926895,
      "rewards/cosine_scaled_reward": -0.2284756200388074,
      "rewards/format_reward": 0.4583333283662796,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2303.4444885253906,
      "epoch": 0.6066838046272494,
      "grad_norm": 1.9564874172210693,
      "kl": 0.583984375,
      "learning_rate": 3.142063423134644e-07,
      "loss": 0.1414,
      "reward": 0.8931210651062429,
      "reward_std": 0.7761038094758987,
      "rewards/cosine_scaled_reward": 0.12017163541167974,
      "rewards/format_reward": 0.6527777649462223,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2350.3333435058594,
      "epoch": 0.6083976006855184,
      "grad_norm": 3.0156519412994385,
      "kl": 0.501953125,
      "learning_rate": 3.115363310950578e-07,
      "loss": 0.0948,
      "reward": 0.21542136371135712,
      "reward_std": 0.4440060332417488,
      "rewards/cosine_scaled_reward": -0.15617820341140032,
      "rewards/format_reward": 0.5277777761220932,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2587.5694580078125,
      "epoch": 0.6101113967437874,
      "grad_norm": 1.8885806798934937,
      "kl": 0.7080078125,
      "learning_rate": 3.0887794225945143e-07,
      "loss": 0.1358,
      "reward": 0.2837059774901718,
      "reward_std": 0.5896440669894218,
      "rewards/cosine_scaled_reward": -0.1637025810778141,
      "rewards/format_reward": 0.6111111119389534,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2377.9722290039062,
      "epoch": 0.6118251928020566,
      "grad_norm": 2.1884989738464355,
      "kl": 0.7119140625,
      "learning_rate": 3.062313053727671e-07,
      "loss": 0.2069,
      "reward": 0.5028799092397094,
      "reward_std": 0.6587233692407608,
      "rewards/cosine_scaled_reward": -0.04022672958672047,
      "rewards/format_reward": 0.5833333358168602,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2724.02783203125,
      "epoch": 0.6135389888603257,
      "grad_norm": 1.1445249319076538,
      "kl": 0.5654296875,
      "learning_rate": 3.0359654942835247e-07,
      "loss": 0.1018,
      "reward": 0.2546988914255053,
      "reward_std": 0.7167258933186531,
      "rewards/cosine_scaled_reward": -0.10876166447997093,
      "rewards/format_reward": 0.4722222238779068,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2532.2638549804688,
      "epoch": 0.6152527849185947,
      "grad_norm": 2.5419840812683105,
      "kl": 0.68505859375,
      "learning_rate": 3.0097380284049523e-07,
      "loss": 0.0873,
      "reward": 0.21593652665615082,
      "reward_std": 0.6653575152158737,
      "rewards/cosine_scaled_reward": -0.11425395932747051,
      "rewards/format_reward": 0.4444444477558136,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2583.513916015625,
      "epoch": 0.6169665809768637,
      "grad_norm": 3.3858025074005127,
      "kl": 0.7587890625,
      "learning_rate": 2.9836319343816397e-07,
      "loss": 0.0324,
      "reward": 0.317771688933135,
      "reward_std": 0.6041048616170883,
      "rewards/cosine_scaled_reward": -0.09111416153609753,
      "rewards/format_reward": 0.4999999962747097,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2334.1944274902344,
      "epoch": 0.6186803770351328,
      "grad_norm": 0.9652746915817261,
      "kl": 0.50732421875,
      "learning_rate": 2.9576484845877793e-07,
      "loss": 0.0994,
      "reward": 0.6510396376252174,
      "reward_std": 0.8507343530654907,
      "rewards/cosine_scaled_reward": 0.019964261911809444,
      "rewards/format_reward": 0.6111111119389534,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3043.9445190429688,
      "epoch": 0.6203941730934018,
      "grad_norm": 0.7930195927619934,
      "kl": 0.67626953125,
      "learning_rate": 2.931788945420058e-07,
      "loss": 0.0974,
      "reward": 0.13819648325443268,
      "reward_std": 0.5064943730831146,
      "rewards/cosine_scaled_reward": -0.1531239915639162,
      "rewards/format_reward": 0.4444444552063942,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2825.7916259765625,
      "epoch": 0.622107969151671,
      "grad_norm": 0.6896006464958191,
      "kl": 0.64794921875,
      "learning_rate": 2.9060545772359305e-07,
      "loss": 0.1145,
      "reward": 0.08600431494414806,
      "reward_std": 0.5438744425773621,
      "rewards/cosine_scaled_reward": -0.17227561306208372,
      "rewards/format_reward": 0.4305555559694767,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2422.3334045410156,
      "epoch": 0.62382176520994,
      "grad_norm": 1.5994491577148438,
      "kl": 0.5615234375,
      "learning_rate": 2.8804466342921987e-07,
      "loss": 0.0825,
      "reward": 0.5049359295517206,
      "reward_std": 0.7525297850370407,
      "rewards/cosine_scaled_reward": -0.06697649694979191,
      "rewards/format_reward": 0.6388888955116272,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2465.0556030273438,
      "epoch": 0.6255355612682091,
      "grad_norm": 7.009267330169678,
      "kl": 0.66259765625,
      "learning_rate": 2.854966364683872e-07,
      "loss": 0.2849,
      "reward": 0.15274390950798988,
      "reward_std": 0.48094654455780983,
      "rewards/cosine_scaled_reward": -0.18057249579578638,
      "rewards/format_reward": 0.5138888880610466,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2835.9861450195312,
      "epoch": 0.6272493573264781,
      "grad_norm": 1.8245147466659546,
      "kl": 0.6435546875,
      "learning_rate": 2.829615010283344e-07,
      "loss": 0.0872,
      "reward": 0.43825584976002574,
      "reward_std": 0.4411094859242439,
      "rewards/cosine_scaled_reward": 0.031627919524908066,
      "rewards/format_reward": 0.37500001210719347,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2408.2500610351562,
      "epoch": 0.6289631533847472,
      "grad_norm": 1.2879042625427246,
      "kl": 0.64208984375,
      "learning_rate": 2.8043938066798645e-07,
      "loss": 0.1245,
      "reward": 0.5816475376486778,
      "reward_std": 0.8480053022503853,
      "rewards/cosine_scaled_reward": 0.026934866793453693,
      "rewards/format_reward": 0.5277777761220932,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3011.8472290039062,
      "epoch": 0.6306769494430163,
      "grad_norm": 1.5342601537704468,
      "kl": 0.9501953125,
      "learning_rate": 2.7793039831193133e-07,
      "loss": 0.1117,
      "reward": -0.08722967363428324,
      "reward_std": 0.4681037962436676,
      "rewards/cosine_scaled_reward": -0.20333705097436905,
      "rewards/format_reward": 0.31944445334374905,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2088.1111450195312,
      "epoch": 0.6323907455012854,
      "grad_norm": 5.395845413208008,
      "kl": 0.52392578125,
      "learning_rate": 2.7543467624442956e-07,
      "loss": 0.278,
      "reward": 0.8531668335199356,
      "reward_std": 0.7198526412248611,
      "rewards/cosine_scaled_reward": 0.07936117798089981,
      "rewards/format_reward": 0.6944444477558136,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2144.4583587646484,
      "epoch": 0.6341045415595544,
      "grad_norm": 1.9326905012130737,
      "kl": 0.5986328125,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.165,
      "reward": 0.7016473673284054,
      "reward_std": 0.35017503798007965,
      "rewards/cosine_scaled_reward": 0.04526812210679054,
      "rewards/format_reward": 0.6111111268401146,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2226.8333740234375,
      "epoch": 0.6358183376178235,
      "grad_norm": 6.064547538757324,
      "kl": 0.47265625,
      "learning_rate": 2.7048349887476037e-07,
      "loss": 0.2747,
      "reward": 0.36747913248836994,
      "reward_std": 0.47022923082113266,
      "rewards/cosine_scaled_reward": -0.12876042909920216,
      "rewards/format_reward": 0.6250000149011612,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2646.166717529297,
      "epoch": 0.6375321336760925,
      "grad_norm": 1.8105882406234741,
      "kl": 0.66259765625,
      "learning_rate": 2.6802828488599294e-07,
      "loss": 0.1523,
      "reward": 0.2788702640682459,
      "reward_std": 0.7272945195436478,
      "rewards/cosine_scaled_reward": -0.082787093706429,
      "rewards/format_reward": 0.4444444440305233,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2658.513916015625,
      "epoch": 0.6392459297343616,
      "grad_norm": 0.861741304397583,
      "kl": 0.56201171875,
      "learning_rate": 2.655868138008171e-07,
      "loss": 0.127,
      "reward": 0.25536923203617334,
      "reward_std": 0.549317829310894,
      "rewards/cosine_scaled_reward": -0.16398204606957734,
      "rewards/format_reward": 0.5833333358168602,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2475.7916564941406,
      "epoch": 0.6409597257926307,
      "grad_norm": 5.120214462280273,
      "kl": 0.66943359375,
      "learning_rate": 2.631592046130896e-07,
      "loss": -0.0041,
      "reward": 0.4401531554758549,
      "reward_std": 0.5939441919326782,
      "rewards/cosine_scaled_reward": -0.10631232312880456,
      "rewards/format_reward": 0.6527777686715126,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2241.4166564941406,
      "epoch": 0.6426735218508998,
      "grad_norm": 4.6052021980285645,
      "kl": 0.68896484375,
      "learning_rate": 2.6074557564105724e-07,
      "loss": 0.258,
      "reward": 0.40211474522948265,
      "reward_std": 0.6810158491134644,
      "rewards/cosine_scaled_reward": -0.13922041468322277,
      "rewards/format_reward": 0.6805555522441864,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2345.541717529297,
      "epoch": 0.6443873179091688,
      "grad_norm": 1.593520164489746,
      "kl": 0.65869140625,
      "learning_rate": 2.583460445215911e-07,
      "loss": 0.1983,
      "reward": 0.4966873601078987,
      "reward_std": 0.6450872495770454,
      "rewards/cosine_scaled_reward": -0.0363785345107317,
      "rewards/format_reward": 0.569444440305233,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2599.9583129882812,
      "epoch": 0.6461011139674379,
      "grad_norm": 1.0820523500442505,
      "kl": 0.54345703125,
      "learning_rate": 2.5596072820445254e-07,
      "loss": 0.1269,
      "reward": 0.3041490036994219,
      "reward_std": 0.5556300804018974,
      "rewards/cosine_scaled_reward": -0.11181438341736794,
      "rewards/format_reward": 0.5277777910232544,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2423.1944274902344,
      "epoch": 0.6478149100257069,
      "grad_norm": 3.9577648639678955,
      "kl": 0.46435546875,
      "learning_rate": 2.5358974294659373e-07,
      "loss": 0.2539,
      "reward": 0.3343656752258539,
      "reward_std": 0.6136218756437302,
      "rewards/cosine_scaled_reward": -0.14531716238707304,
      "rewards/format_reward": 0.625,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1991.0555419921875,
      "epoch": 0.6495287060839761,
      "grad_norm": 6.228683948516846,
      "kl": 0.7626953125,
      "learning_rate": 2.512332043064913e-07,
      "loss": 0.2113,
      "reward": 0.7852881997823715,
      "reward_std": 0.7995356619358063,
      "rewards/cosine_scaled_reward": 0.031532974913716316,
      "rewards/format_reward": 0.722222238779068,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2142.0972595214844,
      "epoch": 0.6512425021422451,
      "grad_norm": 4.392513751983643,
      "kl": 0.8505859375,
      "learning_rate": 2.488912271385139e-07,
      "loss": 0.166,
      "reward": 0.774210050702095,
      "reward_std": 0.9235591739416122,
      "rewards/cosine_scaled_reward": 0.01904946379363537,
      "rewards/format_reward": 0.736111119389534,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2721.166748046875,
      "epoch": 0.6529562982005142,
      "grad_norm": 0.7555143237113953,
      "kl": 0.54931640625,
      "learning_rate": 2.465639255873246e-07,
      "loss": 0.1157,
      "reward": 0.06699353083968163,
      "reward_std": 0.6024204641580582,
      "rewards/cosine_scaled_reward": -0.18872546032071114,
      "rewards/format_reward": 0.4444444477558136,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2592.3472595214844,
      "epoch": 0.6546700942587832,
      "grad_norm": 1.4892374277114868,
      "kl": 0.5986328125,
      "learning_rate": 2.4425141308231765e-07,
      "loss": 0.164,
      "reward": 0.4388514533638954,
      "reward_std": 0.7740809172391891,
      "rewards/cosine_scaled_reward": -0.07918539177626371,
      "rewards/format_reward": 0.597222238779068,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2303.125,
      "epoch": 0.6563838903170522,
      "grad_norm": 1.8696836233139038,
      "kl": 0.63427734375,
      "learning_rate": 2.4195380233209006e-07,
      "loss": 0.0839,
      "reward": 0.2414467092603445,
      "reward_std": 0.5401086919009686,
      "rewards/cosine_scaled_reward": -0.14316555112600327,
      "rewards/format_reward": 0.5277777835726738,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2894.138916015625,
      "epoch": 0.6580976863753213,
      "grad_norm": 2.512624740600586,
      "kl": 0.7236328125,
      "learning_rate": 2.3967120531894857e-07,
      "loss": 0.188,
      "reward": -0.111133978003636,
      "reward_std": 0.4146636873483658,
      "rewards/cosine_scaled_reward": -0.256955873221159,
      "rewards/format_reward": 0.4027777798473835,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2577.1805725097656,
      "epoch": 0.6598114824335904,
      "grad_norm": 1.5134508609771729,
      "kl": 0.75341796875,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.1359,
      "reward": 0.12209473713301122,
      "reward_std": 0.42869339138269424,
      "rewards/cosine_scaled_reward": -0.18200820498168468,
      "rewards/format_reward": 0.4861111156642437,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2836.388916015625,
      "epoch": 0.6615252784918595,
      "grad_norm": 1.6320090293884277,
      "kl": 0.71435546875,
      "learning_rate": 2.3515149676898552e-07,
      "loss": 0.1314,
      "reward": 0.027245239354670048,
      "reward_std": 0.5338631048798561,
      "rewards/cosine_scaled_reward": -0.21554404497146606,
      "rewards/format_reward": 0.4583333432674408,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2370.5416870117188,
      "epoch": 0.6632390745501285,
      "grad_norm": 2.790175437927246,
      "kl": 0.69384765625,
      "learning_rate": 2.3291460551638237e-07,
      "loss": 0.1291,
      "reward": 0.45274626836180687,
      "reward_std": 0.5044268742203712,
      "rewards/cosine_scaled_reward": -0.07223799102939665,
      "rewards/format_reward": 0.5972222313284874,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2043.15283203125,
      "epoch": 0.6649528706083976,
      "grad_norm": 2.196779251098633,
      "kl": 0.91259765625,
      "learning_rate": 2.306931685585657e-07,
      "loss": 0.1933,
      "reward": 0.6870926842093468,
      "reward_std": 0.7499307841062546,
      "rewards/cosine_scaled_reward": -0.017564778798259795,
      "rewards/format_reward": 0.722222238779068,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2507.999969482422,
      "epoch": 0.6666666666666666,
      "grad_norm": 4.833599090576172,
      "kl": 0.7890625,
      "learning_rate": 2.2848729416523859e-07,
      "loss": 0.1485,
      "reward": 0.48738833516836166,
      "reward_std": 0.3942640535533428,
      "rewards/cosine_scaled_reward": -0.047972507774829865,
      "rewards/format_reward": 0.5833333283662796,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2180.5833740234375,
      "epoch": 0.6683804627249358,
      "grad_norm": 4.208037853240967,
      "kl": 0.57275390625,
      "learning_rate": 2.2629708984760706e-07,
      "loss": 0.2068,
      "reward": 0.6339845806360245,
      "reward_std": 0.8561032116413116,
      "rewards/cosine_scaled_reward": -0.009396598441526294,
      "rewards/format_reward": 0.6527777761220932,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2664.0,
      "epoch": 0.6700942587832048,
      "grad_norm": 1.711565375328064,
      "kl": 0.57421875,
      "learning_rate": 2.2412266235313973e-07,
      "loss": 0.2027,
      "reward": 0.2511326225940138,
      "reward_std": 0.7724436074495316,
      "rewards/cosine_scaled_reward": -0.13137813284993172,
      "rewards/format_reward": 0.5138888955116272,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2048.1945190429688,
      "epoch": 0.6718080548414739,
      "grad_norm": 7.40539026260376,
      "kl": 0.576171875,
      "learning_rate": 2.2196411766036487e-07,
      "loss": 0.2932,
      "reward": 0.390616811811924,
      "reward_std": 0.46938444674015045,
      "rewards/cosine_scaled_reward": -0.1519138067960739,
      "rewards/format_reward": 0.6944444552063942,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2850.15283203125,
      "epoch": 0.6735218508997429,
      "grad_norm": 2.314105272293091,
      "kl": 0.712890625,
      "learning_rate": 2.1982156097370557e-07,
      "loss": 0.1382,
      "reward": 0.12740344926714897,
      "reward_std": 0.5854331143200397,
      "rewards/cosine_scaled_reward": -0.1307427268475294,
      "rewards/format_reward": 0.3888888992369175,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2054.3334045410156,
      "epoch": 0.675235646958012,
      "grad_norm": 3.0562775135040283,
      "kl": 0.958984375,
      "learning_rate": 2.1769509671835223e-07,
      "loss": 0.1762,
      "reward": 0.33383211493492126,
      "reward_std": 0.6097967401146889,
      "rewards/cosine_scaled_reward": -0.15947283059358597,
      "rewards/format_reward": 0.6527777835726738,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2001.4027709960938,
      "epoch": 0.676949443016281,
      "grad_norm": 2.749018907546997,
      "kl": 1.3720703125,
      "learning_rate": 2.1558482853517253e-07,
      "loss": 0.2474,
      "reward": 0.46511383540928364,
      "reward_std": 0.5483391135931015,
      "rewards/cosine_scaled_reward": -0.11466531874611974,
      "rewards/format_reward": 0.6944444626569748,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2663.9722900390625,
      "epoch": 0.6786632390745502,
      "grad_norm": 1.3055802583694458,
      "kl": 0.8837890625,
      "learning_rate": 2.134908592756607e-07,
      "loss": 0.1475,
      "reward": 0.21666064485907555,
      "reward_std": 0.8081866502761841,
      "rewards/cosine_scaled_reward": -0.14861411787569523,
      "rewards/format_reward": 0.5138889029622078,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2148.027801513672,
      "epoch": 0.6803770351328192,
      "grad_norm": 2.2016310691833496,
      "kl": 1.193359375,
      "learning_rate": 2.1141329099692406e-07,
      "loss": 0.2683,
      "reward": 0.3172401809133589,
      "reward_std": 0.5794945135712624,
      "rewards/cosine_scaled_reward": -0.13999101985245943,
      "rewards/format_reward": 0.5972222164273262,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2772.2778930664062,
      "epoch": 0.6820908311910883,
      "grad_norm": 5.671627044677734,
      "kl": 0.994140625,
      "learning_rate": 2.0935222495670968e-07,
      "loss": 0.1316,
      "reward": 0.22605895064771175,
      "reward_std": 0.528959184885025,
      "rewards/cosine_scaled_reward": -0.12308163847774267,
      "rewards/format_reward": 0.4722222313284874,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1907.2083740234375,
      "epoch": 0.6838046272493573,
      "grad_norm": 4.919534206390381,
      "kl": 1.0986328125,
      "learning_rate": 2.0730776160846853e-07,
      "loss": 0.1523,
      "reward": 0.8099863529205322,
      "reward_std": 0.7783814370632172,
      "rewards/cosine_scaled_reward": 0.03693760558962822,
      "rewards/format_reward": 0.7361111119389534,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2289.3195190429688,
      "epoch": 0.6855184233076264,
      "grad_norm": 4.336697578430176,
      "kl": 0.9921875,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.125,
      "reward": 0.08474167913664132,
      "reward_std": 0.4911258965730667,
      "rewards/cosine_scaled_reward": -0.21457360684871674,
      "rewards/format_reward": 0.5138888955116272,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2537.0833740234375,
      "epoch": 0.6872322193658955,
      "grad_norm": 1.9845013618469238,
      "kl": 0.7470703125,
      "learning_rate": 2.032690407508949e-07,
      "loss": 0.1521,
      "reward": 0.20629934733733535,
      "reward_std": 0.5084620639681816,
      "rewards/cosine_scaled_reward": -0.17462810222059488,
      "rewards/format_reward": 0.5555555671453476,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2364.2222595214844,
      "epoch": 0.6889460154241646,
      "grad_norm": 4.490449905395508,
      "kl": 0.85595703125,
      "learning_rate": 2.0127498008311922e-07,
      "loss": 0.0678,
      "reward": 0.2729727178812027,
      "reward_std": 0.40766991674900055,
      "rewards/cosine_scaled_reward": -0.16906920075416565,
      "rewards/format_reward": 0.6111111044883728,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2377.500030517578,
      "epoch": 0.6906598114824336,
      "grad_norm": 2.0314667224884033,
      "kl": 0.9287109375,
      "learning_rate": 1.9929791578083655e-07,
      "loss": 0.1243,
      "reward": 0.7173348069190979,
      "reward_std": 0.6178643703460693,
      "rewards/cosine_scaled_reward": 0.03922295683878474,
      "rewards/format_reward": 0.6388888955116272,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2395.3472595214844,
      "epoch": 0.6923736075407027,
      "grad_norm": 3.6266534328460693,
      "kl": 1.1142578125,
      "learning_rate": 1.9733794420337213e-07,
      "loss": 0.1186,
      "reward": 0.37652647122740746,
      "reward_std": 0.6333749815821648,
      "rewards/cosine_scaled_reward": -0.11034788191318512,
      "rewards/format_reward": 0.5972222238779068,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2634.7222595214844,
      "epoch": 0.6940874035989717,
      "grad_norm": 1.629310131072998,
      "kl": 0.87353515625,
      "learning_rate": 1.9539516087697517e-07,
      "loss": 0.1284,
      "reward": 0.30899196676909924,
      "reward_std": 0.5874167829751968,
      "rewards/cosine_scaled_reward": -0.11633734963834286,
      "rewards/format_reward": 0.541666679084301,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1720.5416870117188,
      "epoch": 0.6958011996572407,
      "grad_norm": 3.2341248989105225,
      "kl": 1.0625,
      "learning_rate": 1.934696604901642e-07,
      "loss": 0.1949,
      "reward": 0.5183681361377239,
      "reward_std": 0.5259700566530228,
      "rewards/cosine_scaled_reward": -0.08109369967132807,
      "rewards/format_reward": 0.680555559694767,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2396.875,
      "epoch": 0.6975149957155099,
      "grad_norm": 2.575775146484375,
      "kl": 0.56005859375,
      "learning_rate": 1.915615368891117e-07,
      "loss": 0.1577,
      "reward": 0.16498053632676601,
      "reward_std": 0.6976238563656807,
      "rewards/cosine_scaled_reward": -0.17445417866110802,
      "rewards/format_reward": 0.5138888955116272,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2211.986114501953,
      "epoch": 0.699228791773779,
      "grad_norm": 5.147465229034424,
      "kl": 0.939453125,
      "learning_rate": 1.8967088307307e-07,
      "loss": 0.264,
      "reward": 0.8435009941458702,
      "reward_std": 0.8539558947086334,
      "rewards/cosine_scaled_reward": 0.1370282769203186,
      "rewards/format_reward": 0.569444440305233,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2221.791717529297,
      "epoch": 0.700942587832048,
      "grad_norm": 3.616407632827759,
      "kl": 1.2978515625,
      "learning_rate": 1.8779779118983867e-07,
      "loss": 0.2031,
      "reward": 0.5767598450183868,
      "reward_std": 0.6021636947989464,
      "rewards/cosine_scaled_reward": -0.01717562135308981,
      "rewards/format_reward": 0.611111119389534,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2195.5833129882812,
      "epoch": 0.702656383890317,
      "grad_norm": 4.223770618438721,
      "kl": 0.9541015625,
      "learning_rate": 1.8594235253127372e-07,
      "loss": 0.0719,
      "reward": 0.4589345343410969,
      "reward_std": 0.5643011257052422,
      "rewards/cosine_scaled_reward": -0.09692162275314331,
      "rewards/format_reward": 0.6527777835726738,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2020.15283203125,
      "epoch": 0.7043701799485861,
      "grad_norm": 4.778375148773193,
      "kl": 0.9111328125,
      "learning_rate": 1.8410465752883758e-07,
      "loss": 0.2462,
      "reward": 0.4322133334353566,
      "reward_std": 0.5240239724516869,
      "rewards/cosine_scaled_reward": -0.11722666956484318,
      "rewards/format_reward": 0.666666679084301,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2159.5694580078125,
      "epoch": 0.7060839760068551,
      "grad_norm": 5.010425090789795,
      "kl": 1.271484375,
      "learning_rate": 1.822847957491922e-07,
      "loss": 0.1504,
      "reward": 0.27721285074949265,
      "reward_std": 0.3799732178449631,
      "rewards/cosine_scaled_reward": -0.19472691789269447,
      "rewards/format_reward": 0.6666666567325592,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1994.1805419921875,
      "epoch": 0.7077977720651243,
      "grad_norm": 3.7414398193359375,
      "kl": 0.79296875,
      "learning_rate": 1.804828558898332e-07,
      "loss": 0.2209,
      "reward": 0.4823665115982294,
      "reward_std": 0.8085788935422897,
      "rewards/cosine_scaled_reward": -0.12687229178845882,
      "rewards/format_reward": 0.736111119389534,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2582.513885498047,
      "epoch": 0.7095115681233933,
      "grad_norm": 2.3787803649902344,
      "kl": 0.955078125,
      "learning_rate": 1.7869892577476722e-07,
      "loss": 0.234,
      "reward": 0.35093772783875465,
      "reward_std": 0.7004451155662537,
      "rewards/cosine_scaled_reward": -0.06758668273687363,
      "rewards/format_reward": 0.4861111119389534,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2286.027801513672,
      "epoch": 0.7112253641816624,
      "grad_norm": 2.242143154144287,
      "kl": 1.1669921875,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.2089,
      "reward": 0.0793907418847084,
      "reward_std": 0.4775719493627548,
      "rewards/cosine_scaled_reward": -0.2519712895154953,
      "rewards/format_reward": 0.5833333320915699,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2716.8194885253906,
      "epoch": 0.7129391602399314,
      "grad_norm": 1.0189129114151,
      "kl": 1.0341796875,
      "learning_rate": 1.7518544168045524e-07,
      "loss": 0.1319,
      "reward": 0.3344786912202835,
      "reward_std": 0.6283555030822754,
      "rewards/cosine_scaled_reward": -0.09664955246262252,
      "rewards/format_reward": 0.5277777835726738,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2145.777786254883,
      "epoch": 0.7146529562982005,
      "grad_norm": 3.3594307899475098,
      "kl": 0.630859375,
      "learning_rate": 1.7345605894346726e-07,
      "loss": 0.1719,
      "reward": 0.29561759158968925,
      "reward_std": 0.45837917923927307,
      "rewards/cosine_scaled_reward": -0.16469121165573597,
      "rewards/format_reward": 0.6250000074505806,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2316.4722595214844,
      "epoch": 0.7163667523564696,
      "grad_norm": 3.205843448638916,
      "kl": 0.9482421875,
      "learning_rate": 1.7174502842694212e-07,
      "loss": 0.1905,
      "reward": 0.1763996873050928,
      "reward_std": 0.35552147775888443,
      "rewards/cosine_scaled_reward": -0.16874459758400917,
      "rewards/format_reward": 0.5138888880610466,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1993.9166564941406,
      "epoch": 0.7180805484147387,
      "grad_norm": 5.31653356552124,
      "kl": 1.2333984375,
      "learning_rate": 1.7005243352409333e-07,
      "loss": 0.1316,
      "reward": 0.38526383973658085,
      "reward_std": 0.3629095181822777,
      "rewards/cosine_scaled_reward": -0.16153474483871832,
      "rewards/format_reward": 0.7083333283662796,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2585.3472290039062,
      "epoch": 0.7197943444730077,
      "grad_norm": 3.0050301551818848,
      "kl": 1.2138671875,
      "learning_rate": 1.6837835672960831e-07,
      "loss": 0.2609,
      "reward": 0.03709686268121004,
      "reward_std": 0.45924656093120575,
      "rewards/cosine_scaled_reward": -0.1967293554916978,
      "rewards/format_reward": 0.430555559694767,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2077.4166870117188,
      "epoch": 0.7215081405312768,
      "grad_norm": 2.9638571739196777,
      "kl": 1.1162109375,
      "learning_rate": 1.6672287963562852e-07,
      "loss": 0.1967,
      "reward": 0.4120505638420582,
      "reward_std": 0.7001288831233978,
      "rewards/cosine_scaled_reward": -0.09258583001792431,
      "rewards/format_reward": 0.5972222313284874,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1782.4722290039062,
      "epoch": 0.7232219365895458,
      "grad_norm": 2.496225595474243,
      "kl": 1.1123046875,
      "learning_rate": 1.6508608292777203e-07,
      "loss": 0.3012,
      "reward": 0.3580199657008052,
      "reward_std": 0.5790654197335243,
      "rewards/cosine_scaled_reward": -0.18904556892812252,
      "rewards/format_reward": 0.7361111044883728,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2290.388916015625,
      "epoch": 0.7249357326478149,
      "grad_norm": 2.5555100440979004,
      "kl": 0.8837890625,
      "learning_rate": 1.6346804638120098e-07,
      "loss": 0.1268,
      "reward": 0.4261997193098068,
      "reward_std": 0.6714624091982841,
      "rewards/cosine_scaled_reward": -0.1341223642230034,
      "rewards/format_reward": 0.6944444552063942,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2174.3055419921875,
      "epoch": 0.726649528706084,
      "grad_norm": 4.850281715393066,
      "kl": 0.9345703125,
      "learning_rate": 1.6186884885673413e-07,
      "loss": 0.2657,
      "reward": 0.31390602327883244,
      "reward_std": 0.5223901495337486,
      "rewards/cosine_scaled_reward": -0.1833247635513544,
      "rewards/format_reward": 0.6805555820465088,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2507.8334045410156,
      "epoch": 0.7283633247643531,
      "grad_norm": 3.5151827335357666,
      "kl": 1.26953125,
      "learning_rate": 1.6028856829700258e-07,
      "loss": 0.1979,
      "reward": 0.6333100497722626,
      "reward_std": 0.7416208535432816,
      "rewards/cosine_scaled_reward": 0.031932787562254816,
      "rewards/format_reward": 0.569444440305233,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1761.2083740234375,
      "epoch": 0.7300771208226221,
      "grad_norm": 3.2045891284942627,
      "kl": 1.076171875,
      "learning_rate": 1.5872728172265146e-07,
      "loss": 0.1263,
      "reward": 0.9971873387694359,
      "reward_std": 0.7048115953803062,
      "rewards/cosine_scaled_reward": 0.0749825444072485,
      "rewards/format_reward": 0.8472222238779068,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2577.9861450195312,
      "epoch": 0.7317909168808912,
      "grad_norm": 1.8627033233642578,
      "kl": 1.14453125,
      "learning_rate": 1.5718506522858572e-07,
      "loss": 0.2595,
      "reward": 0.2531158346682787,
      "reward_std": 0.6184235513210297,
      "rewards/cosine_scaled_reward": -0.10955319553613663,
      "rewards/format_reward": 0.4722222276031971,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1856.7222595214844,
      "epoch": 0.7335047129391602,
      "grad_norm": 4.033189296722412,
      "kl": 1.1201171875,
      "learning_rate": 1.5566199398026147e-07,
      "loss": 0.263,
      "reward": 0.7039023488759995,
      "reward_std": 0.8175256699323654,
      "rewards/cosine_scaled_reward": -0.009159944485872984,
      "rewards/format_reward": 0.7222222238779068,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2663.4861450195312,
      "epoch": 0.7352185089974294,
      "grad_norm": 3.801396369934082,
      "kl": 1.130859375,
      "learning_rate": 1.5415814221002265e-07,
      "loss": 0.15,
      "reward": 0.34896907582879066,
      "reward_std": 0.5518276765942574,
      "rewards/cosine_scaled_reward": -0.12412657774984837,
      "rewards/format_reward": 0.597222238779068,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2153.555618286133,
      "epoch": 0.7369323050556984,
      "grad_norm": 2.9870073795318604,
      "kl": 0.9130859375,
      "learning_rate": 1.5267358321348285e-07,
      "loss": 0.1497,
      "reward": 0.44994640722870827,
      "reward_std": 0.3946686089038849,
      "rewards/cosine_scaled_reward": -0.08752679079771042,
      "rewards/format_reward": 0.6250000074505806,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2261.75,
      "epoch": 0.7386461011139674,
      "grad_norm": 6.578658580780029,
      "kl": 1.78515625,
      "learning_rate": 1.5120838934595337e-07,
      "loss": 0.1464,
      "reward": 0.35643661208450794,
      "reward_std": 0.5088120512664318,
      "rewards/cosine_scaled_reward": -0.12039280403405428,
      "rewards/format_reward": 0.5972222238779068,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2386.652801513672,
      "epoch": 0.7403598971722365,
      "grad_norm": 4.385483741760254,
      "kl": 1.3447265625,
      "learning_rate": 1.4976263201891613e-07,
      "loss": 0.3579,
      "reward": 0.4007231565192342,
      "reward_std": 0.6231922283768654,
      "rewards/cosine_scaled_reward": -0.09824953693896532,
      "rewards/format_reward": 0.5972222238779068,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2001.3611297607422,
      "epoch": 0.7420736932305055,
      "grad_norm": 4.371149063110352,
      "kl": 1.0439453125,
      "learning_rate": 1.483363816965435e-07,
      "loss": 0.0871,
      "reward": 0.6510265804827213,
      "reward_std": 0.4398561269044876,
      "rewards/cosine_scaled_reward": -0.04254225082695484,
      "rewards/format_reward": 0.736111119389534,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1622.6666870117188,
      "epoch": 0.7437874892887746,
      "grad_norm": 6.787911891937256,
      "kl": 1.326171875,
      "learning_rate": 1.469297078922642e-07,
      "loss": 0.3842,
      "reward": 0.46582701057195663,
      "reward_std": 0.5145231448113918,
      "rewards/cosine_scaled_reward": -0.14208650775253773,
      "rewards/format_reward": 0.7500000149011612,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2658.7500610351562,
      "epoch": 0.7455012853470437,
      "grad_norm": 2.6709697246551514,
      "kl": 1.232421875,
      "learning_rate": 1.4554267916537495e-07,
      "loss": 0.0821,
      "reward": 0.31399114802479744,
      "reward_std": 0.5854284539818764,
      "rewards/cosine_scaled_reward": -0.10689331218600273,
      "rewards/format_reward": 0.5277777835726738,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1860.9583435058594,
      "epoch": 0.7472150814053128,
      "grad_norm": 3.8863277435302734,
      "kl": 0.91796875,
      "learning_rate": 1.4417536311769885e-07,
      "loss": 0.2564,
      "reward": 0.5377090591937304,
      "reward_std": 0.5195211619138718,
      "rewards/cosine_scaled_reward": -0.0853121317923069,
      "rewards/format_reward": 0.7083333283662796,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2466.2361755371094,
      "epoch": 0.7489288774635818,
      "grad_norm": 2.9903695583343506,
      "kl": 1.16015625,
      "learning_rate": 1.4282782639029128e-07,
      "loss": 0.1351,
      "reward": 0.4385749250650406,
      "reward_std": 0.6242729276418686,
      "rewards/cosine_scaled_reward": -0.10015699185896665,
      "rewards/format_reward": 0.6388889029622078,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2362.8611450195312,
      "epoch": 0.7506426735218509,
      "grad_norm": 1.599947214126587,
      "kl": 1.189453125,
      "learning_rate": 1.4150013466019114e-07,
      "loss": 0.1863,
      "reward": 0.6738657765090466,
      "reward_std": 0.6156510934233665,
      "rewards/cosine_scaled_reward": 0.03137733961921185,
      "rewards/format_reward": 0.611111119389534,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2748.2916870117188,
      "epoch": 0.7523564695801199,
      "grad_norm": 2.867025136947632,
      "kl": 1.033203125,
      "learning_rate": 1.4019235263722034e-07,
      "loss": 0.1123,
      "reward": 0.22596902353689075,
      "reward_std": 0.5135553628206253,
      "rewards/cosine_scaled_reward": -0.13701549544930458,
      "rewards/format_reward": 0.5000000074505806,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2215.277801513672,
      "epoch": 0.7540702656383891,
      "grad_norm": 5.796390533447266,
      "kl": 1.396484375,
      "learning_rate": 1.3890454406082956e-07,
      "loss": 0.1245,
      "reward": 0.6629978334531188,
      "reward_std": 0.5948286652565002,
      "rewards/cosine_scaled_reward": 0.0051100607961416245,
      "rewards/format_reward": 0.6527777910232544,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2402.7222595214844,
      "epoch": 0.7557840616966581,
      "grad_norm": 5.96156644821167,
      "kl": 1.34375,
      "learning_rate": 1.3763677169699217e-07,
      "loss": 0.0634,
      "reward": 0.5063027180731297,
      "reward_std": 0.6581330522894859,
      "rewards/cosine_scaled_reward": -0.08018200099468231,
      "rewards/format_reward": 0.6666666716337204,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2082.263916015625,
      "epoch": 0.7574978577549272,
      "grad_norm": 3.405839443206787,
      "kl": 1.6640625,
      "learning_rate": 1.3638909733514452e-07,
      "loss": 0.2784,
      "reward": 0.5193299576640129,
      "reward_std": 0.5714153945446014,
      "rewards/cosine_scaled_reward": -0.05977945402264595,
      "rewards/format_reward": 0.6388888880610466,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1665.7638854980469,
      "epoch": 0.7592116538131962,
      "grad_norm": 5.792540550231934,
      "kl": 1.0849609375,
      "learning_rate": 1.351615817851748e-07,
      "loss": 0.1343,
      "reward": 0.6929136589169502,
      "reward_std": 0.636933371424675,
      "rewards/cosine_scaled_reward": -0.056320954114198685,
      "rewards/format_reward": 0.8055555671453476,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2291.27783203125,
      "epoch": 0.7609254498714653,
      "grad_norm": 5.360567569732666,
      "kl": 1.2060546875,
      "learning_rate": 1.3395428487445914e-07,
      "loss": 0.18,
      "reward": 0.36108400439843535,
      "reward_std": 0.5261719971895218,
      "rewards/cosine_scaled_reward": -0.15973576810210943,
      "rewards/format_reward": 0.6805555447936058,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2755.4583740234375,
      "epoch": 0.7626392459297343,
      "grad_norm": 2.5989925861358643,
      "kl": 0.875,
      "learning_rate": 1.3276726544494571e-07,
      "loss": 0.1482,
      "reward": 0.14870610460639,
      "reward_std": 0.567838903516531,
      "rewards/cosine_scaled_reward": -0.18259140476584435,
      "rewards/format_reward": 0.5138888880610466,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2247.9583435058594,
      "epoch": 0.7643530419880035,
      "grad_norm": 5.224709987640381,
      "kl": 1.005859375,
      "learning_rate": 1.316005813502869e-07,
      "loss": 0.321,
      "reward": 0.5112787692341954,
      "reward_std": 0.6983462646603584,
      "rewards/cosine_scaled_reward": -0.049916195683181286,
      "rewards/format_reward": 0.6111111119389534,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2349.3055725097656,
      "epoch": 0.7660668380462725,
      "grad_norm": 3.252889633178711,
      "kl": 1.216796875,
      "learning_rate": 1.3045428945301953e-07,
      "loss": 0.1445,
      "reward": 0.32806872576475143,
      "reward_std": 0.7308538854122162,
      "rewards/cosine_scaled_reward": -0.1276322863996029,
      "rewards/format_reward": 0.5833333358168602,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2661.5555725097656,
      "epoch": 0.7677806341045416,
      "grad_norm": 2.3366446495056152,
      "kl": 0.8486328125,
      "learning_rate": 1.2932844562179352e-07,
      "loss": 0.0872,
      "reward": 0.36861317604780197,
      "reward_std": 0.6462560296058655,
      "rewards/cosine_scaled_reward": -0.051804508082568645,
      "rewards/format_reward": 0.47222223225980997,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2592.4444580078125,
      "epoch": 0.7694944301628106,
      "grad_norm": 4.123133182525635,
      "kl": 1.052734375,
      "learning_rate": 1.2822310472864885e-07,
      "loss": 0.2043,
      "reward": 0.2611931987339631,
      "reward_std": 0.7008328437805176,
      "rewards/cosine_scaled_reward": -0.147181186825037,
      "rewards/format_reward": 0.5555555522441864,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1906.3333740234375,
      "epoch": 0.7712082262210797,
      "grad_norm": 3.090589761734009,
      "kl": 0.865234375,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.1642,
      "reward": 0.5996736511588097,
      "reward_std": 0.5084411576390266,
      "rewards/cosine_scaled_reward": -0.06821873132139444,
      "rewards/format_reward": 0.736111119389534,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2815.4166870117188,
      "epoch": 0.7729220222793488,
      "grad_norm": 2.466654062271118,
      "kl": 0.7978515625,
      "learning_rate": 1.260741462457165e-07,
      "loss": 0.1236,
      "reward": 0.07433861424215138,
      "reward_std": 0.5574841573834419,
      "rewards/cosine_scaled_reward": -0.21977514401078224,
      "rewards/format_reward": 0.5138888917863369,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2436.8611450195312,
      "epoch": 0.7746358183376179,
      "grad_norm": 2.854764699935913,
      "kl": 0.8505859375,
      "learning_rate": 1.2503063339313356e-07,
      "loss": 0.1263,
      "reward": 0.462260864675045,
      "reward_std": 0.7514103129506111,
      "rewards/cosine_scaled_reward": -0.09525846503674984,
      "rewards/format_reward": 0.6527777761220932,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2337.7361755371094,
      "epoch": 0.7763496143958869,
      "grad_norm": 3.1975936889648438,
      "kl": 0.97265625,
      "learning_rate": 1.2400783294793668e-07,
      "loss": 0.1955,
      "reward": 0.39777151867747307,
      "reward_std": 0.6588628813624382,
      "rewards/cosine_scaled_reward": -0.11361423693597317,
      "rewards/format_reward": 0.6250000074505806,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2078.0972595214844,
      "epoch": 0.778063410454156,
      "grad_norm": 1.6080825328826904,
      "kl": 1.15625,
      "learning_rate": 1.2300579475997657e-07,
      "loss": 0.2023,
      "reward": 0.7216087523847818,
      "reward_std": 0.7977120280265808,
      "rewards/cosine_scaled_reward": 0.006637714395765215,
      "rewards/format_reward": 0.7083333283662796,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2211.7916259765625,
      "epoch": 0.779777206512425,
      "grad_norm": 3.7410457134246826,
      "kl": 1.00390625,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.2082,
      "reward": 0.4414830207824707,
      "reward_std": 0.6565307825803757,
      "rewards/cosine_scaled_reward": -0.10564738605171442,
      "rewards/format_reward": 0.6527777761220932,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2365.52783203125,
      "epoch": 0.781491002570694,
      "grad_norm": 6.645061492919922,
      "kl": 0.64013671875,
      "learning_rate": 1.2106419949317388e-07,
      "loss": 0.2263,
      "reward": 0.3324281768873334,
      "reward_std": 0.5864489898085594,
      "rewards/cosine_scaled_reward": -0.1046192436479032,
      "rewards/format_reward": 0.5416666641831398,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2122.5833435058594,
      "epoch": 0.7832047986289632,
      "grad_norm": 2.3880536556243896,
      "kl": 1.04736328125,
      "learning_rate": 1.2012473704494537e-07,
      "loss": 0.1423,
      "reward": 0.8056632168591022,
      "reward_std": 0.6164202988147736,
      "rewards/cosine_scaled_reward": 0.020887171383947134,
      "rewards/format_reward": 0.7638888955116272,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1959.8056030273438,
      "epoch": 0.7849185946872322,
      "grad_norm": 4.885958671569824,
      "kl": 0.94287109375,
      "learning_rate": 1.1920622611056974e-07,
      "loss": 0.2544,
      "reward": 0.4146232455968857,
      "reward_std": 0.5990116819739342,
      "rewards/cosine_scaled_reward": -0.12602169532328844,
      "rewards/format_reward": 0.6666666865348816,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2254.0694274902344,
      "epoch": 0.7866323907455013,
      "grad_norm": 2.2345101833343506,
      "kl": 0.53564453125,
      "learning_rate": 1.1830871145697412e-07,
      "loss": 0.0455,
      "reward": 0.4500209465622902,
      "reward_std": 0.5013090819120407,
      "rewards/cosine_scaled_reward": -0.10137841757386923,
      "rewards/format_reward": 0.652777798473835,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2215.1666564941406,
      "epoch": 0.7883461868037703,
      "grad_norm": 4.272637367248535,
      "kl": 0.92236328125,
      "learning_rate": 1.1743223682775649e-07,
      "loss": 0.0789,
      "reward": 0.44543247297406197,
      "reward_std": 0.5984909385442734,
      "rewards/cosine_scaled_reward": -0.10367265064269304,
      "rewards/format_reward": 0.6527777910232544,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1930.5000305175781,
      "epoch": 0.7900599828620394,
      "grad_norm": 2.8687753677368164,
      "kl": 1.2333984375,
      "learning_rate": 1.1657684494105386e-07,
      "loss": 0.1984,
      "reward": 0.4672253951430321,
      "reward_std": 0.6156143024563789,
      "rewards/cosine_scaled_reward": -0.13444286305457354,
      "rewards/format_reward": 0.736111119389534,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1981.9583740234375,
      "epoch": 0.7917737789203085,
      "grad_norm": 3.0882349014282227,
      "kl": 0.7373046875,
      "learning_rate": 1.1574257748745986e-07,
      "loss": 0.0448,
      "reward": 0.5570826064795256,
      "reward_std": 0.6341868117451668,
      "rewards/cosine_scaled_reward": -0.08951424108818173,
      "rewards/format_reward": 0.7361111119389534,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1936.4306030273438,
      "epoch": 0.7934875749785776,
      "grad_norm": 2.4561548233032227,
      "kl": 0.7119140625,
      "learning_rate": 1.1492947512799328e-07,
      "loss": 0.2065,
      "reward": 0.5815738141536713,
      "reward_std": 0.7455588281154633,
      "rewards/cosine_scaled_reward": -0.08421308733522892,
      "rewards/format_reward": 0.7500000149011612,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2648.277801513672,
      "epoch": 0.7952013710368466,
      "grad_norm": 1.9183648824691772,
      "kl": 1.189453125,
      "learning_rate": 1.1413757749211602e-07,
      "loss": 0.1831,
      "reward": 0.4944647327065468,
      "reward_std": 0.5960628166794777,
      "rewards/cosine_scaled_reward": -0.04443428758531809,
      "rewards/format_reward": 0.5833333358168602,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 3089.4722290039062,
      "epoch": 0.7969151670951157,
      "grad_norm": 1.3800582885742188,
      "kl": 0.88671875,
      "learning_rate": 1.1336692317580158e-07,
      "loss": 0.1132,
      "reward": 0.17487204633653164,
      "reward_std": 0.6750592887401581,
      "rewards/cosine_scaled_reward": -0.1278417520225048,
      "rewards/format_reward": 0.430555559694767,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2615.013916015625,
      "epoch": 0.7986289631533847,
      "grad_norm": 2.8072264194488525,
      "kl": 0.93115234375,
      "learning_rate": 1.1261754973965422e-07,
      "loss": 0.15,
      "reward": 0.17807744164019823,
      "reward_std": 0.6022924780845642,
      "rewards/cosine_scaled_reward": -0.11929461418185383,
      "rewards/format_reward": 0.4166666679084301,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2457.3750610351562,
      "epoch": 0.8003427592116538,
      "grad_norm": 4.940661430358887,
      "kl": 0.7890625,
      "learning_rate": 1.1188949370707787e-07,
      "loss": 0.1464,
      "reward": 0.24202457256615162,
      "reward_std": 0.42437436431646347,
      "rewards/cosine_scaled_reward": -0.19148772559128702,
      "rewards/format_reward": 0.6249999925494194,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2192.986114501953,
      "epoch": 0.8020565552699229,
      "grad_norm": 3.136319637298584,
      "kl": 0.767578125,
      "learning_rate": 1.1118279056249653e-07,
      "loss": 0.1721,
      "reward": 0.6284131053835154,
      "reward_std": 0.5748142190277576,
      "rewards/cosine_scaled_reward": -0.012182342819869518,
      "rewards/format_reward": 0.6527777761220932,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2432.8333435058594,
      "epoch": 0.803770351328192,
      "grad_norm": 1.9713729619979858,
      "kl": 0.8603515625,
      "learning_rate": 1.1049747474962444e-07,
      "loss": 0.1523,
      "reward": 0.5457211770117283,
      "reward_std": 0.729132629930973,
      "rewards/cosine_scaled_reward": -0.0535283163189888,
      "rewards/format_reward": 0.6527777910232544,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2276.4305725097656,
      "epoch": 0.805484147386461,
      "grad_norm": 3.8467977046966553,
      "kl": 1.216796875,
      "learning_rate": 1.0983357966978745e-07,
      "loss": 0.1232,
      "reward": 0.5122000686824322,
      "reward_std": 0.7733886539936066,
      "rewards/cosine_scaled_reward": -0.09112219791859388,
      "rewards/format_reward": 0.6944444552063942,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2643.7083129882812,
      "epoch": 0.8071979434447301,
      "grad_norm": 1.1509345769882202,
      "kl": 0.7216796875,
      "learning_rate": 1.0919113768029517e-07,
      "loss": 0.1381,
      "reward": 0.10054661217145622,
      "reward_std": 0.6373118087649345,
      "rewards/cosine_scaled_reward": -0.16500448435544968,
      "rewards/format_reward": 0.4305555745959282,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2733.8056030273438,
      "epoch": 0.8089117395029991,
      "grad_norm": 1.7471221685409546,
      "kl": 0.60986328125,
      "learning_rate": 1.0857018009286381e-07,
      "loss": 0.1037,
      "reward": 0.3268199451267719,
      "reward_std": 0.7872605472803116,
      "rewards/cosine_scaled_reward": -0.06575669860467315,
      "rewards/format_reward": 0.4583333507180214,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2760.416717529297,
      "epoch": 0.8106255355612683,
      "grad_norm": 2.182706832885742,
      "kl": 0.7177734375,
      "learning_rate": 1.0797073717209013e-07,
      "loss": 0.103,
      "reward": 0.3022213885560632,
      "reward_std": 0.5640696436166763,
      "rewards/cosine_scaled_reward": -0.15444485377520323,
      "rewards/format_reward": 0.611111119389534,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2262.4166564941406,
      "epoch": 0.8123393316195373,
      "grad_norm": 2.2662978172302246,
      "kl": 1.310546875,
      "learning_rate": 1.0739283813397639e-07,
      "loss": 0.2095,
      "reward": 0.817143252119422,
      "reward_std": 0.5297734513878822,
      "rewards/cosine_scaled_reward": 0.0960716437548399,
      "rewards/format_reward": 0.6250000149011612,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2711.9722595214844,
      "epoch": 0.8140531276778064,
      "grad_norm": 5.152209758758545,
      "kl": 0.900390625,
      "learning_rate": 1.068365111445064e-07,
      "loss": 0.0199,
      "reward": 0.1260463148355484,
      "reward_std": 0.5338724106550217,
      "rewards/cosine_scaled_reward": -0.1939212940633297,
      "rewards/format_reward": 0.5138888917863369,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2675.7361755371094,
      "epoch": 0.8157669237360754,
      "grad_norm": 2.230329990386963,
      "kl": 0.79931640625,
      "learning_rate": 1.063017833182728e-07,
      "loss": 0.1478,
      "reward": 0.14864197466522455,
      "reward_std": 0.6397556141018867,
      "rewards/cosine_scaled_reward": -0.1756790205836296,
      "rewards/format_reward": 0.5000000037252903,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2487.75,
      "epoch": 0.8174807197943444,
      "grad_norm": 4.63166618347168,
      "kl": 0.8720703125,
      "learning_rate": 1.0578868071715544e-07,
      "loss": 0.3369,
      "reward": 0.37176867201924324,
      "reward_std": 0.6089313849806786,
      "rewards/cosine_scaled_reward": -0.07106010848656297,
      "rewards/format_reward": 0.5138888992369175,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2181.4722595214844,
      "epoch": 0.8191945158526135,
      "grad_norm": 3.272205114364624,
      "kl": 1.212890625,
      "learning_rate": 1.0529722834905125e-07,
      "loss": 0.1721,
      "reward": 0.2916110037913313,
      "reward_std": 0.49708379805088043,
      "rewards/cosine_scaled_reward": -0.11113895289599895,
      "rewards/format_reward": 0.5138889029622078,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2471.8194732666016,
      "epoch": 0.8209083119108826,
      "grad_norm": 3.132082462310791,
      "kl": 1.228515625,
      "learning_rate": 1.0482745016665526e-07,
      "loss": 0.208,
      "reward": 0.7535388497635722,
      "reward_std": 0.695548452436924,
      "rewards/cosine_scaled_reward": 0.06426943093538284,
      "rewards/format_reward": 0.6250000074505806,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2416.0693969726562,
      "epoch": 0.8226221079691517,
      "grad_norm": 3.6008918285369873,
      "kl": 0.6015625,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.1762,
      "reward": 0.3882830161601305,
      "reward_std": 0.6291572600603104,
      "rewards/cosine_scaled_reward": -0.07669184263795614,
      "rewards/format_reward": 0.5416666641831398,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1688.6388854980469,
      "epoch": 0.8243359040274207,
      "grad_norm": 3.3292489051818848,
      "kl": 0.94482421875,
      "learning_rate": 1.0395300688680625e-07,
      "loss": 0.1756,
      "reward": 0.6976406946778297,
      "reward_std": 0.7118247449398041,
      "rewards/cosine_scaled_reward": -0.040068539790809155,
      "rewards/format_reward": 0.7777777835726738,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2275.9723205566406,
      "epoch": 0.8260497000856898,
      "grad_norm": 5.62246036529541,
      "kl": 0.806640625,
      "learning_rate": 1.0354838440848501e-07,
      "loss": 0.3047,
      "reward": 0.3320089429616928,
      "reward_std": 0.5019624754786491,
      "rewards/cosine_scaled_reward": -0.13955109613016248,
      "rewards/format_reward": 0.6111111268401146,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2215.8055725097656,
      "epoch": 0.8277634961439588,
      "grad_norm": 1.8363783359527588,
      "kl": 0.98046875,
      "learning_rate": 1.0316552135205837e-07,
      "loss": 0.205,
      "reward": 0.4018698123982176,
      "reward_std": 0.5796016827225685,
      "rewards/cosine_scaled_reward": -0.13239844236522913,
      "rewards/format_reward": 0.6666666716337204,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1844.236099243164,
      "epoch": 0.829477292202228,
      "grad_norm": 4.732890605926514,
      "kl": 0.64990234375,
      "learning_rate": 1.0280443637773163e-07,
      "loss": 0.229,
      "reward": 0.37843877635896206,
      "reward_std": 0.6878086104989052,
      "rewards/cosine_scaled_reward": -0.1441139355301857,
      "rewards/format_reward": 0.6666666716337204,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2126.527801513672,
      "epoch": 0.831191088260497,
      "grad_norm": 3.030064821243286,
      "kl": 1.2275390625,
      "learning_rate": 1.0246514708427701e-07,
      "loss": 0.2121,
      "reward": 0.5017230249941349,
      "reward_std": 0.8949761241674423,
      "rewards/cosine_scaled_reward": -0.0616384893655777,
      "rewards/format_reward": 0.625,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2167.52783203125,
      "epoch": 0.8329048843187661,
      "grad_norm": 2.106167793273926,
      "kl": 0.8662109375,
      "learning_rate": 1.0214767000817596e-07,
      "loss": 0.0938,
      "reward": 0.5535758845508099,
      "reward_std": 0.5298986956477165,
      "rewards/cosine_scaled_reward": -0.021823172457516193,
      "rewards/format_reward": 0.597222238779068,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2419.6111450195312,
      "epoch": 0.8346186803770351,
      "grad_norm": 2.8747453689575195,
      "kl": 0.6708984375,
      "learning_rate": 1.0185202062281336e-07,
      "loss": 0.1754,
      "reward": 0.2720159562304616,
      "reward_std": 0.545224204659462,
      "rewards/cosine_scaled_reward": -0.1487142387777567,
      "rewards/format_reward": 0.5694444552063942,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2196.486114501953,
      "epoch": 0.8363324764353042,
      "grad_norm": 2.825509786605835,
      "kl": 0.55517578125,
      "learning_rate": 1.0157821333772304e-07,
      "loss": 0.1679,
      "reward": 0.6998728811740875,
      "reward_std": 0.6955326199531555,
      "rewards/cosine_scaled_reward": 0.030491996556520462,
      "rewards/format_reward": 0.6388888955116272,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2617.722198486328,
      "epoch": 0.8380462724935732,
      "grad_norm": 1.9763245582580566,
      "kl": 0.8466796875,
      "learning_rate": 1.013262614978859e-07,
      "loss": 0.1616,
      "reward": 0.28653959557414055,
      "reward_std": 0.6969783715903759,
      "rewards/cosine_scaled_reward": -0.12061909190379083,
      "rewards/format_reward": 0.5277777835726738,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2630.0972290039062,
      "epoch": 0.8397600685518424,
      "grad_norm": 1.7776055335998535,
      "kl": 0.73388671875,
      "learning_rate": 1.0109617738307911e-07,
      "loss": 0.1053,
      "reward": 0.3464082106947899,
      "reward_std": 0.7413296326994896,
      "rewards/cosine_scaled_reward": -0.08374034571170341,
      "rewards/format_reward": 0.5138888880610466,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2610.1666870117188,
      "epoch": 0.8414738646101114,
      "grad_norm": 2.362657308578491,
      "kl": 1.03369140625,
      "learning_rate": 1.0088797220727779e-07,
      "loss": 0.1536,
      "reward": 0.054581154661718756,
      "reward_std": 0.5118880867958069,
      "rewards/cosine_scaled_reward": -0.27132053300738335,
      "rewards/format_reward": 0.5972222238779068,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2591.611114501953,
      "epoch": 0.8431876606683805,
      "grad_norm": 1.4310436248779297,
      "kl": 0.748046875,
      "learning_rate": 1.0070165611810855e-07,
      "loss": 0.1227,
      "reward": 0.2780334800481796,
      "reward_std": 0.5931698530912399,
      "rewards/cosine_scaled_reward": -0.16653881408274174,
      "rewards/format_reward": 0.611111119389534,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1928.0278015136719,
      "epoch": 0.8449014567266495,
      "grad_norm": 4.139144420623779,
      "kl": 1.39892578125,
      "learning_rate": 1.005372381963547e-07,
      "loss": 0.2949,
      "reward": 0.36576576717197895,
      "reward_std": 0.4379217103123665,
      "rewards/cosine_scaled_reward": -0.15045045968145132,
      "rewards/format_reward": 0.6666666716337204,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2274.680633544922,
      "epoch": 0.8466152527849186,
      "grad_norm": 1.5368496179580688,
      "kl": 0.958984375,
      "learning_rate": 1.0039472645551372e-07,
      "loss": 0.1968,
      "reward": 0.3456185795366764,
      "reward_std": 0.5900547206401825,
      "rewards/cosine_scaled_reward": -0.11885737907141447,
      "rewards/format_reward": 0.5833333358168602,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1835.8333740234375,
      "epoch": 0.8483290488431876,
      "grad_norm": 4.2471394538879395,
      "kl": 1.61279296875,
      "learning_rate": 1.002741278414069e-07,
      "loss": 0.1987,
      "reward": 0.9312632232904434,
      "reward_std": 0.586229220032692,
      "rewards/cosine_scaled_reward": 0.06979827064787969,
      "rewards/format_reward": 0.7916666567325592,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2454.902801513672,
      "epoch": 0.8500428449014568,
      "grad_norm": 2.069298505783081,
      "kl": 0.9306640625,
      "learning_rate": 1.0017544823184055e-07,
      "loss": 0.176,
      "reward": 0.6016820748336613,
      "reward_std": 0.7270394861698151,
      "rewards/cosine_scaled_reward": 0.0161188212223351,
      "rewards/format_reward": 0.5694444477558136,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2282.250030517578,
      "epoch": 0.8517566409597258,
      "grad_norm": 2.224278688430786,
      "kl": 0.7265625,
      "learning_rate": 1.0009869243631952e-07,
      "loss": 0.1715,
      "reward": 0.43558146245777607,
      "reward_std": 0.6017558500170708,
      "rewards/cosine_scaled_reward": -0.07387594413012266,
      "rewards/format_reward": 0.5833333432674408,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2533.7500610351562,
      "epoch": 0.8534704370179949,
      "grad_norm": 5.092855930328369,
      "kl": 1.08935546875,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.1009,
      "reward": 0.1837000446394086,
      "reward_std": 0.7107623964548111,
      "rewards/cosine_scaled_reward": -0.19287220388650894,
      "rewards/format_reward": 0.569444440305233,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 2394.777801513672,
      "epoch": 0.8551842330762639,
      "grad_norm": 2.348245620727539,
      "kl": 0.81884765625,
      "learning_rate": 1.0001096618257236e-07,
      "loss": 0.1019,
      "reward": 0.8653097227215767,
      "reward_std": 0.7131348252296448,
      "rewards/cosine_scaled_reward": 0.13404375594109297,
      "rewards/format_reward": 0.5972222238779068,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1616.4583282470703,
      "epoch": 0.856898029134533,
      "grad_norm": 2.849949598312378,
      "kl": 0.951171875,
      "learning_rate": 1e-07,
      "loss": 0.12,
      "reward": 1.1779827252030373,
      "reward_std": 0.6799286007881165,
      "rewards/cosine_scaled_reward": 0.20010241214185953,
      "rewards/format_reward": 0.7777777761220932,
      "step": 500
    },
    {
      "epoch": 0.856898029134533,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.12157149085606943,
      "train_runtime": 48026.4516,
      "train_samples_per_second": 0.75,
      "train_steps_per_second": 0.01
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 6,
  "trial_name": null,
  "trial_params": null
}