DAPO-No-DS-8B / trainer_state.json
kangdawei's picture
Model save
72e9612 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22857142857142856,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_fraction": 0.0,
"completion_length": 2523.270866394043,
"epoch": 0.001142857142857143,
"grad_norm": 0.0744374468922615,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0007,
"reward": 0.17862090840935707,
"reward_std": 0.539480353705585,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.0713790925219655,
"step": 1
},
{
"clip_fraction": 0.0,
"completion_length": 2684.583366394043,
"epoch": 0.002285714285714286,
"grad_norm": 0.07803362607955933,
"kl": 0.0,
"learning_rate": 1e-07,
"loss": 0.0338,
"reward": 0.33918463438749313,
"reward_std": 0.4111455399543047,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.047517990693449974,
"step": 2
},
{
"clip_fraction": 0.0,
"completion_length": 2981.3541717529297,
"epoch": 0.0034285714285714284,
"grad_norm": 0.07460929453372955,
"kl": 5.2601099014282227e-05,
"learning_rate": 2e-07,
"loss": -0.0527,
"reward": 0.0897666085511446,
"reward_std": 0.43747894931584597,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.014400066807866096,
"step": 3
},
{
"clip_fraction": 0.0,
"completion_length": 1419.8333740234375,
"epoch": 0.004571428571428572,
"grad_norm": 0.11164188385009766,
"kl": 4.7594308853149414e-05,
"learning_rate": 3e-07,
"loss": -0.0508,
"reward": 0.09919259510934353,
"reward_std": 0.6438650283962488,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.06747407512739301,
"step": 4
},
{
"clip_fraction": 0.0,
"completion_length": 3174.0625228881836,
"epoch": 0.005714285714285714,
"grad_norm": 0.06975400447845459,
"kl": 5.8650970458984375e-05,
"learning_rate": 4e-07,
"loss": 0.0271,
"reward": -0.15678282314911485,
"reward_std": 0.33761502243578434,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.19844949309481308,
"step": 5
},
{
"clip_fraction": 0.0,
"completion_length": 2732.250045776367,
"epoch": 0.006857142857142857,
"grad_norm": 0.07184051722288132,
"kl": 4.495866596698761e-05,
"learning_rate": 5e-07,
"loss": -0.0141,
"reward": -0.13207554817199707,
"reward_std": 0.2536952579393983,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1737422114238143,
"step": 6
},
{
"clip_fraction": 0.0,
"completion_length": 2609.5208740234375,
"epoch": 0.008,
"grad_norm": 0.06629231572151184,
"kl": 3.341585397720337e-05,
"learning_rate": 6e-07,
"loss": 0.0435,
"reward": 0.09007547050714493,
"reward_std": 0.3771515293046832,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.07659117877483368,
"step": 7
},
{
"clip_fraction": 0.0,
"completion_length": 2417.1458892822266,
"epoch": 0.009142857142857144,
"grad_norm": 0.07080549746751785,
"kl": 3.287196159362793e-05,
"learning_rate": 7e-07,
"loss": -0.0181,
"reward": 0.4757417570799589,
"reward_std": 0.6338865607976913,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.12157511284749489,
"step": 8
},
{
"clip_fraction": 0.0,
"completion_length": 2861.041702270508,
"epoch": 0.010285714285714285,
"grad_norm": 0.1069210022687912,
"kl": 4.7460198402404785e-05,
"learning_rate": 8e-07,
"loss": 0.0098,
"reward": -0.021934514865279198,
"reward_std": 0.4319152287207544,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1261011796505045,
"step": 9
},
{
"clip_fraction": 0.0,
"completion_length": 2603.104179382324,
"epoch": 0.011428571428571429,
"grad_norm": 0.08398132026195526,
"kl": 5.1975250244140625e-05,
"learning_rate": 9e-07,
"loss": -0.0231,
"reward": 0.13638958521187305,
"reward_std": 0.5910755675286055,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.051110414788126945,
"step": 10
},
{
"clip_fraction": 0.0,
"completion_length": 3321.750030517578,
"epoch": 0.012571428571428572,
"grad_norm": 0.04589095339179039,
"kl": 4.659593105316162e-05,
"learning_rate": 1e-06,
"loss": 0.0195,
"reward": -0.061404408887028694,
"reward_std": 0.3960698740556836,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.16557107982225716,
"step": 11
},
{
"clip_fraction": 0.0,
"completion_length": 2037.8750457763672,
"epoch": 0.013714285714285714,
"grad_norm": 0.0902877002954483,
"kl": 4.0978193283081055e-05,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0965,
"reward": 0.18073058780282736,
"reward_std": 0.5450206631794572,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.006769413128495216,
"step": 12
},
{
"clip_fraction": 0.0,
"completion_length": 2972.312515258789,
"epoch": 0.014857142857142857,
"grad_norm": 0.05695594474673271,
"kl": 3.445148468017578e-05,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0496,
"reward": 0.18564651999622583,
"reward_std": 0.5214524045586586,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.04352014325559139,
"step": 13
},
{
"clip_fraction": 0.0,
"completion_length": 2374.9375228881836,
"epoch": 0.016,
"grad_norm": 0.0727272629737854,
"kl": 3.678351640701294e-05,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0125,
"reward": 0.08268034365028143,
"reward_std": 0.5252711391076446,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.08398633264005184,
"step": 14
},
{
"clip_fraction": 0.0,
"completion_length": 2724.062515258789,
"epoch": 0.017142857142857144,
"grad_norm": 0.07843177765607834,
"kl": 3.783963620662689e-05,
"learning_rate": 9.956206309337066e-07,
"loss": -0.0409,
"reward": 0.2568634729832411,
"reward_std": 0.3660598713904619,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.006863469257950783,
"step": 15
},
{
"clip_fraction": 0.0,
"completion_length": 3526.7083435058594,
"epoch": 0.018285714285714287,
"grad_norm": 0.05552014708518982,
"kl": 5.0380825996398926e-05,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0203,
"reward": -0.260313069447875,
"reward_std": 0.15844993200153112,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.26031307131052017,
"step": 16
},
{
"clip_fraction": 0.0,
"completion_length": 2162.083366394043,
"epoch": 0.019428571428571427,
"grad_norm": 0.130440354347229,
"kl": 5.4463744163513184e-05,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0201,
"reward": 0.16820077877491713,
"reward_std": 0.6183132668957114,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.0609658882021904,
"step": 17
},
{
"clip_fraction": 0.0,
"completion_length": 2849.8750534057617,
"epoch": 0.02057142857142857,
"grad_norm": 0.05553295090794563,
"kl": 3.0259601771831512e-05,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0625,
"reward": 0.28691262751817703,
"reward_std": 0.5530234389007092,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": 0.0160792883252725,
"step": 18
},
{
"clip_fraction": 0.0,
"completion_length": 2852.1250534057617,
"epoch": 0.021714285714285714,
"grad_norm": 0.069038525223732,
"kl": 3.6716461181640625e-05,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0466,
"reward": 0.41487319313455373,
"reward_std": 0.5478954035788774,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.060706520453095436,
"step": 19
},
{
"clip_fraction": 0.0,
"completion_length": 1917.3125457763672,
"epoch": 0.022857142857142857,
"grad_norm": 0.09311036020517349,
"kl": 3.442913293838501e-05,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0638,
"reward": 0.34980504028499126,
"reward_std": 0.5843578286003321,
"rewards/accuracy_reward": 0.31250000186264515,
"rewards/cosine_scaled_reward": 0.037305014207959175,
"step": 20
},
{
"clip_fraction": 0.0,
"completion_length": 2692.729232788086,
"epoch": 0.024,
"grad_norm": 0.10610220581293106,
"kl": 4.506111145019531e-05,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0482,
"reward": 0.3050496280193329,
"reward_std": 0.5608191061764956,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/cosine_scaled_reward": 0.013382963836193085,
"step": 21
},
{
"clip_fraction": 0.0,
"completion_length": 1465.395866394043,
"epoch": 0.025142857142857144,
"grad_norm": 0.12254074215888977,
"kl": 3.9443373680114746e-05,
"learning_rate": 9.672327345550543e-07,
"loss": -0.0519,
"reward": 0.3392514977604151,
"reward_std": 0.30786877777427435,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/cosine_scaled_reward": 0.0059181563556194305,
"step": 22
},
{
"clip_fraction": 0.0,
"completion_length": 2394.937530517578,
"epoch": 0.026285714285714287,
"grad_norm": 0.10998130589723587,
"kl": 3.953278064727783e-05,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0996,
"reward": 0.14149342849850655,
"reward_std": 0.34754151944071054,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.0668399203568697,
"step": 23
},
{
"clip_fraction": 0.0,
"completion_length": 2533.875045776367,
"epoch": 0.027428571428571427,
"grad_norm": 0.08649002015590668,
"kl": 3.2901763916015625e-05,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0447,
"reward": 0.5146787296980619,
"reward_std": 0.5441905837506056,
"rewards/accuracy_reward": 0.39583334140479565,
"rewards/cosine_scaled_reward": 0.118845384567976,
"step": 24
},
{
"clip_fraction": 0.0,
"completion_length": 2268.791679382324,
"epoch": 0.02857142857142857,
"grad_norm": 0.08303514868021011,
"kl": 4.312890814617276e-05,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0186,
"reward": -0.08262800239026546,
"reward_std": 0.36995193734765053,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.1659613372758031,
"step": 25
},
{
"clip_fraction": 0.0,
"completion_length": 2869.7500228881836,
"epoch": 0.029714285714285714,
"grad_norm": 0.061481326818466187,
"kl": 5.170702934265137e-05,
"learning_rate": 9.397114317029974e-07,
"loss": -0.0101,
"reward": 0.006060175597667694,
"reward_std": 0.3566547529771924,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.11893982626497746,
"step": 26
},
{
"clip_fraction": 0.0,
"completion_length": 2821.687530517578,
"epoch": 0.030857142857142857,
"grad_norm": 0.08310137689113617,
"kl": 6.3285231590271e-05,
"learning_rate": 9.316216432703916e-07,
"loss": -0.004,
"reward": -0.13682544324547052,
"reward_std": 0.38389212638139725,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.19932544289622456,
"step": 27
},
{
"clip_fraction": 0.0,
"completion_length": 2828.333366394043,
"epoch": 0.032,
"grad_norm": 0.07233916968107224,
"kl": 4.972517490386963e-05,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0209,
"reward": 0.25604306906461716,
"reward_std": 0.5071917362511158,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": -0.014790281420573592,
"step": 28
},
{
"clip_fraction": 0.0,
"completion_length": 3092.6458587646484,
"epoch": 0.03314285714285714,
"grad_norm": 0.07088705897331238,
"kl": 4.079937934875488e-05,
"learning_rate": 9.140576474687263e-07,
"loss": -0.0988,
"reward": -0.12730359099805355,
"reward_std": 0.25404997263103724,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.18980359099805355,
"step": 29
},
{
"clip_fraction": 0.0,
"completion_length": 2785.1458740234375,
"epoch": 0.03428571428571429,
"grad_norm": 0.0721890851855278,
"kl": 4.616379737854004e-05,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0538,
"reward": 0.26715128123760223,
"reward_std": 0.5637835282832384,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": 0.0379846110008657,
"step": 30
},
{
"clip_fraction": 0.0,
"completion_length": 3028.0416679382324,
"epoch": 0.03542857142857143,
"grad_norm": 0.07398995757102966,
"kl": 4.813075065612793e-05,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0307,
"reward": -0.1556811612099409,
"reward_std": 0.3613749761134386,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.21818116027861834,
"step": 31
},
{
"clip_fraction": 0.0,
"completion_length": 2944.3958740234375,
"epoch": 0.036571428571428574,
"grad_norm": 0.06509231775999069,
"kl": 5.62518835067749e-05,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0698,
"reward": 0.305016117868945,
"reward_std": 0.3333305884152651,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.013349458575248718,
"step": 32
},
{
"clip_fraction": 0.0,
"completion_length": 3250.5208587646484,
"epoch": 0.037714285714285714,
"grad_norm": 0.061484675854444504,
"kl": 5.4955482482910156e-05,
"learning_rate": 8.737029101523929e-07,
"loss": 0.025,
"reward": 0.008718075230717659,
"reward_std": 0.5328585915267467,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.1371152652427554,
"step": 33
},
{
"clip_fraction": 0.0,
"completion_length": 2223.6666870117188,
"epoch": 0.038857142857142854,
"grad_norm": 0.08283665031194687,
"kl": 4.7266483306884766e-05,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0628,
"reward": 0.5019577480852604,
"reward_std": 0.6730294618755579,
"rewards/accuracy_reward": 0.3750000111758709,
"rewards/cosine_scaled_reward": 0.1269577438943088,
"step": 34
},
{
"clip_fraction": 0.0,
"completion_length": 3116.520896911621,
"epoch": 0.04,
"grad_norm": 0.10011623799800873,
"kl": 6.172060966491699e-05,
"learning_rate": 8.511087728614862e-07,
"loss": -0.0185,
"reward": -0.1340523180551827,
"reward_std": 0.38020466081798077,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.1965523138642311,
"step": 35
},
{
"clip_fraction": 0.0,
"completion_length": 3130.75,
"epoch": 0.04114285714285714,
"grad_norm": 0.06970471888780594,
"kl": 5.842745304107666e-05,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0422,
"reward": -0.17425783909857273,
"reward_std": 0.3371034972369671,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.21592450886964798,
"step": 36
},
{
"clip_fraction": 0.0,
"completion_length": 3406.125,
"epoch": 0.04228571428571429,
"grad_norm": 0.0568542517721653,
"kl": 4.285573959350586e-05,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0194,
"reward": -0.044786570593714714,
"reward_std": 0.24069493543356657,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.12811991199851036,
"step": 37
},
{
"clip_fraction": 0.0,
"completion_length": 3252.375015258789,
"epoch": 0.04342857142857143,
"grad_norm": 0.053038232028484344,
"kl": 5.0187110900878906e-05,
"learning_rate": 8.145033635316128e-07,
"loss": -0.0011,
"reward": 0.00859471783041954,
"reward_std": 0.3808100689202547,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.1164052952080965,
"step": 38
},
{
"clip_fraction": 0.0,
"completion_length": 2861.833335876465,
"epoch": 0.044571428571428574,
"grad_norm": 0.06544026732444763,
"kl": 3.574788570404053e-05,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0258,
"reward": 0.2788538106251508,
"reward_std": 0.21739591227378696,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 0.008020471781492233,
"step": 39
},
{
"clip_fraction": 0.0,
"completion_length": 2407.020896911621,
"epoch": 0.045714285714285714,
"grad_norm": 0.07022753357887268,
"kl": 3.733113408088684e-05,
"learning_rate": 7.884636689049422e-07,
"loss": -0.0064,
"reward": 0.11176938330754638,
"reward_std": 0.3469166085124016,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.05489729158580303,
"step": 40
},
{
"clip_fraction": 0.0,
"completion_length": 3272.0625915527344,
"epoch": 0.046857142857142854,
"grad_norm": 0.04923461005091667,
"kl": 4.369020462036133e-05,
"learning_rate": 7.75e-07,
"loss": -0.0115,
"reward": 0.19393274933099747,
"reward_std": 0.6539879608899355,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.014400593005120754,
"step": 41
},
{
"clip_fraction": 0.0,
"completion_length": 2870.9166774749756,
"epoch": 0.048,
"grad_norm": 0.10464286059141159,
"kl": 7.665157318115234e-05,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0068,
"reward": -0.23928624275140464,
"reward_std": 0.23300944967195392,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.26011957973241806,
"step": 42
},
{
"clip_fraction": 0.0,
"completion_length": 2943.6666717529297,
"epoch": 0.04914285714285714,
"grad_norm": 0.05960472300648689,
"kl": 4.680454730987549e-05,
"learning_rate": 7.472670160550848e-07,
"loss": -0.0069,
"reward": -0.09865465015172958,
"reward_std": 0.264129894785583,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.18198798224329948,
"step": 43
},
{
"clip_fraction": 0.0,
"completion_length": 2371.2291946411133,
"epoch": 0.05028571428571429,
"grad_norm": 0.0943843275308609,
"kl": 4.684180021286011e-05,
"learning_rate": 7.330314893841101e-07,
"loss": 0.012,
"reward": 0.23478438053280115,
"reward_std": 0.5152947697788477,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": 0.02645104774273932,
"step": 44
},
{
"clip_fraction": 0.0,
"completion_length": 3218.7084045410156,
"epoch": 0.05142857142857143,
"grad_norm": 0.056803327053785324,
"kl": 4.836916923522949e-05,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0238,
"reward": 0.06656578462570906,
"reward_std": 0.46774430200457573,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.0792675418779254,
"step": 45
},
{
"clip_fraction": 0.0,
"completion_length": 3084.5,
"epoch": 0.052571428571428575,
"grad_norm": 0.06916145235300064,
"kl": 4.89652156829834e-05,
"learning_rate": 7.039090644965509e-07,
"loss": -0.0137,
"reward": -0.15745936054736376,
"reward_std": 0.30057619512081146,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.19912602473050356,
"step": 46
},
{
"clip_fraction": 0.0,
"completion_length": 2617.854202270508,
"epoch": 0.053714285714285714,
"grad_norm": 0.0891217514872551,
"kl": 3.8154423236846924e-05,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0131,
"reward": 0.15285246446728706,
"reward_std": 0.609043394215405,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.05548086855560541,
"step": 47
},
{
"clip_fraction": 0.0,
"completion_length": 2537.8542098999023,
"epoch": 0.054857142857142854,
"grad_norm": 0.0810699462890625,
"kl": 3.608688712120056e-05,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0076,
"reward": 0.22076446935534477,
"reward_std": 0.6259710285812616,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": 0.012431120034307241,
"step": 48
},
{
"clip_fraction": 0.0,
"completion_length": 1916.0625305175781,
"epoch": 0.056,
"grad_norm": 0.11279745399951935,
"kl": 2.9848888516426086e-05,
"learning_rate": 6.588648530198504e-07,
"loss": -0.0081,
"reward": 0.31437641754746437,
"reward_std": 0.4195905257947743,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/cosine_scaled_reward": 0.022709736600518227,
"step": 49
},
{
"clip_fraction": 0.0,
"completion_length": 3009.8333740234375,
"epoch": 0.05714285714285714,
"grad_norm": 0.06987818330526352,
"kl": 3.094226121902466e-05,
"learning_rate": 6.435602608679916e-07,
"loss": -0.0053,
"reward": 0.3973438460379839,
"reward_std": 0.30630784668028355,
"rewards/accuracy_reward": 0.3125,
"rewards/cosine_scaled_reward": 0.084843834862113,
"step": 50
},
{
"clip_fraction": 0.0,
"completion_length": 2353.6041870117188,
"epoch": 0.05828571428571429,
"grad_norm": 0.0980035811662674,
"kl": 4.871189594268799e-05,
"learning_rate": 6.281416799501187e-07,
"loss": -0.0059,
"reward": -0.023236393928527832,
"reward_std": 0.37473731534555554,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.14823639206588268,
"step": 51
},
{
"clip_fraction": 0.0,
"completion_length": 2840.583366394043,
"epoch": 0.05942857142857143,
"grad_norm": 0.0852833166718483,
"kl": 4.4442713260650635e-05,
"learning_rate": 6.126278954320294e-07,
"loss": -0.007,
"reward": 0.46025677397847176,
"reward_std": 0.6503263358026743,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/cosine_scaled_reward": 0.0852567870169878,
"step": 52
},
{
"clip_fraction": 0.0,
"completion_length": 2741.3958892822266,
"epoch": 0.060571428571428575,
"grad_norm": 0.08544100821018219,
"kl": 5.532801151275635e-05,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0061,
"reward": 0.1790762129239738,
"reward_std": 0.6837563626468182,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.05009045993210748,
"step": 53
},
{
"clip_fraction": 0.0,
"completion_length": 2533.4792098999023,
"epoch": 0.061714285714285715,
"grad_norm": 0.08326774090528488,
"kl": 3.949552774429321e-05,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0983,
"reward": 0.6619744710624218,
"reward_std": 0.7842050231993198,
"rewards/accuracy_reward": 0.45833335258066654,
"rewards/cosine_scaled_reward": 0.2036411385051906,
"step": 54
},
{
"clip_fraction": 0.0,
"completion_length": 2908.062530517578,
"epoch": 0.06285714285714286,
"grad_norm": 0.057915616780519485,
"kl": 3.5546720027923584e-05,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0188,
"reward": 0.27089552767574787,
"reward_std": 0.5088877673260868,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 6.21667131781578e-05,
"step": 55
},
{
"clip_fraction": 0.0,
"completion_length": 3113.5625228881836,
"epoch": 0.064,
"grad_norm": 0.06461653858423233,
"kl": 3.863126039505005e-05,
"learning_rate": 5.5e-07,
"loss": -0.0314,
"reward": -0.09405255503952503,
"reward_std": 0.4130659643560648,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.17738589039072394,
"step": 56
},
{
"clip_fraction": 0.0,
"completion_length": 3084.500030517578,
"epoch": 0.06514285714285714,
"grad_norm": 0.052722394466400146,
"kl": 3.3408403396606445e-05,
"learning_rate": 5.342952264838747e-07,
"loss": -0.0609,
"reward": 0.0041184090077877045,
"reward_std": 0.5053744353353977,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.14171493193134665,
"step": 57
},
{
"clip_fraction": 0.0,
"completion_length": 2236.958381652832,
"epoch": 0.06628571428571428,
"grad_norm": 0.09903010725975037,
"kl": 2.588331699371338e-05,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0531,
"reward": 0.36401716619729996,
"reward_std": 0.4780782051384449,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.030683835968375206,
"step": 58
},
{
"clip_fraction": 0.0,
"completion_length": 2744.979179382324,
"epoch": 0.06742857142857143,
"grad_norm": 0.07801861315965652,
"kl": 3.4183263778686523e-05,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0011,
"reward": -0.1735086990520358,
"reward_std": 0.38626679591834545,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.2568420314928517,
"step": 59
},
{
"clip_fraction": 0.0,
"completion_length": 2849.458366394043,
"epoch": 0.06857142857142857,
"grad_norm": 0.06434936821460724,
"kl": 3.138929605484009e-05,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0188,
"reward": -0.17126354575157166,
"reward_std": 0.2597244749777019,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.21293021738529205,
"step": 60
},
{
"clip_fraction": 0.0,
"completion_length": 2868.1042404174805,
"epoch": 0.06971428571428571,
"grad_norm": 0.058776307851076126,
"kl": 2.8677284717559814e-05,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0473,
"reward": 0.13987798150628805,
"reward_std": 0.4987390795722604,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.04762201849371195,
"step": 61
},
{
"clip_fraction": 0.0,
"completion_length": 2286.62504196167,
"epoch": 0.07085714285714285,
"grad_norm": 0.07639817148447037,
"kl": 1.909933052957058e-05,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0389,
"reward": 0.2486275490373373,
"reward_std": 0.3404446765780449,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": -0.0013724502641707659,
"step": 62
},
{
"clip_fraction": 0.0,
"completion_length": 1874.6875457763672,
"epoch": 0.072,
"grad_norm": 0.08694759756326675,
"kl": 3.208965063095093e-05,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0497,
"reward": 0.283432574942708,
"reward_std": 0.6469337809830904,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": 0.012599228648468852,
"step": 63
},
{
"clip_fraction": 0.0,
"completion_length": 2791.3750228881836,
"epoch": 0.07314285714285715,
"grad_norm": 0.058527979999780655,
"kl": 4.375725984573364e-05,
"learning_rate": 4.2596318988235037e-07,
"loss": -0.0143,
"reward": 0.11854150518774986,
"reward_std": 0.557681780308485,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.06895849623833783,
"step": 64
},
{
"clip_fraction": 0.0,
"completion_length": 2645.5625228881836,
"epoch": 0.07428571428571429,
"grad_norm": 0.07280135899782181,
"kl": 2.8699636459350586e-05,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0051,
"reward": 0.27093657973455265,
"reward_std": 0.2569840718060732,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 0.00010325387120246887,
"step": 65
},
{
"clip_fraction": 0.0,
"completion_length": 2068.145839691162,
"epoch": 0.07542857142857143,
"grad_norm": 0.13709966838359833,
"kl": 2.857297658920288e-05,
"learning_rate": 3.9609093550344907e-07,
"loss": -0.0312,
"reward": 0.37133984826505184,
"reward_std": 0.4443043200299144,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.01717321015894413,
"step": 66
},
{
"clip_fraction": 0.0,
"completion_length": 3338.5416870117188,
"epoch": 0.07657142857142857,
"grad_norm": 0.04955720156431198,
"kl": 2.9280781745910645e-05,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0486,
"reward": -0.2874361127614975,
"reward_std": 0.17634079419076443,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.28743611462414265,
"step": 67
},
{
"clip_fraction": 0.0,
"completion_length": 1655.2916793823242,
"epoch": 0.07771428571428571,
"grad_norm": 0.10328181087970734,
"kl": 3.488361835479736e-05,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0563,
"reward": 0.04894767206860706,
"reward_std": 0.46508038230240345,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.11771899089217186,
"step": 68
},
{
"clip_fraction": 0.0,
"completion_length": 2115.520866394043,
"epoch": 0.07885714285714286,
"grad_norm": 0.08769083023071289,
"kl": 5.054473876953125e-05,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0664,
"reward": -0.21063880110159516,
"reward_std": 0.22452263766899705,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.23147212620824575,
"step": 69
},
{
"clip_fraction": 0.0,
"completion_length": 2957.250045776367,
"epoch": 0.08,
"grad_norm": 0.07507078349590302,
"kl": 0.00010189414024353027,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0422,
"reward": 0.047607121989130974,
"reward_std": 0.4603970441967249,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.09822620265185833,
"step": 70
},
{
"clip_fraction": 0.0,
"completion_length": 2741.500011444092,
"epoch": 0.08114285714285714,
"grad_norm": 0.0938272476196289,
"kl": 3.9443373680114746e-05,
"learning_rate": 3.250000000000001e-07,
"loss": -0.0011,
"reward": -0.011160964146256447,
"reward_std": 0.43730730563402176,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.09449429312371649,
"step": 71
},
{
"clip_fraction": 0.0,
"completion_length": 2641.395835876465,
"epoch": 0.08228571428571428,
"grad_norm": 0.08971308171749115,
"kl": 4.0397047996520996e-05,
"learning_rate": 3.115363310950578e-07,
"loss": -0.03,
"reward": -0.12160800583660603,
"reward_std": 0.33957473933696747,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.16327467001974583,
"step": 72
},
{
"clip_fraction": 0.0,
"completion_length": 3444.104217529297,
"epoch": 0.08342857142857144,
"grad_norm": 0.047388430684804916,
"kl": 3.725290298461914e-05,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0397,
"reward": 0.0910368449985981,
"reward_std": 0.47095555253326893,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.07562982058152556,
"step": 73
},
{
"clip_fraction": 0.0,
"completion_length": 2376.2916946411133,
"epoch": 0.08457142857142858,
"grad_norm": 0.09753943979740143,
"kl": 6.413459777832031e-05,
"learning_rate": 2.854966364683872e-07,
"loss": 0.1156,
"reward": 0.36098406091332436,
"reward_std": 0.521298123523593,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.06931740138679743,
"step": 74
},
{
"clip_fraction": 0.0,
"completion_length": 2761.0208587646484,
"epoch": 0.08571428571428572,
"grad_norm": 0.05892053619027138,
"kl": 3.839470446109772e-05,
"learning_rate": 2.729523361034538e-07,
"loss": -0.0033,
"reward": 0.2974963430315256,
"reward_std": 0.32638865895569324,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.047496337443590164,
"step": 75
},
{
"clip_fraction": 0.0,
"completion_length": 2590.500045776367,
"epoch": 0.08685714285714285,
"grad_norm": 0.11792360991239548,
"kl": 3.809481859207153e-05,
"learning_rate": 2.6074557564105724e-07,
"loss": -0.0438,
"reward": -0.22247323859483004,
"reward_std": 0.22486078180372715,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2433065790683031,
"step": 76
},
{
"clip_fraction": 0.0,
"completion_length": 2898.000030517578,
"epoch": 0.088,
"grad_norm": 0.06427934020757675,
"kl": 4.331767559051514e-05,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0008,
"reward": -0.0680793123319745,
"reward_std": 0.2837136909365654,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.1305793197825551,
"step": 77
},
{
"clip_fraction": 0.0,
"completion_length": 3283.354217529297,
"epoch": 0.08914285714285715,
"grad_norm": 0.05109477415680885,
"kl": 3.820657730102539e-05,
"learning_rate": 2.374037332934512e-07,
"loss": 0.01,
"reward": 0.009964404162019491,
"reward_std": 0.5483912099152803,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.13586892932653427,
"step": 78
},
{
"clip_fraction": 0.0,
"completion_length": 2252.250045776367,
"epoch": 0.09028571428571429,
"grad_norm": 0.0837658941745758,
"kl": 2.925097942352295e-05,
"learning_rate": 2.2629708984760706e-07,
"loss": -0.004,
"reward": 0.2518207300454378,
"reward_std": 0.42728718742728233,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.0018207323737442493,
"step": 79
},
{
"clip_fraction": 0.0,
"completion_length": 3298.041702270508,
"epoch": 0.09142857142857143,
"grad_norm": 0.057548362761735916,
"kl": 4.651397466659546e-05,
"learning_rate": 2.1558482853517253e-07,
"loss": -0.005,
"reward": -0.07602027803659439,
"reward_std": 0.3821410443633795,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.18018695712089539,
"step": 80
},
{
"clip_fraction": 0.0,
"completion_length": 3132.81254196167,
"epoch": 0.09257142857142857,
"grad_norm": 0.07149508595466614,
"kl": 5.067884922027588e-05,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0055,
"reward": 0.011346326675266027,
"reward_std": 0.5589314438402653,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.13448702194727957,
"step": 81
},
{
"clip_fraction": 0.0,
"completion_length": 2614.4166870117188,
"epoch": 0.09371428571428571,
"grad_norm": 0.07767566293478012,
"kl": 4.256516695022583e-05,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0811,
"reward": 0.0747821144759655,
"reward_std": 0.6653982885181904,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.0918845571577549,
"step": 82
},
{
"clip_fraction": 0.0,
"completion_length": 2527.520835876465,
"epoch": 0.09485714285714286,
"grad_norm": 0.097532257437706,
"kl": 4.713237285614014e-05,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.025,
"reward": 0.1644680369645357,
"reward_std": 0.38445328176021576,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.023031978867948055,
"step": 83
},
{
"clip_fraction": 0.0,
"completion_length": 2790.479217529297,
"epoch": 0.096,
"grad_norm": 0.06318508833646774,
"kl": 3.428757190704346e-05,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0002,
"reward": 0.28921602852642536,
"reward_std": 0.5463491454720497,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/cosine_scaled_reward": -0.002450621104799211,
"step": 84
},
{
"clip_fraction": 0.0,
"completion_length": 2910.7084045410156,
"epoch": 0.09714285714285714,
"grad_norm": 0.05188002064824104,
"kl": 3.795698285102844e-05,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.0004,
"reward": -0.15216440707445145,
"reward_std": 0.3802374005317688,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.21466441452503204,
"step": 85
},
{
"clip_fraction": 0.0,
"completion_length": 2774.500045776367,
"epoch": 0.09828571428571428,
"grad_norm": 0.08824881166219711,
"kl": 4.828721284866333e-05,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0431,
"reward": -0.10176742170006037,
"reward_std": 0.28106818813830614,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.16426740679889917,
"step": 86
},
{
"clip_fraction": 0.0,
"completion_length": 2453.0000381469727,
"epoch": 0.09942857142857142,
"grad_norm": 0.07480672001838684,
"kl": 4.8995018005371094e-05,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.078,
"reward": 0.26724753249436617,
"reward_std": 0.574736475944519,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.017247509211301804,
"step": 87
},
{
"clip_fraction": 0.0,
"completion_length": 2125.4583435058594,
"epoch": 0.10057142857142858,
"grad_norm": 0.10394497215747833,
"kl": 3.460049629211426e-05,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.1439,
"reward": 0.4671774273738265,
"reward_std": 0.8694365136325359,
"rewards/accuracy_reward": 0.33333334513008595,
"rewards/cosine_scaled_reward": 0.13384409341961145,
"step": 88
},
{
"clip_fraction": 0.0,
"completion_length": 2954.4583740234375,
"epoch": 0.10171428571428572,
"grad_norm": 0.05468936264514923,
"kl": 3.0003488063812256e-05,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0426,
"reward": 0.09598893485963345,
"reward_std": 0.49872371926903725,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.07067773153539747,
"step": 89
},
{
"clip_fraction": 0.0,
"completion_length": 2314.6666946411133,
"epoch": 0.10285714285714286,
"grad_norm": 0.11714471131563187,
"kl": 5.67510724067688e-05,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.028,
"reward": -0.16322916094213724,
"reward_std": 0.30263339448720217,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.20489584188908339,
"step": 90
},
{
"clip_fraction": 0.0,
"completion_length": 2949.854202270508,
"epoch": 0.104,
"grad_norm": 0.06450404226779938,
"kl": 4.634261131286621e-05,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0308,
"reward": 0.009420277085155249,
"reward_std": 0.4902005009353161,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.13641306199133396,
"step": 91
},
{
"clip_fraction": 0.0,
"completion_length": 2546.2083740234375,
"epoch": 0.10514285714285715,
"grad_norm": 0.08860377222299576,
"kl": 3.190338611602783e-05,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0264,
"reward": -0.07203258201479912,
"reward_std": 0.37115267012268305,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.134532586671412,
"step": 92
},
{
"clip_fraction": 0.0,
"completion_length": 3584.0,
"epoch": 0.10628571428571429,
"grad_norm": 0.05677548050880432,
"kl": 5.491077899932861e-05,
"learning_rate": 1.1743223682775649e-07,
"loss": -0.0,
"reward": -0.2040251363068819,
"reward_std": 0.14581821858882904,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.20402513444423676,
"step": 93
},
{
"clip_fraction": 0.0,
"completion_length": 2474.3333740234375,
"epoch": 0.10742857142857143,
"grad_norm": 0.0738380178809166,
"kl": 5.2111921831965446e-05,
"learning_rate": 1.1336692317580158e-07,
"loss": -0.0028,
"reward": 0.18377637607045472,
"reward_std": 0.35242756828665733,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.024556951597332954,
"step": 94
},
{
"clip_fraction": 0.0,
"completion_length": 3511.8958435058594,
"epoch": 0.10857142857142857,
"grad_norm": 0.04296368733048439,
"kl": 3.476440906524658e-05,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0,
"reward": 0.0025116736069321632,
"reward_std": 0.38779854215681553,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.12248833384364843,
"step": 95
},
{
"clip_fraction": 0.0,
"completion_length": 2279.9583740234375,
"epoch": 0.10971428571428571,
"grad_norm": 0.08072325587272644,
"kl": 3.765523433685303e-05,
"learning_rate": 1.068365111445064e-07,
"loss": -0.006,
"reward": -0.024050889536738396,
"reward_std": 0.4769116332754493,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.16988423094153404,
"step": 96
},
{
"clip_fraction": 0.0,
"completion_length": 3010.7708740234375,
"epoch": 0.11085714285714286,
"grad_norm": 0.06400413811206818,
"kl": 3.889948129653931e-05,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0745,
"reward": 0.06319739483296871,
"reward_std": 0.37303004786372185,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.10346927866339684,
"step": 97
},
{
"clip_fraction": 0.0,
"completion_length": 2826.0208435058594,
"epoch": 0.112,
"grad_norm": 0.05574605613946915,
"kl": 3.8623809814453125e-05,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.1032,
"reward": 0.11388520710170269,
"reward_std": 0.3436102643609047,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.07361481338739395,
"step": 98
},
{
"clip_fraction": 0.0,
"completion_length": 2823.6250076293945,
"epoch": 0.11314285714285714,
"grad_norm": 0.090733103454113,
"kl": 3.966689109802246e-05,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0091,
"reward": 0.06398104969412088,
"reward_std": 0.5714576002210379,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.10268562845885754,
"step": 99
},
{
"clip_fraction": 0.0,
"completion_length": 2570.083381652832,
"epoch": 0.11428571428571428,
"grad_norm": 0.07998774200677872,
"kl": 4.67151403427124e-05,
"learning_rate": 1.002741278414069e-07,
"loss": -0.0209,
"reward": 0.07360539212822914,
"reward_std": 0.48094008676707745,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.07222793623805046,
"step": 100
},
{
"clip_fraction": 0.0,
"completion_length": 2722.187515258789,
"epoch": 0.11542857142857142,
"grad_norm": 0.07589790225028992,
"kl": 4.192441701889038e-05,
"learning_rate": 1e-07,
"loss": 0.069,
"reward": 0.08873439207673073,
"reward_std": 0.2866000607609749,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.09876561164855957,
"step": 101
},
{
"clip_fraction": 0.0,
"completion_length": 1838.4166946411133,
"epoch": 0.11657142857142858,
"grad_norm": 0.09402992576360703,
"kl": 3.657490015029907e-05,
"learning_rate": 6.203955092681039e-07,
"loss": 0.0156,
"reward": 0.06690541142597795,
"reward_std": 0.18542613834142685,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.05809460300952196,
"step": 102
},
{
"clip_fraction": 0.0,
"completion_length": 2870.6667137145996,
"epoch": 0.11771428571428572,
"grad_norm": 0.07137490063905716,
"kl": 4.7534704208374023e-05,
"learning_rate": 6.126278954320294e-07,
"loss": -0.0495,
"reward": 0.24454372422769666,
"reward_std": 0.4555186741054058,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": -0.026289615780115128,
"step": 103
},
{
"clip_fraction": 0.0,
"completion_length": 2432.3333778381348,
"epoch": 0.11885714285714286,
"grad_norm": 0.09176477044820786,
"kl": 3.963988274335861e-05,
"learning_rate": 6.048412045323164e-07,
"loss": 0.0414,
"reward": 0.011347562074661255,
"reward_std": 0.4153030626475811,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.11365244910120964,
"step": 104
},
{
"clip_fraction": 0.0,
"completion_length": 2421.7500381469727,
"epoch": 0.12,
"grad_norm": 0.08922665566205978,
"kl": 3.612414002418518e-05,
"learning_rate": 5.97037808470444e-07,
"loss": 0.1048,
"reward": 0.26649707183241844,
"reward_std": 0.5726894475519657,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.016497071366757154,
"step": 105
},
{
"clip_fraction": 0.0,
"completion_length": 2394.250015258789,
"epoch": 0.12114285714285715,
"grad_norm": 0.08171434700489044,
"kl": 1.9472092390060425e-05,
"learning_rate": 5.892200842364462e-07,
"loss": -0.0643,
"reward": 0.6783079504966736,
"reward_std": 0.5645040161907673,
"rewards/accuracy_reward": 0.479166679084301,
"rewards/cosine_scaled_reward": 0.19914129562675953,
"step": 106
},
{
"clip_fraction": 0.0,
"completion_length": 2945.1041717529297,
"epoch": 0.12228571428571429,
"grad_norm": 0.07043629884719849,
"kl": 3.892183303833008e-05,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0082,
"reward": 0.09549596160650253,
"reward_std": 0.38008159026503563,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.07117070630192757,
"step": 107
},
{
"clip_fraction": 0.0,
"completion_length": 2612.541732788086,
"epoch": 0.12342857142857143,
"grad_norm": 0.08020760118961334,
"kl": 6.224215030670166e-05,
"learning_rate": 5.735511803093248e-07,
"loss": 0.0319,
"reward": -0.06708686612546444,
"reward_std": 0.372654527425766,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.17125353403389454,
"step": 108
},
{
"clip_fraction": 0.0,
"completion_length": 3040.041679382324,
"epoch": 0.12457142857142857,
"grad_norm": 0.05782421678304672,
"kl": 3.9560720324516296e-05,
"learning_rate": 5.657047735161255e-07,
"loss": -0.0298,
"reward": 0.06678529269993305,
"reward_std": 0.23496104590594769,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.079048041254282,
"step": 109
},
{
"clip_fraction": 0.0,
"completion_length": 2703.375030517578,
"epoch": 0.12571428571428572,
"grad_norm": 0.07456289976835251,
"kl": 4.085153341293335e-05,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0608,
"reward": 0.09353478881530464,
"reward_std": 0.6213366910815239,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.11479855328798294,
"step": 110
},
{
"clip_fraction": 0.0,
"completion_length": 3046.000045776367,
"epoch": 0.12685714285714286,
"grad_norm": 0.06936592608690262,
"kl": 4.996359348297119e-05,
"learning_rate": 5.5e-07,
"loss": 0.0529,
"reward": 0.04783104546368122,
"reward_std": 0.406554002314806,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.09800229035317898,
"step": 111
},
{
"clip_fraction": 0.0,
"completion_length": 3252.4791870117188,
"epoch": 0.128,
"grad_norm": 0.056168291717767715,
"kl": 4.392117261886597e-05,
"learning_rate": 5.421464171032224e-07,
"loss": -0.0406,
"reward": 0.160230646841228,
"reward_std": 0.3509800494648516,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.027269369922578335,
"step": 112
},
{
"clip_fraction": 0.0,
"completion_length": 2445.0833435058594,
"epoch": 0.12914285714285714,
"grad_norm": 0.09087050706148148,
"kl": 5.398690700531006e-05,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0283,
"reward": -0.02850266359746456,
"reward_std": 0.3581254305317998,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.1118359980173409,
"step": 113
},
{
"clip_fraction": 0.0,
"completion_length": 2361.2708854675293,
"epoch": 0.13028571428571428,
"grad_norm": 0.0836726725101471,
"kl": 2.1731480956077576e-05,
"learning_rate": 5.264488196906752e-07,
"loss": 0.0029,
"reward": -0.08279290050268173,
"reward_std": 0.2787897954694927,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.16612621676176786,
"step": 114
},
{
"clip_fraction": 0.0,
"completion_length": 2874.3333435058594,
"epoch": 0.13142857142857142,
"grad_norm": 0.0852808952331543,
"kl": 3.896281123161316e-05,
"learning_rate": 5.186095868151436e-07,
"loss": -0.0246,
"reward": 0.2715425807982683,
"reward_std": 0.3593301521614194,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": -0.02012409595772624,
"step": 115
},
{
"clip_fraction": 0.0,
"completion_length": 3274.2708435058594,
"epoch": 0.13257142857142856,
"grad_norm": 0.05964406952261925,
"kl": 4.2632222175598145e-05,
"learning_rate": 5.107799157635538e-07,
"loss": -0.0429,
"reward": -0.0861705094575882,
"reward_std": 0.2727812984958291,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.16950384341180325,
"step": 116
},
{
"clip_fraction": 0.0,
"completion_length": 3269.625,
"epoch": 0.1337142857142857,
"grad_norm": 0.05711478367447853,
"kl": 5.498528480529785e-05,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0143,
"reward": -0.2802669182419777,
"reward_std": 0.18606608174741268,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2802669182419777,
"step": 117
},
{
"clip_fraction": 0.0,
"completion_length": 2989.854202270508,
"epoch": 0.13485714285714287,
"grad_norm": 0.05973963439464569,
"kl": 3.249943256378174e-05,
"learning_rate": 4.951587954676837e-07,
"loss": 0.0042,
"reward": 0.4425080083310604,
"reward_std": 0.6226732302457094,
"rewards/accuracy_reward": 0.354166679084301,
"rewards/cosine_scaled_reward": 0.0883413702249527,
"step": 118
},
{
"clip_fraction": 0.0,
"completion_length": 1957.4583740234375,
"epoch": 0.136,
"grad_norm": 0.09650486707687378,
"kl": 5.301833152770996e-05,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0763,
"reward": 0.2231588363647461,
"reward_std": 0.4853620417416096,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.006007825839333236,
"step": 119
},
{
"clip_fraction": 0.0,
"completion_length": 2861.604179382324,
"epoch": 0.13714285714285715,
"grad_norm": 0.08811937272548676,
"kl": 4.668533802032471e-05,
"learning_rate": 4.79604490731896e-07,
"loss": 0.069,
"reward": 0.18268701434135437,
"reward_std": 0.5529882707633078,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.025646327529102564,
"step": 120
},
{
"clip_fraction": 0.0,
"completion_length": 2120.895881652832,
"epoch": 0.1382857142857143,
"grad_norm": 0.08887775242328644,
"kl": 3.911927342414856e-05,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.0243,
"reward": 0.0723479799926281,
"reward_std": 0.4600833263248205,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.07348535116761923,
"step": 121
},
{
"clip_fraction": 0.0,
"completion_length": 3076.645866394043,
"epoch": 0.13942857142857143,
"grad_norm": 0.07256121933460236,
"kl": 4.369020462036133e-05,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0058,
"reward": 0.306396946310997,
"reward_std": 0.5766899082809687,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/cosine_scaled_reward": 0.05639694258570671,
"step": 122
},
{
"clip_fraction": 0.0,
"completion_length": 2826.8750228881836,
"epoch": 0.14057142857142857,
"grad_norm": 0.06723114103078842,
"kl": 4.945695400238037e-05,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0474,
"reward": -0.022131433710455894,
"reward_std": 0.3866724129766226,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.16796477201114612,
"step": 123
},
{
"clip_fraction": 0.0,
"completion_length": 2317.583354949951,
"epoch": 0.1417142857142857,
"grad_norm": 0.08971195667982101,
"kl": 3.826618194580078e-05,
"learning_rate": 4.4877202554526084e-07,
"loss": 0.0437,
"reward": 0.25669101858511567,
"reward_std": 0.4173083985224366,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.034975672140717506,
"step": 124
},
{
"clip_fraction": 0.0,
"completion_length": 2852.875030517578,
"epoch": 0.14285714285714285,
"grad_norm": 0.060154370963573456,
"kl": 3.0517578125e-05,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0046,
"reward": 0.3323164558969438,
"reward_std": 0.5723556145094335,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.019816441694274545,
"step": 125
},
{
"clip_fraction": 0.0,
"completion_length": 2835.3750610351562,
"epoch": 0.144,
"grad_norm": 0.06273169815540314,
"kl": 2.740137279033661e-05,
"learning_rate": 4.3353142970386557e-07,
"loss": 0.0187,
"reward": 0.07377211796119809,
"reward_std": 0.414914159104228,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.07206120900809765,
"step": 126
},
{
"clip_fraction": 0.0,
"completion_length": 3493.5,
"epoch": 0.14514285714285713,
"grad_norm": 0.051632124930620193,
"kl": 3.935769200325012e-05,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0178,
"reward": -0.10274199862033129,
"reward_std": 0.32821359671652317,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.14440866466611624,
"step": 127
},
{
"clip_fraction": 0.0,
"completion_length": 2737.229217529297,
"epoch": 0.1462857142857143,
"grad_norm": 0.06725908070802689,
"kl": 2.730637788772583e-05,
"learning_rate": 4.1843273287476854e-07,
"loss": 0.0261,
"reward": 0.4665216477587819,
"reward_std": 0.5292752608656883,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.13318830379284918,
"step": 128
},
{
"clip_fraction": 0.0,
"completion_length": 3285.9791870117188,
"epoch": 0.14742857142857144,
"grad_norm": 0.059827111661434174,
"kl": 4.4018030166625977e-05,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0326,
"reward": -0.1473593506962061,
"reward_std": 0.34279950708150864,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.20985935255885124,
"step": 129
},
{
"clip_fraction": 0.0,
"completion_length": 3269.8958435058594,
"epoch": 0.14857142857142858,
"grad_norm": 0.0539652518928051,
"kl": 4.6581029891967773e-05,
"learning_rate": 4.034943304942796e-07,
"loss": -0.0338,
"reward": 0.012523974291980267,
"reward_std": 0.3652627067640424,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.09164270292967558,
"step": 130
},
{
"clip_fraction": 0.0,
"completion_length": 2738.0416946411133,
"epoch": 0.14971428571428572,
"grad_norm": 0.07965836673974991,
"kl": 3.647804260253906e-05,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0134,
"reward": 0.40422316640615463,
"reward_std": 0.4564414999913424,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/cosine_scaled_reward": 0.029223157092928886,
"step": 131
},
{
"clip_fraction": 0.0,
"completion_length": 2821.604179382324,
"epoch": 0.15085714285714286,
"grad_norm": 0.06730424612760544,
"kl": 3.738701343536377e-05,
"learning_rate": 3.8873442270461485e-07,
"loss": -0.0595,
"reward": 0.14993381313979626,
"reward_std": 0.24461174756288528,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.07923284359276295,
"step": 132
},
{
"clip_fraction": 0.0,
"completion_length": 3298.9166870117188,
"epoch": 0.152,
"grad_norm": 0.0547853484749794,
"kl": 3.6306679248809814e-05,
"learning_rate": 3.8142703296283953e-07,
"loss": -0.0196,
"reward": -0.0262713935226202,
"reward_std": 0.28523722756654024,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.21377139631658792,
"step": 133
},
{
"clip_fraction": 0.0,
"completion_length": 2631.020851135254,
"epoch": 0.15314285714285714,
"grad_norm": 0.09167379140853882,
"kl": 5.6549906730651855e-05,
"learning_rate": 3.7417099217982686e-07,
"loss": -0.0388,
"reward": 0.13690885063260794,
"reward_std": 0.33062932174652815,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.05059114011237398,
"step": 134
},
{
"clip_fraction": 0.0,
"completion_length": 1884.020866394043,
"epoch": 0.15428571428571428,
"grad_norm": 0.09686336666345596,
"kl": 4.8333313316106796e-05,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0075,
"reward": 0.5446690749377012,
"reward_std": 0.5788980075158179,
"rewards/accuracy_reward": 0.41666666977107525,
"rewards/cosine_scaled_reward": 0.12800239364150912,
"step": 135
},
{
"clip_fraction": 0.0,
"completion_length": 2724.2083740234375,
"epoch": 0.15542857142857142,
"grad_norm": 0.06881242245435715,
"kl": 3.3229589462280273e-05,
"learning_rate": 3.5982178221668533e-07,
"loss": -0.0172,
"reward": 0.25056808441877365,
"reward_std": 0.6991038620471954,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.0005680816248059273,
"step": 136
},
{
"clip_fraction": 0.0,
"completion_length": 3288.2916717529297,
"epoch": 0.15657142857142858,
"grad_norm": 0.04798026755452156,
"kl": 3.696233034133911e-05,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0012,
"reward": -0.2087622880935669,
"reward_std": 0.2625069562345743,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.25042895757360384,
"step": 137
},
{
"clip_fraction": 0.0,
"completion_length": 2845.5416946411133,
"epoch": 0.15771428571428572,
"grad_norm": 0.06401628255844116,
"kl": 2.7161091566085815e-05,
"learning_rate": 3.45704275117204e-07,
"loss": -0.007,
"reward": 0.04433951433748007,
"reward_std": 0.16650092136114836,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.08066049311310053,
"step": 138
},
{
"clip_fraction": 0.0,
"completion_length": 2961.604202270508,
"epoch": 0.15885714285714286,
"grad_norm": 0.06384900212287903,
"kl": 4.455633461475372e-05,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0591,
"reward": -0.00648902915418148,
"reward_std": 0.3278183531947434,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.11065569380298257,
"step": 139
},
{
"clip_fraction": 0.0,
"completion_length": 3147.437530517578,
"epoch": 0.16,
"grad_norm": 0.0742841511964798,
"kl": 4.8547983169555664e-05,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.0277,
"reward": 0.2934675266733393,
"reward_std": 0.46008346043527126,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": 0.06430085934698582,
"step": 140
},
{
"clip_fraction": 0.0,
"completion_length": 3212.6458740234375,
"epoch": 0.16114285714285714,
"grad_norm": 0.05506473779678345,
"kl": 3.645569086074829e-05,
"learning_rate": 3.250000000000001e-07,
"loss": -0.0265,
"reward": 0.12175815179944038,
"reward_std": 0.5772441830486059,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.06574184074997902,
"step": 141
},
{
"clip_fraction": 0.0,
"completion_length": 2816.729232788086,
"epoch": 0.16228571428571428,
"grad_norm": 0.06666780263185501,
"kl": 4.5670196413993835e-05,
"learning_rate": 3.182328662904756e-07,
"loss": 0.0521,
"reward": 0.16211201017722487,
"reward_std": 0.5409660097211599,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.06705465726554394,
"step": 142
},
{
"clip_fraction": 0.0,
"completion_length": 2639.291702270508,
"epoch": 0.16342857142857142,
"grad_norm": 0.08498696982860565,
"kl": 4.427810199558735e-05,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0497,
"reward": -0.27455265261232853,
"reward_std": 0.23319050949066877,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2953859841218218,
"step": 143
},
{
"clip_fraction": 0.0,
"completion_length": 3100.437511444092,
"epoch": 0.16457142857142856,
"grad_norm": 0.07933609187602997,
"kl": 3.787130117416382e-05,
"learning_rate": 3.0491243424323783e-07,
"loss": -0.0187,
"reward": 0.18983200006186962,
"reward_std": 0.3597974181175232,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.03933467622846365,
"step": 144
},
{
"clip_fraction": 0.0,
"completion_length": 2241.604179382324,
"epoch": 0.1657142857142857,
"grad_norm": 0.09724698215723038,
"kl": 3.6539509892463684e-05,
"learning_rate": 2.9836319343816397e-07,
"loss": -0.0007,
"reward": 0.24542317104896938,
"reward_std": 0.4150366364046931,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.04624348785728216,
"step": 145
},
{
"clip_fraction": 0.0,
"completion_length": 2525.7292098999023,
"epoch": 0.16685714285714287,
"grad_norm": 0.06342972815036774,
"kl": 2.726912498474121e-05,
"learning_rate": 2.918906036420294e-07,
"loss": -0.0081,
"reward": -0.045263445004820824,
"reward_std": 0.40356126986443996,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.14943011198192835,
"step": 146
},
{
"clip_fraction": 0.0,
"completion_length": 3529.729217529297,
"epoch": 0.168,
"grad_norm": 0.0595870241522789,
"kl": 4.3898820877075195e-05,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0118,
"reward": 0.07939862087368965,
"reward_std": 0.5240298006683588,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.10810139158274978,
"step": 147
},
{
"clip_fraction": 0.0,
"completion_length": 2771.041702270508,
"epoch": 0.16914285714285715,
"grad_norm": 0.0586538165807724,
"kl": 2.527981996536255e-05,
"learning_rate": 2.791832395815782e-07,
"loss": -0.0089,
"reward": 0.07017020601779222,
"reward_std": 0.2498982846736908,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.0756631288677454,
"step": 148
},
{
"clip_fraction": 0.0,
"completion_length": 2869.4375534057617,
"epoch": 0.1702857142857143,
"grad_norm": 0.07042869925498962,
"kl": 3.543868660926819e-05,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0055,
"reward": 0.2183131380006671,
"reward_std": 0.27962948102504015,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.010853511281311512,
"step": 149
},
{
"clip_fraction": 0.0,
"completion_length": 2761.625030517578,
"epoch": 0.17142857142857143,
"grad_norm": 0.07804974168539047,
"kl": 4.456937313079834e-05,
"learning_rate": 2.6680582402757324e-07,
"loss": 0.0089,
"reward": -0.214060353115201,
"reward_std": 0.2980783907696605,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.25572701659984887,
"step": 150
},
{
"clip_fraction": 0.0,
"completion_length": 2819.5833740234375,
"epoch": 0.17257142857142857,
"grad_norm": 0.07131638377904892,
"kl": 4.2825937271118164e-05,
"learning_rate": 2.6074557564105724e-07,
"loss": -0.0174,
"reward": 0.032599929720163345,
"reward_std": 0.4547419548034668,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.1132334005087614,
"step": 151
},
{
"clip_fraction": 0.0,
"completion_length": 3040.833354949951,
"epoch": 0.1737142857142857,
"grad_norm": 0.10568142682313919,
"kl": 5.511939525604248e-05,
"learning_rate": 2.547734369542718e-07,
"loss": 0.0503,
"reward": -0.10969539848156273,
"reward_std": 0.40426155179739,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.1721954019740224,
"step": 152
},
{
"clip_fraction": 0.0,
"completion_length": 2793.0417251586914,
"epoch": 0.17485714285714285,
"grad_norm": 0.09443458169698715,
"kl": 4.933774471282959e-05,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0244,
"reward": -0.0962764136493206,
"reward_std": 0.34763350896537304,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.17960975063033402,
"step": 153
},
{
"clip_fraction": 0.0,
"completion_length": 3132.8125610351562,
"epoch": 0.176,
"grad_norm": 0.05638415366411209,
"kl": 2.7516856789588928e-05,
"learning_rate": 2.4310073797187573e-07,
"loss": -0.0348,
"reward": 0.449524587020278,
"reward_std": 0.5751714678481221,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.09535788558423519,
"step": 154
},
{
"clip_fraction": 0.0,
"completion_length": 2529.104202270508,
"epoch": 0.17714285714285713,
"grad_norm": 0.08164854347705841,
"kl": 3.826734609901905e-05,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0305,
"reward": 0.2544417988974601,
"reward_std": 0.46309633809141815,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.004441759781911969,
"step": 155
},
{
"clip_fraction": 0.0,
"completion_length": 2993.270866394043,
"epoch": 0.1782857142857143,
"grad_norm": 0.06250454485416412,
"kl": 3.37064266204834e-05,
"learning_rate": 2.3180194846605364e-07,
"loss": -0.0066,
"reward": -0.026538243517279625,
"reward_std": 0.4186902232468128,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.15153825469315052,
"step": 156
},
{
"clip_fraction": 0.0,
"completion_length": 3177.8333435058594,
"epoch": 0.17942857142857144,
"grad_norm": 0.053174857050180435,
"kl": 3.8951635360717773e-05,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0154,
"reward": -0.12036301381886005,
"reward_std": 0.44441020861268044,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.20369636011309922,
"step": 157
},
{
"clip_fraction": 0.0,
"completion_length": 2696.041702270508,
"epoch": 0.18057142857142858,
"grad_norm": 0.06273730844259262,
"kl": 4.3585896492004395e-05,
"learning_rate": 2.2089083427137329e-07,
"loss": 0.0106,
"reward": 0.20966186001896858,
"reward_std": 0.24054066091775894,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": 0.0013285204768180847,
"step": 158
},
{
"clip_fraction": 0.0,
"completion_length": 3259.7708435058594,
"epoch": 0.18171428571428572,
"grad_norm": 0.05220724642276764,
"kl": 2.981722354888916e-05,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0177,
"reward": -0.20584244281053543,
"reward_std": 0.28025806602090597,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.2683424372226,
"step": 159
},
{
"clip_fraction": 0.0,
"completion_length": 3075.4792098999023,
"epoch": 0.18285714285714286,
"grad_norm": 0.06919407844543457,
"kl": 5.105137825012207e-05,
"learning_rate": 2.1038068889975259e-07,
"loss": 0.0367,
"reward": -0.008766223094426095,
"reward_std": 0.4496794454753399,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.13376623298972845,
"step": 160
},
{
"clip_fraction": 0.0,
"completion_length": 2116.8958892822266,
"epoch": 0.184,
"grad_norm": 0.08072680234909058,
"kl": 4.120171070098877e-05,
"learning_rate": 2.0528000059645995e-07,
"loss": -0.056,
"reward": 0.15352233685553074,
"reward_std": 0.5114505719393492,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.03397766686975956,
"step": 161
},
{
"clip_fraction": 0.0,
"completion_length": 3499.2916870117188,
"epoch": 0.18514285714285714,
"grad_norm": 0.0527619905769825,
"kl": 3.422051668167114e-05,
"learning_rate": 2.0028431734436308e-07,
"loss": -0.0021,
"reward": -0.038823087234050035,
"reward_std": 0.4940961766988039,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.1638230886310339,
"step": 162
},
{
"clip_fraction": 0.0,
"completion_length": 2734.8333892822266,
"epoch": 0.18628571428571428,
"grad_norm": 0.07632818818092346,
"kl": 3.5293400287628174e-05,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0764,
"reward": 0.4233822599053383,
"reward_std": 0.4737155893817544,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.06921557523310184,
"step": 163
},
{
"clip_fraction": 0.0,
"completion_length": 2648.4375381469727,
"epoch": 0.18742857142857142,
"grad_norm": 0.08120272308588028,
"kl": 4.1447579860687256e-05,
"learning_rate": 1.9061402047871833e-07,
"loss": -0.0271,
"reward": 0.2842383498791605,
"reward_std": 0.5444171037524939,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.034238346852362156,
"step": 164
},
{
"clip_fraction": 0.0,
"completion_length": 3249.2083435058594,
"epoch": 0.18857142857142858,
"grad_norm": 0.06324990838766098,
"kl": 4.744529724121094e-05,
"learning_rate": 1.8594235253127372e-07,
"loss": -0.0558,
"reward": -0.2445811154320836,
"reward_std": 0.2507223319262266,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2654144521802664,
"step": 165
},
{
"clip_fraction": 0.0,
"completion_length": 2949.8541717529297,
"epoch": 0.18971428571428572,
"grad_norm": 0.06484034657478333,
"kl": 4.217028617858887e-05,
"learning_rate": 1.8138158006995363e-07,
"loss": 0.0424,
"reward": -0.1878061555325985,
"reward_std": 0.31927773356437683,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.22947282809764147,
"step": 166
},
{
"clip_fraction": 0.0,
"completion_length": 2445.354202270508,
"epoch": 0.19085714285714286,
"grad_norm": 0.08651679754257202,
"kl": 3.0443072319030762e-05,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0164,
"reward": 0.3632579315453768,
"reward_std": 0.48152059130370617,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": 0.07159125246107578,
"step": 167
},
{
"clip_fraction": 0.0,
"completion_length": 3308.1458740234375,
"epoch": 0.192,
"grad_norm": 0.05902985855937004,
"kl": 4.501640796661377e-05,
"learning_rate": 1.7259824442455923e-07,
"loss": 0.014,
"reward": 0.1645604595541954,
"reward_std": 0.5049289520829916,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.02293952787294984,
"step": 168
},
{
"clip_fraction": 0.0,
"completion_length": 2684.062545776367,
"epoch": 0.19314285714285714,
"grad_norm": 0.055804040282964706,
"kl": 3.5997480154037476e-05,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.0215,
"reward": 0.6388495257124305,
"reward_std": 0.4502353947609663,
"rewards/accuracy_reward": 0.47916667722165585,
"rewards/cosine_scaled_reward": 0.15968285594135523,
"step": 169
},
{
"clip_fraction": 0.0,
"completion_length": 2689.9167404174805,
"epoch": 0.19428571428571428,
"grad_norm": 0.07801216095685959,
"kl": 2.8800219297409058e-05,
"learning_rate": 1.6427471468404952e-07,
"loss": -0.0261,
"reward": 0.21209237910807133,
"reward_std": 0.18574862275272608,
"rewards/accuracy_reward": 0.25,
"rewards/cosine_scaled_reward": -0.037907619029283524,
"step": 170
},
{
"clip_fraction": 0.0,
"completion_length": 2724.9583435058594,
"epoch": 0.19542857142857142,
"grad_norm": 0.08230967074632645,
"kl": 4.396587610244751e-05,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.1104,
"reward": 0.2708106115460396,
"reward_std": 0.4123786874115467,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/cosine_scaled_reward": 0.020810591988265514,
"step": 171
},
{
"clip_fraction": 0.0,
"completion_length": 2835.2291870117188,
"epoch": 0.19657142857142856,
"grad_norm": 0.0861576646566391,
"kl": 4.382431507110596e-05,
"learning_rate": 1.5642113178727193e-07,
"loss": -0.0178,
"reward": 0.3303783554583788,
"reward_std": 0.26864094100892544,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.03871168568730354,
"step": 172
},
{
"clip_fraction": 0.0,
"completion_length": 2145.4791717529297,
"epoch": 0.1977142857142857,
"grad_norm": 0.11213409900665283,
"kl": 2.8414186090230942e-05,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0598,
"reward": -0.1945440210402012,
"reward_std": 0.2684439942240715,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.236210685223341,
"step": 173
},
{
"clip_fraction": 0.0,
"completion_length": 2457.000045776367,
"epoch": 0.19885714285714284,
"grad_norm": 0.07888836413621902,
"kl": 4.423223435878754e-05,
"learning_rate": 1.4904706411523448e-07,
"loss": 0.0072,
"reward": 0.028682731091976166,
"reward_std": 0.3702436462044716,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.117150594945997,
"step": 174
},
{
"clip_fraction": 0.0,
"completion_length": 2821.541702270508,
"epoch": 0.2,
"grad_norm": 0.06768631935119629,
"kl": 3.556848969310522e-05,
"learning_rate": 1.4554267916537495e-07,
"loss": -0.0158,
"reward": 0.3141335854306817,
"reward_std": 0.3858940042555332,
"rewards/accuracy_reward": 0.25,
"rewards/cosine_scaled_reward": 0.06413357798010111,
"step": 175
},
{
"clip_fraction": 0.0,
"completion_length": 2702.479217529297,
"epoch": 0.20114285714285715,
"grad_norm": 0.09728685021400452,
"kl": 3.568828105926514e-05,
"learning_rate": 1.4216149583350755e-07,
"loss": 0.0125,
"reward": 0.4325672350823879,
"reward_std": 0.7816441245377064,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.09923387924209237,
"step": 176
},
{
"clip_fraction": 0.0,
"completion_length": 2954.2500534057617,
"epoch": 0.2022857142857143,
"grad_norm": 0.07380012422800064,
"kl": 5.640089511871338e-05,
"learning_rate": 1.3890454406082956e-07,
"loss": -0.038,
"reward": 0.03029090305790305,
"reward_std": 0.3226492116227746,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.0947091169655323,
"step": 177
},
{
"clip_fraction": 0.0,
"completion_length": 2251.062530517578,
"epoch": 0.20342857142857143,
"grad_norm": 0.09012287110090256,
"kl": 3.452599048614502e-05,
"learning_rate": 1.3577281594640182e-07,
"loss": 0.0711,
"reward": 0.21392884047236294,
"reward_std": 0.4973601717501879,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": 0.0055954959243535995,
"step": 178
},
{
"clip_fraction": 0.0,
"completion_length": 2972.9375,
"epoch": 0.20457142857142857,
"grad_norm": 0.0639650970697403,
"kl": 3.37846577167511e-05,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0246,
"reward": -0.15617204643785954,
"reward_std": 0.33500426076352596,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.2186720399186015,
"step": 179
},
{
"clip_fraction": 0.0,
"completion_length": 2245.6666870117188,
"epoch": 0.2057142857142857,
"grad_norm": 0.11034268140792847,
"kl": 4.6290457248687744e-05,
"learning_rate": 1.2988880807625927e-07,
"loss": -0.0654,
"reward": 0.39349728263914585,
"reward_std": 0.5014538783580065,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.06016395008191466,
"step": 180
},
{
"clip_fraction": 0.0,
"completion_length": 3190.9791679382324,
"epoch": 0.20685714285714285,
"grad_norm": 0.07291311770677567,
"kl": 3.898981958627701e-05,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0136,
"reward": -0.06048195622861385,
"reward_std": 0.25065876357257366,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.16464862413704395,
"step": 181
},
{
"clip_fraction": 0.0,
"completion_length": 2276.5833587646484,
"epoch": 0.208,
"grad_norm": 0.07662935554981232,
"kl": 2.4404376745224e-05,
"learning_rate": 1.2451664098030743e-07,
"loss": -0.0495,
"reward": 0.17493994487449527,
"reward_std": 0.6582750072702765,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.03339339280501008,
"step": 182
},
{
"clip_fraction": 0.0,
"completion_length": 1754.2708587646484,
"epoch": 0.20914285714285713,
"grad_norm": 0.11352153867483139,
"kl": 4.573538899421692e-05,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0516,
"reward": 0.18977578077465296,
"reward_std": 0.6147185508161783,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.018557552015408874,
"step": 183
},
{
"clip_fraction": 0.0,
"completion_length": 3000.437530517578,
"epoch": 0.2102857142857143,
"grad_norm": 0.07938341796398163,
"kl": 4.5321881771087646e-05,
"learning_rate": 1.1966285981663407e-07,
"loss": -0.0348,
"reward": -0.057322083972394466,
"reward_std": 0.22675575967878103,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.16148875933140516,
"step": 184
},
{
"clip_fraction": 0.0,
"completion_length": 2710.31254196167,
"epoch": 0.21142857142857144,
"grad_norm": 0.11325710266828537,
"kl": 3.845244646072388e-05,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0851,
"reward": 0.04408662021160126,
"reward_std": 0.36012868769466877,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.10174670163542032,
"step": 185
},
{
"clip_fraction": 0.0,
"completion_length": 3046.1458435058594,
"epoch": 0.21257142857142858,
"grad_norm": 0.05957398936152458,
"kl": 3.740936517715454e-05,
"learning_rate": 1.1533337816991931e-07,
"loss": 0.0061,
"reward": 0.14046389423310757,
"reward_std": 0.48607588466256857,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.04703611321747303,
"step": 186
},
{
"clip_fraction": 0.0,
"completion_length": 2773.4167098999023,
"epoch": 0.21371428571428572,
"grad_norm": 0.08205266296863556,
"kl": 4.443526268005371e-05,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0641,
"reward": -0.12049626559019089,
"reward_std": 0.27329744305461645,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.2038295976817608,
"step": 187
},
{
"clip_fraction": 0.0,
"completion_length": 3556.8125,
"epoch": 0.21485714285714286,
"grad_norm": 0.04973941296339035,
"kl": 4.267692565917969e-05,
"learning_rate": 1.1153347084664419e-07,
"loss": -0.0033,
"reward": -0.09900977090001106,
"reward_std": 0.36740162037312984,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.1615097806788981,
"step": 188
},
{
"clip_fraction": 0.0,
"completion_length": 2301.937515258789,
"epoch": 0.216,
"grad_norm": 0.08138614147901535,
"kl": 4.2557716369628906e-05,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.0459,
"reward": -0.12572868075221777,
"reward_std": 0.2826405204832554,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.16739534121006727,
"step": 189
},
{
"clip_fraction": 0.0,
"completion_length": 3100.750030517578,
"epoch": 0.21714285714285714,
"grad_norm": 0.054819922894239426,
"kl": 2.2755935788154602e-05,
"learning_rate": 1.0826776744855121e-07,
"loss": -0.0095,
"reward": 0.31759869679808617,
"reward_std": 0.4463086621835828,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": 0.04676533304154873,
"step": 190
},
{
"clip_fraction": 0.0,
"completion_length": 2485.2916870117188,
"epoch": 0.21828571428571428,
"grad_norm": 0.07485377788543701,
"kl": 4.908442497253418e-05,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0541,
"reward": 0.18139376863837242,
"reward_std": 0.3567335680127144,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.026939572766423225,
"step": 191
},
{
"clip_fraction": 0.0,
"completion_length": 3492.1458740234375,
"epoch": 0.21942857142857142,
"grad_norm": 0.05016130581498146,
"kl": 3.1597912311553955e-05,
"learning_rate": 1.0554024673218806e-07,
"loss": -0.0201,
"reward": -0.03553268779069185,
"reward_std": 0.31929558888077736,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.16053267009556293,
"step": 192
},
{
"clip_fraction": 0.0,
"completion_length": 2811.1250228881836,
"epoch": 0.22057142857142858,
"grad_norm": 0.07492359727621078,
"kl": 3.6617740988731384e-05,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0277,
"reward": 0.020672958344221115,
"reward_std": 0.48316178657114506,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.12516038585454226,
"step": 193
},
{
"clip_fraction": 0.0,
"completion_length": 3140.041732788086,
"epoch": 0.22171428571428572,
"grad_norm": 0.05980326607823372,
"kl": 4.427880048751831e-05,
"learning_rate": 1.0335423176140511e-07,
"loss": -0.0036,
"reward": 0.5903252474963665,
"reward_std": 0.5727798100560904,
"rewards/accuracy_reward": 0.4166666753590107,
"rewards/cosine_scaled_reward": 0.17365856003016233,
"step": 194
},
{
"clip_fraction": 0.0,
"completion_length": 2799.9791717529297,
"epoch": 0.22285714285714286,
"grad_norm": 0.059899453073740005,
"kl": 4.7072768211364746e-05,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0319,
"reward": -0.16166013106703758,
"reward_std": 0.2635766211897135,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.20332680083811283,
"step": 195
},
{
"clip_fraction": 0.0,
"completion_length": 3475.625030517578,
"epoch": 0.224,
"grad_norm": 0.05154493823647499,
"kl": 4.3954700231552124e-05,
"learning_rate": 1.017123858587145e-07,
"loss": 0.0012,
"reward": -0.05280750431120396,
"reward_std": 0.37235557474195957,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.1569741666316986,
"step": 196
},
{
"clip_fraction": 0.0,
"completion_length": 2883.5416870117188,
"epoch": 0.22514285714285714,
"grad_norm": 0.08069178462028503,
"kl": 4.8279762268066406e-05,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.0765,
"reward": 0.35078890819568187,
"reward_std": 0.5930481739342213,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.05912225344218314,
"step": 197
},
{
"clip_fraction": 0.0,
"completion_length": 2874.5625076293945,
"epoch": 0.22628571428571428,
"grad_norm": 0.08328583091497421,
"kl": 4.653632640838623e-05,
"learning_rate": 1.0061670936044178e-07,
"loss": 0.0162,
"reward": 0.08681692741811275,
"reward_std": 0.3849259242415428,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.07984975911676884,
"step": 198
},
{
"clip_fraction": 0.0,
"completion_length": 3574.8958435058594,
"epoch": 0.22742857142857142,
"grad_norm": 0.049314841628074646,
"kl": 4.60892915725708e-05,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0031,
"reward": -0.26533746905624866,
"reward_std": 0.16454033181071281,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2653374746441841,
"step": 199
},
{
"clip_fraction": 0.0,
"completion_length": 2369.1667098999023,
"epoch": 0.22857142857142856,
"grad_norm": 0.07246367633342743,
"kl": 3.355741500854492e-05,
"learning_rate": 1.0006853717962393e-07,
"loss": 0.0212,
"reward": 0.30690951086580753,
"reward_std": 0.45345655642449856,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.05690951179713011,
"step": 200
},
{
"epoch": 0.22857142857142856,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.0035571102210087704,
"train_runtime": 35198.8637,
"train_samples_per_second": 0.273,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}