checkpoint_book_80 / trainer_state.json
AravindS373's picture
Upload folder using huggingface_hub
5554f95 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9907120743034056,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 222.078125,
"epoch": 0.01238390092879257,
"grad_norm": 0.5330060720443726,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 2.1595765352249146,
"reward_std": 0.4799410402774811,
"rewards/ngram_similarity_reward_func": 0.22074851393699646,
"rewards/reasoning_quality_reward_func": 0.7317968606948853,
"rewards/soft_format_reward_func": 0.2109375,
"rewards/sql_execution_reward_func": 0.5703125,
"rewards/xmlcount_reward_func": 0.42578125,
"step": 1
},
{
"completion_length": 228.609375,
"epoch": 0.02476780185758514,
"grad_norm": 0.38014766573905945,
"kl": 0.0,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 2.1653581261634827,
"reward_std": 0.35478534549474716,
"rewards/ngram_similarity_reward_func": 0.2899237535893917,
"rewards/reasoning_quality_reward_func": 0.7039499878883362,
"rewards/soft_format_reward_func": 0.203125,
"rewards/sql_execution_reward_func": 0.5406249971129,
"rewards/xmlcount_reward_func": 0.427734375,
"step": 2
},
{
"completion_length": 249.984375,
"epoch": 0.03715170278637771,
"grad_norm": 0.4404110908508301,
"kl": 0.00019386520580155775,
"learning_rate": 1e-05,
"loss": 0.0,
"reward": 2.2775589525699615,
"reward_std": 0.2987724468111992,
"rewards/ngram_similarity_reward_func": 0.14325893856585026,
"rewards/reasoning_quality_reward_func": 0.7573468536138535,
"rewards/soft_format_reward_func": 0.23828125,
"rewards/sql_execution_reward_func": 0.66015625,
"rewards/xmlcount_reward_func": 0.478515625,
"step": 3
},
{
"completion_length": 219.28125,
"epoch": 0.04953560371517028,
"grad_norm": 0.4848824143409729,
"kl": 0.00020055885761394165,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0,
"reward": 1.9259249866008759,
"reward_std": 0.49138037860393524,
"rewards/ngram_similarity_reward_func": 0.16651681065559387,
"rewards/reasoning_quality_reward_func": 0.7145218700170517,
"rewards/soft_format_reward_func": 0.18359375,
"rewards/sql_execution_reward_func": 0.4745738562196493,
"rewards/xmlcount_reward_func": 0.38671875,
"step": 4
},
{
"completion_length": 240.875,
"epoch": 0.06191950464396285,
"grad_norm": 0.3829522430896759,
"kl": 0.00025230981555068865,
"learning_rate": 2e-05,
"loss": 0.0,
"reward": 2.286948025226593,
"reward_std": 0.46134958416223526,
"rewards/ngram_similarity_reward_func": 0.23377767577767372,
"rewards/reasoning_quality_reward_func": 0.7305140793323517,
"rewards/soft_format_reward_func": 0.21484375,
"rewards/sql_execution_reward_func": 0.6585937514901161,
"rewards/xmlcount_reward_func": 0.44921875,
"step": 5
},
{
"completion_length": 220.421875,
"epoch": 0.07430340557275542,
"grad_norm": 0.41778889298439026,
"kl": 0.0005900838441448286,
"learning_rate": 2.5e-05,
"loss": 0.0,
"reward": 2.2411254048347473,
"reward_std": 0.41372712701559067,
"rewards/ngram_similarity_reward_func": 0.21116455644369125,
"rewards/reasoning_quality_reward_func": 0.7018359005451202,
"rewards/soft_format_reward_func": 0.234375,
"rewards/sql_execution_reward_func": 0.625,
"rewards/xmlcount_reward_func": 0.46875,
"step": 6
},
{
"completion_length": 243.671875,
"epoch": 0.08668730650154799,
"grad_norm": 0.362834632396698,
"kl": 0.0015416233654832467,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.0001,
"reward": 2.479892313480377,
"reward_std": 0.42981887608766556,
"rewards/ngram_similarity_reward_func": 0.31569236889481544,
"rewards/reasoning_quality_reward_func": 0.7431062161922455,
"rewards/soft_format_reward_func": 0.23046875,
"rewards/sql_execution_reward_func": 0.7257812470197678,
"rewards/xmlcount_reward_func": 0.46484375,
"step": 7
},
{
"completion_length": 272.546875,
"epoch": 0.09907120743034056,
"grad_norm": 0.42216944694519043,
"kl": 0.002694789902307093,
"learning_rate": 3.5000000000000004e-05,
"loss": 0.0001,
"reward": 2.703847050666809,
"reward_std": 0.29045113176107407,
"rewards/ngram_similarity_reward_func": 0.3112502619624138,
"rewards/reasoning_quality_reward_func": 0.7433781176805496,
"rewards/soft_format_reward_func": 0.2421875,
"rewards/sql_execution_reward_func": 0.9187500029802322,
"rewards/xmlcount_reward_func": 0.48828125,
"step": 8
},
{
"completion_length": 265.84375,
"epoch": 0.11145510835913312,
"grad_norm": 0.34325331449508667,
"kl": 0.004968019318766892,
"learning_rate": 4e-05,
"loss": 0.0002,
"reward": 2.6481465101242065,
"reward_std": 0.24203401803970337,
"rewards/ngram_similarity_reward_func": 0.30987007170915604,
"rewards/reasoning_quality_reward_func": 0.7656203359365463,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.8304687440395355,
"rewards/xmlcount_reward_func": 0.49609375,
"step": 9
},
{
"completion_length": 265.515625,
"epoch": 0.1238390092879257,
"grad_norm": 0.36068692803382874,
"kl": 0.005839241901412606,
"learning_rate": 3.9982867988473446e-05,
"loss": 0.0002,
"reward": 2.4154441356658936,
"reward_std": 0.2717306688427925,
"rewards/ngram_similarity_reward_func": 0.30530440993607044,
"rewards/reasoning_quality_reward_func": 0.7639749944210052,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.5961647690273821,
"rewards/xmlcount_reward_func": 0.5,
"step": 10
},
{
"completion_length": 271.234375,
"epoch": 0.13622291021671826,
"grad_norm": 0.37997934222221375,
"kl": 0.018266105791553855,
"learning_rate": 3.993150456565143e-05,
"loss": 0.0007,
"reward": 2.8274221420288086,
"reward_std": 0.2109035775065422,
"rewards/ngram_similarity_reward_func": 0.3601767495274544,
"rewards/reasoning_quality_reward_func": 0.7848234474658966,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.9421875029802322,
"rewards/xmlcount_reward_func": 0.494140625,
"step": 11
},
{
"completion_length": 296.265625,
"epoch": 0.14860681114551083,
"grad_norm": 0.3279929757118225,
"kl": 0.011964517878368497,
"learning_rate": 3.9846007504728593e-05,
"loss": 0.0005,
"reward": 2.734043776988983,
"reward_std": 0.16971006244421005,
"rewards/ngram_similarity_reward_func": 0.3088625408709049,
"rewards/reasoning_quality_reward_func": 0.8142437487840652,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8609375059604645,
"rewards/xmlcount_reward_func": 0.5,
"step": 12
},
{
"completion_length": 311.25,
"epoch": 0.1609907120743034,
"grad_norm": 0.31357380747795105,
"kl": 0.018573109060525894,
"learning_rate": 3.972653955421975e-05,
"loss": 0.0007,
"reward": 2.4877208471298218,
"reward_std": 0.2542732544243336,
"rewards/ngram_similarity_reward_func": 0.40954745560884476,
"rewards/reasoning_quality_reward_func": 0.8195796459913254,
"rewards/soft_format_reward_func": 0.2421875,
"rewards/sql_execution_reward_func": 0.5242187529802322,
"rewards/xmlcount_reward_func": 0.4921875,
"step": 13
},
{
"completion_length": 305.96875,
"epoch": 0.17337461300309598,
"grad_norm": 0.35947704315185547,
"kl": 0.02572451764717698,
"learning_rate": 3.95733281281588e-05,
"loss": 0.001,
"reward": 2.84596848487854,
"reward_std": 0.16431875061243773,
"rewards/ngram_similarity_reward_func": 0.4371529445052147,
"rewards/reasoning_quality_reward_func": 0.7978781312704086,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8609375059604645,
"rewards/xmlcount_reward_func": 0.5,
"step": 14
},
{
"completion_length": 316.8125,
"epoch": 0.18575851393188855,
"grad_norm": 0.3095603883266449,
"kl": 0.031108289025723934,
"learning_rate": 3.938666487320323e-05,
"loss": 0.0012,
"reward": 2.543521463871002,
"reward_std": 0.21121197938919067,
"rewards/ngram_similarity_reward_func": 0.33944912999868393,
"rewards/reasoning_quality_reward_func": 0.7945765405893326,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.6614488586783409,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 15
},
{
"completion_length": 333.65625,
"epoch": 0.19814241486068113,
"grad_norm": 0.32227563858032227,
"kl": 0.027149478904902935,
"learning_rate": 3.9166905113468086e-05,
"loss": 0.0011,
"reward": 2.5172452330589294,
"reward_std": 0.23450876772403717,
"rewards/ngram_similarity_reward_func": 0.33389993757009506,
"rewards/reasoning_quality_reward_func": 0.7915484458208084,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.6476562395691872,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 16
},
{
"completion_length": 370.265625,
"epoch": 0.21052631578947367,
"grad_norm": 0.31235265731811523,
"kl": 0.031742531806230545,
"learning_rate": 3.891446717414635e-05,
"loss": 0.0013,
"reward": 2.688088834285736,
"reward_std": 0.11317835189402103,
"rewards/ngram_similarity_reward_func": 0.2212217040359974,
"rewards/reasoning_quality_reward_func": 0.8551484197378159,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8617187440395355,
"rewards/xmlcount_reward_func": 0.5,
"step": 17
},
{
"completion_length": 340.03125,
"epoch": 0.22291021671826625,
"grad_norm": 0.3352001905441284,
"kl": 0.043277411721646786,
"learning_rate": 3.862983158520316e-05,
"loss": 0.0017,
"reward": 2.741812765598297,
"reward_std": 0.14071671105921268,
"rewards/ngram_similarity_reward_func": 0.28818774223327637,
"rewards/reasoning_quality_reward_func": 0.8387812525033951,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8648437410593033,
"rewards/xmlcount_reward_func": 0.5,
"step": 18
},
{
"completion_length": 333.21875,
"epoch": 0.23529411764705882,
"grad_norm": 0.31716179847717285,
"kl": 0.04549229796975851,
"learning_rate": 3.83135401666597e-05,
"loss": 0.0018,
"reward": 2.625436544418335,
"reward_std": 0.13139259070158005,
"rewards/ngram_similarity_reward_func": 0.35064588487148285,
"rewards/reasoning_quality_reward_func": 0.8470562547445297,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.68359375,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 19
},
{
"completion_length": 330.203125,
"epoch": 0.2476780185758514,
"grad_norm": 0.31764793395996094,
"kl": 0.04236258752644062,
"learning_rate": 3.796619499720799e-05,
"loss": 0.0017,
"reward": 2.899237811565399,
"reward_std": 0.20555716007947922,
"rewards/ngram_similarity_reward_func": 0.4203330874443054,
"rewards/reasoning_quality_reward_func": 0.8171859234571457,
"rewards/soft_format_reward_func": 0.2421875,
"rewards/sql_execution_reward_func": 0.9234375059604645,
"rewards/xmlcount_reward_func": 0.49609375,
"step": 20
},
{
"completion_length": 412.453125,
"epoch": 0.26006191950464397,
"grad_norm": 0.30428770184516907,
"kl": 0.036752122454345226,
"learning_rate": 3.75884572681199e-05,
"loss": 0.0015,
"reward": 2.610611140727997,
"reward_std": 0.15251119248569012,
"rewards/ngram_similarity_reward_func": 0.32537825778126717,
"rewards/reasoning_quality_reward_func": 0.8832796812057495,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.6578125059604645,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 21
},
{
"completion_length": 354.265625,
"epoch": 0.2724458204334365,
"grad_norm": 0.3025682270526886,
"kl": 0.04639334697276354,
"learning_rate": 3.718104602463194e-05,
"loss": 0.0019,
"reward": 2.575563907623291,
"reward_std": 0.21309060789644718,
"rewards/ngram_similarity_reward_func": 0.42503100633621216,
"rewards/reasoning_quality_reward_func": 0.8571734130382538,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.5492187440395355,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 22
},
{
"completion_length": 379.46875,
"epoch": 0.2848297213622291,
"grad_norm": 0.28553083539009094,
"kl": 0.04365707188844681,
"learning_rate": 3.6744736797201856e-05,
"loss": 0.0017,
"reward": 2.4477399587631226,
"reward_std": 0.2514076679944992,
"rewards/ngram_similarity_reward_func": 0.45473647862672806,
"rewards/reasoning_quality_reward_func": 0.8824515789747238,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.36055195331573486,
"rewards/xmlcount_reward_func": 0.5,
"step": 23
},
{
"completion_length": 346.921875,
"epoch": 0.29721362229102166,
"grad_norm": 0.3103869557380676,
"kl": 0.05476447567343712,
"learning_rate": 3.6280360125242234e-05,
"loss": 0.0022,
"reward": 2.7398348450660706,
"reward_std": 0.23609711229801178,
"rewards/ngram_similarity_reward_func": 0.4382130652666092,
"rewards/reasoning_quality_reward_func": 0.8634234368801117,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.6881983578205109,
"rewards/xmlcount_reward_func": 0.5,
"step": 24
},
{
"completion_length": 420.453125,
"epoch": 0.30959752321981426,
"grad_norm": 0.2726235091686249,
"kl": 0.054436798207461834,
"learning_rate": 3.578879997614161e-05,
"loss": 0.0022,
"reward": 2.6918256878852844,
"reward_std": 0.14829625003039837,
"rewards/ngram_similarity_reward_func": 0.38556934148073196,
"rewards/reasoning_quality_reward_func": 0.9156312495470047,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.640625,
"rewards/xmlcount_reward_func": 0.5,
"step": 25
},
{
"completion_length": 385.9375,
"epoch": 0.3219814241486068,
"grad_norm": 0.31836339831352234,
"kl": 0.05234138946980238,
"learning_rate": 3.5270992062582236e-05,
"loss": 0.0021,
"reward": 2.4975169897079468,
"reward_std": 0.3400815278291702,
"rewards/ngram_similarity_reward_func": 0.35228848457336426,
"rewards/reasoning_quality_reward_func": 0.852295309305191,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.5526988655328751,
"rewards/xmlcount_reward_func": 0.494140625,
"step": 26
},
{
"completion_length": 344.109375,
"epoch": 0.33436532507739936,
"grad_norm": 0.28171250224113464,
"kl": 0.06604294572025537,
"learning_rate": 3.472792206135786e-05,
"loss": 0.0026,
"reward": 2.8997097611427307,
"reward_std": 0.1521923691034317,
"rewards/ngram_similarity_reward_func": 0.5297628864645958,
"rewards/reasoning_quality_reward_func": 0.8543218523263931,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.765625,
"rewards/xmlcount_reward_func": 0.5,
"step": 27
},
{
"completion_length": 367.171875,
"epoch": 0.34674922600619196,
"grad_norm": 0.2663843631744385,
"kl": 0.06475986633449793,
"learning_rate": 3.4160623737081886e-05,
"loss": 0.0026,
"reward": 3.0965726375579834,
"reward_std": 0.09447081200778484,
"rewards/ngram_similarity_reward_func": 0.4422616958618164,
"rewards/reasoning_quality_reward_func": 0.9062640517950058,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 1.0,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 28
},
{
"completion_length": 409.859375,
"epoch": 0.3591331269349845,
"grad_norm": 0.28277286887168884,
"kl": 0.047645531594753265,
"learning_rate": 3.3570176974357714e-05,
"loss": 0.0019,
"reward": 2.482310175895691,
"reward_std": 0.155114084482193,
"rewards/ngram_similarity_reward_func": 0.17045550420880318,
"rewards/reasoning_quality_reward_func": 0.874354675412178,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.6875,
"rewards/xmlcount_reward_func": 0.5,
"step": 29
},
{
"completion_length": 359.9375,
"epoch": 0.3715170278637771,
"grad_norm": 0.2870733439922333,
"kl": 0.0618684496730566,
"learning_rate": 3.295770572215697e-05,
"loss": 0.0025,
"reward": 2.976421356201172,
"reward_std": 0.17351308092474937,
"rewards/ngram_similarity_reward_func": 0.5148354731500149,
"rewards/reasoning_quality_reward_func": 0.8990859091281891,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8125,
"rewards/xmlcount_reward_func": 0.5,
"step": 30
},
{
"completion_length": 364.09375,
"epoch": 0.38390092879256965,
"grad_norm": 0.3511677086353302,
"kl": 0.07197799813002348,
"learning_rate": 3.232437585431883e-05,
"loss": 0.0029,
"reward": 2.9660959243774414,
"reward_std": 0.1771723162382841,
"rewards/ngram_similarity_reward_func": 0.4719693809747696,
"rewards/reasoning_quality_reward_func": 0.8925640434026718,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.859375,
"rewards/xmlcount_reward_func": 0.49609375,
"step": 31
},
{
"completion_length": 388.890625,
"epoch": 0.39628482972136225,
"grad_norm": 0.27826693654060364,
"kl": 0.061267949640750885,
"learning_rate": 3.1671392950242836e-05,
"loss": 0.0025,
"reward": 2.6841952204704285,
"reward_std": 0.15280664712190628,
"rewards/ngram_similarity_reward_func": 0.3556499555706978,
"rewards/reasoning_quality_reward_func": 0.9199515581130981,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.6585937440395355,
"rewards/xmlcount_reward_func": 0.5,
"step": 32
},
{
"completion_length": 366.859375,
"epoch": 0.4086687306501548,
"grad_norm": 0.2917367219924927,
"kl": 0.06815788336098194,
"learning_rate": 3.1e-05,
"loss": 0.0027,
"reward": 2.9397113919258118,
"reward_std": 0.1146523468196392,
"rewards/ngram_similarity_reward_func": 0.296003520488739,
"rewards/reasoning_quality_reward_func": 0.905426561832428,
"rewards/soft_format_reward_func": 0.2421875,
"rewards/sql_execution_reward_func": 1.0,
"rewards/xmlcount_reward_func": 0.49609375,
"step": 33
},
{
"completion_length": 349.6875,
"epoch": 0.42105263157894735,
"grad_norm": 0.26540401577949524,
"kl": 0.07279603462666273,
"learning_rate": 3.0311475038230616e-05,
"loss": 0.0029,
"reward": 2.89504611492157,
"reward_std": 0.19008862972259521,
"rewards/ngram_similarity_reward_func": 0.4920961260795593,
"rewards/reasoning_quality_reward_func": 0.9310749769210815,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7218750044703484,
"rewards/xmlcount_reward_func": 0.5,
"step": 34
},
{
"completion_length": 349.3125,
"epoch": 0.43343653250773995,
"grad_norm": 0.30334243178367615,
"kl": 0.06882389821112156,
"learning_rate": 2.960712871133259e-05,
"loss": 0.0028,
"reward": 2.8439746499061584,
"reward_std": 0.24623795598745346,
"rewards/ngram_similarity_reward_func": 0.5152419656515121,
"rewards/reasoning_quality_reward_func": 0.9223406165838242,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.6563920453190804,
"rewards/xmlcount_reward_func": 0.5,
"step": 35
},
{
"completion_length": 385.125,
"epoch": 0.4458204334365325,
"grad_norm": 0.28430214524269104,
"kl": 0.06620587687939405,
"learning_rate": 2.8888301782571618e-05,
"loss": 0.0026,
"reward": 2.7404602766036987,
"reward_std": 0.25473709031939507,
"rewards/ngram_similarity_reward_func": 0.3008290082216263,
"rewards/reasoning_quality_reward_func": 0.934943750500679,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7546875029802322,
"rewards/xmlcount_reward_func": 0.5,
"step": 36
},
{
"completion_length": 340.421875,
"epoch": 0.4582043343653251,
"grad_norm": 0.2889171838760376,
"kl": 0.07523482665419579,
"learning_rate": 2.8156362579862042e-05,
"loss": 0.003,
"reward": 2.897187650203705,
"reward_std": 0.25312334299087524,
"rewards/ngram_similarity_reward_func": 0.41222984343767166,
"rewards/reasoning_quality_reward_func": 0.925192192196846,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.8156249970197678,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 37
},
{
"completion_length": 320.171875,
"epoch": 0.47058823529411764,
"grad_norm": 0.2764835059642792,
"kl": 0.07314357813447714,
"learning_rate": 2.7412704391076914e-05,
"loss": 0.0029,
"reward": 3.0073145031929016,
"reward_std": 0.1429160237312317,
"rewards/ngram_similarity_reward_func": 0.4768817350268364,
"rewards/reasoning_quality_reward_func": 0.9210578054189682,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.859375,
"rewards/xmlcount_reward_func": 0.5,
"step": 38
},
{
"completion_length": 332.09375,
"epoch": 0.48297213622291024,
"grad_norm": 0.32125160098075867,
"kl": 0.07088590506464243,
"learning_rate": 2.6658742811845377e-05,
"loss": 0.0028,
"reward": 3.05528324842453,
"reward_std": 0.1555289849638939,
"rewards/ngram_similarity_reward_func": 0.42346765100955963,
"rewards/reasoning_quality_reward_func": 0.9286906123161316,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.953125,
"rewards/xmlcount_reward_func": 0.5,
"step": 39
},
{
"completion_length": 329.9375,
"epoch": 0.4953560371517028,
"grad_norm": 0.29124927520751953,
"kl": 0.06565980054438114,
"learning_rate": 2.5895913050885853e-05,
"loss": 0.0026,
"reward": 2.8043496012687683,
"reward_std": 0.18914231285452843,
"rewards/ngram_similarity_reward_func": 0.39962920919060707,
"rewards/reasoning_quality_reward_func": 0.9246421754360199,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7320312410593033,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 40
},
{
"completion_length": 317.640625,
"epoch": 0.5077399380804953,
"grad_norm": 0.28270959854125977,
"kl": 0.07616368494927883,
"learning_rate": 2.512566719800475e-05,
"loss": 0.003,
"reward": 3.0060057044029236,
"reward_std": 0.20819612592458725,
"rewards/ngram_similarity_reward_func": 0.46407629549503326,
"rewards/reasoning_quality_reward_func": 0.9291453063488007,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8627840876579285,
"rewards/xmlcount_reward_func": 0.5,
"step": 41
},
{
"completion_length": 320.1875,
"epoch": 0.5201238390092879,
"grad_norm": 0.29271021485328674,
"kl": 0.07423507608473301,
"learning_rate": 2.4349471459960935e-05,
"loss": 0.003,
"reward": 2.8800657987594604,
"reward_std": 0.12723891995847225,
"rewards/ngram_similarity_reward_func": 0.39660485088825226,
"rewards/reasoning_quality_reward_func": 0.902210921049118,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8312499970197678,
"rewards/xmlcount_reward_func": 0.5,
"step": 42
},
{
"completion_length": 344.484375,
"epoch": 0.5325077399380805,
"grad_norm": 0.28183987736701965,
"kl": 0.07508978061378002,
"learning_rate": 2.356880336945785e-05,
"loss": 0.003,
"reward": 2.802961766719818,
"reward_std": 0.15244293212890625,
"rewards/ngram_similarity_reward_func": 0.28034142404794693,
"rewards/reasoning_quality_reward_func": 0.9257453233003616,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8468749970197678,
"rewards/xmlcount_reward_func": 0.5,
"step": 43
},
{
"completion_length": 331.1875,
"epoch": 0.544891640866873,
"grad_norm": 0.2783842086791992,
"kl": 0.0693344809114933,
"learning_rate": 2.2785148972576052e-05,
"loss": 0.0028,
"reward": 2.757667899131775,
"reward_std": 0.18025352619588375,
"rewards/ngram_similarity_reward_func": 0.48256489634513855,
"rewards/reasoning_quality_reward_func": 0.9297906160354614,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.5953124985098839,
"rewards/xmlcount_reward_func": 0.5,
"step": 44
},
{
"completion_length": 324.140625,
"epoch": 0.5572755417956656,
"grad_norm": 0.2986034154891968,
"kl": 0.07126997038722038,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0029,
"reward": 2.700526773929596,
"reward_std": 0.1273169182240963,
"rewards/ngram_similarity_reward_func": 0.2901314552873373,
"rewards/reasoning_quality_reward_func": 0.9182077944278717,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7421875,
"rewards/xmlcount_reward_func": 0.5,
"step": 45
},
{
"completion_length": 312.859375,
"epoch": 0.5696594427244582,
"grad_norm": 0.2958148717880249,
"kl": 0.07780578825622797,
"learning_rate": 2.1214851027423954e-05,
"loss": 0.0031,
"reward": 2.8383458256721497,
"reward_std": 0.14491115603595972,
"rewards/ngram_similarity_reward_func": 0.40570230409502983,
"rewards/reasoning_quality_reward_func": 0.916734367609024,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7659090906381607,
"rewards/xmlcount_reward_func": 0.5,
"step": 46
},
{
"completion_length": 311.828125,
"epoch": 0.5820433436532507,
"grad_norm": 0.2920028567314148,
"kl": 0.06001428607851267,
"learning_rate": 2.0431196630542152e-05,
"loss": 0.0024,
"reward": 3.046830952167511,
"reward_std": 0.2057624664157629,
"rewards/ngram_similarity_reward_func": 0.6009372770786285,
"rewards/reasoning_quality_reward_func": 0.9173781126737595,
"rewards/soft_format_reward_func": 0.24609375,
"rewards/sql_execution_reward_func": 0.7843749970197678,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 47
},
{
"completion_length": 305.65625,
"epoch": 0.5944272445820433,
"grad_norm": 0.2988949418067932,
"kl": 0.06703130528330803,
"learning_rate": 1.9650528540039077e-05,
"loss": 0.0027,
"reward": 2.6817620396614075,
"reward_std": 0.12368473783135414,
"rewards/ngram_similarity_reward_func": 0.38093067705631256,
"rewards/reasoning_quality_reward_func": 0.9133312404155731,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.637499988079071,
"rewards/xmlcount_reward_func": 0.5,
"step": 48
},
{
"completion_length": 312.8125,
"epoch": 0.6068111455108359,
"grad_norm": 0.27996182441711426,
"kl": 0.08176139369606972,
"learning_rate": 1.8874332801995258e-05,
"loss": 0.0033,
"reward": 3.1290236711502075,
"reward_std": 0.09270750731229782,
"rewards/ngram_similarity_reward_func": 0.44197215139865875,
"rewards/reasoning_quality_reward_func": 0.9370515793561935,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 1.0,
"rewards/xmlcount_reward_func": 0.5,
"step": 49
},
{
"completion_length": 344.09375,
"epoch": 0.6191950464396285,
"grad_norm": 0.27371323108673096,
"kl": 0.07049352023750544,
"learning_rate": 1.810408694911415e-05,
"loss": 0.0028,
"reward": 2.818117916584015,
"reward_std": 0.21658625453710556,
"rewards/ngram_similarity_reward_func": 0.41760701686143875,
"rewards/reasoning_quality_reward_func": 0.9512921720743179,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.703125,
"rewards/xmlcount_reward_func": 0.49609375,
"step": 50
},
{
"completion_length": 317.46875,
"epoch": 0.631578947368421,
"grad_norm": 0.30640900135040283,
"kl": 0.0756241325289011,
"learning_rate": 1.7341257188154625e-05,
"loss": 0.003,
"reward": 2.9922556281089783,
"reward_std": 0.10434877779334784,
"rewards/ngram_similarity_reward_func": 0.4318556822836399,
"rewards/reasoning_quality_reward_func": 0.9354000091552734,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.875,
"rewards/xmlcount_reward_func": 0.5,
"step": 51
},
{
"completion_length": 362.546875,
"epoch": 0.6439628482972136,
"grad_norm": 0.2744141221046448,
"kl": 0.062476624734699726,
"learning_rate": 1.6587295608923088e-05,
"loss": 0.0025,
"reward": 2.807952642440796,
"reward_std": 0.12682343646883965,
"rewards/ngram_similarity_reward_func": 0.25346523337066174,
"rewards/reasoning_quality_reward_func": 0.9455031156539917,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8609375059604645,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 52
},
{
"completion_length": 326.640625,
"epoch": 0.6563467492260062,
"grad_norm": 0.2845337688922882,
"kl": 0.06813267059624195,
"learning_rate": 1.5843637420137964e-05,
"loss": 0.0027,
"reward": 2.9080602526664734,
"reward_std": 0.15239747613668442,
"rewards/ngram_similarity_reward_func": 0.47624945640563965,
"rewards/reasoning_quality_reward_func": 0.9552484601736069,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7265625,
"rewards/xmlcount_reward_func": 0.5,
"step": 53
},
{
"completion_length": 336.140625,
"epoch": 0.6687306501547987,
"grad_norm": 0.27873164415359497,
"kl": 0.06789861433207989,
"learning_rate": 1.5111698217428385e-05,
"loss": 0.0027,
"reward": 2.9421083331108093,
"reward_std": 0.14753830805420876,
"rewards/ngram_similarity_reward_func": 0.4633248597383499,
"rewards/reasoning_quality_reward_func": 0.9520296901464462,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7767538130283356,
"rewards/xmlcount_reward_func": 0.5,
"step": 54
},
{
"completion_length": 342.953125,
"epoch": 0.6811145510835913,
"grad_norm": 0.29634758830070496,
"kl": 0.0591394891962409,
"learning_rate": 1.4392871288667415e-05,
"loss": 0.0024,
"reward": 2.8162970542907715,
"reward_std": 0.11586539167910814,
"rewards/ngram_similarity_reward_func": 0.28963764756917953,
"rewards/reasoning_quality_reward_func": 0.92978435754776,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8468749970197678,
"rewards/xmlcount_reward_func": 0.5,
"step": 55
},
{
"completion_length": 326.34375,
"epoch": 0.6934984520123839,
"grad_norm": 0.3019636273384094,
"kl": 0.07066548988223076,
"learning_rate": 1.3688524961769396e-05,
"loss": 0.0028,
"reward": 2.952807366847992,
"reward_std": 0.11193438991904259,
"rewards/ngram_similarity_reward_func": 0.4892011173069477,
"rewards/reasoning_quality_reward_func": 0.9323562532663345,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.78125,
"rewards/xmlcount_reward_func": 0.5,
"step": 56
},
{
"completion_length": 341.09375,
"epoch": 0.7058823529411765,
"grad_norm": 0.27949875593185425,
"kl": 0.08039782661944628,
"learning_rate": 1.3000000000000006e-05,
"loss": 0.0032,
"reward": 3.017184257507324,
"reward_std": 0.12472959142178297,
"rewards/ngram_similarity_reward_func": 0.4284828044474125,
"rewards/reasoning_quality_reward_func": 0.9480765461921692,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.890625,
"rewards/xmlcount_reward_func": 0.5,
"step": 57
},
{
"completion_length": 348.234375,
"epoch": 0.718266253869969,
"grad_norm": 0.26645585894584656,
"kl": 0.06729870289564133,
"learning_rate": 1.232860704975717e-05,
"loss": 0.0027,
"reward": 2.8991669416427612,
"reward_std": 0.04610138572752476,
"rewards/ngram_similarity_reward_func": 0.29805589467287064,
"rewards/reasoning_quality_reward_func": 0.9761109352111816,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.875,
"rewards/xmlcount_reward_func": 0.5,
"step": 58
},
{
"completion_length": 322.765625,
"epoch": 0.7306501547987616,
"grad_norm": 0.2988031804561615,
"kl": 0.07192371133714914,
"learning_rate": 1.1675624145681177e-05,
"loss": 0.0029,
"reward": 2.894783914089203,
"reward_std": 0.19001621287316084,
"rewards/ngram_similarity_reward_func": 0.5530954301357269,
"rewards/reasoning_quality_reward_func": 0.9504609107971191,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.6412276178598404,
"rewards/xmlcount_reward_func": 0.5,
"step": 59
},
{
"completion_length": 355.75,
"epoch": 0.7430340557275542,
"grad_norm": 0.2641834020614624,
"kl": 0.06368108931928873,
"learning_rate": 1.1042294277843029e-05,
"loss": 0.0025,
"reward": 2.89058655500412,
"reward_std": 0.12556276191025972,
"rewards/ngram_similarity_reward_func": 0.40262240916490555,
"rewards/reasoning_quality_reward_func": 0.9567140638828278,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.78125,
"rewards/xmlcount_reward_func": 0.5,
"step": 60
},
{
"completion_length": 300.3125,
"epoch": 0.7554179566563467,
"grad_norm": 0.29292401671409607,
"kl": 0.08026566356420517,
"learning_rate": 1.0429823025642292e-05,
"loss": 0.0032,
"reward": 3.160749673843384,
"reward_std": 0.2943864706903696,
"rewards/ngram_similarity_reward_func": 0.5859435126185417,
"rewards/reasoning_quality_reward_func": 0.9517593681812286,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.875,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 61
},
{
"completion_length": 307.1875,
"epoch": 0.7678018575851393,
"grad_norm": 0.29157254099845886,
"kl": 0.07550233788788319,
"learning_rate": 9.839376262918117e-06,
"loss": 0.003,
"reward": 2.749490201473236,
"reward_std": 0.1510023418813944,
"rewards/ngram_similarity_reward_func": 0.4469105303287506,
"rewards/reasoning_quality_reward_func": 0.9275796860456467,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.625,
"rewards/xmlcount_reward_func": 0.5,
"step": 62
},
{
"completion_length": 320.5625,
"epoch": 0.7801857585139319,
"grad_norm": 0.26841044425964355,
"kl": 0.07417850941419601,
"learning_rate": 9.272077938642147e-06,
"loss": 0.003,
"reward": 3.029486298561096,
"reward_std": 0.14038624800741673,
"rewards/ngram_similarity_reward_func": 0.5038420185446739,
"rewards/reasoning_quality_reward_func": 0.9472562223672867,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.828388050198555,
"rewards/xmlcount_reward_func": 0.5,
"step": 63
},
{
"completion_length": 347.9375,
"epoch": 0.7925696594427245,
"grad_norm": 0.2752821743488312,
"kl": 0.0751224858686328,
"learning_rate": 8.72900793741777e-06,
"loss": 0.003,
"reward": 3.038633167743683,
"reward_std": 0.08897098805755377,
"rewards/ngram_similarity_reward_func": 0.4439566656947136,
"rewards/reasoning_quality_reward_func": 0.9696765691041946,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.875,
"rewards/xmlcount_reward_func": 0.5,
"step": 64
},
{
"completion_length": 339.515625,
"epoch": 0.804953560371517,
"grad_norm": 0.31212714314460754,
"kl": 0.08936411328613758,
"learning_rate": 8.2112000238584e-06,
"loss": 0.0036,
"reward": 3.021269977092743,
"reward_std": 0.10588092915713787,
"rewards/ngram_similarity_reward_func": 0.5628981739282608,
"rewards/reasoning_quality_reward_func": 0.9739968627691269,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.734375,
"rewards/xmlcount_reward_func": 0.5,
"step": 65
},
{
"completion_length": 376.796875,
"epoch": 0.8173374613003096,
"grad_norm": 0.26753705739974976,
"kl": 0.0706013347953558,
"learning_rate": 7.71963987475777e-06,
"loss": 0.0028,
"reward": 2.8900545239448547,
"reward_std": 0.11279613338410854,
"rewards/ngram_similarity_reward_func": 0.31394050642848015,
"rewards/reasoning_quality_reward_func": 0.9636140614748001,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.8624999970197678,
"rewards/xmlcount_reward_func": 0.5,
"step": 66
},
{
"completion_length": 325.171875,
"epoch": 0.8297213622291022,
"grad_norm": 0.2971738278865814,
"kl": 0.07812906242907047,
"learning_rate": 7.255263202798146e-06,
"loss": 0.0031,
"reward": 2.962576150894165,
"reward_std": 0.0649077408015728,
"rewards/ngram_similarity_reward_func": 0.3916136734187603,
"rewards/reasoning_quality_reward_func": 0.9459624886512756,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.875,
"rewards/xmlcount_reward_func": 0.5,
"step": 67
},
{
"completion_length": 339.921875,
"epoch": 0.8421052631578947,
"grad_norm": 0.2827884256839752,
"kl": 0.08069668803364038,
"learning_rate": 6.818953975368061e-06,
"loss": 0.0032,
"reward": 3.235768675804138,
"reward_std": 0.1414957493543625,
"rewards/ngram_similarity_reward_func": 0.5445858836174011,
"rewards/reasoning_quality_reward_func": 0.9724328070878983,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.96875,
"rewards/xmlcount_reward_func": 0.5,
"step": 68
},
{
"completion_length": 374.375,
"epoch": 0.8544891640866873,
"grad_norm": 0.2604800760746002,
"kl": 0.07065233774483204,
"learning_rate": 6.411542731880104e-06,
"loss": 0.0028,
"reward": 3.021795392036438,
"reward_std": 0.15352355409413576,
"rewards/ngram_similarity_reward_func": 0.34035639837384224,
"rewards/reasoning_quality_reward_func": 0.9814390540122986,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.9500000029802322,
"rewards/xmlcount_reward_func": 0.5,
"step": 69
},
{
"completion_length": 346.640625,
"epoch": 0.8668730650154799,
"grad_norm": 0.2746836543083191,
"kl": 0.06909866165369749,
"learning_rate": 6.03380500279201e-06,
"loss": 0.0028,
"reward": 2.642494022846222,
"reward_std": 0.19972242414951324,
"rewards/ngram_similarity_reward_func": 0.4592840112745762,
"rewards/reasoning_quality_reward_func": 0.9604828059673309,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.4727272689342499,
"rewards/xmlcount_reward_func": 0.5,
"step": 70
},
{
"completion_length": 345.78125,
"epoch": 0.8792569659442725,
"grad_norm": 0.3155825734138489,
"kl": 0.07083407044410706,
"learning_rate": 5.686459833340302e-06,
"loss": 0.0028,
"reward": 2.6602479815483093,
"reward_std": 0.1187733905389905,
"rewards/ngram_similarity_reward_func": 0.4456479325890541,
"rewards/reasoning_quality_reward_func": 0.9665531367063522,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.498046875,
"step": 71
},
{
"completion_length": 347.640625,
"epoch": 0.891640866873065,
"grad_norm": 0.2810722589492798,
"kl": 0.0792484674602747,
"learning_rate": 5.370168414796839e-06,
"loss": 0.0032,
"reward": 2.9334104657173157,
"reward_std": 0.15719584189355373,
"rewards/ngram_similarity_reward_func": 0.4541776664555073,
"rewards/reasoning_quality_reward_func": 0.9636077880859375,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.765625,
"rewards/xmlcount_reward_func": 0.5,
"step": 72
},
{
"completion_length": 331.265625,
"epoch": 0.9040247678018576,
"grad_norm": 0.2836628556251526,
"kl": 0.08229007571935654,
"learning_rate": 5.085532825853651e-06,
"loss": 0.0033,
"reward": 3.0361828804016113,
"reward_std": 0.19561532977968454,
"rewards/ngram_similarity_reward_func": 0.40529073029756546,
"rewards/reasoning_quality_reward_func": 0.9590171575546265,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.921875,
"rewards/xmlcount_reward_func": 0.5,
"step": 73
},
{
"completion_length": 345.25,
"epoch": 0.9164086687306502,
"grad_norm": 0.2591722309589386,
"kl": 0.0776594839990139,
"learning_rate": 4.833094886531918e-06,
"loss": 0.0031,
"reward": 2.752366602420807,
"reward_std": 0.1025167815387249,
"rewards/ngram_similarity_reward_func": 0.4143087863922119,
"rewards/reasoning_quality_reward_func": 0.9630578011274338,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.625,
"rewards/xmlcount_reward_func": 0.5,
"step": 74
},
{
"completion_length": 377.71875,
"epoch": 0.9287925696594427,
"grad_norm": 0.25510552525520325,
"kl": 0.07739250548183918,
"learning_rate": 4.613335126796773e-06,
"loss": 0.0031,
"reward": 3.047813057899475,
"reward_std": 0.18943855166435242,
"rewards/ngram_similarity_reward_func": 0.36462873220443726,
"rewards/reasoning_quality_reward_func": 0.9753718972206116,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.9578125029802322,
"rewards/xmlcount_reward_func": 0.5,
"step": 75
},
{
"completion_length": 353.359375,
"epoch": 0.9411764705882353,
"grad_norm": 0.26207593083381653,
"kl": 0.0766323795542121,
"learning_rate": 4.4266718718412e-06,
"loss": 0.0031,
"reward": 3.032430052757263,
"reward_std": 0.06077156774699688,
"rewards/ngram_similarity_reward_func": 0.4302191510796547,
"rewards/reasoning_quality_reward_func": 0.9772109389305115,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.875,
"rewards/xmlcount_reward_func": 0.5,
"step": 76
},
{
"completion_length": 330.796875,
"epoch": 0.9535603715170279,
"grad_norm": 0.26912447810173035,
"kl": 0.07224935106933117,
"learning_rate": 4.2734604457802565e-06,
"loss": 0.0029,
"reward": 2.9969308972358704,
"reward_std": 0.11895978637039661,
"rewards/ngram_similarity_reward_func": 0.5269326269626617,
"rewards/reasoning_quality_reward_func": 0.9542312771081924,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.7657670378684998,
"rewards/xmlcount_reward_func": 0.5,
"step": 77
},
{
"completion_length": 359.5625,
"epoch": 0.9659442724458205,
"grad_norm": 0.25813353061676025,
"kl": 0.07646154798567295,
"learning_rate": 4.153992495271414e-06,
"loss": 0.0031,
"reward": 3.1742658615112305,
"reward_std": 0.15984882693737745,
"rewards/ngram_similarity_reward_func": 0.4850096367299557,
"rewards/reasoning_quality_reward_func": 0.9642562717199326,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.9749999940395355,
"rewards/xmlcount_reward_func": 0.5,
"step": 78
},
{
"completion_length": 348.515625,
"epoch": 0.978328173374613,
"grad_norm": 0.269113689661026,
"kl": 0.07896707952022552,
"learning_rate": 4.0684954343485806e-06,
"loss": 0.0032,
"reward": 2.8017610907554626,
"reward_std": 0.23625551722943783,
"rewards/ngram_similarity_reward_func": 0.47032520920038223,
"rewards/reasoning_quality_reward_func": 0.9892484545707703,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.5921874865889549,
"rewards/xmlcount_reward_func": 0.5,
"step": 79
},
{
"completion_length": 375.296875,
"epoch": 0.9907120743034056,
"grad_norm": 0.2751859724521637,
"kl": 0.06904378905892372,
"learning_rate": 4.01713201152656e-06,
"loss": 0.0028,
"reward": 2.791142702102661,
"reward_std": 0.14453750429674983,
"rewards/ngram_similarity_reward_func": 0.31529772467911243,
"rewards/reasoning_quality_reward_func": 0.9427484273910522,
"rewards/soft_format_reward_func": 0.25,
"rewards/sql_execution_reward_func": 0.783096581697464,
"rewards/xmlcount_reward_func": 0.5,
"step": 80
}
],
"logging_steps": 1,
"max_steps": 80,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}