text2sql-grpo-intermediate-reward / trainer_state.json
genies-llm's picture
Model save
46f3827 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9943502824858759,
"eval_steps": 100,
"global_step": 220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 193.8046875,
"epoch": 0.00903954802259887,
"grad_norm": 0.299434095621109,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0276,
"num_tokens": 503964.0,
"reward": 4.863432988524437,
"reward_std": 1.8696988988667727,
"rewards/accuracy_reward": 0.345703125,
"rewards/exec_out_all_reward": 0.7421875,
"rewards/exec_out_step_reward": 0.9397887196391821,
"rewards/format_reward": 0.642578125,
"rewards/keywords_iou_reward": 0.3189462535083294,
"rewards/sql_step_keywords_recall_reward": 0.5181736210361123,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 188.369140625,
"epoch": 0.01807909604519774,
"grad_norm": 0.3049573004245758,
"kl": 8.754432201385498e-08,
"learning_rate": 1.3636363636363637e-07,
"loss": 0.0311,
"num_tokens": 1008385.0,
"reward": 4.9951048865914345,
"reward_std": 1.8348420038819313,
"rewards/accuracy_reward": 0.34765625,
"rewards/exec_out_all_reward": 0.787109375,
"rewards/exec_out_step_reward": 0.9488211516290903,
"rewards/format_reward": 0.642578125,
"rewards/keywords_iou_reward": 0.35271753335837275,
"rewards/sql_step_keywords_recall_reward": 0.5205361591652036,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 190.701171875,
"epoch": 0.02711864406779661,
"grad_norm": 0.3177661597728729,
"kl": 0.00011102110147476196,
"learning_rate": 2.7272727272727274e-07,
"loss": 0.0338,
"num_tokens": 1511588.0,
"reward": 4.939835079014301,
"reward_std": 1.7858339007943869,
"rewards/accuracy_reward": 0.322265625,
"rewards/exec_out_all_reward": 0.80078125,
"rewards/exec_out_step_reward": 0.9515764508396387,
"rewards/format_reward": 0.65625,
"rewards/keywords_iou_reward": 0.3603124172659591,
"rewards/sql_step_keywords_recall_reward": 0.5215400578454137,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 194.2421875,
"epoch": 0.03615819209039548,
"grad_norm": 0.29520705342292786,
"kl": 0.00011537596583366394,
"learning_rate": 4.0909090909090906e-07,
"loss": 0.0418,
"num_tokens": 2016412.0,
"reward": 4.9831836223602295,
"reward_std": 1.7916885651648045,
"rewards/accuracy_reward": 0.37109375,
"rewards/exec_out_all_reward": 0.751953125,
"rewards/exec_out_step_reward": 0.9367489777505398,
"rewards/format_reward": 0.630859375,
"rewards/keywords_iou_reward": 0.32331358385272324,
"rewards/sql_step_keywords_recall_reward": 0.5326199810951948,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 192.185546875,
"epoch": 0.04519774011299435,
"grad_norm": 0.2962106168270111,
"kl": 0.00011671334505081177,
"learning_rate": 5.454545454545455e-07,
"loss": 0.0337,
"num_tokens": 2520743.0,
"reward": 4.651097267866135,
"reward_std": 1.9646679311990738,
"rewards/accuracy_reward": 0.318359375,
"rewards/exec_out_all_reward": 0.763671875,
"rewards/exec_out_step_reward": 0.9430377371609211,
"rewards/format_reward": 0.595703125,
"rewards/keywords_iou_reward": 0.29655176820233464,
"rewards/sql_step_keywords_recall_reward": 0.48214349802583456,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 190.55078125,
"epoch": 0.05423728813559322,
"grad_norm": 0.3201525807380676,
"kl": 0.00012993812561035156,
"learning_rate": 6.818181818181818e-07,
"loss": 0.0321,
"num_tokens": 3023701.0,
"reward": 4.995345205068588,
"reward_std": 2.013074729591608,
"rewards/accuracy_reward": 0.36328125,
"rewards/exec_out_all_reward": 0.7734375,
"rewards/exec_out_step_reward": 0.9453125055879354,
"rewards/format_reward": 0.623046875,
"rewards/keywords_iou_reward": 0.33588895108550787,
"rewards/sql_step_keywords_recall_reward": 0.5286454004235566,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 187.921875,
"epoch": 0.06327683615819209,
"grad_norm": 0.2839597165584564,
"kl": 0.00017252564430236816,
"learning_rate": 8.181818181818181e-07,
"loss": 0.0222,
"num_tokens": 3524501.0,
"reward": 5.12370303273201,
"reward_std": 1.872809598222375,
"rewards/accuracy_reward": 0.38671875,
"rewards/exec_out_all_reward": 0.771484375,
"rewards/exec_out_step_reward": 0.9374604746699333,
"rewards/format_reward": 0.662109375,
"rewards/keywords_iou_reward": 0.3198722831439227,
"rewards/sql_step_keywords_recall_reward": 0.5660292678512633,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 194.861328125,
"epoch": 0.07231638418079096,
"grad_norm": 0.264257550239563,
"kl": 0.0003757178783416748,
"learning_rate": 9.545454545454546e-07,
"loss": 0.0214,
"num_tokens": 4033750.0,
"reward": 4.592174172401428,
"reward_std": 1.6819164399057627,
"rewards/accuracy_reward": 0.287109375,
"rewards/exec_out_all_reward": 0.72265625,
"rewards/exec_out_step_reward": 0.9274584576487541,
"rewards/format_reward": 0.662109375,
"rewards/keywords_iou_reward": 0.31115873460657895,
"rewards/sql_step_keywords_recall_reward": 0.5091950967907906,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 190.73046875,
"epoch": 0.08135593220338982,
"grad_norm": 0.2734237313270569,
"kl": 0.0005688667297363281,
"learning_rate": 1.090909090909091e-06,
"loss": 0.0149,
"num_tokens": 4536888.0,
"reward": 5.064344555139542,
"reward_std": 1.7968224007636309,
"rewards/accuracy_reward": 0.35546875,
"rewards/exec_out_all_reward": 0.7109375,
"rewards/exec_out_step_reward": 0.9287527892738581,
"rewards/format_reward": 0.7265625,
"rewards/keywords_iou_reward": 0.354521602508612,
"rewards/sql_step_keywords_recall_reward": 0.5671735098585486,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 177.17578125,
"epoch": 0.0903954802259887,
"grad_norm": 0.22836743295192719,
"kl": 0.002085447311401367,
"learning_rate": 1.2272727272727274e-06,
"loss": 0.002,
"num_tokens": 5032606.0,
"reward": 5.785055458545685,
"reward_std": 1.7521540587767959,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.759765625,
"rewards/exec_out_step_reward": 0.9359297584742308,
"rewards/format_reward": 0.826171875,
"rewards/keywords_iou_reward": 0.38180301152169704,
"rewards/sql_step_keywords_recall_reward": 0.6558322114869952,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 177.865234375,
"epoch": 0.09943502824858758,
"grad_norm": 0.2427971363067627,
"kl": 0.0032596588134765625,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.0119,
"num_tokens": 5529005.0,
"reward": 5.771241188049316,
"reward_std": 1.641195336356759,
"rewards/accuracy_reward": 0.4375,
"rewards/exec_out_all_reward": 0.783203125,
"rewards/exec_out_step_reward": 0.9457356799393892,
"rewards/format_reward": 0.869140625,
"rewards/keywords_iou_reward": 0.39362214831635356,
"rewards/sql_step_keywords_recall_reward": 0.6359174773097038,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 173.3203125,
"epoch": 0.10847457627118644,
"grad_norm": 0.23091059923171997,
"kl": 0.003955364227294922,
"learning_rate": 1.5e-06,
"loss": 0.0064,
"num_tokens": 6024409.0,
"reward": 6.000889599323273,
"reward_std": 1.5886465199291706,
"rewards/accuracy_reward": 0.46875,
"rewards/exec_out_all_reward": 0.76953125,
"rewards/exec_out_step_reward": 0.9391183033585548,
"rewards/format_reward": 0.8984375,
"rewards/keywords_iou_reward": 0.42962841456755996,
"rewards/sql_step_keywords_recall_reward": 0.6595456739887595,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 171.25,
"epoch": 0.11751412429378531,
"grad_norm": 0.2572859823703766,
"kl": 0.007582187652587891,
"learning_rate": 1.6363636363636363e-06,
"loss": 0.0009,
"num_tokens": 6520105.0,
"reward": 5.847834274172783,
"reward_std": 1.4467571768909693,
"rewards/accuracy_reward": 0.46484375,
"rewards/exec_out_all_reward": 0.744140625,
"rewards/exec_out_step_reward": 0.9297774098813534,
"rewards/format_reward": 0.904296875,
"rewards/keywords_iou_reward": 0.3783310679718852,
"rewards/sql_step_keywords_recall_reward": 0.6535822190344334,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 160.04296875,
"epoch": 0.12655367231638417,
"grad_norm": 0.23153281211853027,
"kl": 0.012262344360351562,
"learning_rate": 1.7727272727272729e-06,
"loss": -0.0025,
"num_tokens": 7007539.0,
"reward": 5.9820186495780945,
"reward_std": 1.6872543934732676,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.82421875,
"rewards/exec_out_step_reward": 0.9496279824525118,
"rewards/format_reward": 0.90234375,
"rewards/keywords_iou_reward": 0.44878690084442496,
"rewards/sql_step_keywords_recall_reward": 0.6816918756812811,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 159.3203125,
"epoch": 0.13559322033898305,
"grad_norm": 0.22347486019134521,
"kl": 0.015293121337890625,
"learning_rate": 1.909090909090909e-06,
"loss": 0.001,
"num_tokens": 7495787.0,
"reward": 5.671351440250874,
"reward_std": 1.3719452489167452,
"rewards/accuracy_reward": 0.357421875,
"rewards/exec_out_all_reward": 0.771484375,
"rewards/exec_out_step_reward": 0.939460875466466,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.46561355609446764,
"rewards/sql_step_keywords_recall_reward": 0.6658978424966335,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 158.810546875,
"epoch": 0.14463276836158193,
"grad_norm": 0.2615886926651001,
"kl": 0.021930694580078125,
"learning_rate": 2.0454545454545453e-06,
"loss": -0.003,
"num_tokens": 7983430.0,
"reward": 5.687411919236183,
"reward_std": 1.400244857184589,
"rewards/accuracy_reward": 0.38671875,
"rewards/exec_out_all_reward": 0.791015625,
"rewards/exec_out_step_reward": 0.9499209504574537,
"rewards/format_reward": 0.88671875,
"rewards/keywords_iou_reward": 0.4251784700900316,
"rewards/sql_step_keywords_recall_reward": 0.6625246489420533,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 155.96484375,
"epoch": 0.1536723163841808,
"grad_norm": 0.2243858426809311,
"kl": 0.019573211669921875,
"learning_rate": 2.181818181818182e-06,
"loss": 0.0037,
"num_tokens": 8467588.0,
"reward": 6.101026564836502,
"reward_std": 1.3565897848457098,
"rewards/accuracy_reward": 0.447265625,
"rewards/exec_out_all_reward": 0.826171875,
"rewards/exec_out_step_reward": 0.9590797107666731,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.4580969992093742,
"rewards/sql_step_keywords_recall_reward": 0.6769247055053711,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 157.99609375,
"epoch": 0.16271186440677965,
"grad_norm": 0.23680506646633148,
"kl": 0.0247344970703125,
"learning_rate": 2.318181818181818e-06,
"loss": 0.0027,
"num_tokens": 8954138.0,
"reward": 6.041056051850319,
"reward_std": 1.2283777361735702,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.845703125,
"rewards/exec_out_step_reward": 0.959887308999896,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.44093527970835567,
"rewards/sql_step_keywords_recall_reward": 0.6895325118675828,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 156.0703125,
"epoch": 0.17175141242937852,
"grad_norm": 0.2468118816614151,
"kl": 0.0263519287109375,
"learning_rate": 2.454545454545455e-06,
"loss": 0.0033,
"num_tokens": 9439242.0,
"reward": 6.419172838330269,
"reward_std": 1.4407691890373826,
"rewards/accuracy_reward": 0.49609375,
"rewards/exec_out_all_reward": 0.83984375,
"rewards/exec_out_step_reward": 0.9605569522827864,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.4862754005007446,
"rewards/sql_step_keywords_recall_reward": 0.7243463154882193,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 158.8828125,
"epoch": 0.1807909604519774,
"grad_norm": 0.2642815411090851,
"kl": 0.028415679931640625,
"learning_rate": 2.590909090909091e-06,
"loss": 0.0059,
"num_tokens": 9927642.0,
"reward": 5.962770789861679,
"reward_std": 1.364680239930749,
"rewards/accuracy_reward": 0.423828125,
"rewards/exec_out_all_reward": 0.875,
"rewards/exec_out_step_reward": 0.9687531031668186,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.4039350217208266,
"rewards/sql_step_keywords_recall_reward": 0.678335040807724,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 157.8203125,
"epoch": 0.18983050847457628,
"grad_norm": 0.24316003918647766,
"kl": 0.032009124755859375,
"learning_rate": 2.7272727272727272e-06,
"loss": -0.0003,
"num_tokens": 10415362.0,
"reward": 6.179068893194199,
"reward_std": 1.4617707338184118,
"rewards/accuracy_reward": 0.462890625,
"rewards/exec_out_all_reward": 0.845703125,
"rewards/exec_out_step_reward": 0.9599469937384129,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.4459744766354561,
"rewards/sql_step_keywords_recall_reward": 0.6924073351547122,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 157.458984375,
"epoch": 0.19887005649717515,
"grad_norm": 0.24961793422698975,
"kl": 0.03668212890625,
"learning_rate": 2.863636363636364e-06,
"loss": -0.004,
"num_tokens": 10901521.0,
"reward": 6.071441277861595,
"reward_std": 1.1777024501934648,
"rewards/accuracy_reward": 0.453125,
"rewards/exec_out_all_reward": 0.830078125,
"rewards/exec_out_step_reward": 0.9483445044606924,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.42953827418386936,
"rewards/sql_step_keywords_recall_reward": 0.6683171540498734,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 158.517578125,
"epoch": 0.207909604519774,
"grad_norm": 0.26023003458976746,
"kl": 0.03612518310546875,
"learning_rate": 3e-06,
"loss": 0.0019,
"num_tokens": 11389862.0,
"reward": 6.313733980059624,
"reward_std": 1.3131706872954965,
"rewards/accuracy_reward": 0.47265625,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9664326030761003,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.46862596087157726,
"rewards/sql_step_keywords_recall_reward": 0.6971588619053364,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 162.736328125,
"epoch": 0.21694915254237288,
"grad_norm": 0.25574371218681335,
"kl": 0.039890289306640625,
"learning_rate": 2.9998111915108126e-06,
"loss": -0.0018,
"num_tokens": 11879287.0,
"reward": 5.985842078924179,
"reward_std": 1.2227760329842567,
"rewards/accuracy_reward": 0.423828125,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9674719516187906,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4203591588884592,
"rewards/sql_step_keywords_recall_reward": 0.6659330297261477,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 162.716796875,
"epoch": 0.22598870056497175,
"grad_norm": 0.2357674390077591,
"kl": 0.039127349853515625,
"learning_rate": 2.9992448135747778e-06,
"loss": -0.0065,
"num_tokens": 12367230.0,
"reward": 6.357031494379044,
"reward_std": 1.352663902565837,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.89453125,
"rewards/exec_out_step_reward": 0.9736126679927111,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.462111447006464,
"rewards/sql_step_keywords_recall_reward": 0.6994303409010172,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 163.759765625,
"epoch": 0.23502824858757063,
"grad_norm": 3.8160126209259033,
"kl": 0.41811370849609375,
"learning_rate": 2.998301008774512e-06,
"loss": 0.0131,
"num_tokens": 12855263.0,
"reward": 6.066176131367683,
"reward_std": 1.418117775581777,
"rewards/accuracy_reward": 0.453125,
"rewards/exec_out_all_reward": 0.865234375,
"rewards/exec_out_step_reward": 0.9645538832992315,
"rewards/format_reward": 0.9140625,
"rewards/keywords_iou_reward": 0.4158592028543353,
"rewards/sql_step_keywords_recall_reward": 0.678106939420104,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 172.52734375,
"epoch": 0.2440677966101695,
"grad_norm": 0.23165632784366608,
"kl": 0.03951263427734375,
"learning_rate": 2.9969800147078265e-06,
"loss": 0.0075,
"num_tokens": 13348781.0,
"reward": 6.2546906769275665,
"reward_std": 1.2166710263118148,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.960680965334177,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4882043502293527,
"rewards/sql_step_keywords_recall_reward": 0.7082259934395552,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 174.0,
"epoch": 0.25310734463276835,
"grad_norm": 0.2445058375597,
"kl": 0.04166412353515625,
"learning_rate": 2.9952821639279137e-06,
"loss": 0.0028,
"num_tokens": 494680.0,
"reward": 6.440436959266663,
"reward_std": 1.2339025381952524,
"rewards/accuracy_reward": 0.50390625,
"rewards/exec_out_all_reward": 0.83984375,
"rewards/exec_out_step_reward": 0.9568103682249784,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.48325524432584643,
"rewards/sql_step_keywords_recall_reward": 0.7143817320466042,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 178.755859375,
"epoch": 0.2621468926553672,
"grad_norm": 0.23020873963832855,
"kl": 0.04241943359375,
"learning_rate": 2.993207883859627e-06,
"loss": -0.003,
"num_tokens": 991863.0,
"reward": 5.925758346915245,
"reward_std": 1.3979150608647615,
"rewards/accuracy_reward": 0.4140625,
"rewards/exec_out_all_reward": 0.865234375,
"rewards/exec_out_step_reward": 0.9639307502657175,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.41449853405356407,
"rewards/sql_step_keywords_recall_reward": 0.6797055369243026,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 182.48828125,
"epoch": 0.2711864406779661,
"grad_norm": 0.22643530368804932,
"kl": 0.04361724853515625,
"learning_rate": 2.990757696691881e-06,
"loss": 0.0059,
"num_tokens": 1490665.0,
"reward": 6.013850957155228,
"reward_std": 1.3883078750222921,
"rewards/accuracy_reward": 0.43359375,
"rewards/exec_out_all_reward": 0.833984375,
"rewards/exec_out_step_reward": 0.9505758639425039,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.43903162656351924,
"rewards/sql_step_keywords_recall_reward": 0.6773993754759431,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 190.423828125,
"epoch": 0.280225988700565,
"grad_norm": 0.2063753306865692,
"kl": 0.0457611083984375,
"learning_rate": 2.987932219246193e-06,
"loss": 0.0075,
"num_tokens": 1993394.0,
"reward": 5.88956793397665,
"reward_std": 1.3650804716162384,
"rewards/accuracy_reward": 0.40625,
"rewards/exec_out_all_reward": 0.810546875,
"rewards/exec_out_step_reward": 0.9385083485394716,
"rewards/format_reward": 0.921875,
"rewards/keywords_iou_reward": 0.4571849275380373,
"rewards/sql_step_keywords_recall_reward": 0.679267879575491,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 193.060546875,
"epoch": 0.28926553672316385,
"grad_norm": 0.22804522514343262,
"kl": 0.0471343994140625,
"learning_rate": 2.984732162821399e-06,
"loss": 0.0114,
"num_tokens": 2497401.0,
"reward": 5.931887894868851,
"reward_std": 1.4683727947995067,
"rewards/accuracy_reward": 0.435546875,
"rewards/exec_out_all_reward": 0.822265625,
"rewards/exec_out_step_reward": 0.9442894347012043,
"rewards/format_reward": 0.90625,
"rewards/keywords_iou_reward": 0.4232069947756827,
"rewards/sql_step_keywords_recall_reward": 0.670481245033443,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 194.625,
"epoch": 0.2983050847457627,
"grad_norm": 0.20885376632213593,
"kl": 0.04837799072265625,
"learning_rate": 2.9811583330145917e-06,
"loss": 0.0136,
"num_tokens": 3002817.0,
"reward": 6.591131284832954,
"reward_std": 1.272476114332676,
"rewards/accuracy_reward": 0.5390625,
"rewards/exec_out_all_reward": 0.8359375,
"rewards/exec_out_step_reward": 0.9475725479424,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4941097451373935,
"rewards/sql_step_keywords_recall_reward": 0.7217455059289932,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 201.3046875,
"epoch": 0.3073446327683616,
"grad_norm": 0.21211469173431396,
"kl": 0.048919677734375,
"learning_rate": 2.9772116295183124e-06,
"loss": -0.001,
"num_tokens": 3512913.0,
"reward": 6.256010413169861,
"reward_std": 1.339096024632454,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.853515625,
"rewards/exec_out_step_reward": 0.9534575026482344,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.4364256302360445,
"rewards/sql_step_keywords_recall_reward": 0.7129048258066177,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 204.94140625,
"epoch": 0.3163841807909605,
"grad_norm": 0.22092854976654053,
"kl": 0.0505828857421875,
"learning_rate": 2.97289304589406e-06,
"loss": 0.017,
"num_tokens": 4024451.0,
"reward": 5.7779867351055145,
"reward_std": 1.4151953971013427,
"rewards/accuracy_reward": 0.376953125,
"rewards/exec_out_all_reward": 0.8359375,
"rewards/exec_out_step_reward": 0.9576846230775118,
"rewards/format_reward": 0.91015625,
"rewards/keywords_iou_reward": 0.4494855832308531,
"rewards/sql_step_keywords_recall_reward": 0.6674247067421675,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 205.88671875,
"epoch": 0.3254237288135593,
"grad_norm": 0.20495107769966125,
"kl": 0.0490264892578125,
"learning_rate": 2.9682036693221684e-06,
"loss": 0.0146,
"num_tokens": 4537929.0,
"reward": 6.179159179329872,
"reward_std": 1.19661252386868,
"rewards/accuracy_reward": 0.458984375,
"rewards/exec_out_all_reward": 0.837890625,
"rewards/exec_out_step_reward": 0.9454868901520967,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.48097023693844676,
"rewards/sql_step_keywords_recall_reward": 0.6564974309876561,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 205.6953125,
"epoch": 0.3344632768361582,
"grad_norm": 0.20743127167224884,
"kl": 0.05022430419921875,
"learning_rate": 2.963144680328111e-06,
"loss": 0.0123,
"num_tokens": 5048565.0,
"reward": 6.279510959982872,
"reward_std": 1.4437304949387908,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.861328125,
"rewards/exec_out_step_reward": 0.95453792065382,
"rewards/format_reward": 0.923828125,
"rewards/keywords_iou_reward": 0.4697499736212194,
"rewards/sql_step_keywords_recall_reward": 0.686254383996129,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 208.59765625,
"epoch": 0.34350282485875705,
"grad_norm": 0.20503395795822144,
"kl": 0.05016326904296875,
"learning_rate": 2.9577173524853125e-06,
"loss": -0.0049,
"num_tokens": 5560463.0,
"reward": 5.8292489647865295,
"reward_std": 1.3312111617997289,
"rewards/accuracy_reward": 0.3984375,
"rewards/exec_out_all_reward": 0.837890625,
"rewards/exec_out_step_reward": 0.9564809743314981,
"rewards/format_reward": 0.91796875,
"rewards/keywords_iou_reward": 0.430765890982002,
"rewards/sql_step_keywords_recall_reward": 0.6616268502548337,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 204.892578125,
"epoch": 0.3525423728813559,
"grad_norm": 0.19791673123836517,
"kl": 0.04753875732421875,
"learning_rate": 2.9519230520945346e-06,
"loss": -0.0044,
"num_tokens": 6072524.0,
"reward": 6.163229390978813,
"reward_std": 1.2822516057640314,
"rewards/accuracy_reward": 0.453125,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.963240172713995,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4430888262577355,
"rewards/sql_step_keywords_recall_reward": 0.696624081581831,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 206.439453125,
"epoch": 0.3615819209039548,
"grad_norm": 0.1895550638437271,
"kl": 0.048187255859375,
"learning_rate": 2.9457632378399134e-06,
"loss": 0.0102,
"num_tokens": 6585445.0,
"reward": 5.869170263409615,
"reward_std": 1.2297673234716058,
"rewards/accuracy_reward": 0.37890625,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.9623821955174208,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4554209231864661,
"rewards/sql_step_keywords_recall_reward": 0.6834461260586977,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 201.962890625,
"epoch": 0.3706214689265537,
"grad_norm": 0.20803290605545044,
"kl": 0.0442962646484375,
"learning_rate": 2.9392394604217463e-06,
"loss": 0.0043,
"num_tokens": 7094046.0,
"reward": 6.21223983168602,
"reward_std": 1.2842160500586033,
"rewards/accuracy_reward": 0.4765625,
"rewards/exec_out_all_reward": 0.875,
"rewards/exec_out_step_reward": 0.9636959079653025,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.43379027443006635,
"rewards/sql_step_keywords_recall_reward": 0.6680727442726493,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 206.572265625,
"epoch": 0.37966101694915255,
"grad_norm": 0.19538187980651855,
"kl": 0.044647216796875,
"learning_rate": 2.932353362166111e-06,
"loss": 0.0142,
"num_tokens": 7608915.0,
"reward": 6.033717706799507,
"reward_std": 1.3723872043192387,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.857421875,
"rewards/exec_out_step_reward": 0.9629634786397219,
"rewards/format_reward": 0.95703125,
"rewards/keywords_iou_reward": 0.429211582057178,
"rewards/sql_step_keywords_recall_reward": 0.6713154595345259,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 203.2421875,
"epoch": 0.3887005649717514,
"grad_norm": 0.20477567613124847,
"kl": 0.04170989990234375,
"learning_rate": 2.9251066766114183e-06,
"loss": 0.0111,
"num_tokens": 8120303.0,
"reward": 5.484863147139549,
"reward_std": 1.2587912240996957,
"rewards/accuracy_reward": 0.3046875,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9700288362801075,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.42473567882552743,
"rewards/sql_step_keywords_recall_reward": 0.641925479285419,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 198.205078125,
"epoch": 0.3977401129943503,
"grad_norm": 0.19639819860458374,
"kl": 0.042938232421875,
"learning_rate": 2.9175012280720027e-06,
"loss": -0.0058,
"num_tokens": 8629068.0,
"reward": 5.85739204287529,
"reward_std": 1.3158389078453183,
"rewards/accuracy_reward": 0.39453125,
"rewards/exec_out_all_reward": 0.828125,
"rewards/exec_out_step_reward": 0.9519577771425247,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.44756252504885197,
"rewards/sql_step_keywords_recall_reward": 0.6665591625496745,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 199.18359375,
"epoch": 0.4067796610169492,
"grad_norm": 0.19431763887405396,
"kl": 0.041290283203125,
"learning_rate": 2.9095389311788626e-06,
"loss": 0.0103,
"num_tokens": 9137718.0,
"reward": 6.069079004228115,
"reward_std": 1.3046986246481538,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.8359375,
"rewards/exec_out_step_reward": 0.9571854863315821,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.4311121259815991,
"rewards/sql_step_keywords_recall_reward": 0.6656848564743996,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 197.578125,
"epoch": 0.415819209039548,
"grad_norm": 0.18736235797405243,
"kl": 0.04170989990234375,
"learning_rate": 2.9012217903976603e-06,
"loss": 0.0009,
"num_tokens": 9644030.0,
"reward": 6.115010187029839,
"reward_std": 1.2594214268028736,
"rewards/accuracy_reward": 0.443359375,
"rewards/exec_out_all_reward": 0.87890625,
"rewards/exec_out_step_reward": 0.9649127330631018,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.42334456741809845,
"rewards/sql_step_keywords_recall_reward": 0.7018458480015397,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 194.72265625,
"epoch": 0.4248587570621469,
"grad_norm": 0.21365734934806824,
"kl": 0.03936004638671875,
"learning_rate": 2.892551899524109e-06,
"loss": -0.0027,
"num_tokens": 10148012.0,
"reward": 6.157889060676098,
"reward_std": 1.2168289944529533,
"rewards/accuracy_reward": 0.447265625,
"rewards/exec_out_all_reward": 0.90234375,
"rewards/exec_out_step_reward": 0.9766919370740652,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.43009675364010036,
"rewards/sql_step_keywords_recall_reward": 0.6862379219383001,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 187.716796875,
"epoch": 0.43389830508474575,
"grad_norm": 0.1940770298242569,
"kl": 0.0411834716796875,
"learning_rate": 2.8835314411568722e-06,
"loss": 0.0058,
"num_tokens": 10649115.0,
"reward": 6.114228963851929,
"reward_std": 1.1688512060791254,
"rewards/accuracy_reward": 0.423828125,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9754417818039656,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.4551887298002839,
"rewards/sql_step_keywords_recall_reward": 0.7069253623485565,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 188.994140625,
"epoch": 0.4429378531073446,
"grad_norm": 0.1919233798980713,
"kl": 0.040985107421875,
"learning_rate": 2.8741626861481045e-06,
"loss": 0.0096,
"num_tokens": 11150488.0,
"reward": 6.672178938984871,
"reward_std": 1.279738076031208,
"rewards/accuracy_reward": 0.51171875,
"rewards/exec_out_all_reward": 0.90234375,
"rewards/exec_out_step_reward": 0.9763346407562494,
"rewards/format_reward": 0.970703125,
"rewards/keywords_iou_reward": 0.5284834480844438,
"rewards/sql_step_keywords_recall_reward": 0.7189555410295725,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 191.287109375,
"epoch": 0.4519774011299435,
"grad_norm": 0.20888246595859528,
"kl": 0.041839599609375,
"learning_rate": 2.8644479930317777e-06,
"loss": 0.0116,
"num_tokens": 11654247.0,
"reward": 6.019737772643566,
"reward_std": 1.2559357401914895,
"rewards/accuracy_reward": 0.423828125,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9652940593659878,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.44982042722404003,
"rewards/sql_step_keywords_recall_reward": 0.6469903746619821,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 188.822265625,
"epoch": 0.4610169491525424,
"grad_norm": 0.20416894555091858,
"kl": 0.04168701171875,
"learning_rate": 2.854389807429932e-06,
"loss": 0.0077,
"num_tokens": 12156812.0,
"reward": 5.93711394071579,
"reward_std": 1.3493517027236521,
"rewards/accuracy_reward": 0.419921875,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.9683663547039032,
"rewards/format_reward": 0.95703125,
"rewards/keywords_iou_reward": 0.4109305152669549,
"rewards/sql_step_keywords_recall_reward": 0.6507927812635899,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 186.201171875,
"epoch": 0.47005649717514125,
"grad_norm": 0.2009599655866623,
"kl": 0.04203033447265625,
"learning_rate": 2.843990661437004e-06,
"loss": -0.0079,
"num_tokens": 12656615.0,
"reward": 6.319135099649429,
"reward_std": 1.1828816812485456,
"rewards/accuracy_reward": 0.490234375,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9687747992575169,
"rewards/format_reward": 0.966796875,
"rewards/keywords_iou_reward": 0.44148961594328284,
"rewards/sql_step_keywords_recall_reward": 0.668552921153605,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 184.240234375,
"epoch": 0.47909604519774013,
"grad_norm": 0.19552090764045715,
"kl": 0.040004730224609375,
"learning_rate": 2.8332531729823854e-06,
"loss": 0.0091,
"num_tokens": 13154062.0,
"reward": 6.084091693162918,
"reward_std": 1.3168746987357736,
"rewards/accuracy_reward": 0.439453125,
"rewards/exec_out_all_reward": 0.89453125,
"rewards/exec_out_step_reward": 0.9761377759277821,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.41599453624803573,
"rewards/sql_step_keywords_recall_reward": 0.6822148254141212,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 186.546875,
"epoch": 0.488135593220339,
"grad_norm": 0.18975664675235748,
"kl": 0.041107177734375,
"learning_rate": 2.822180045171373e-06,
"loss": 0.0031,
"num_tokens": 13654138.0,
"reward": 6.5647883862257,
"reward_std": 1.142817527987063,
"rewards/accuracy_reward": 0.5390625,
"rewards/exec_out_all_reward": 0.890625,
"rewards/exec_out_step_reward": 0.972067216411233,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.45550795644521713,
"rewards/sql_step_keywords_recall_reward": 0.6836584862321615,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 185.6171875,
"epoch": 0.4971751412429379,
"grad_norm": 0.19297046959400177,
"kl": 0.04022216796875,
"learning_rate": 2.8107740656046774e-06,
"loss": 0.0018,
"num_tokens": 14153762.0,
"reward": 6.067966505885124,
"reward_std": 1.339047422632575,
"rewards/accuracy_reward": 0.419921875,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9681291859596968,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4687476740218699,
"rewards/sql_step_keywords_recall_reward": 0.6701545566320419,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 183.33203125,
"epoch": 0.5062146892655367,
"grad_norm": 0.19569052755832672,
"kl": 0.03861236572265625,
"learning_rate": 2.7990381056766585e-06,
"loss": -0.0018,
"num_tokens": 14653404.0,
"reward": 6.230767786502838,
"reward_std": 1.3125396608375013,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.9760687928646803,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.44265242759138346,
"rewards/sql_step_keywords_recall_reward": 0.681894151493907,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 191.53125,
"epoch": 0.5152542372881356,
"grad_norm": 0.19364245235919952,
"kl": 0.0371551513671875,
"learning_rate": 2.7869751198524656e-06,
"loss": -0.0058,
"num_tokens": 15157368.0,
"reward": 6.011801972985268,
"reward_std": 1.4972982537001371,
"rewards/accuracy_reward": 0.435546875,
"rewards/exec_out_all_reward": 0.841796875,
"rewards/exec_out_step_reward": 0.9621403776109219,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.4371058586984873,
"rewards/sql_step_keywords_recall_reward": 0.6559187090024352,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 189.95703125,
"epoch": 0.5242937853107345,
"grad_norm": 0.1767347753047943,
"kl": 0.037456512451171875,
"learning_rate": 2.7745881449242716e-06,
"loss": 0.0095,
"num_tokens": 15662582.0,
"reward": 6.229088187217712,
"reward_std": 1.2932423749007285,
"rewards/accuracy_reward": 0.4921875,
"rewards/exec_out_all_reward": 0.853515625,
"rewards/exec_out_step_reward": 0.9643019940704107,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4135230230167508,
"rewards/sql_step_keywords_recall_reward": 0.6760213691741228,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 191.84765625,
"epoch": 0.5333333333333333,
"grad_norm": 0.1982230544090271,
"kl": 0.036502838134765625,
"learning_rate": 2.761880299246772e-06,
"loss": -0.0078,
"num_tokens": 16165460.0,
"reward": 6.063759118318558,
"reward_std": 1.3337543765082955,
"rewards/accuracy_reward": 0.439453125,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9730437770485878,
"rewards/format_reward": 0.95703125,
"rewards/keywords_iou_reward": 0.42756529804319143,
"rewards/sql_step_keywords_recall_reward": 0.6379284737631679,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 191.296875,
"epoch": 0.5423728813559322,
"grad_norm": 0.19018128514289856,
"kl": 0.036468505859375,
"learning_rate": 2.748854781952157e-06,
"loss": -0.008,
"num_tokens": 16671384.0,
"reward": 6.285549536347389,
"reward_std": 1.3978888802230358,
"rewards/accuracy_reward": 0.490234375,
"rewards/exec_out_all_reward": 0.861328125,
"rewards/exec_out_step_reward": 0.9668580982834101,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4441379075869918,
"rewards/sql_step_keywords_recall_reward": 0.6686968319118023,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 203.625,
"epoch": 0.5514124293785311,
"grad_norm": 0.19416409730911255,
"kl": 0.033016204833984375,
"learning_rate": 2.735514872144749e-06,
"loss": -0.008,
"num_tokens": 17181944.0,
"reward": 6.115775644779205,
"reward_std": 1.6173988990485668,
"rewards/accuracy_reward": 0.453125,
"rewards/exec_out_all_reward": 0.873046875,
"rewards/exec_out_step_reward": 0.9669309612363577,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.4201988475397229,
"rewards/sql_step_keywords_recall_reward": 0.6697751097381115,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 197.470703125,
"epoch": 0.56045197740113,
"grad_norm": 0.19720803201198578,
"kl": 0.033603668212890625,
"learning_rate": 2.721863928075504e-06,
"loss": 0.0067,
"num_tokens": 17690761.0,
"reward": 6.248985543847084,
"reward_std": 1.3238589530810714,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9681136887520552,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4740994907915592,
"rewards/sql_step_keywords_recall_reward": 0.6705634454265237,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 202.150390625,
"epoch": 0.5694915254237288,
"grad_norm": 0.19422629475593567,
"kl": 0.032680511474609375,
"learning_rate": 2.707905386296588e-06,
"loss": -0.0065,
"num_tokens": 18198586.0,
"reward": 6.26141269505024,
"reward_std": 1.2987114731222391,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.884765625,
"rewards/exec_out_step_reward": 0.9684066604822874,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.4992506830021739,
"rewards/sql_step_keywords_recall_reward": 0.6577859437093139,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 206.173828125,
"epoch": 0.5785310734463277,
"grad_norm": 0.17673034965991974,
"kl": 0.034145355224609375,
"learning_rate": 2.6936427607962483e-06,
"loss": 0.0066,
"num_tokens": 18710943.0,
"reward": 5.954583629965782,
"reward_std": 1.3590196399018168,
"rewards/accuracy_reward": 0.43359375,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.9674510210752487,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.3994289576075971,
"rewards/sql_step_keywords_recall_reward": 0.6550715854391456,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 205.908203125,
"epoch": 0.5875706214689266,
"grad_norm": 0.1863545924425125,
"kl": 0.035305023193359375,
"learning_rate": 2.6790796421141813e-06,
"loss": 0.0025,
"num_tokens": 19222232.0,
"reward": 6.260747715830803,
"reward_std": 1.2453271835111082,
"rewards/accuracy_reward": 0.48046875,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9697412867099047,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4435248177032918,
"rewards/sql_step_keywords_recall_reward": 0.6637223660945892,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 208.349609375,
"epoch": 0.5966101694915255,
"grad_norm": 0.18152064085006714,
"kl": 0.034008026123046875,
"learning_rate": 2.6642196964376354e-06,
"loss": 0.005,
"num_tokens": 19736371.0,
"reward": 6.178658068180084,
"reward_std": 1.2107095727697015,
"rewards/accuracy_reward": 0.439453125,
"rewards/exec_out_all_reward": 0.875,
"rewards/exec_out_step_reward": 0.9720323383808136,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.47833117935806513,
"rewards/sql_step_keywords_recall_reward": 0.6698852656409144,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 213.107421875,
"epoch": 0.6056497175141243,
"grad_norm": 0.28298619389533997,
"kl": 0.06278610229492188,
"learning_rate": 2.649066664678467e-06,
"loss": -0.001,
"num_tokens": 20249726.0,
"reward": 6.54718804359436,
"reward_std": 1.2923204032704234,
"rewards/accuracy_reward": 0.5546875,
"rewards/exec_out_all_reward": 0.888671875,
"rewards/exec_out_step_reward": 0.9712689146399498,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.41723292297683656,
"rewards/sql_step_keywords_recall_reward": 0.6906719226390123,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 216.087890625,
"epoch": 0.6146892655367232,
"grad_norm": 0.1797921508550644,
"kl": 0.037776947021484375,
"learning_rate": 2.6336243615313876e-06,
"loss": -0.0023,
"num_tokens": 20765263.0,
"reward": 6.382973074913025,
"reward_std": 1.2423311527818441,
"rewards/accuracy_reward": 0.486328125,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.9681625198572874,
"rewards/format_reward": 0.9296875,
"rewards/keywords_iou_reward": 0.48494360502809286,
"rewards/sql_step_keywords_recall_reward": 0.677345173433423,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 218.08203125,
"epoch": 0.6237288135593221,
"grad_norm": 0.171707421541214,
"kl": 0.03691864013671875,
"learning_rate": 2.6178966745136323e-06,
"loss": -0.0042,
"num_tokens": 21284597.0,
"reward": 6.1321365386247635,
"reward_std": 1.2671317560598254,
"rewards/accuracy_reward": 0.45703125,
"rewards/exec_out_all_reward": 0.849609375,
"rewards/exec_out_step_reward": 0.9618140794336796,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.43983029294759035,
"rewards/sql_step_keywords_recall_reward": 0.6695681791752577,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 215.431640625,
"epoch": 0.632768361581921,
"grad_norm": 0.18202371895313263,
"kl": 0.0379486083984375,
"learning_rate": 2.6018875629862996e-06,
"loss": -0.0007,
"num_tokens": 21802886.0,
"reward": 6.155725434422493,
"reward_std": 1.3498500874266028,
"rewards/accuracy_reward": 0.447265625,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9724082369357347,
"rewards/format_reward": 0.927734375,
"rewards/keywords_iou_reward": 0.45339376712217927,
"rewards/sql_step_keywords_recall_reward": 0.673013923689723,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 222.77734375,
"epoch": 0.6418079096045197,
"grad_norm": 0.1931043118238449,
"kl": 0.04012298583984375,
"learning_rate": 2.585601057157605e-06,
"loss": -0.0014,
"num_tokens": 22324500.0,
"reward": 6.071022488176823,
"reward_std": 1.363126328913495,
"rewards/accuracy_reward": 0.458984375,
"rewards/exec_out_all_reward": 0.845703125,
"rewards/exec_out_step_reward": 0.9601764027029276,
"rewards/format_reward": 0.91015625,
"rewards/keywords_iou_reward": 0.42694294080138206,
"rewards/sql_step_keywords_recall_reward": 0.6651632944121957,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 216.185546875,
"epoch": 0.6508474576271186,
"grad_norm": 0.18399913609027863,
"kl": 0.03882598876953125,
"learning_rate": 2.5690412570682945e-06,
"loss": -0.0003,
"num_tokens": 22841407.0,
"reward": 6.496973499655724,
"reward_std": 1.1896542916074395,
"rewards/accuracy_reward": 0.525390625,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9670332632958889,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.45264067919924855,
"rewards/sql_step_keywords_recall_reward": 0.7105963062494993,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 216.98828125,
"epoch": 0.6598870056497175,
"grad_norm": 0.17450737953186035,
"kl": 0.037841796875,
"learning_rate": 2.552212331559482e-06,
"loss": -0.0076,
"num_tokens": 23356665.0,
"reward": 6.324795305728912,
"reward_std": 1.2927344804629683,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.916015625,
"rewards/exec_out_step_reward": 0.9763090629130602,
"rewards/format_reward": 0.927734375,
"rewards/keywords_iou_reward": 0.4405778916552663,
"rewards/sql_step_keywords_recall_reward": 0.6938929669559002,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 222.453125,
"epoch": 0.6689265536723163,
"grad_norm": 0.18439583480358124,
"kl": 0.03949737548828125,
"learning_rate": 2.535118517223168e-06,
"loss": 0.0112,
"num_tokens": 23875141.0,
"reward": 6.2983558177948,
"reward_std": 1.3202420324087143,
"rewards/accuracy_reward": 0.494140625,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9673696402460337,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.41860854998230934,
"rewards/sql_step_keywords_recall_reward": 0.7047065645456314,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 217.3203125,
"epoch": 0.6779661016949152,
"grad_norm": 0.17797358334064484,
"kl": 0.037334442138671875,
"learning_rate": 2.5177641173356982e-06,
"loss": 0.0013,
"num_tokens": 24391073.0,
"reward": 6.193948924541473,
"reward_std": 1.4235245073214173,
"rewards/accuracy_reward": 0.466796875,
"rewards/exec_out_all_reward": 0.8984375,
"rewards/exec_out_step_reward": 0.9762230291962624,
"rewards/format_reward": 0.91796875,
"rewards/keywords_iou_reward": 0.42891340190544724,
"rewards/sql_step_keywords_recall_reward": 0.6763053219765425,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 222.564453125,
"epoch": 0.6870056497175141,
"grad_norm": 0.17648960649967194,
"kl": 0.03826904296875,
"learning_rate": 2.5001535007744377e-06,
"loss": 0.0017,
"num_tokens": 24910594.0,
"reward": 6.196133196353912,
"reward_std": 1.3525635278783739,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.8671875,
"rewards/exec_out_step_reward": 0.9637571293860674,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.4552514897659421,
"rewards/sql_step_keywords_recall_reward": 0.677341865375638,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 217.21875,
"epoch": 0.696045197740113,
"grad_norm": 0.1959611028432846,
"kl": 0.042377471923828125,
"learning_rate": 2.4822911009179277e-06,
"loss": 0.0062,
"num_tokens": 25428242.0,
"reward": 6.28637857735157,
"reward_std": 1.4062156137079,
"rewards/accuracy_reward": 0.466796875,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9695289265364408,
"rewards/format_reward": 0.91796875,
"rewards/keywords_iou_reward": 0.4715513661503792,
"rewards/sql_step_keywords_recall_reward": 0.7116375369951129,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 218.865234375,
"epoch": 0.7050847457627119,
"grad_norm": 0.1893206089735031,
"kl": 0.03656005859375,
"learning_rate": 2.464181414529809e-06,
"loss": 0.0047,
"num_tokens": 25947605.0,
"reward": 5.8394907265901566,
"reward_std": 1.3573181126266718,
"rewards/accuracy_reward": 0.390625,
"rewards/exec_out_all_reward": 0.84375,
"rewards/exec_out_step_reward": 0.9585681743919849,
"rewards/format_reward": 0.92578125,
"rewards/keywords_iou_reward": 0.4459369848482311,
"rewards/sql_step_keywords_recall_reward": 0.6570173809304833,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 212.64453125,
"epoch": 0.7141242937853107,
"grad_norm": 0.17911891639232635,
"kl": 0.037021636962890625,
"learning_rate": 2.4458290006267837e-06,
"loss": -0.0001,
"num_tokens": 26462715.0,
"reward": 6.297634035348892,
"reward_std": 1.2602604366838932,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.8671875,
"rewards/exec_out_step_reward": 0.9640160016715527,
"rewards/format_reward": 0.91796875,
"rewards/keywords_iou_reward": 0.4979046704247594,
"rewards/sql_step_keywords_recall_reward": 0.7089024959132075,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 210.369140625,
"epoch": 0.7231638418079096,
"grad_norm": 0.18120358884334564,
"kl": 0.035877227783203125,
"learning_rate": 2.427238479330908e-06,
"loss": 0.0027,
"num_tokens": 26975792.0,
"reward": 6.11888575553894,
"reward_std": 1.5019584177061915,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.9738637823611498,
"rewards/format_reward": 0.90234375,
"rewards/keywords_iou_reward": 0.4661337183788419,
"rewards/sql_step_keywords_recall_reward": 0.6912701558321714,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 210.333984375,
"epoch": 0.7322033898305085,
"grad_norm": 0.18645890057086945,
"kl": 0.03778076171875,
"learning_rate": 2.4084145307065e-06,
"loss": 0.0031,
"num_tokens": 27488767.0,
"reward": 6.057440027594566,
"reward_std": 1.336686883121729,
"rewards/accuracy_reward": 0.41796875,
"rewards/exec_out_all_reward": 0.8671875,
"rewards/exec_out_step_reward": 0.9673099610954523,
"rewards/format_reward": 0.90625,
"rewards/keywords_iou_reward": 0.49649542290717363,
"rewards/sql_step_keywords_recall_reward": 0.6518266946077347,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 206.6796875,
"epoch": 0.7412429378531074,
"grad_norm": 0.18546722829341888,
"kl": 0.035541534423828125,
"learning_rate": 2.389361893581961e-06,
"loss": -0.0067,
"num_tokens": 28001731.0,
"reward": 6.299026131629944,
"reward_std": 1.3064639195799828,
"rewards/accuracy_reward": 0.490234375,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.9757549054920673,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.4211979394312948,
"rewards/sql_step_keywords_recall_reward": 0.6957191359251738,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 205.423828125,
"epoch": 0.7502824858757062,
"grad_norm": 0.18241924047470093,
"kl": 0.03655242919921875,
"learning_rate": 2.3700853643567976e-06,
"loss": 0.0047,
"num_tokens": 28512732.0,
"reward": 6.236706480383873,
"reward_std": 1.2026822408661246,
"rewards/accuracy_reward": 0.49609375,
"rewards/exec_out_all_reward": 0.861328125,
"rewards/exec_out_step_reward": 0.9606026802212,
"rewards/format_reward": 0.904296875,
"rewards/keywords_iou_reward": 0.43541459972038865,
"rewards/sql_step_keywords_recall_reward": 0.6552745532244444,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 201.544921875,
"epoch": 0.7593220338983051,
"grad_norm": 0.19208958745002747,
"kl": 0.0334625244140625,
"learning_rate": 2.350589795794156e-06,
"loss": -0.0085,
"num_tokens": 29021143.0,
"reward": 6.086180254817009,
"reward_std": 1.245661067776382,
"rewards/accuracy_reward": 0.439453125,
"rewards/exec_out_all_reward": 0.869140625,
"rewards/exec_out_step_reward": 0.9663248769938946,
"rewards/format_reward": 0.900390625,
"rewards/keywords_iou_reward": 0.45154744386672974,
"rewards/sql_step_keywords_recall_reward": 0.6894168108701706,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 201.060546875,
"epoch": 0.768361581920904,
"grad_norm": 0.1888093501329422,
"kl": 0.03191375732421875,
"learning_rate": 2.3308800957991657e-06,
"loss": 0.0122,
"num_tokens": 29529626.0,
"reward": 6.194907002151012,
"reward_std": 1.2625772105529904,
"rewards/accuracy_reward": 0.470703125,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.9580496698617935,
"rewards/format_reward": 0.9140625,
"rewards/keywords_iou_reward": 0.45335739478468895,
"rewards/sql_step_keywords_recall_reward": 0.6777987945824862,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 195.92578125,
"epoch": 0.7774011299435029,
"grad_norm": 0.18627431988716125,
"kl": 0.031524658203125,
"learning_rate": 2.3109612261833968e-06,
"loss": -0.0039,
"num_tokens": 30036392.0,
"reward": 6.356723390519619,
"reward_std": 1.5176555626094341,
"rewards/accuracy_reward": 0.474609375,
"rewards/exec_out_all_reward": 0.869140625,
"rewards/exec_out_step_reward": 0.9662566669285297,
"rewards/format_reward": 0.919921875,
"rewards/keywords_iou_reward": 0.4986234325915575,
"rewards/sql_step_keywords_recall_reward": 0.7057198826223612,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 191.9296875,
"epoch": 0.7864406779661017,
"grad_norm": 0.19546450674533844,
"kl": 0.031040191650390625,
"learning_rate": 2.2908382014157536e-06,
"loss": 0.0014,
"num_tokens": 30540172.0,
"reward": 6.007154896855354,
"reward_std": 1.4377120230346918,
"rewards/accuracy_reward": 0.44140625,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.9677215088158846,
"rewards/format_reward": 0.908203125,
"rewards/keywords_iou_reward": 0.4238502769730985,
"rewards/sql_step_keywords_recall_reward": 0.6624359153211117,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 191.09765625,
"epoch": 0.7954802259887006,
"grad_norm": 0.19553017616271973,
"kl": 0.0312347412109375,
"learning_rate": 2.27051608736011e-06,
"loss": -0.0024,
"num_tokens": 31042954.0,
"reward": 6.573052808642387,
"reward_std": 1.3831378351897001,
"rewards/accuracy_reward": 0.52734375,
"rewards/exec_out_all_reward": 0.873046875,
"rewards/exec_out_step_reward": 0.9695428721606731,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4803972856607288,
"rewards/sql_step_keywords_recall_reward": 0.7208402901887894,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 193.73828125,
"epoch": 0.8045197740112995,
"grad_norm": 0.19286681711673737,
"kl": 0.031497955322265625,
"learning_rate": 2.25e-06,
"loss": 0.0024,
"num_tokens": 31547488.0,
"reward": 6.184370666742325,
"reward_std": 1.2657314036041498,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9704876635223627,
"rewards/format_reward": 0.9140625,
"rewards/keywords_iou_reward": 0.4544413227122277,
"rewards/sql_step_keywords_recall_reward": 0.7034378284588456,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 187.453125,
"epoch": 0.8135593220338984,
"grad_norm": 0.19443422555923462,
"kl": 0.030490875244140625,
"learning_rate": 2.229295104150703e-06,
"loss": -0.0012,
"num_tokens": 32049256.0,
"reward": 6.299180343747139,
"reward_std": 1.3650079052895308,
"rewards/accuracy_reward": 0.494140625,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.9578698594123125,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.4481339924968779,
"rewards/sql_step_keywords_recall_reward": 0.6755113024264574,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 186.705078125,
"epoch": 0.8225988700564971,
"grad_norm": 0.19431209564208984,
"kl": 0.03226470947265625,
"learning_rate": 2.2084066121590242e-06,
"loss": 0.0028,
"num_tokens": 32550041.0,
"reward": 6.25195187330246,
"reward_std": 1.1578238443471491,
"rewards/accuracy_reward": 0.484375,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.958809994161129,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.4414861728437245,
"rewards/sql_step_keywords_recall_reward": 0.6699351165443659,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 183.091796875,
"epoch": 0.831638418079096,
"grad_norm": 0.1891166865825653,
"kl": 0.031597137451171875,
"learning_rate": 2.187339782591116e-06,
"loss": -0.0113,
"num_tokens": 33048284.0,
"reward": 6.556719660758972,
"reward_std": 1.3454436883330345,
"rewards/accuracy_reward": 0.529296875,
"rewards/exec_out_all_reward": 0.875,
"rewards/exec_out_step_reward": 0.9694142211228609,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.4681888446211815,
"rewards/sql_step_keywords_recall_reward": 0.7095215003937483,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 189.263671875,
"epoch": 0.8406779661016949,
"grad_norm": 0.19213782250881195,
"kl": 0.032009124755859375,
"learning_rate": 2.166099918908661e-06,
"loss": 0.0056,
"num_tokens": 33552367.0,
"reward": 6.123005196452141,
"reward_std": 1.2814611946232617,
"rewards/accuracy_reward": 0.439453125,
"rewards/exec_out_all_reward": 0.849609375,
"rewards/exec_out_step_reward": 0.9598136860877275,
"rewards/format_reward": 0.955078125,
"rewards/keywords_iou_reward": 0.4669042509049177,
"rewards/sql_step_keywords_recall_reward": 0.6668829349800944,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 187.416015625,
"epoch": 0.8497175141242937,
"grad_norm": 0.19189083576202393,
"kl": 0.033023834228515625,
"learning_rate": 2.1446923681337578e-06,
"loss": 0.0026,
"num_tokens": 34052664.0,
"reward": 6.199526712298393,
"reward_std": 1.2074398496188223,
"rewards/accuracy_reward": 0.4375,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9721904434263706,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.4780406136997044,
"rewards/sql_step_keywords_recall_reward": 0.6911769825965166,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 179.875,
"epoch": 0.8587570621468926,
"grad_norm": 0.20052167773246765,
"kl": 0.031097412109375,
"learning_rate": 2.1231225195028298e-06,
"loss": 0.0007,
"num_tokens": 34549540.0,
"reward": 6.093083538115025,
"reward_std": 1.3363224570639431,
"rewards/accuracy_reward": 0.421875,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.9643709696829319,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.45903572149109095,
"rewards/sql_step_keywords_recall_reward": 0.7145474180579185,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 178.525390625,
"epoch": 0.8677966101694915,
"grad_norm": 0.19684137403964996,
"kl": 0.033634185791015625,
"learning_rate": 2.1013958031099208e-06,
"loss": 0.0089,
"num_tokens": 35046073.0,
"reward": 6.270583778619766,
"reward_std": 1.2627136316150427,
"rewards/accuracy_reward": 0.501953125,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9700528588145971,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.40578509494662285,
"rewards/sql_step_keywords_recall_reward": 0.6627888614311814,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 180.716796875,
"epoch": 0.8768361581920904,
"grad_norm": 0.19163627922534943,
"kl": 0.0335235595703125,
"learning_rate": 2.079517688539693e-06,
"loss": 0.0145,
"num_tokens": 35545980.0,
"reward": 6.588066384196281,
"reward_std": 1.1638533752411604,
"rewards/accuracy_reward": 0.546875,
"rewards/exec_out_all_reward": 0.888671875,
"rewards/exec_out_step_reward": 0.9688662607222795,
"rewards/format_reward": 0.958984375,
"rewards/keywords_iou_reward": 0.4430620293132961,
"rewards/sql_step_keywords_recall_reward": 0.6979197897017002,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 184.3359375,
"epoch": 0.8858757062146893,
"grad_norm": 0.20076608657836914,
"kl": 0.03387451171875,
"learning_rate": 2.0574936834904912e-06,
"loss": 0.0007,
"num_tokens": 36044740.0,
"reward": 6.348625332117081,
"reward_std": 1.502235893625766,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.869140625,
"rewards/exec_out_step_reward": 0.9674339685589075,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4792054174467921,
"rewards/sql_step_keywords_recall_reward": 0.7001243568956852,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 183.564453125,
"epoch": 0.8949152542372881,
"grad_norm": 0.1972309947013855,
"kl": 0.036647796630859375,
"learning_rate": 2.0353293323878076e-06,
"loss": -0.001,
"num_tokens": 36544293.0,
"reward": 5.943858131766319,
"reward_std": 1.3974212240427732,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.841796875,
"rewards/exec_out_step_reward": 0.9585201255977154,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.39970124512910843,
"rewards/sql_step_keywords_recall_reward": 0.6703105177730322,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 179.31640625,
"epoch": 0.903954802259887,
"grad_norm": 0.1914178431034088,
"kl": 0.035003662109375,
"learning_rate": 2.0130302149885033e-06,
"loss": 0.008,
"num_tokens": 37040831.0,
"reward": 6.4351161271333694,
"reward_std": 1.3373866842593998,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.90234375,
"rewards/exec_out_step_reward": 0.9746520053595304,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.481153879314661,
"rewards/sql_step_keywords_recall_reward": 0.713000101968646,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 182.529296875,
"epoch": 0.9129943502824859,
"grad_norm": 0.19319666922092438,
"kl": 0.03475189208984375,
"learning_rate": 1.990601944976133e-06,
"loss": 0.0012,
"num_tokens": 37538390.0,
"reward": 6.169027402997017,
"reward_std": 1.2931091291829944,
"rewards/accuracy_reward": 0.423828125,
"rewards/exec_out_all_reward": 0.916015625,
"rewards/exec_out_step_reward": 0.980399776250124,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4757719023618847,
"rewards/sql_step_keywords_recall_reward": 0.6804432403296232,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 181.212890625,
"epoch": 0.9220338983050848,
"grad_norm": 0.1960325837135315,
"kl": 0.03401947021484375,
"learning_rate": 1.9680501685477304e-06,
"loss": 0.0151,
"num_tokens": 38036931.0,
"reward": 6.41617426276207,
"reward_std": 1.365414334461093,
"rewards/accuracy_reward": 0.5,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.96528010815382,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.4880142016336322,
"rewards/sql_step_keywords_recall_reward": 0.6760376645252109,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 177.46484375,
"epoch": 0.9310734463276836,
"grad_norm": 0.1870131492614746,
"kl": 0.035228729248046875,
"learning_rate": 1.9453805629924126e-06,
"loss": -0.0004,
"num_tokens": 38533177.0,
"reward": 6.086783587932587,
"reward_std": 1.2497869406361133,
"rewards/accuracy_reward": 0.44921875,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.968900365754962,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.4101551335770637,
"rewards/sql_step_keywords_recall_reward": 0.6823387397453189,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 178.97265625,
"epoch": 0.9401129943502825,
"grad_norm": 0.20031407475471497,
"kl": 0.035129547119140625,
"learning_rate": 1.9225988352621446e-06,
"loss": -0.0078,
"num_tokens": 39029707.0,
"reward": 6.0163338631391525,
"reward_std": 1.0426889704540372,
"rewards/accuracy_reward": 0.439453125,
"rewards/exec_out_all_reward": 0.83203125,
"rewards/exec_out_step_reward": 0.959713701158762,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.4327788045629859,
"rewards/sql_step_keywords_recall_reward": 0.6695781610906124,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 180.3046875,
"epoch": 0.9491525423728814,
"grad_norm": 0.19868069887161255,
"kl": 0.0337371826171875,
"learning_rate": 1.8997107205350524e-06,
"loss": 0.0245,
"num_tokens": 39526947.0,
"reward": 6.0550860315561295,
"reward_std": 1.201542696915567,
"rewards/accuracy_reward": 0.416015625,
"rewards/exec_out_all_reward": 0.84765625,
"rewards/exec_out_step_reward": 0.9624209459871054,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.48174334689974785,
"rewards/sql_step_keywords_recall_reward": 0.6760533768683672,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 175.94140625,
"epoch": 0.9581920903954803,
"grad_norm": 0.19075711071491241,
"kl": 0.035160064697265625,
"learning_rate": 1.8767219807716187e-06,
"loss": 0.0152,
"num_tokens": 40023281.0,
"reward": 6.078598067164421,
"reward_std": 1.1788357459008694,
"rewards/accuracy_reward": 0.4140625,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9745574481785297,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.4620191561989486,
"rewards/sql_step_keywords_recall_reward": 0.6917209886014462,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 174.169921875,
"epoch": 0.9672316384180791,
"grad_norm": 0.19707335531711578,
"kl": 0.036041259765625,
"learning_rate": 1.853638403264141e-06,
"loss": 0.0039,
"num_tokens": 40516220.0,
"reward": 6.2247384339571,
"reward_std": 1.1692724945023656,
"rewards/accuracy_reward": 0.46875,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9709844719618559,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.43461011815816164,
"rewards/sql_step_keywords_recall_reward": 0.6970336530357599,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 179.353515625,
"epoch": 0.976271186440678,
"grad_norm": 0.19845238327980042,
"kl": 0.036739349365234375,
"learning_rate": 1.8304657991798111e-06,
"loss": 0.0253,
"num_tokens": 41014509.0,
"reward": 6.089982569217682,
"reward_std": 1.1940758088603616,
"rewards/accuracy_reward": 0.4140625,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9712944850325584,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.4622031897306442,
"rewards/sql_step_keywords_recall_reward": 0.7060004426166415,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 181.298828125,
"epoch": 0.9853107344632769,
"grad_norm": 0.20742465555667877,
"kl": 0.03838348388671875,
"learning_rate": 1.8072100020977862e-06,
"loss": 0.0088,
"num_tokens": 41514946.0,
"reward": 5.96857476234436,
"reward_std": 1.265411582775414,
"rewards/accuracy_reward": 0.39453125,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.9691251274198294,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.47014313703402877,
"rewards/sql_step_keywords_recall_reward": 0.6646321276202798,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 174.021484375,
"epoch": 0.9943502824858758,
"grad_norm": 0.19735410809516907,
"kl": 0.03719329833984375,
"learning_rate": 1.7838768665406153e-06,
"loss": -0.0014,
"num_tokens": 42009789.0,
"reward": 6.292138174176216,
"reward_std": 1.134878752520308,
"rewards/accuracy_reward": 0.474609375,
"rewards/exec_out_all_reward": 0.890625,
"rewards/exec_out_step_reward": 0.9741862006485462,
"rewards/format_reward": 0.962890625,
"rewards/keywords_iou_reward": 0.44052915135398507,
"rewards/sql_step_keywords_recall_reward": 0.6849405262619257,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 174.87109375,
"epoch": 1.0090395480225989,
"grad_norm": 0.20000207424163818,
"kl": 0.04029083251953125,
"learning_rate": 1.7604722665003958e-06,
"loss": 0.0104,
"num_tokens": 42504659.0,
"reward": 6.249677374958992,
"reward_std": 1.4170940481126308,
"rewards/accuracy_reward": 0.4375,
"rewards/exec_out_all_reward": 0.884765625,
"rewards/exec_out_step_reward": 0.971117002889514,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.49388469848781824,
"rewards/sql_step_keywords_recall_reward": 0.7165721878409386,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 174.515625,
"epoch": 1.0180790960451978,
"grad_norm": 0.1991143524646759,
"kl": 0.038921356201171875,
"learning_rate": 1.737002093960025e-06,
"loss": 0.0071,
"num_tokens": 43000247.0,
"reward": 6.2897831201553345,
"reward_std": 1.209823683835566,
"rewards/accuracy_reward": 0.44921875,
"rewards/exec_out_all_reward": 0.900390625,
"rewards/exec_out_step_reward": 0.9761315789073706,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.48860851069912314,
"rewards/sql_step_keywords_recall_reward": 0.699715806171298,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 176.083984375,
"epoch": 1.0271186440677966,
"grad_norm": 0.20260195434093475,
"kl": 0.037258148193359375,
"learning_rate": 1.713472257409928e-06,
"loss": -0.0071,
"num_tokens": 43496746.0,
"reward": 6.175719887018204,
"reward_std": 1.3017593873664737,
"rewards/accuracy_reward": 0.453125,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.9692282117903233,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.4611970195546746,
"rewards/sql_step_keywords_recall_reward": 0.6590976314619184,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 171.19921875,
"epoch": 1.0361581920903955,
"grad_norm": 0.20994015038013458,
"kl": 0.038059234619140625,
"learning_rate": 1.689888680360624e-06,
"loss": 0.0009,
"num_tokens": 43989932.0,
"reward": 6.245173625648022,
"reward_std": 1.1906684855930507,
"rewards/accuracy_reward": 0.447265625,
"rewards/exec_out_all_reward": 0.904296875,
"rewards/exec_out_step_reward": 0.9748775381594896,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4764430886134505,
"rewards/sql_step_keywords_recall_reward": 0.6787380147725344,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 177.447265625,
"epoch": 1.0451977401129944,
"grad_norm": 0.22471484541893005,
"kl": 0.039340972900390625,
"learning_rate": 1.6662572998515165e-06,
"loss": 0.0046,
"num_tokens": 44485501.0,
"reward": 6.439530774950981,
"reward_std": 1.2442573299631476,
"rewards/accuracy_reward": 0.5,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9739366378635168,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.46715445443987846,
"rewards/sql_step_keywords_recall_reward": 0.6933945845812559,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 179.21484375,
"epoch": 1.0542372881355933,
"grad_norm": 0.20036683976650238,
"kl": 0.03893280029296875,
"learning_rate": 1.6425840649562737e-06,
"loss": 0.0051,
"num_tokens": 44984123.0,
"reward": 6.33234478533268,
"reward_std": 1.2393232183530927,
"rewards/accuracy_reward": 0.48046875,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9720695428550243,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.45927969785407186,
"rewards/sql_step_keywords_recall_reward": 0.6917158551514149,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 181.154296875,
"epoch": 1.0632768361581921,
"grad_norm": 0.19345538318157196,
"kl": 0.03975677490234375,
"learning_rate": 1.6188749352851825e-06,
"loss": 0.0073,
"num_tokens": 45483218.0,
"reward": 6.538501590490341,
"reward_std": 1.1121491650119424,
"rewards/accuracy_reward": 0.501953125,
"rewards/exec_out_all_reward": 0.865234375,
"rewards/exec_out_step_reward": 0.9685004372149706,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.5210402370430529,
"rewards/sql_step_keywords_recall_reward": 0.7037018835544586,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 180.533203125,
"epoch": 1.072316384180791,
"grad_norm": 0.20600463449954987,
"kl": 0.036891937255859375,
"learning_rate": 1.5951358794848467e-06,
"loss": -0.002,
"num_tokens": 45981975.0,
"reward": 6.367153495550156,
"reward_std": 1.4412866719067097,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9675254262983799,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.48966031754389405,
"rewards/sql_step_keywords_recall_reward": 0.6722605032846332,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 180.583984375,
"epoch": 1.0813559322033899,
"grad_norm": 0.1875942498445511,
"kl": 0.036533355712890625,
"learning_rate": 1.5713728737356139e-06,
"loss": -0.013,
"num_tokens": 46481262.0,
"reward": 5.682912960648537,
"reward_std": 1.2121786596253514,
"rewards/accuracy_reward": 0.36328125,
"rewards/exec_out_all_reward": 0.8359375,
"rewards/exec_out_step_reward": 0.9655343275517225,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.41401433339342475,
"rewards/sql_step_keywords_recall_reward": 0.6491155764088035,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 184.494140625,
"epoch": 1.0903954802259888,
"grad_norm": 0.2242497056722641,
"kl": 0.041225433349609375,
"learning_rate": 1.5475919002471018e-06,
"loss": 0.0018,
"num_tokens": 46983563.0,
"reward": 6.413750275969505,
"reward_std": 1.4098568577319384,
"rewards/accuracy_reward": 0.505859375,
"rewards/exec_out_all_reward": 0.875,
"rewards/exec_out_step_reward": 0.9675168935209513,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.45125941652804613,
"rewards/sql_step_keywords_recall_reward": 0.7019176911562681,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 181.625,
"epoch": 1.0994350282485876,
"grad_norm": 0.1926642209291458,
"kl": 0.035884857177734375,
"learning_rate": 1.523798945752212e-06,
"loss": 0.0016,
"num_tokens": 47480911.0,
"reward": 6.660622417926788,
"reward_std": 1.1989344246685505,
"rewards/accuracy_reward": 0.52734375,
"rewards/exec_out_all_reward": 0.927734375,
"rewards/exec_out_step_reward": 0.9832945894449949,
"rewards/format_reward": 0.958984375,
"rewards/keywords_iou_reward": 0.480492593254894,
"rewards/sql_step_keywords_recall_reward": 0.7202489655464888,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 190.0234375,
"epoch": 1.1084745762711865,
"grad_norm": 0.18677794933319092,
"kl": 0.03594970703125,
"learning_rate": 1.5e-06,
"loss": -0.0015,
"num_tokens": 47983023.0,
"reward": 6.145782947540283,
"reward_std": 1.3262191619724035,
"rewards/accuracy_reward": 0.447265625,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9680919889360666,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.4536509499885142,
"rewards/sql_step_keywords_recall_reward": 0.6532015362754464,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 190.458984375,
"epoch": 1.1175141242937854,
"grad_norm": 0.19311833381652832,
"kl": 0.034397125244140625,
"learning_rate": 1.476201054247788e-06,
"loss": 0.0084,
"num_tokens": 48485038.0,
"reward": 6.124564379453659,
"reward_std": 1.1108355158939958,
"rewards/accuracy_reward": 0.44921875,
"rewards/exec_out_all_reward": 0.857421875,
"rewards/exec_out_step_reward": 0.9658536426723003,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4373150817118585,
"rewards/sql_step_keywords_recall_reward": 0.6903305593878031,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 194.169921875,
"epoch": 1.1265536723163843,
"grad_norm": 0.19611912965774536,
"kl": 0.03643798828125,
"learning_rate": 1.452408099752899e-06,
"loss": -0.0002,
"num_tokens": 48990821.0,
"reward": 6.194984808564186,
"reward_std": 1.3247142443433404,
"rewards/accuracy_reward": 0.4375,
"rewards/exec_out_all_reward": 0.8515625,
"rewards/exec_out_step_reward": 0.9632626511156559,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.4958450337871909,
"rewards/sql_step_keywords_recall_reward": 0.6951102269813418,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 191.966796875,
"epoch": 1.1355932203389831,
"grad_norm": 0.1942368596792221,
"kl": 0.034610748291015625,
"learning_rate": 1.4286271262643866e-06,
"loss": 0.011,
"num_tokens": 49493892.0,
"reward": 6.56321893632412,
"reward_std": 1.2955252706306055,
"rewards/accuracy_reward": 0.51171875,
"rewards/exec_out_all_reward": 0.888671875,
"rewards/exec_out_step_reward": 0.9716486856341362,
"rewards/format_reward": 0.95703125,
"rewards/keywords_iou_reward": 0.5003192345611751,
"rewards/sql_step_keywords_recall_reward": 0.6983536276966333,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 192.224609375,
"epoch": 1.144632768361582,
"grad_norm": 0.20695985853672028,
"kl": 0.03424835205078125,
"learning_rate": 1.4048641205151533e-06,
"loss": -0.0047,
"num_tokens": 49996803.0,
"reward": 6.204301163554192,
"reward_std": 1.1253142580389977,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.8984375,
"rewards/exec_out_step_reward": 0.9757238961756229,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.4387336834333837,
"rewards/sql_step_keywords_recall_reward": 0.7007192308083177,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 194.66015625,
"epoch": 1.1536723163841809,
"grad_norm": 0.19700393080711365,
"kl": 0.03594970703125,
"learning_rate": 1.3811250647148171e-06,
"loss": 0.0124,
"num_tokens": 50504301.0,
"reward": 6.500779703259468,
"reward_std": 1.2046514563262463,
"rewards/accuracy_reward": 0.517578125,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.9675091523677111,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.4829273517243564,
"rewards/sql_step_keywords_recall_reward": 0.7021815236657858,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 190.998046875,
"epoch": 1.1627118644067798,
"grad_norm": 0.19084982573986053,
"kl": 0.034423828125,
"learning_rate": 1.3574159350437264e-06,
"loss": 0.003,
"num_tokens": 51006412.0,
"reward": 6.172577649354935,
"reward_std": 1.3462738115340471,
"rewards/accuracy_reward": 0.458984375,
"rewards/exec_out_all_reward": 0.869140625,
"rewards/exec_out_step_reward": 0.9645081553608179,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4471881305798888,
"rewards/sql_step_keywords_recall_reward": 0.6691619791090488,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 197.43359375,
"epoch": 1.1717514124293786,
"grad_norm": 0.1826663464307785,
"kl": 0.035003662109375,
"learning_rate": 1.3337427001484835e-06,
"loss": 0.0024,
"num_tokens": 51513734.0,
"reward": 6.046234875917435,
"reward_std": 1.4189167954027653,
"rewards/accuracy_reward": 0.44140625,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.9655668716877699,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.42813110165297985,
"rewards/sql_step_keywords_recall_reward": 0.656046318821609,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 196.70703125,
"epoch": 1.1807909604519775,
"grad_norm": 0.1818644106388092,
"kl": 0.033786773681640625,
"learning_rate": 1.3101113196393759e-06,
"loss": 0.0028,
"num_tokens": 52020596.0,
"reward": 5.911285370588303,
"reward_std": 1.1000383193604648,
"rewards/accuracy_reward": 0.408203125,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.9638741612434387,
"rewards/format_reward": 0.95703125,
"rewards/keywords_iou_reward": 0.4286212190054357,
"rewards/sql_step_keywords_recall_reward": 0.6448562629520893,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 192.52734375,
"epoch": 1.1898305084745764,
"grad_norm": 0.19093115627765656,
"kl": 0.0337066650390625,
"learning_rate": 1.2865277425900725e-06,
"loss": 0.0096,
"num_tokens": 52524122.0,
"reward": 6.245767995715141,
"reward_std": 1.3251913916319609,
"rewards/accuracy_reward": 0.4921875,
"rewards/exec_out_all_reward": 0.8671875,
"rewards/exec_out_step_reward": 0.966382997110486,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4250256856903434,
"rewards/sql_step_keywords_recall_reward": 0.6480836141854525,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 190.916015625,
"epoch": 1.1988700564971753,
"grad_norm": 0.19361349940299988,
"kl": 0.033050537109375,
"learning_rate": 1.2629979060399751e-06,
"loss": -0.0008,
"num_tokens": 53027339.0,
"reward": 6.7118589878082275,
"reward_std": 1.4029255080968142,
"rewards/accuracy_reward": 0.560546875,
"rewards/exec_out_all_reward": 0.9140625,
"rewards/exec_out_step_reward": 0.9767283629626036,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.477216730825603,
"rewards/sql_step_keywords_recall_reward": 0.6849940754473209,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 189.78125,
"epoch": 1.207909604519774,
"grad_norm": 0.19163502752780914,
"kl": 0.033824920654296875,
"learning_rate": 1.2395277334996047e-06,
"loss": -0.002,
"num_tokens": 53530615.0,
"reward": 6.5633436143398285,
"reward_std": 1.354396466165781,
"rewards/accuracy_reward": 0.529296875,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.969381669536233,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.4778866241686046,
"rewards/sql_step_keywords_recall_reward": 0.6909231022000313,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 188.806640625,
"epoch": 1.2169491525423728,
"grad_norm": 0.19524553418159485,
"kl": 0.0322418212890625,
"learning_rate": 1.2161231334593852e-06,
"loss": 0.0042,
"num_tokens": 54033328.0,
"reward": 6.4206047505140305,
"reward_std": 1.3125502597540617,
"rewards/accuracy_reward": 0.494140625,
"rewards/exec_out_all_reward": 0.8984375,
"rewards/exec_out_step_reward": 0.9737436473369598,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.47170518431812525,
"rewards/sql_step_keywords_recall_reward": 0.6909506395459175,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 189.267578125,
"epoch": 1.2259887005649717,
"grad_norm": 0.18766768276691437,
"kl": 0.034023284912109375,
"learning_rate": 1.1927899979022142e-06,
"loss": -0.0056,
"num_tokens": 54535337.0,
"reward": 6.526236951351166,
"reward_std": 1.2283624270930886,
"rewards/accuracy_reward": 0.515625,
"rewards/exec_out_all_reward": 0.865234375,
"rewards/exec_out_step_reward": 0.9680036306381226,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.5038230996578932,
"rewards/sql_step_keywords_recall_reward": 0.6794933304190636,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 192.39453125,
"epoch": 1.2350282485875705,
"grad_norm": 0.19446401298046112,
"kl": 0.0313568115234375,
"learning_rate": 1.169534200820189e-06,
"loss": 0.0045,
"num_tokens": 55037983.0,
"reward": 6.247622415423393,
"reward_std": 1.3995716699864715,
"rewards/accuracy_reward": 0.443359375,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.962557353079319,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.4969401224516332,
"rewards/sql_step_keywords_recall_reward": 0.7130598044022918,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 188.71484375,
"epoch": 1.2440677966101694,
"grad_norm": 0.1841951161623001,
"kl": 0.03244781494140625,
"learning_rate": 1.146361596735859e-06,
"loss": 0.0073,
"num_tokens": 55542381.0,
"reward": 6.34030369669199,
"reward_std": 1.3015205739066005,
"rewards/accuracy_reward": 0.48046875,
"rewards/exec_out_all_reward": 0.8984375,
"rewards/exec_out_step_reward": 0.9747984893620014,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.46549498522654176,
"rewards/sql_step_keywords_recall_reward": 0.6747496416792274,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 190.142578125,
"epoch": 1.2531073446327683,
"grad_norm": 0.19216689467430115,
"kl": 0.030513763427734375,
"learning_rate": 1.1232780192283814e-06,
"loss": 0.0089,
"num_tokens": 56045178.0,
"reward": 6.406587705016136,
"reward_std": 1.128734229831025,
"rewards/accuracy_reward": 0.486328125,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9704783633351326,
"rewards/format_reward": 0.96484375,
"rewards/keywords_iou_reward": 0.4696982908062637,
"rewards/sql_step_keywords_recall_reward": 0.6998377349227667,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 186.853515625,
"epoch": 1.2621468926553672,
"grad_norm": 0.19616030156612396,
"kl": 0.032970428466796875,
"learning_rate": 1.1002892794649477e-06,
"loss": 0.0007,
"num_tokens": 56547795.0,
"reward": 6.060941353440285,
"reward_std": 1.3276239773258567,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.853515625,
"rewards/exec_out_step_reward": 0.9571653380990028,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.4333818582817912,
"rewards/sql_step_keywords_recall_reward": 0.6471685189753771,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 185.00390625,
"epoch": 1.271186440677966,
"grad_norm": 0.18865719437599182,
"kl": 0.030315399169921875,
"learning_rate": 1.0774011647378555e-06,
"loss": 0.0,
"num_tokens": 57049073.0,
"reward": 6.1720483005046844,
"reward_std": 1.3698342852294445,
"rewards/accuracy_reward": 0.45703125,
"rewards/exec_out_all_reward": 0.83984375,
"rewards/exec_out_step_reward": 0.9541589226573706,
"rewards/format_reward": 0.955078125,
"rewards/keywords_iou_reward": 0.46421836549416184,
"rewards/sql_step_keywords_recall_reward": 0.6664058230817318,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 185.212890625,
"epoch": 1.280225988700565,
"grad_norm": 0.19604600965976715,
"kl": 0.03199005126953125,
"learning_rate": 1.0546194370075883e-06,
"loss": -0.0021,
"num_tokens": 57548486.0,
"reward": 6.491792589426041,
"reward_std": 1.2623751778155565,
"rewards/accuracy_reward": 0.501953125,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9714409783482552,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.5014809351414442,
"rewards/sql_step_keywords_recall_reward": 0.6736397361382842,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 183.6796875,
"epoch": 1.2892655367231638,
"grad_norm": 0.1924704760313034,
"kl": 0.03244781494140625,
"learning_rate": 1.0319498314522695e-06,
"loss": 0.0019,
"num_tokens": 58047710.0,
"reward": 6.139053791761398,
"reward_std": 1.1230434579774737,
"rewards/accuracy_reward": 0.4453125,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9705496653914452,
"rewards/format_reward": 0.9296875,
"rewards/keywords_iou_reward": 0.4448565673374105,
"rewards/sql_step_keywords_recall_reward": 0.68504096288234,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 188.365234375,
"epoch": 1.2983050847457627,
"grad_norm": 0.1963050365447998,
"kl": 0.03145599365234375,
"learning_rate": 1.0093980550238675e-06,
"loss": 0.0036,
"num_tokens": 58549501.0,
"reward": 6.090264290571213,
"reward_std": 1.4014679677784443,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.857421875,
"rewards/exec_out_step_reward": 0.9621853325515985,
"rewards/format_reward": 0.916015625,
"rewards/keywords_iou_reward": 0.47523164842277765,
"rewards/sql_step_keywords_recall_reward": 0.6776156453415751,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 181.072265625,
"epoch": 1.3073446327683615,
"grad_norm": 0.2053123414516449,
"kl": 0.031829833984375,
"learning_rate": 9.86969785011497e-07,
"loss": 0.0054,
"num_tokens": 59046626.0,
"reward": 6.3448584377765656,
"reward_std": 1.176757472101599,
"rewards/accuracy_reward": 0.458984375,
"rewards/exec_out_all_reward": 0.884765625,
"rewards/exec_out_step_reward": 0.9675664994865656,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.49913227930665016,
"rewards/sql_step_keywords_recall_reward": 0.7110585309565067,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 185.021484375,
"epoch": 1.3163841807909604,
"grad_norm": 0.19113275408744812,
"kl": 0.031230926513671875,
"learning_rate": 9.646706676121923e-07,
"loss": -0.0098,
"num_tokens": 59546101.0,
"reward": 6.435375913977623,
"reward_std": 1.407385234721005,
"rewards/accuracy_reward": 0.4921875,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9711650554090738,
"rewards/format_reward": 0.953125,
"rewards/keywords_iou_reward": 0.48695238353684545,
"rewards/sql_step_keywords_recall_reward": 0.6875717546790838,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 182.16015625,
"epoch": 1.3254237288135593,
"grad_norm": 0.2129882425069809,
"kl": 0.03119659423828125,
"learning_rate": 9.425063165095089e-07,
"loss": -0.0039,
"num_tokens": 60046491.0,
"reward": 6.0068028047680855,
"reward_std": 1.2135074082762003,
"rewards/accuracy_reward": 0.4140625,
"rewards/exec_out_all_reward": 0.87890625,
"rewards/exec_out_step_reward": 0.9671890567988157,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.44391668657772243,
"rewards/sql_step_keywords_recall_reward": 0.6752177719026804,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 183.91796875,
"epoch": 1.3344632768361582,
"grad_norm": 0.21050292253494263,
"kl": 0.032924652099609375,
"learning_rate": 9.204823114603069e-07,
"loss": 0.0047,
"num_tokens": 60546385.0,
"reward": 6.227329030632973,
"reward_std": 1.179283824749291,
"rewards/accuracy_reward": 0.458984375,
"rewards/exec_out_all_reward": 0.8515625,
"rewards/exec_out_step_reward": 0.9637028854340315,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4683625341858715,
"rewards/sql_step_keywords_recall_reward": 0.6999479737132788,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 185.40234375,
"epoch": 1.343502824858757,
"grad_norm": 0.21124009788036346,
"kl": 0.032260894775390625,
"learning_rate": 8.986041968900797e-07,
"loss": 0.0088,
"num_tokens": 61049115.0,
"reward": 6.444768786430359,
"reward_std": 1.4197536138817668,
"rewards/accuracy_reward": 0.50390625,
"rewards/exec_out_all_reward": 0.884765625,
"rewards/exec_out_step_reward": 0.9735870882868767,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.4634511903859675,
"rewards/sql_step_keywords_recall_reward": 0.7102948874235153,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 186.373046875,
"epoch": 1.352542372881356,
"grad_norm": 0.21280112862586975,
"kl": 0.02997589111328125,
"learning_rate": 8.768774804971705e-07,
"loss": 0.0098,
"num_tokens": 61551534.0,
"reward": 6.14260359108448,
"reward_std": 1.3231439045630395,
"rewards/accuracy_reward": 0.443359375,
"rewards/exec_out_all_reward": 0.849609375,
"rewards/exec_out_step_reward": 0.9615505710244179,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.46615127846598625,
"rewards/sql_step_keywords_recall_reward": 0.6784378979355097,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 187.35546875,
"epoch": 1.3615819209039548,
"grad_norm": 0.20350147783756256,
"kl": 0.031444549560546875,
"learning_rate": 8.553076318662425e-07,
"loss": 0.0024,
"num_tokens": 62054616.0,
"reward": 5.876114495098591,
"reward_std": 1.1356665641069412,
"rewards/accuracy_reward": 0.373046875,
"rewards/exec_out_all_reward": 0.857421875,
"rewards/exec_out_step_reward": 0.9639787971973419,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.4735513115301728,
"rewards/sql_step_keywords_recall_reward": 0.6779237259179354,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 182.556640625,
"epoch": 1.3706214689265537,
"grad_norm": 0.18335995078086853,
"kl": 0.030361175537109375,
"learning_rate": 8.339000810913388e-07,
"loss": -0.0031,
"num_tokens": 62553909.0,
"reward": 6.188477337360382,
"reward_std": 1.1688060224987566,
"rewards/accuracy_reward": 0.466796875,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9724152106791735,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.421648170100525,
"rewards/sql_step_keywords_recall_reward": 0.6715939035639167,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 178.1953125,
"epoch": 1.3796610169491526,
"grad_norm": 0.18019497394561768,
"kl": 0.032196044921875,
"learning_rate": 8.126602174088844e-07,
"loss": -0.0063,
"num_tokens": 63051989.0,
"reward": 6.597941100597382,
"reward_std": 1.2893404318019748,
"rewards/accuracy_reward": 0.533203125,
"rewards/exec_out_all_reward": 0.890625,
"rewards/exec_out_step_reward": 0.9766291547566652,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4863300649449229,
"rewards/sql_step_keywords_recall_reward": 0.6838080808520317,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 181.888671875,
"epoch": 1.3887005649717514,
"grad_norm": 0.20189358294010162,
"kl": 0.031101226806640625,
"learning_rate": 7.915933878409761e-07,
"loss": -0.0082,
"num_tokens": 63552568.0,
"reward": 6.299270272254944,
"reward_std": 1.1446837144903839,
"rewards/accuracy_reward": 0.484375,
"rewards/exec_out_all_reward": 0.873046875,
"rewards/exec_out_step_reward": 0.9710131492465734,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.45233285753056407,
"rewards/sql_step_keywords_recall_reward": 0.6716383351013064,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 179.68359375,
"epoch": 1.3977401129943503,
"grad_norm": 0.1866413652896881,
"kl": 0.03018951416015625,
"learning_rate": 7.707048958492972e-07,
"loss": 0.0052,
"num_tokens": 64051946.0,
"reward": 6.494629591703415,
"reward_std": 1.1752266022376716,
"rewards/accuracy_reward": 0.50390625,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9707899298518896,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.4983974387869239,
"rewards/sql_step_keywords_recall_reward": 0.6989197302609682,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 179.88671875,
"epoch": 1.4067796610169492,
"grad_norm": 0.18069399893283844,
"kl": 0.031505584716796875,
"learning_rate": 7.500000000000003e-07,
"loss": 0.0009,
"num_tokens": 64549212.0,
"reward": 6.292890816926956,
"reward_std": 1.408079206943512,
"rewards/accuracy_reward": 0.4921875,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.9635618217289448,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.44063833844847977,
"rewards/sql_step_keywords_recall_reward": 0.6863335473462939,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 180.708984375,
"epoch": 1.415819209039548,
"grad_norm": 0.19954738020896912,
"kl": 0.03003692626953125,
"learning_rate": 7.294839126398909e-07,
"loss": 0.0072,
"num_tokens": 65046991.0,
"reward": 6.40887725353241,
"reward_std": 1.1249554408714175,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.90234375,
"rewards/exec_out_step_reward": 0.9757773783057928,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.48421067791059613,
"rewards/sql_step_keywords_recall_reward": 0.6892878729850054,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 180.326171875,
"epoch": 1.424858757062147,
"grad_norm": 0.1969391107559204,
"kl": 0.0305023193359375,
"learning_rate": 7.091617985842463e-07,
"loss": 0.0018,
"num_tokens": 65544562.0,
"reward": 6.413661152124405,
"reward_std": 1.2689514786470681,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.890625,
"rewards/exec_out_step_reward": 0.9725539479404688,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.49607263831421733,
"rewards/sql_step_keywords_recall_reward": 0.693102465942502,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 178.669921875,
"epoch": 1.4338983050847458,
"grad_norm": 0.19801102578639984,
"kl": 0.03037261962890625,
"learning_rate": 6.890387738166042e-07,
"loss": -0.0004,
"num_tokens": 66041689.0,
"reward": 6.508177891373634,
"reward_std": 1.3759017111733556,
"rewards/accuracy_reward": 0.5234375,
"rewards/exec_out_all_reward": 0.923828125,
"rewards/exec_out_step_reward": 0.9815886970609426,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4463528748601675,
"rewards/sql_step_keywords_recall_reward": 0.6709927897900343,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 178.77734375,
"epoch": 1.4429378531073447,
"grad_norm": 0.19369132816791534,
"kl": 0.0314788818359375,
"learning_rate": 6.691199042008347e-07,
"loss": 0.003,
"num_tokens": 66538115.0,
"reward": 6.160938322544098,
"reward_std": 1.2979237555991858,
"rewards/accuracy_reward": 0.46875,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9724400117993355,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.412393850274384,
"rewards/sql_step_keywords_recall_reward": 0.6566793192178011,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 176.439453125,
"epoch": 1.4519774011299436,
"grad_norm": 0.1850394755601883,
"kl": 0.02997589111328125,
"learning_rate": 6.494102042058441e-07,
"loss": 0.002,
"num_tokens": 67035676.0,
"reward": 6.159577623009682,
"reward_std": 1.2864303840324283,
"rewards/accuracy_reward": 0.462890625,
"rewards/exec_out_all_reward": 0.873046875,
"rewards/exec_out_step_reward": 0.9678982235491276,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4201263478025794,
"rewards/sql_step_keywords_recall_reward": 0.6854111216962337,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 180.287109375,
"epoch": 1.4610169491525424,
"grad_norm": 0.20010262727737427,
"kl": 0.030330657958984375,
"learning_rate": 6.29914635643203e-07,
"loss": -0.0048,
"num_tokens": 67531747.0,
"reward": 6.243592485785484,
"reward_std": 1.3317115511745214,
"rewards/accuracy_reward": 0.455078125,
"rewards/exec_out_all_reward": 0.888671875,
"rewards/exec_out_step_reward": 0.9729833193123341,
"rewards/format_reward": 0.966796875,
"rewards/keywords_iou_reward": 0.46371736377477646,
"rewards/sql_step_keywords_recall_reward": 0.6673932354897261,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 181.583984375,
"epoch": 1.4700564971751413,
"grad_norm": 0.19057103991508484,
"kl": 0.032482147216796875,
"learning_rate": 6.106381064180395e-07,
"loss": 0.0051,
"num_tokens": 68029614.0,
"reward": 6.253777638077736,
"reward_std": 1.371273732278496,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.87890625,
"rewards/exec_out_step_reward": 0.9656637534499168,
"rewards/format_reward": 0.916015625,
"rewards/keywords_iou_reward": 0.450145754031837,
"rewards/sql_step_keywords_recall_reward": 0.6788380099460483,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 181.328125,
"epoch": 1.4790960451977402,
"grad_norm": 0.1943560391664505,
"kl": 0.02936553955078125,
"learning_rate": 5.915854692935003e-07,
"loss": 0.0001,
"num_tokens": 68527730.0,
"reward": 6.239083915948868,
"reward_std": 1.3497515600174665,
"rewards/accuracy_reward": 0.48046875,
"rewards/exec_out_all_reward": 0.84375,
"rewards/exec_out_step_reward": 0.9633672833442688,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.45536057371646166,
"rewards/sql_step_keywords_recall_reward": 0.6638235626742244,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 177.58203125,
"epoch": 1.488135593220339,
"grad_norm": 0.19292642176151276,
"kl": 0.029979705810546875,
"learning_rate": 5.727615206690921e-07,
"loss": 0.0093,
"num_tokens": 69024424.0,
"reward": 6.809370994567871,
"reward_std": 1.231651745736599,
"rewards/accuracy_reward": 0.5703125,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9722276534885168,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4978084210306406,
"rewards/sql_step_keywords_recall_reward": 0.7380108721554279,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 176.873046875,
"epoch": 1.497175141242938,
"grad_norm": 0.19874171912670135,
"kl": 0.02878570556640625,
"learning_rate": 5.541709993732168e-07,
"loss": 0.0052,
"num_tokens": 69519951.0,
"reward": 6.364575162529945,
"reward_std": 1.2380520347505808,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.912109375,
"rewards/exec_out_step_reward": 0.9779986720532179,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.4500721860677004,
"rewards/sql_step_keywords_recall_reward": 0.6973696993663907,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 182.763671875,
"epoch": 1.5062146892655366,
"grad_norm": 0.19222836196422577,
"kl": 0.031803131103515625,
"learning_rate": 5.358185854701909e-07,
"loss": 0.016,
"num_tokens": 70020190.0,
"reward": 5.9986598044633865,
"reward_std": 1.2878384962677956,
"rewards/accuracy_reward": 0.392578125,
"rewards/exec_out_all_reward": 0.90234375,
"rewards/exec_out_step_reward": 0.9768469464033842,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.46051234856713563,
"rewards/sql_step_keywords_recall_reward": 0.6847725082188845,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 179.427734375,
"epoch": 1.5152542372881355,
"grad_norm": 0.18806356191635132,
"kl": 0.02924346923828125,
"learning_rate": 5.177088990820725e-07,
"loss": 0.0156,
"num_tokens": 70519589.0,
"reward": 6.586422994732857,
"reward_std": 1.445882560685277,
"rewards/accuracy_reward": 0.525390625,
"rewards/exec_out_all_reward": 0.873046875,
"rewards/exec_out_step_reward": 0.9663845468312502,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.48613627161830664,
"rewards/sql_step_keywords_recall_reward": 0.7317502833902836,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 182.669921875,
"epoch": 1.5242937853107343,
"grad_norm": 0.19503186643123627,
"kl": 0.027843475341796875,
"learning_rate": 4.998464992255627e-07,
"loss": -0.0017,
"num_tokens": 71019964.0,
"reward": 5.967584699392319,
"reward_std": 1.1278308150358498,
"rewards/accuracy_reward": 0.404296875,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9704303070902824,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4382792445831001,
"rewards/sql_step_keywords_recall_reward": 0.6870021214708686,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 179.84375,
"epoch": 1.5333333333333332,
"grad_norm": 0.18861566483974457,
"kl": 0.027347564697265625,
"learning_rate": 4.82235882664302e-07,
"loss": -0.0001,
"num_tokens": 71518200.0,
"reward": 6.40145568549633,
"reward_std": 1.264804814942181,
"rewards/accuracy_reward": 0.486328125,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9703760556876659,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.47830750420689583,
"rewards/sql_step_keywords_recall_reward": 0.7127459226176143,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 179.998046875,
"epoch": 1.542372881355932,
"grad_norm": 0.19780074059963226,
"kl": 0.027751922607421875,
"learning_rate": 4.648814827768323e-07,
"loss": 0.0012,
"num_tokens": 72016087.0,
"reward": 6.078360304236412,
"reward_std": 1.233950492925942,
"rewards/accuracy_reward": 0.4296875,
"rewards/exec_out_all_reward": 0.869140625,
"rewards/exec_out_step_reward": 0.9696281347423792,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.45836541755124927,
"rewards/sql_step_keywords_recall_reward": 0.6568450266495347,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 184.81640625,
"epoch": 1.551412429378531,
"grad_norm": 0.20185421407222748,
"kl": 0.028972625732421875,
"learning_rate": 4.4778766844051793e-07,
"loss": -0.0002,
"num_tokens": 72515641.0,
"reward": 6.029541537165642,
"reward_std": 1.3070494611747563,
"rewards/accuracy_reward": 0.416015625,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.9650747179985046,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.4725735238753259,
"rewards/sql_step_keywords_recall_reward": 0.6486166473478079,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 180.265625,
"epoch": 1.5604519774011298,
"grad_norm": 0.18522052466869354,
"kl": 0.028156280517578125,
"learning_rate": 4.309587429317061e-07,
"loss": -0.0069,
"num_tokens": 73014473.0,
"reward": 6.007387965917587,
"reward_std": 1.1997167933732271,
"rewards/accuracy_reward": 0.4140625,
"rewards/exec_out_all_reward": 0.857421875,
"rewards/exec_out_step_reward": 0.9654715433716774,
"rewards/format_reward": 0.955078125,
"rewards/keywords_iou_reward": 0.441091364948079,
"rewards/sql_step_keywords_recall_reward": 0.690983671694994,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 186.296875,
"epoch": 1.5694915254237287,
"grad_norm": 0.18709787726402283,
"kl": 0.02730560302734375,
"learning_rate": 4.1439894284239473e-07,
"loss": 0.0048,
"num_tokens": 73516757.0,
"reward": 5.854448825120926,
"reward_std": 1.0955259250476956,
"rewards/accuracy_reward": 0.396484375,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.9658908490091562,
"rewards/format_reward": 0.919921875,
"rewards/keywords_iou_reward": 0.419542781310156,
"rewards/sql_step_keywords_recall_reward": 0.6803318522870541,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 184.041015625,
"epoch": 1.5785310734463276,
"grad_norm": 0.18421481549739838,
"kl": 0.02740478515625,
"learning_rate": 3.981124370137002e-07,
"loss": 0.0015,
"num_tokens": 74016222.0,
"reward": 6.491113051772118,
"reward_std": 1.2589517189189792,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.87890625,
"rewards/exec_out_step_reward": 0.969364620745182,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.5320240047294647,
"rewards/sql_step_keywords_recall_reward": 0.7194192241877317,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 185.123046875,
"epoch": 1.5875706214689265,
"grad_norm": 0.20407724380493164,
"kl": 0.027408599853515625,
"learning_rate": 3.82103325486368e-07,
"loss": 0.0167,
"num_tokens": 74517801.0,
"reward": 6.3000208735466,
"reward_std": 1.357117084786296,
"rewards/accuracy_reward": 0.4765625,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9679687526077032,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.44746885914355516,
"rewards/sql_step_keywords_recall_reward": 0.704692529514432,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 180.05078125,
"epoch": 1.5966101694915253,
"grad_norm": 0.19040866196155548,
"kl": 0.02652740478515625,
"learning_rate": 3.6637563846861275e-07,
"loss": -0.0012,
"num_tokens": 75013259.0,
"reward": 6.4401615858078,
"reward_std": 1.168332906672731,
"rewards/accuracy_reward": 0.505859375,
"rewards/exec_out_all_reward": 0.890625,
"rewards/exec_out_step_reward": 0.974679134786129,
"rewards/format_reward": 0.9609375,
"rewards/keywords_iou_reward": 0.43378148321062326,
"rewards/sql_step_keywords_recall_reward": 0.7229194287210703,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 183.84375,
"epoch": 1.6056497175141242,
"grad_norm": 0.19518616795539856,
"kl": 0.027690887451171875,
"learning_rate": 3.5093333532153313e-07,
"loss": 0.0007,
"num_tokens": 75511563.0,
"reward": 6.2283158749341965,
"reward_std": 1.2062615705654025,
"rewards/accuracy_reward": 0.466796875,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9733979757875204,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.4363416051492095,
"rewards/sql_step_keywords_recall_reward": 0.698640950024128,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 180.58203125,
"epoch": 1.614689265536723,
"grad_norm": 0.19140039384365082,
"kl": 0.03049468994140625,
"learning_rate": 3.357803035623646e-07,
"loss": 0.0114,
"num_tokens": 76009449.0,
"reward": 6.212793804705143,
"reward_std": 1.3535035271197557,
"rewards/accuracy_reward": 0.4296875,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9737645741552114,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.502739230170846,
"rewards/sql_step_keywords_recall_reward": 0.6788633242249489,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 184.478515625,
"epoch": 1.623728813559322,
"grad_norm": 0.20329172909259796,
"kl": 0.02678680419921875,
"learning_rate": 3.209203578858191e-07,
"loss": 0.0015,
"num_tokens": 76512090.0,
"reward": 5.99811252951622,
"reward_std": 1.3144248933531344,
"rewards/accuracy_reward": 0.427734375,
"rewards/exec_out_all_reward": 0.890625,
"rewards/exec_out_step_reward": 0.9720734115689993,
"rewards/format_reward": 0.90625,
"rewards/keywords_iou_reward": 0.43393061752431095,
"rewards/sql_step_keywords_recall_reward": 0.6503653433173895,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 182.916015625,
"epoch": 1.6327683615819208,
"grad_norm": 0.20274612307548523,
"kl": 0.027004241943359375,
"learning_rate": 3.063572392037517e-07,
"loss": -0.0061,
"num_tokens": 77010299.0,
"reward": 6.448321744799614,
"reward_std": 1.2834087014198303,
"rewards/accuracy_reward": 0.490234375,
"rewards/exec_out_all_reward": 0.888671875,
"rewards/exec_out_step_reward": 0.9712286107242107,
"rewards/format_reward": 0.9609375,
"rewards/keywords_iou_reward": 0.4910502852872014,
"rewards/sql_step_keywords_recall_reward": 0.6844456251710653,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 178.615234375,
"epoch": 1.6418079096045197,
"grad_norm": 0.20460090041160583,
"kl": 0.02655029296875,
"learning_rate": 2.920946137034121e-07,
"loss": 0.0016,
"num_tokens": 77506942.0,
"reward": 6.275775626301765,
"reward_std": 1.3149840263649821,
"rewards/accuracy_reward": 0.458984375,
"rewards/exec_out_all_reward": 0.89453125,
"rewards/exec_out_step_reward": 0.9711751285940409,
"rewards/format_reward": 0.92578125,
"rewards/keywords_iou_reward": 0.4703782368451357,
"rewards/sql_step_keywords_recall_reward": 0.7075940538197756,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 181.2421875,
"epoch": 1.6508474576271186,
"grad_norm": 0.1951521635055542,
"kl": 0.02664947509765625,
"learning_rate": 2.781360719244964e-07,
"loss": 0.0085,
"num_tokens": 78006654.0,
"reward": 6.269969627261162,
"reward_std": 1.5092957746237516,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.8671875,
"rewards/exec_out_step_reward": 0.9697536900639534,
"rewards/format_reward": 0.923828125,
"rewards/keywords_iou_reward": 0.43865267653018236,
"rewards/sql_step_keywords_recall_reward": 0.7022074311971664,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 181.6171875,
"epoch": 1.6598870056497175,
"grad_norm": 0.18546244502067566,
"kl": 0.02535247802734375,
"learning_rate": 2.64485127855251e-07,
"loss": 0.0072,
"num_tokens": 78504702.0,
"reward": 6.320208579301834,
"reward_std": 1.431410001590848,
"rewards/accuracy_reward": 0.474609375,
"rewards/exec_out_all_reward": 0.859375,
"rewards/exec_out_step_reward": 0.9669557642191648,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.4761747941374779,
"rewards/sql_step_keywords_recall_reward": 0.7055906923487782,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 179.552734375,
"epoch": 1.6689265536723163,
"grad_norm": 0.20000162720680237,
"kl": 0.026123046875,
"learning_rate": 2.5114521804784305e-07,
"loss": -0.0022,
"num_tokens": 79001101.0,
"reward": 6.44641749560833,
"reward_std": 1.3242136964108795,
"rewards/accuracy_reward": 0.498046875,
"rewards/exec_out_all_reward": 0.904296875,
"rewards/exec_out_step_reward": 0.9747899696230888,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.46967238979414105,
"rewards/sql_step_keywords_recall_reward": 0.6924389712512493,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 179.234375,
"epoch": 1.6779661016949152,
"grad_norm": 0.18595600128173828,
"kl": 0.026111602783203125,
"learning_rate": 2.3811970075322803e-07,
"loss": 0.0056,
"num_tokens": 79497289.0,
"reward": 6.366844519972801,
"reward_std": 1.3081835759803653,
"rewards/accuracy_reward": 0.490234375,
"rewards/exec_out_all_reward": 0.904296875,
"rewards/exec_out_step_reward": 0.9748759996145964,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.43892133235931396,
"rewards/sql_step_keywords_recall_reward": 0.7094383966177702,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 182.5234375,
"epoch": 1.687005649717514,
"grad_norm": 0.20412081480026245,
"kl": 0.02643585205078125,
"learning_rate": 2.254118550757286e-07,
"loss": 0.0128,
"num_tokens": 79997557.0,
"reward": 6.095862299203873,
"reward_std": 1.4136508908122778,
"rewards/accuracy_reward": 0.453125,
"rewards/exec_out_all_reward": 0.837890625,
"rewards/exec_out_step_reward": 0.9592912942171097,
"rewards/format_reward": 0.9375,
"rewards/keywords_iou_reward": 0.44104228960350156,
"rewards/sql_step_keywords_recall_reward": 0.6665958110243082,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 185.845703125,
"epoch": 1.696045197740113,
"grad_norm": 0.19426719844341278,
"kl": 0.026729583740234375,
"learning_rate": 2.130248801475344e-07,
"loss": -0.0023,
"num_tokens": 80499130.0,
"reward": 6.401883035898209,
"reward_std": 1.3080633180215955,
"rewards/accuracy_reward": 0.501953125,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9698521215468645,
"rewards/format_reward": 0.93359375,
"rewards/keywords_iou_reward": 0.45063989935442805,
"rewards/sql_step_keywords_recall_reward": 0.7065323041751981,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 184.69921875,
"epoch": 1.7050847457627119,
"grad_norm": 0.18957343697547913,
"kl": 0.027210235595703125,
"learning_rate": 2.0096189432334195e-07,
"loss": -0.0014,
"num_tokens": 80999876.0,
"reward": 6.019113600254059,
"reward_std": 1.2530112564563751,
"rewards/accuracy_reward": 0.423828125,
"rewards/exec_out_all_reward": 0.884765625,
"rewards/exec_out_step_reward": 0.9723625108599663,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.4438905091956258,
"rewards/sql_step_keywords_recall_reward": 0.6433451026678085,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 182.720703125,
"epoch": 1.7141242937853107,
"grad_norm": 0.19690978527069092,
"kl": 0.026309967041015625,
"learning_rate": 1.892259343953226e-07,
"loss": 0.0181,
"num_tokens": 81498693.0,
"reward": 6.560971170663834,
"reward_std": 1.3874189644120634,
"rewards/accuracy_reward": 0.5078125,
"rewards/exec_out_all_reward": 0.9296875,
"rewards/exec_out_step_reward": 0.9844036791473627,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.47905752109363675,
"rewards/sql_step_keywords_recall_reward": 0.7258742917329073,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 182.2578125,
"epoch": 1.7231638418079096,
"grad_norm": 0.2019815295934677,
"kl": 0.02889251708984375,
"learning_rate": 1.7781995482862706e-07,
"loss": -0.0001,
"num_tokens": 81997317.0,
"reward": 6.342395722866058,
"reward_std": 1.1282042702659965,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.9375,
"rewards/exec_out_step_reward": 0.985164001584053,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.46832023840397596,
"rewards/sql_step_keywords_recall_reward": 0.6979350317269564,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 182.46875,
"epoch": 1.7322033898305085,
"grad_norm": 0.19806276261806488,
"kl": 0.027133941650390625,
"learning_rate": 1.6674682701761496e-07,
"loss": 0.0079,
"num_tokens": 82498165.0,
"reward": 6.360767655074596,
"reward_std": 1.2937721209600568,
"rewards/accuracy_reward": 0.49609375,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9698614254593849,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.44721768144518137,
"rewards/sql_step_keywords_recall_reward": 0.6937365289777517,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 186.044921875,
"epoch": 1.7412429378531074,
"grad_norm": 0.1942872703075409,
"kl": 0.025554656982421875,
"learning_rate": 1.5600933856299637e-07,
"loss": 0.0038,
"num_tokens": 83000064.0,
"reward": 6.068183168768883,
"reward_std": 1.4636581200174987,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.833984375,
"rewards/exec_out_step_reward": 0.9575916156172752,
"rewards/format_reward": 0.912109375,
"rewards/keywords_iou_reward": 0.44754891796037555,
"rewards/sql_step_keywords_recall_reward": 0.6647125380113721,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 184.869140625,
"epoch": 1.7502824858757062,
"grad_norm": 0.19219300150871277,
"kl": 0.0272674560546875,
"learning_rate": 1.4561019257006842e-07,
"loss": 0.0018,
"num_tokens": 83500337.0,
"reward": 6.50420406460762,
"reward_std": 1.2782420022413135,
"rewards/accuracy_reward": 0.53125,
"rewards/exec_out_all_reward": 0.857421875,
"rewards/exec_out_step_reward": 0.9676587302237749,
"rewards/format_reward": 0.927734375,
"rewards/keywords_iou_reward": 0.46198446361813694,
"rewards/sql_step_keywords_recall_reward": 0.7024201266467571,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 187.498046875,
"epoch": 1.759322033898305,
"grad_norm": 0.1927022784948349,
"kl": 0.02826690673828125,
"learning_rate": 1.3555200696822234e-07,
"loss": 0.0006,
"num_tokens": 84003200.0,
"reward": 6.2347564697265625,
"reward_std": 1.3537420043721795,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9668782595545053,
"rewards/format_reward": 0.92578125,
"rewards/keywords_iou_reward": 0.43135018879547715,
"rewards/sql_step_keywords_recall_reward": 0.6786152720451355,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 181.091796875,
"epoch": 1.768361581920904,
"grad_norm": 0.1913248747587204,
"kl": 0.026073455810546875,
"learning_rate": 1.2583731385189562e-07,
"loss": -0.0044,
"num_tokens": 84499811.0,
"reward": 6.492121763527393,
"reward_std": 1.319369402481243,
"rewards/accuracy_reward": 0.509765625,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9670100193470716,
"rewards/format_reward": 0.947265625,
"rewards/keywords_iou_reward": 0.47826224053278565,
"rewards/sql_step_keywords_recall_reward": 0.7053060494363308,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 182.16015625,
"epoch": 1.7774011299435029,
"grad_norm": 0.18614481389522552,
"kl": 0.02542877197265625,
"learning_rate": 1.1646855884312813e-07,
"loss": -0.0025,
"num_tokens": 84998661.0,
"reward": 6.048209026455879,
"reward_std": 1.3966924250125885,
"rewards/accuracy_reward": 0.44921875,
"rewards/exec_out_all_reward": 0.833984375,
"rewards/exec_out_step_reward": 0.9590921085327864,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.42350240517407656,
"rewards/sql_step_keywords_recall_reward": 0.6717996271327138,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 183.830078125,
"epoch": 1.7864406779661017,
"grad_norm": 0.20049569010734558,
"kl": 0.02552032470703125,
"learning_rate": 1.0744810047589116e-07,
"loss": 0.0085,
"num_tokens": 85499318.0,
"reward": 6.379006251692772,
"reward_std": 1.3671105708926916,
"rewards/accuracy_reward": 0.482421875,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9740528911352158,
"rewards/format_reward": 0.921875,
"rewards/keywords_iou_reward": 0.48139098659157753,
"rewards/sql_step_keywords_recall_reward": 0.7038900572806597,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 186.349609375,
"epoch": 1.7954802259887006,
"grad_norm": 0.1779472380876541,
"kl": 0.02698516845703125,
"learning_rate": 9.877820960234002e-08,
"loss": -0.0,
"num_tokens": 85999137.0,
"reward": 6.308243364095688,
"reward_std": 1.220099939033389,
"rewards/accuracy_reward": 0.46875,
"rewards/exec_out_all_reward": 0.888671875,
"rewards/exec_out_step_reward": 0.9750612266361713,
"rewards/format_reward": 0.958984375,
"rewards/keywords_iou_reward": 0.4597327196970582,
"rewards/sql_step_keywords_recall_reward": 0.6910604787990451,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 181.806640625,
"epoch": 1.8045197740112995,
"grad_norm": 0.1846829503774643,
"kl": 0.025753021240234375,
"learning_rate": 9.046106882113752e-08,
"loss": 0.0034,
"num_tokens": 86497530.0,
"reward": 6.5371609181165695,
"reward_std": 1.2568824323825538,
"rewards/accuracy_reward": 0.537109375,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9686531201004982,
"rewards/format_reward": 0.927734375,
"rewards/keywords_iou_reward": 0.4614989855326712,
"rewards/sql_step_keywords_recall_reward": 0.698244234547019,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 181.83984375,
"epoch": 1.8135593220338984,
"grad_norm": 0.2010311335325241,
"kl": 0.026294708251953125,
"learning_rate": 8.249877192799731e-08,
"loss": 0.0117,
"num_tokens": 86996804.0,
"reward": 6.099536940455437,
"reward_std": 1.1783363316208124,
"rewards/accuracy_reward": 0.42578125,
"rewards/exec_out_all_reward": 0.869140625,
"rewards/exec_out_step_reward": 0.9668596535921097,
"rewards/format_reward": 0.919921875,
"rewards/keywords_iou_reward": 0.4779109531082213,
"rewards/sql_step_keywords_recall_reward": 0.6846678359434009,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 176.578125,
"epoch": 1.8225988700564972,
"grad_norm": 0.19091665744781494,
"kl": 0.0277862548828125,
"learning_rate": 7.489332338858202e-08,
"loss": 0.0051,
"num_tokens": 87493076.0,
"reward": 6.273200556635857,
"reward_std": 1.2643384067341685,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.876953125,
"rewards/exec_out_step_reward": 0.9696537107229233,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.474200002849102,
"rewards/sql_step_keywords_recall_reward": 0.6930374698713422,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 193.27734375,
"epoch": 1.831638418079096,
"grad_norm": 0.1949450820684433,
"kl": 0.02576446533203125,
"learning_rate": 6.76466378338892e-08,
"loss": -0.003,
"num_tokens": 88001494.0,
"reward": 6.114458784461021,
"reward_std": 1.2841259008273482,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.833984375,
"rewards/exec_out_step_reward": 0.9544363897293806,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.45011676382273436,
"rewards/sql_step_keywords_recall_reward": 0.6465076114982367,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 183.59375,
"epoch": 1.840677966101695,
"grad_norm": 0.1869850605726242,
"kl": 0.026782989501953125,
"learning_rate": 6.076053957825411e-08,
"loss": -0.0073,
"num_tokens": 88501102.0,
"reward": 6.228741064667702,
"reward_std": 1.307523036841303,
"rewards/accuracy_reward": 0.4609375,
"rewards/exec_out_all_reward": 0.89453125,
"rewards/exec_out_step_reward": 0.9735940638929605,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.4400957697071135,
"rewards/sql_step_keywords_recall_reward": 0.685502259992063,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 184.279296875,
"epoch": 1.8497175141242939,
"grad_norm": 0.19799074530601501,
"kl": 0.026813507080078125,
"learning_rate": 5.423676216008694e-08,
"loss": 0.0047,
"num_tokens": 89002113.0,
"reward": 6.288302145898342,
"reward_std": 1.3452563788741827,
"rewards/accuracy_reward": 0.443359375,
"rewards/exec_out_all_reward": 0.896484375,
"rewards/exec_out_step_reward": 0.973502604290843,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.496027123183012,
"rewards/sql_step_keywords_recall_reward": 0.7211827598512173,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 181.740234375,
"epoch": 1.8587570621468927,
"grad_norm": 0.1976655274629593,
"kl": 0.027782440185546875,
"learning_rate": 4.807694790546563e-08,
"loss": 0.0013,
"num_tokens": 89500732.0,
"reward": 6.4734716564416885,
"reward_std": 1.2421362679451704,
"rewards/accuracy_reward": 0.494140625,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9743815138936043,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.49230897752568126,
"rewards/sql_step_keywords_recall_reward": 0.7156439917162061,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 183.732421875,
"epoch": 1.8677966101694916,
"grad_norm": 0.21844159066677094,
"kl": 0.026279449462890625,
"learning_rate": 4.2282647514687525e-08,
"loss": 0.0087,
"num_tokens": 90000723.0,
"reward": 6.1350885555148125,
"reward_std": 1.201456573791802,
"rewards/accuracy_reward": 0.427734375,
"rewards/exec_out_all_reward": 0.87109375,
"rewards/exec_out_step_reward": 0.9642880447208881,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.47726981807500124,
"rewards/sql_step_keywords_recall_reward": 0.6889170501381159,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 185.2109375,
"epoch": 1.8768361581920905,
"grad_norm": 0.19886131584644318,
"kl": 0.026386260986328125,
"learning_rate": 3.685531967188943e-08,
"loss": 0.0084,
"num_tokens": 90502179.0,
"reward": 6.224872663617134,
"reward_std": 1.0856527155265212,
"rewards/accuracy_reward": 0.451171875,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.9733282178640366,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.4684679554775357,
"rewards/sql_step_keywords_recall_reward": 0.6759366653859615,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 181.986328125,
"epoch": 1.8858757062146894,
"grad_norm": 0.1973237842321396,
"kl": 0.028614044189453125,
"learning_rate": 3.1796330677832056e-08,
"loss": 0.0041,
"num_tokens": 91001508.0,
"reward": 6.379835411906242,
"reward_std": 1.199483459815383,
"rewards/accuracy_reward": 0.478515625,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.973174761980772,
"rewards/format_reward": 0.9453125,
"rewards/keywords_iou_reward": 0.4832291747443378,
"rewards/sql_step_keywords_recall_reward": 0.6882491651922464,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 182.765625,
"epoch": 1.8949152542372882,
"grad_norm": 0.20336341857910156,
"kl": 0.025714874267578125,
"learning_rate": 2.710695410593994e-08,
"loss": 0.0105,
"num_tokens": 91500808.0,
"reward": 6.037193328142166,
"reward_std": 1.2909285621717572,
"rewards/accuracy_reward": 0.431640625,
"rewards/exec_out_all_reward": 0.8828125,
"rewards/exec_out_step_reward": 0.9731863792985678,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.4269423745572567,
"rewards/sql_step_keywords_recall_reward": 0.6573878172785044,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 185.685546875,
"epoch": 1.9039548022598871,
"grad_norm": 0.19327940046787262,
"kl": 0.02794647216796875,
"learning_rate": 2.278837048168797e-08,
"loss": -0.0007,
"num_tokens": 92000983.0,
"reward": 6.355218142271042,
"reward_std": 1.3059450194705278,
"rewards/accuracy_reward": 0.48046875,
"rewards/exec_out_all_reward": 0.919921875,
"rewards/exec_out_step_reward": 0.980823727324605,
"rewards/format_reward": 0.939453125,
"rewards/keywords_iou_reward": 0.4438144704326987,
"rewards/sql_step_keywords_recall_reward": 0.7055154535919428,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 187.62890625,
"epoch": 1.912994350282486,
"grad_norm": 0.19996102154254913,
"kl": 0.02562713623046875,
"learning_rate": 1.8841666985408568e-08,
"loss": 0.0026,
"num_tokens": 92501125.0,
"reward": 6.144110098481178,
"reward_std": 1.3673810623586178,
"rewards/accuracy_reward": 0.419921875,
"rewards/exec_out_all_reward": 0.85546875,
"rewards/exec_out_step_reward": 0.9624224957078695,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.5000754236243665,
"rewards/sql_step_keywords_recall_reward": 0.7108335876837373,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 178.302734375,
"epoch": 1.9220338983050849,
"grad_norm": 0.19664518535137177,
"kl": 0.025630950927734375,
"learning_rate": 1.5267837178600972e-08,
"loss": 0.0021,
"num_tokens": 92998136.0,
"reward": 6.30431304872036,
"reward_std": 1.3761892821639776,
"rewards/accuracy_reward": 0.474609375,
"rewards/exec_out_all_reward": 0.89453125,
"rewards/exec_out_step_reward": 0.9729538708925247,
"rewards/format_reward": 0.943359375,
"rewards/keywords_iou_reward": 0.454624411650002,
"rewards/sql_step_keywords_recall_reward": 0.6857822621241212,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 182.158203125,
"epoch": 1.9310734463276837,
"grad_norm": 0.18419690430164337,
"kl": 0.02671051025390625,
"learning_rate": 1.206778075380699e-08,
"loss": 0.0046,
"num_tokens": 93497689.0,
"reward": 6.260894909501076,
"reward_std": 1.3260251162573695,
"rewards/accuracy_reward": 0.466796875,
"rewards/exec_out_all_reward": 0.8984375,
"rewards/exec_out_step_reward": 0.9784419946372509,
"rewards/format_reward": 0.94140625,
"rewards/keywords_iou_reward": 0.45361618138849735,
"rewards/sql_step_keywords_recall_reward": 0.6681892573833466,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 183.88671875,
"epoch": 1.9401129943502826,
"grad_norm": 0.2128608375787735,
"kl": 0.027370452880859375,
"learning_rate": 9.242303308118816e-09,
"loss": -0.0028,
"num_tokens": 93996519.0,
"reward": 6.385763391852379,
"reward_std": 1.3881093207746744,
"rewards/accuracy_reward": 0.49609375,
"rewards/exec_out_all_reward": 0.89453125,
"rewards/exec_out_step_reward": 0.975878132507205,
"rewards/format_reward": 0.923828125,
"rewards/keywords_iou_reward": 0.44957458041608334,
"rewards/sql_step_keywords_recall_reward": 0.7080017421394587,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 183.0703125,
"epoch": 1.9491525423728815,
"grad_norm": 0.185265451669693,
"kl": 0.026287078857421875,
"learning_rate": 6.792116140373117e-09,
"loss": -0.0056,
"num_tokens": 94496719.0,
"reward": 6.287876293063164,
"reward_std": 1.1061802469193935,
"rewards/accuracy_reward": 0.47265625,
"rewards/exec_out_all_reward": 0.880859375,
"rewards/exec_out_step_reward": 0.9741544220596552,
"rewards/format_reward": 0.94921875,
"rewards/keywords_iou_reward": 0.4433903433382511,
"rewards/sql_step_keywords_recall_reward": 0.7062380816787481,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 182.119140625,
"epoch": 1.9581920903954804,
"grad_norm": 0.19649042189121246,
"kl": 0.02834320068359375,
"learning_rate": 4.7178360720865895e-09,
"loss": 0.0082,
"num_tokens": 94996296.0,
"reward": 6.1123000383377075,
"reward_std": 1.1606702040880919,
"rewards/accuracy_reward": 0.4375,
"rewards/exec_out_all_reward": 0.90625,
"rewards/exec_out_step_reward": 0.9780420735478401,
"rewards/format_reward": 0.9296875,
"rewards/keywords_iou_reward": 0.44118882389739156,
"rewards/sql_step_keywords_recall_reward": 0.665942832827568,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 180.314453125,
"epoch": 1.9672316384180792,
"grad_norm": 0.20041655004024506,
"kl": 0.025310516357421875,
"learning_rate": 3.0199852921735105e-09,
"loss": -0.002,
"num_tokens": 95493249.0,
"reward": 6.628199502825737,
"reward_std": 1.2294201632030308,
"rewards/accuracy_reward": 0.521484375,
"rewards/exec_out_all_reward": 0.88671875,
"rewards/exec_out_step_reward": 0.9736746642738581,
"rewards/format_reward": 0.951171875,
"rewards/keywords_iou_reward": 0.5000922083854675,
"rewards/sql_step_keywords_recall_reward": 0.730512335896492,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 182.69140625,
"epoch": 1.9762711864406781,
"grad_norm": 0.2012374997138977,
"kl": 0.0261688232421875,
"learning_rate": 1.6989912254880557e-09,
"loss": 0.0159,
"num_tokens": 95991843.0,
"reward": 6.3724522441625595,
"reward_std": 1.4343089256435633,
"rewards/accuracy_reward": 0.509765625,
"rewards/exec_out_all_reward": 0.875,
"rewards/exec_out_step_reward": 0.9689127672463655,
"rewards/format_reward": 0.9140625,
"rewards/keywords_iou_reward": 0.4413177212700248,
"rewards/sql_step_keywords_recall_reward": 0.6927789896726608,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 182.826171875,
"epoch": 1.985310734463277,
"grad_norm": 0.18999001383781433,
"kl": 0.0272064208984375,
"learning_rate": 7.551864252223761e-10,
"loss": 0.0057,
"num_tokens": 96489986.0,
"reward": 6.2077417075634,
"reward_std": 1.2748773116618395,
"rewards/accuracy_reward": 0.4453125,
"rewards/exec_out_all_reward": 0.892578125,
"rewards/exec_out_step_reward": 0.9720354303717613,
"rewards/format_reward": 0.935546875,
"rewards/keywords_iou_reward": 0.4684856841340661,
"rewards/sql_step_keywords_recall_reward": 0.6893599443137646,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 183.3854250907898,
"epoch": 1.9943502824858759,
"grad_norm": 0.18637076020240784,
"kl": 0.0267333984375,
"learning_rate": 1.8880848918739758e-10,
"loss": -0.0015,
"num_tokens": 96990210.0,
"reward": 6.196442812681198,
"reward_std": 1.3213467076420784,
"rewards/accuracy_reward": 0.4765625,
"rewards/exec_out_all_reward": 0.86328125,
"rewards/exec_out_step_reward": 0.969191774725914,
"rewards/format_reward": 0.931640625,
"rewards/keywords_iou_reward": 0.4246487212367356,
"rewards/sql_step_keywords_recall_reward": 0.676781676709652,
"step": 220
},
{
"epoch": 1.9943502824858759,
"step": 220,
"total_flos": 0.0,
"train_loss": 0.002633511937032877,
"train_runtime": 233060.1474,
"train_samples_per_second": 0.122,
"train_steps_per_second": 0.001
}
],
"logging_steps": 1,
"max_steps": 220,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 27,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}