{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9907120743034056, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 222.078125, "epoch": 0.01238390092879257, "grad_norm": 0.5330060720443726, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 2.1595765352249146, "reward_std": 0.4799410402774811, "rewards/ngram_similarity_reward_func": 0.22074851393699646, "rewards/reasoning_quality_reward_func": 0.7317968606948853, "rewards/soft_format_reward_func": 0.2109375, "rewards/sql_execution_reward_func": 0.5703125, "rewards/xmlcount_reward_func": 0.42578125, "step": 1 }, { "completion_length": 228.609375, "epoch": 0.02476780185758514, "grad_norm": 0.38014766573905945, "kl": 0.0, "learning_rate": 5e-06, "loss": 0.0, "reward": 2.1653581261634827, "reward_std": 0.35478534549474716, "rewards/ngram_similarity_reward_func": 0.2899237535893917, "rewards/reasoning_quality_reward_func": 0.7039499878883362, "rewards/soft_format_reward_func": 0.203125, "rewards/sql_execution_reward_func": 0.5406249971129, "rewards/xmlcount_reward_func": 0.427734375, "step": 2 }, { "completion_length": 249.984375, "epoch": 0.03715170278637771, "grad_norm": 0.4404110908508301, "kl": 0.00019386520580155775, "learning_rate": 1e-05, "loss": 0.0, "reward": 2.2775589525699615, "reward_std": 0.2987724468111992, "rewards/ngram_similarity_reward_func": 0.14325893856585026, "rewards/reasoning_quality_reward_func": 0.7573468536138535, "rewards/soft_format_reward_func": 0.23828125, "rewards/sql_execution_reward_func": 0.66015625, "rewards/xmlcount_reward_func": 0.478515625, "step": 3 }, { "completion_length": 219.28125, "epoch": 0.04953560371517028, "grad_norm": 0.4848824143409729, "kl": 0.00020055885761394165, "learning_rate": 1.5000000000000002e-05, "loss": 0.0, "reward": 1.9259249866008759, "reward_std": 0.49138037860393524, "rewards/ngram_similarity_reward_func": 0.16651681065559387, "rewards/reasoning_quality_reward_func": 0.7145218700170517, "rewards/soft_format_reward_func": 0.18359375, "rewards/sql_execution_reward_func": 0.4745738562196493, "rewards/xmlcount_reward_func": 0.38671875, "step": 4 }, { "completion_length": 240.875, "epoch": 0.06191950464396285, "grad_norm": 0.3829522430896759, "kl": 0.00025230981555068865, "learning_rate": 2e-05, "loss": 0.0, "reward": 2.286948025226593, "reward_std": 0.46134958416223526, "rewards/ngram_similarity_reward_func": 0.23377767577767372, "rewards/reasoning_quality_reward_func": 0.7305140793323517, "rewards/soft_format_reward_func": 0.21484375, "rewards/sql_execution_reward_func": 0.6585937514901161, "rewards/xmlcount_reward_func": 0.44921875, "step": 5 }, { "completion_length": 220.421875, "epoch": 0.07430340557275542, "grad_norm": 0.41778889298439026, "kl": 0.0005900838441448286, "learning_rate": 2.5e-05, "loss": 0.0, "reward": 2.2411254048347473, "reward_std": 0.41372712701559067, "rewards/ngram_similarity_reward_func": 0.21116455644369125, "rewards/reasoning_quality_reward_func": 0.7018359005451202, "rewards/soft_format_reward_func": 0.234375, "rewards/sql_execution_reward_func": 0.625, "rewards/xmlcount_reward_func": 0.46875, "step": 6 }, { "completion_length": 243.671875, "epoch": 0.08668730650154799, "grad_norm": 0.362834632396698, "kl": 0.0015416233654832467, "learning_rate": 3.0000000000000004e-05, "loss": 0.0001, "reward": 2.479892313480377, "reward_std": 0.42981887608766556, "rewards/ngram_similarity_reward_func": 0.31569236889481544, "rewards/reasoning_quality_reward_func": 0.7431062161922455, "rewards/soft_format_reward_func": 0.23046875, "rewards/sql_execution_reward_func": 0.7257812470197678, "rewards/xmlcount_reward_func": 0.46484375, "step": 7 }, { "completion_length": 272.546875, "epoch": 0.09907120743034056, "grad_norm": 0.42216944694519043, "kl": 0.002694789902307093, "learning_rate": 3.5000000000000004e-05, "loss": 0.0001, "reward": 2.703847050666809, "reward_std": 0.29045113176107407, "rewards/ngram_similarity_reward_func": 0.3112502619624138, "rewards/reasoning_quality_reward_func": 0.7433781176805496, "rewards/soft_format_reward_func": 0.2421875, "rewards/sql_execution_reward_func": 0.9187500029802322, "rewards/xmlcount_reward_func": 0.48828125, "step": 8 }, { "completion_length": 265.84375, "epoch": 0.11145510835913312, "grad_norm": 0.34325331449508667, "kl": 0.004968019318766892, "learning_rate": 4e-05, "loss": 0.0002, "reward": 2.6481465101242065, "reward_std": 0.24203401803970337, "rewards/ngram_similarity_reward_func": 0.30987007170915604, "rewards/reasoning_quality_reward_func": 0.7656203359365463, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.8304687440395355, "rewards/xmlcount_reward_func": 0.49609375, "step": 9 }, { "completion_length": 265.515625, "epoch": 0.1238390092879257, "grad_norm": 0.36068692803382874, "kl": 0.005839241901412606, "learning_rate": 3.9982867988473446e-05, "loss": 0.0002, "reward": 2.4154441356658936, "reward_std": 0.2717306688427925, "rewards/ngram_similarity_reward_func": 0.30530440993607044, "rewards/reasoning_quality_reward_func": 0.7639749944210052, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.5961647690273821, "rewards/xmlcount_reward_func": 0.5, "step": 10 }, { "completion_length": 271.234375, "epoch": 0.13622291021671826, "grad_norm": 0.37997934222221375, "kl": 0.018266105791553855, "learning_rate": 3.993150456565143e-05, "loss": 0.0007, "reward": 2.8274221420288086, "reward_std": 0.2109035775065422, "rewards/ngram_similarity_reward_func": 0.3601767495274544, "rewards/reasoning_quality_reward_func": 0.7848234474658966, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.9421875029802322, "rewards/xmlcount_reward_func": 0.494140625, "step": 11 }, { "completion_length": 296.265625, "epoch": 0.14860681114551083, "grad_norm": 0.3279929757118225, "kl": 0.011964517878368497, "learning_rate": 3.9846007504728593e-05, "loss": 0.0005, "reward": 2.734043776988983, "reward_std": 0.16971006244421005, "rewards/ngram_similarity_reward_func": 0.3088625408709049, "rewards/reasoning_quality_reward_func": 0.8142437487840652, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8609375059604645, "rewards/xmlcount_reward_func": 0.5, "step": 12 }, { "completion_length": 311.25, "epoch": 0.1609907120743034, "grad_norm": 0.31357380747795105, "kl": 0.018573109060525894, "learning_rate": 3.972653955421975e-05, "loss": 0.0007, "reward": 2.4877208471298218, "reward_std": 0.2542732544243336, "rewards/ngram_similarity_reward_func": 0.40954745560884476, "rewards/reasoning_quality_reward_func": 0.8195796459913254, "rewards/soft_format_reward_func": 0.2421875, "rewards/sql_execution_reward_func": 0.5242187529802322, "rewards/xmlcount_reward_func": 0.4921875, "step": 13 }, { "completion_length": 305.96875, "epoch": 0.17337461300309598, "grad_norm": 0.35947704315185547, "kl": 0.02572451764717698, "learning_rate": 3.95733281281588e-05, "loss": 0.001, "reward": 2.84596848487854, "reward_std": 0.16431875061243773, "rewards/ngram_similarity_reward_func": 0.4371529445052147, "rewards/reasoning_quality_reward_func": 0.7978781312704086, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8609375059604645, "rewards/xmlcount_reward_func": 0.5, "step": 14 }, { "completion_length": 316.8125, "epoch": 0.18575851393188855, "grad_norm": 0.3095603883266449, "kl": 0.031108289025723934, "learning_rate": 3.938666487320323e-05, "loss": 0.0012, "reward": 2.543521463871002, "reward_std": 0.21121197938919067, "rewards/ngram_similarity_reward_func": 0.33944912999868393, "rewards/reasoning_quality_reward_func": 0.7945765405893326, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.6614488586783409, "rewards/xmlcount_reward_func": 0.498046875, "step": 15 }, { "completion_length": 333.65625, "epoch": 0.19814241486068113, "grad_norm": 0.32227563858032227, "kl": 0.027149478904902935, "learning_rate": 3.9166905113468086e-05, "loss": 0.0011, "reward": 2.5172452330589294, "reward_std": 0.23450876772403717, "rewards/ngram_similarity_reward_func": 0.33389993757009506, "rewards/reasoning_quality_reward_func": 0.7915484458208084, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.6476562395691872, "rewards/xmlcount_reward_func": 0.498046875, "step": 16 }, { "completion_length": 370.265625, "epoch": 0.21052631578947367, "grad_norm": 0.31235265731811523, "kl": 0.031742531806230545, "learning_rate": 3.891446717414635e-05, "loss": 0.0013, "reward": 2.688088834285736, "reward_std": 0.11317835189402103, "rewards/ngram_similarity_reward_func": 0.2212217040359974, "rewards/reasoning_quality_reward_func": 0.8551484197378159, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8617187440395355, "rewards/xmlcount_reward_func": 0.5, "step": 17 }, { "completion_length": 340.03125, "epoch": 0.22291021671826625, "grad_norm": 0.3352001905441284, "kl": 0.043277411721646786, "learning_rate": 3.862983158520316e-05, "loss": 0.0017, "reward": 2.741812765598297, "reward_std": 0.14071671105921268, "rewards/ngram_similarity_reward_func": 0.28818774223327637, "rewards/reasoning_quality_reward_func": 0.8387812525033951, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8648437410593033, "rewards/xmlcount_reward_func": 0.5, "step": 18 }, { "completion_length": 333.21875, "epoch": 0.23529411764705882, "grad_norm": 0.31716179847717285, "kl": 0.04549229796975851, "learning_rate": 3.83135401666597e-05, "loss": 0.0018, "reward": 2.625436544418335, "reward_std": 0.13139259070158005, "rewards/ngram_similarity_reward_func": 0.35064588487148285, "rewards/reasoning_quality_reward_func": 0.8470562547445297, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.68359375, "rewards/xmlcount_reward_func": 0.498046875, "step": 19 }, { "completion_length": 330.203125, "epoch": 0.2476780185758514, "grad_norm": 0.31764793395996094, "kl": 0.04236258752644062, "learning_rate": 3.796619499720799e-05, "loss": 0.0017, "reward": 2.899237811565399, "reward_std": 0.20555716007947922, "rewards/ngram_similarity_reward_func": 0.4203330874443054, "rewards/reasoning_quality_reward_func": 0.8171859234571457, "rewards/soft_format_reward_func": 0.2421875, "rewards/sql_execution_reward_func": 0.9234375059604645, "rewards/xmlcount_reward_func": 0.49609375, "step": 20 }, { "completion_length": 412.453125, "epoch": 0.26006191950464397, "grad_norm": 0.30428770184516907, "kl": 0.036752122454345226, "learning_rate": 3.75884572681199e-05, "loss": 0.0015, "reward": 2.610611140727997, "reward_std": 0.15251119248569012, "rewards/ngram_similarity_reward_func": 0.32537825778126717, "rewards/reasoning_quality_reward_func": 0.8832796812057495, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.6578125059604645, "rewards/xmlcount_reward_func": 0.498046875, "step": 21 }, { "completion_length": 354.265625, "epoch": 0.2724458204334365, "grad_norm": 0.3025682270526886, "kl": 0.04639334697276354, "learning_rate": 3.718104602463194e-05, "loss": 0.0019, "reward": 2.575563907623291, "reward_std": 0.21309060789644718, "rewards/ngram_similarity_reward_func": 0.42503100633621216, "rewards/reasoning_quality_reward_func": 0.8571734130382538, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.5492187440395355, "rewards/xmlcount_reward_func": 0.498046875, "step": 22 }, { "completion_length": 379.46875, "epoch": 0.2848297213622291, "grad_norm": 0.28553083539009094, "kl": 0.04365707188844681, "learning_rate": 3.6744736797201856e-05, "loss": 0.0017, "reward": 2.4477399587631226, "reward_std": 0.2514076679944992, "rewards/ngram_similarity_reward_func": 0.45473647862672806, "rewards/reasoning_quality_reward_func": 0.8824515789747238, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.36055195331573486, "rewards/xmlcount_reward_func": 0.5, "step": 23 }, { "completion_length": 346.921875, "epoch": 0.29721362229102166, "grad_norm": 0.3103869557380676, "kl": 0.05476447567343712, "learning_rate": 3.6280360125242234e-05, "loss": 0.0022, "reward": 2.7398348450660706, "reward_std": 0.23609711229801178, "rewards/ngram_similarity_reward_func": 0.4382130652666092, "rewards/reasoning_quality_reward_func": 0.8634234368801117, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.6881983578205109, "rewards/xmlcount_reward_func": 0.5, "step": 24 }, { "completion_length": 420.453125, "epoch": 0.30959752321981426, "grad_norm": 0.2726235091686249, "kl": 0.054436798207461834, "learning_rate": 3.578879997614161e-05, "loss": 0.0022, "reward": 2.6918256878852844, "reward_std": 0.14829625003039837, "rewards/ngram_similarity_reward_func": 0.38556934148073196, "rewards/reasoning_quality_reward_func": 0.9156312495470047, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.640625, "rewards/xmlcount_reward_func": 0.5, "step": 25 }, { "completion_length": 385.9375, "epoch": 0.3219814241486068, "grad_norm": 0.31836339831352234, "kl": 0.05234138946980238, "learning_rate": 3.5270992062582236e-05, "loss": 0.0021, "reward": 2.4975169897079468, "reward_std": 0.3400815278291702, "rewards/ngram_similarity_reward_func": 0.35228848457336426, "rewards/reasoning_quality_reward_func": 0.852295309305191, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.5526988655328751, "rewards/xmlcount_reward_func": 0.494140625, "step": 26 }, { "completion_length": 344.109375, "epoch": 0.33436532507739936, "grad_norm": 0.28171250224113464, "kl": 0.06604294572025537, "learning_rate": 3.472792206135786e-05, "loss": 0.0026, "reward": 2.8997097611427307, "reward_std": 0.1521923691034317, "rewards/ngram_similarity_reward_func": 0.5297628864645958, "rewards/reasoning_quality_reward_func": 0.8543218523263931, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.765625, "rewards/xmlcount_reward_func": 0.5, "step": 27 }, { "completion_length": 367.171875, "epoch": 0.34674922600619196, "grad_norm": 0.2663843631744385, "kl": 0.06475986633449793, "learning_rate": 3.4160623737081886e-05, "loss": 0.0026, "reward": 3.0965726375579834, "reward_std": 0.09447081200778484, "rewards/ngram_similarity_reward_func": 0.4422616958618164, "rewards/reasoning_quality_reward_func": 0.9062640517950058, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 1.0, "rewards/xmlcount_reward_func": 0.498046875, "step": 28 }, { "completion_length": 409.859375, "epoch": 0.3591331269349845, "grad_norm": 0.28277286887168884, "kl": 0.047645531594753265, "learning_rate": 3.3570176974357714e-05, "loss": 0.0019, "reward": 2.482310175895691, "reward_std": 0.155114084482193, "rewards/ngram_similarity_reward_func": 0.17045550420880318, "rewards/reasoning_quality_reward_func": 0.874354675412178, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.6875, "rewards/xmlcount_reward_func": 0.5, "step": 29 }, { "completion_length": 359.9375, "epoch": 0.3715170278637771, "grad_norm": 0.2870733439922333, "kl": 0.0618684496730566, "learning_rate": 3.295770572215697e-05, "loss": 0.0025, "reward": 2.976421356201172, "reward_std": 0.17351308092474937, "rewards/ngram_similarity_reward_func": 0.5148354731500149, "rewards/reasoning_quality_reward_func": 0.8990859091281891, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8125, "rewards/xmlcount_reward_func": 0.5, "step": 30 }, { "completion_length": 364.09375, "epoch": 0.38390092879256965, "grad_norm": 0.3511677086353302, "kl": 0.07197799813002348, "learning_rate": 3.232437585431883e-05, "loss": 0.0029, "reward": 2.9660959243774414, "reward_std": 0.1771723162382841, "rewards/ngram_similarity_reward_func": 0.4719693809747696, "rewards/reasoning_quality_reward_func": 0.8925640434026718, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.859375, "rewards/xmlcount_reward_func": 0.49609375, "step": 31 }, { "completion_length": 388.890625, "epoch": 0.39628482972136225, "grad_norm": 0.27826693654060364, "kl": 0.061267949640750885, "learning_rate": 3.1671392950242836e-05, "loss": 0.0025, "reward": 2.6841952204704285, "reward_std": 0.15280664712190628, "rewards/ngram_similarity_reward_func": 0.3556499555706978, "rewards/reasoning_quality_reward_func": 0.9199515581130981, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.6585937440395355, "rewards/xmlcount_reward_func": 0.5, "step": 32 }, { "completion_length": 366.859375, "epoch": 0.4086687306501548, "grad_norm": 0.2917367219924927, "kl": 0.06815788336098194, "learning_rate": 3.1e-05, "loss": 0.0027, "reward": 2.9397113919258118, "reward_std": 0.1146523468196392, "rewards/ngram_similarity_reward_func": 0.296003520488739, "rewards/reasoning_quality_reward_func": 0.905426561832428, "rewards/soft_format_reward_func": 0.2421875, "rewards/sql_execution_reward_func": 1.0, "rewards/xmlcount_reward_func": 0.49609375, "step": 33 }, { "completion_length": 349.6875, "epoch": 0.42105263157894735, "grad_norm": 0.26540401577949524, "kl": 0.07279603462666273, "learning_rate": 3.0311475038230616e-05, "loss": 0.0029, "reward": 2.89504611492157, "reward_std": 0.19008862972259521, "rewards/ngram_similarity_reward_func": 0.4920961260795593, "rewards/reasoning_quality_reward_func": 0.9310749769210815, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7218750044703484, "rewards/xmlcount_reward_func": 0.5, "step": 34 }, { "completion_length": 349.3125, "epoch": 0.43343653250773995, "grad_norm": 0.30334243178367615, "kl": 0.06882389821112156, "learning_rate": 2.960712871133259e-05, "loss": 0.0028, "reward": 2.8439746499061584, "reward_std": 0.24623795598745346, "rewards/ngram_similarity_reward_func": 0.5152419656515121, "rewards/reasoning_quality_reward_func": 0.9223406165838242, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.6563920453190804, "rewards/xmlcount_reward_func": 0.5, "step": 35 }, { "completion_length": 385.125, "epoch": 0.4458204334365325, "grad_norm": 0.28430214524269104, "kl": 0.06620587687939405, "learning_rate": 2.8888301782571618e-05, "loss": 0.0026, "reward": 2.7404602766036987, "reward_std": 0.25473709031939507, "rewards/ngram_similarity_reward_func": 0.3008290082216263, "rewards/reasoning_quality_reward_func": 0.934943750500679, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7546875029802322, "rewards/xmlcount_reward_func": 0.5, "step": 36 }, { "completion_length": 340.421875, "epoch": 0.4582043343653251, "grad_norm": 0.2889171838760376, "kl": 0.07523482665419579, "learning_rate": 2.8156362579862042e-05, "loss": 0.003, "reward": 2.897187650203705, "reward_std": 0.25312334299087524, "rewards/ngram_similarity_reward_func": 0.41222984343767166, "rewards/reasoning_quality_reward_func": 0.925192192196846, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.8156249970197678, "rewards/xmlcount_reward_func": 0.498046875, "step": 37 }, { "completion_length": 320.171875, "epoch": 0.47058823529411764, "grad_norm": 0.2764835059642792, "kl": 0.07314357813447714, "learning_rate": 2.7412704391076914e-05, "loss": 0.0029, "reward": 3.0073145031929016, "reward_std": 0.1429160237312317, "rewards/ngram_similarity_reward_func": 0.4768817350268364, "rewards/reasoning_quality_reward_func": 0.9210578054189682, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.859375, "rewards/xmlcount_reward_func": 0.5, "step": 38 }, { "completion_length": 332.09375, "epoch": 0.48297213622291024, "grad_norm": 0.32125160098075867, "kl": 0.07088590506464243, "learning_rate": 2.6658742811845377e-05, "loss": 0.0028, "reward": 3.05528324842453, "reward_std": 0.1555289849638939, "rewards/ngram_similarity_reward_func": 0.42346765100955963, "rewards/reasoning_quality_reward_func": 0.9286906123161316, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.953125, "rewards/xmlcount_reward_func": 0.5, "step": 39 }, { "completion_length": 329.9375, "epoch": 0.4953560371517028, "grad_norm": 0.29124927520751953, "kl": 0.06565980054438114, "learning_rate": 2.5895913050885853e-05, "loss": 0.0026, "reward": 2.8043496012687683, "reward_std": 0.18914231285452843, "rewards/ngram_similarity_reward_func": 0.39962920919060707, "rewards/reasoning_quality_reward_func": 0.9246421754360199, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7320312410593033, "rewards/xmlcount_reward_func": 0.498046875, "step": 40 }, { "completion_length": 317.640625, "epoch": 0.5077399380804953, "grad_norm": 0.28270959854125977, "kl": 0.07616368494927883, "learning_rate": 2.512566719800475e-05, "loss": 0.003, "reward": 3.0060057044029236, "reward_std": 0.20819612592458725, "rewards/ngram_similarity_reward_func": 0.46407629549503326, "rewards/reasoning_quality_reward_func": 0.9291453063488007, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8627840876579285, "rewards/xmlcount_reward_func": 0.5, "step": 41 }, { "completion_length": 320.1875, "epoch": 0.5201238390092879, "grad_norm": 0.29271021485328674, "kl": 0.07423507608473301, "learning_rate": 2.4349471459960935e-05, "loss": 0.003, "reward": 2.8800657987594604, "reward_std": 0.12723891995847225, "rewards/ngram_similarity_reward_func": 0.39660485088825226, "rewards/reasoning_quality_reward_func": 0.902210921049118, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8312499970197678, "rewards/xmlcount_reward_func": 0.5, "step": 42 }, { "completion_length": 344.484375, "epoch": 0.5325077399380805, "grad_norm": 0.28183987736701965, "kl": 0.07508978061378002, "learning_rate": 2.356880336945785e-05, "loss": 0.003, "reward": 2.802961766719818, "reward_std": 0.15244293212890625, "rewards/ngram_similarity_reward_func": 0.28034142404794693, "rewards/reasoning_quality_reward_func": 0.9257453233003616, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8468749970197678, "rewards/xmlcount_reward_func": 0.5, "step": 43 }, { "completion_length": 331.1875, "epoch": 0.544891640866873, "grad_norm": 0.2783842086791992, "kl": 0.0693344809114933, "learning_rate": 2.2785148972576052e-05, "loss": 0.0028, "reward": 2.757667899131775, "reward_std": 0.18025352619588375, "rewards/ngram_similarity_reward_func": 0.48256489634513855, "rewards/reasoning_quality_reward_func": 0.9297906160354614, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.5953124985098839, "rewards/xmlcount_reward_func": 0.5, "step": 44 }, { "completion_length": 324.140625, "epoch": 0.5572755417956656, "grad_norm": 0.2986034154891968, "kl": 0.07126997038722038, "learning_rate": 2.2000000000000003e-05, "loss": 0.0029, "reward": 2.700526773929596, "reward_std": 0.1273169182240963, "rewards/ngram_similarity_reward_func": 0.2901314552873373, "rewards/reasoning_quality_reward_func": 0.9182077944278717, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7421875, "rewards/xmlcount_reward_func": 0.5, "step": 45 }, { "completion_length": 312.859375, "epoch": 0.5696594427244582, "grad_norm": 0.2958148717880249, "kl": 0.07780578825622797, "learning_rate": 2.1214851027423954e-05, "loss": 0.0031, "reward": 2.8383458256721497, "reward_std": 0.14491115603595972, "rewards/ngram_similarity_reward_func": 0.40570230409502983, "rewards/reasoning_quality_reward_func": 0.916734367609024, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7659090906381607, "rewards/xmlcount_reward_func": 0.5, "step": 46 }, { "completion_length": 311.828125, "epoch": 0.5820433436532507, "grad_norm": 0.2920028567314148, "kl": 0.06001428607851267, "learning_rate": 2.0431196630542152e-05, "loss": 0.0024, "reward": 3.046830952167511, "reward_std": 0.2057624664157629, "rewards/ngram_similarity_reward_func": 0.6009372770786285, "rewards/reasoning_quality_reward_func": 0.9173781126737595, "rewards/soft_format_reward_func": 0.24609375, "rewards/sql_execution_reward_func": 0.7843749970197678, "rewards/xmlcount_reward_func": 0.498046875, "step": 47 }, { "completion_length": 305.65625, "epoch": 0.5944272445820433, "grad_norm": 0.2988949418067932, "kl": 0.06703130528330803, "learning_rate": 1.9650528540039077e-05, "loss": 0.0027, "reward": 2.6817620396614075, "reward_std": 0.12368473783135414, "rewards/ngram_similarity_reward_func": 0.38093067705631256, "rewards/reasoning_quality_reward_func": 0.9133312404155731, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.637499988079071, "rewards/xmlcount_reward_func": 0.5, "step": 48 }, { "completion_length": 312.8125, "epoch": 0.6068111455108359, "grad_norm": 0.27996182441711426, "kl": 0.08176139369606972, "learning_rate": 1.8874332801995258e-05, "loss": 0.0033, "reward": 3.1290236711502075, "reward_std": 0.09270750731229782, "rewards/ngram_similarity_reward_func": 0.44197215139865875, "rewards/reasoning_quality_reward_func": 0.9370515793561935, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 1.0, "rewards/xmlcount_reward_func": 0.5, "step": 49 }, { "completion_length": 344.09375, "epoch": 0.6191950464396285, "grad_norm": 0.27371323108673096, "kl": 0.07049352023750544, "learning_rate": 1.810408694911415e-05, "loss": 0.0028, "reward": 2.818117916584015, "reward_std": 0.21658625453710556, "rewards/ngram_similarity_reward_func": 0.41760701686143875, "rewards/reasoning_quality_reward_func": 0.9512921720743179, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.703125, "rewards/xmlcount_reward_func": 0.49609375, "step": 50 }, { "completion_length": 317.46875, "epoch": 0.631578947368421, "grad_norm": 0.30640900135040283, "kl": 0.0756241325289011, "learning_rate": 1.7341257188154625e-05, "loss": 0.003, "reward": 2.9922556281089783, "reward_std": 0.10434877779334784, "rewards/ngram_similarity_reward_func": 0.4318556822836399, "rewards/reasoning_quality_reward_func": 0.9354000091552734, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.875, "rewards/xmlcount_reward_func": 0.5, "step": 51 }, { "completion_length": 362.546875, "epoch": 0.6439628482972136, "grad_norm": 0.2744141221046448, "kl": 0.062476624734699726, "learning_rate": 1.6587295608923088e-05, "loss": 0.0025, "reward": 2.807952642440796, "reward_std": 0.12682343646883965, "rewards/ngram_similarity_reward_func": 0.25346523337066174, "rewards/reasoning_quality_reward_func": 0.9455031156539917, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8609375059604645, "rewards/xmlcount_reward_func": 0.498046875, "step": 52 }, { "completion_length": 326.640625, "epoch": 0.6563467492260062, "grad_norm": 0.2845337688922882, "kl": 0.06813267059624195, "learning_rate": 1.5843637420137964e-05, "loss": 0.0027, "reward": 2.9080602526664734, "reward_std": 0.15239747613668442, "rewards/ngram_similarity_reward_func": 0.47624945640563965, "rewards/reasoning_quality_reward_func": 0.9552484601736069, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7265625, "rewards/xmlcount_reward_func": 0.5, "step": 53 }, { "completion_length": 336.140625, "epoch": 0.6687306501547987, "grad_norm": 0.27873164415359497, "kl": 0.06789861433207989, "learning_rate": 1.5111698217428385e-05, "loss": 0.0027, "reward": 2.9421083331108093, "reward_std": 0.14753830805420876, "rewards/ngram_similarity_reward_func": 0.4633248597383499, "rewards/reasoning_quality_reward_func": 0.9520296901464462, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7767538130283356, "rewards/xmlcount_reward_func": 0.5, "step": 54 }, { "completion_length": 342.953125, "epoch": 0.6811145510835913, "grad_norm": 0.29634758830070496, "kl": 0.0591394891962409, "learning_rate": 1.4392871288667415e-05, "loss": 0.0024, "reward": 2.8162970542907715, "reward_std": 0.11586539167910814, "rewards/ngram_similarity_reward_func": 0.28963764756917953, "rewards/reasoning_quality_reward_func": 0.92978435754776, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8468749970197678, "rewards/xmlcount_reward_func": 0.5, "step": 55 }, { "completion_length": 326.34375, "epoch": 0.6934984520123839, "grad_norm": 0.3019636273384094, "kl": 0.07066548988223076, "learning_rate": 1.3688524961769396e-05, "loss": 0.0028, "reward": 2.952807366847992, "reward_std": 0.11193438991904259, "rewards/ngram_similarity_reward_func": 0.4892011173069477, "rewards/reasoning_quality_reward_func": 0.9323562532663345, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.78125, "rewards/xmlcount_reward_func": 0.5, "step": 56 }, { "completion_length": 341.09375, "epoch": 0.7058823529411765, "grad_norm": 0.27949875593185425, "kl": 0.08039782661944628, "learning_rate": 1.3000000000000006e-05, "loss": 0.0032, "reward": 3.017184257507324, "reward_std": 0.12472959142178297, "rewards/ngram_similarity_reward_func": 0.4284828044474125, "rewards/reasoning_quality_reward_func": 0.9480765461921692, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.890625, "rewards/xmlcount_reward_func": 0.5, "step": 57 }, { "completion_length": 348.234375, "epoch": 0.718266253869969, "grad_norm": 0.26645585894584656, "kl": 0.06729870289564133, "learning_rate": 1.232860704975717e-05, "loss": 0.0027, "reward": 2.8991669416427612, "reward_std": 0.04610138572752476, "rewards/ngram_similarity_reward_func": 0.29805589467287064, "rewards/reasoning_quality_reward_func": 0.9761109352111816, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.875, "rewards/xmlcount_reward_func": 0.5, "step": 58 }, { "completion_length": 322.765625, "epoch": 0.7306501547987616, "grad_norm": 0.2988031804561615, "kl": 0.07192371133714914, "learning_rate": 1.1675624145681177e-05, "loss": 0.0029, "reward": 2.894783914089203, "reward_std": 0.19001621287316084, "rewards/ngram_similarity_reward_func": 0.5530954301357269, "rewards/reasoning_quality_reward_func": 0.9504609107971191, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.6412276178598404, "rewards/xmlcount_reward_func": 0.5, "step": 59 }, { "completion_length": 355.75, "epoch": 0.7430340557275542, "grad_norm": 0.2641834020614624, "kl": 0.06368108931928873, "learning_rate": 1.1042294277843029e-05, "loss": 0.0025, "reward": 2.89058655500412, "reward_std": 0.12556276191025972, "rewards/ngram_similarity_reward_func": 0.40262240916490555, "rewards/reasoning_quality_reward_func": 0.9567140638828278, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.78125, "rewards/xmlcount_reward_func": 0.5, "step": 60 }, { "completion_length": 300.3125, "epoch": 0.7554179566563467, "grad_norm": 0.29292401671409607, "kl": 0.08026566356420517, "learning_rate": 1.0429823025642292e-05, "loss": 0.0032, "reward": 3.160749673843384, "reward_std": 0.2943864706903696, "rewards/ngram_similarity_reward_func": 0.5859435126185417, "rewards/reasoning_quality_reward_func": 0.9517593681812286, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.875, "rewards/xmlcount_reward_func": 0.498046875, "step": 61 }, { "completion_length": 307.1875, "epoch": 0.7678018575851393, "grad_norm": 0.29157254099845886, "kl": 0.07550233788788319, "learning_rate": 9.839376262918117e-06, "loss": 0.003, "reward": 2.749490201473236, "reward_std": 0.1510023418813944, "rewards/ngram_similarity_reward_func": 0.4469105303287506, "rewards/reasoning_quality_reward_func": 0.9275796860456467, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.625, "rewards/xmlcount_reward_func": 0.5, "step": 62 }, { "completion_length": 320.5625, "epoch": 0.7801857585139319, "grad_norm": 0.26841044425964355, "kl": 0.07417850941419601, "learning_rate": 9.272077938642147e-06, "loss": 0.003, "reward": 3.029486298561096, "reward_std": 0.14038624800741673, "rewards/ngram_similarity_reward_func": 0.5038420185446739, "rewards/reasoning_quality_reward_func": 0.9472562223672867, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.828388050198555, "rewards/xmlcount_reward_func": 0.5, "step": 63 }, { "completion_length": 347.9375, "epoch": 0.7925696594427245, "grad_norm": 0.2752821743488312, "kl": 0.0751224858686328, "learning_rate": 8.72900793741777e-06, "loss": 0.003, "reward": 3.038633167743683, "reward_std": 0.08897098805755377, "rewards/ngram_similarity_reward_func": 0.4439566656947136, "rewards/reasoning_quality_reward_func": 0.9696765691041946, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.875, "rewards/xmlcount_reward_func": 0.5, "step": 64 }, { "completion_length": 339.515625, "epoch": 0.804953560371517, "grad_norm": 0.31212714314460754, "kl": 0.08936411328613758, "learning_rate": 8.2112000238584e-06, "loss": 0.0036, "reward": 3.021269977092743, "reward_std": 0.10588092915713787, "rewards/ngram_similarity_reward_func": 0.5628981739282608, "rewards/reasoning_quality_reward_func": 0.9739968627691269, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.734375, "rewards/xmlcount_reward_func": 0.5, "step": 65 }, { "completion_length": 376.796875, "epoch": 0.8173374613003096, "grad_norm": 0.26753705739974976, "kl": 0.0706013347953558, "learning_rate": 7.71963987475777e-06, "loss": 0.0028, "reward": 2.8900545239448547, "reward_std": 0.11279613338410854, "rewards/ngram_similarity_reward_func": 0.31394050642848015, "rewards/reasoning_quality_reward_func": 0.9636140614748001, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.8624999970197678, "rewards/xmlcount_reward_func": 0.5, "step": 66 }, { "completion_length": 325.171875, "epoch": 0.8297213622291022, "grad_norm": 0.2971738278865814, "kl": 0.07812906242907047, "learning_rate": 7.255263202798146e-06, "loss": 0.0031, "reward": 2.962576150894165, "reward_std": 0.0649077408015728, "rewards/ngram_similarity_reward_func": 0.3916136734187603, "rewards/reasoning_quality_reward_func": 0.9459624886512756, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.875, "rewards/xmlcount_reward_func": 0.5, "step": 67 }, { "completion_length": 339.921875, "epoch": 0.8421052631578947, "grad_norm": 0.2827884256839752, "kl": 0.08069668803364038, "learning_rate": 6.818953975368061e-06, "loss": 0.0032, "reward": 3.235768675804138, "reward_std": 0.1414957493543625, "rewards/ngram_similarity_reward_func": 0.5445858836174011, "rewards/reasoning_quality_reward_func": 0.9724328070878983, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.96875, "rewards/xmlcount_reward_func": 0.5, "step": 68 }, { "completion_length": 374.375, "epoch": 0.8544891640866873, "grad_norm": 0.2604800760746002, "kl": 0.07065233774483204, "learning_rate": 6.411542731880104e-06, "loss": 0.0028, "reward": 3.021795392036438, "reward_std": 0.15352355409413576, "rewards/ngram_similarity_reward_func": 0.34035639837384224, "rewards/reasoning_quality_reward_func": 0.9814390540122986, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.9500000029802322, "rewards/xmlcount_reward_func": 0.5, "step": 69 }, { "completion_length": 346.640625, "epoch": 0.8668730650154799, "grad_norm": 0.2746836543083191, "kl": 0.06909866165369749, "learning_rate": 6.03380500279201e-06, "loss": 0.0028, "reward": 2.642494022846222, "reward_std": 0.19972242414951324, "rewards/ngram_similarity_reward_func": 0.4592840112745762, "rewards/reasoning_quality_reward_func": 0.9604828059673309, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.4727272689342499, "rewards/xmlcount_reward_func": 0.5, "step": 70 }, { "completion_length": 345.78125, "epoch": 0.8792569659442725, "grad_norm": 0.3155825734138489, "kl": 0.07083407044410706, "learning_rate": 5.686459833340302e-06, "loss": 0.0028, "reward": 2.6602479815483093, "reward_std": 0.1187733905389905, "rewards/ngram_similarity_reward_func": 0.4456479325890541, "rewards/reasoning_quality_reward_func": 0.9665531367063522, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.498046875, "step": 71 }, { "completion_length": 347.640625, "epoch": 0.891640866873065, "grad_norm": 0.2810722589492798, "kl": 0.0792484674602747, "learning_rate": 5.370168414796839e-06, "loss": 0.0032, "reward": 2.9334104657173157, "reward_std": 0.15719584189355373, "rewards/ngram_similarity_reward_func": 0.4541776664555073, "rewards/reasoning_quality_reward_func": 0.9636077880859375, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.765625, "rewards/xmlcount_reward_func": 0.5, "step": 72 }, { "completion_length": 331.265625, "epoch": 0.9040247678018576, "grad_norm": 0.2836628556251526, "kl": 0.08229007571935654, "learning_rate": 5.085532825853651e-06, "loss": 0.0033, "reward": 3.0361828804016113, "reward_std": 0.19561532977968454, "rewards/ngram_similarity_reward_func": 0.40529073029756546, "rewards/reasoning_quality_reward_func": 0.9590171575546265, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.921875, "rewards/xmlcount_reward_func": 0.5, "step": 73 }, { "completion_length": 345.25, "epoch": 0.9164086687306502, "grad_norm": 0.2591722309589386, "kl": 0.0776594839990139, "learning_rate": 4.833094886531918e-06, "loss": 0.0031, "reward": 2.752366602420807, "reward_std": 0.1025167815387249, "rewards/ngram_similarity_reward_func": 0.4143087863922119, "rewards/reasoning_quality_reward_func": 0.9630578011274338, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.625, "rewards/xmlcount_reward_func": 0.5, "step": 74 }, { "completion_length": 377.71875, "epoch": 0.9287925696594427, "grad_norm": 0.25510552525520325, "kl": 0.07739250548183918, "learning_rate": 4.613335126796773e-06, "loss": 0.0031, "reward": 3.047813057899475, "reward_std": 0.18943855166435242, "rewards/ngram_similarity_reward_func": 0.36462873220443726, "rewards/reasoning_quality_reward_func": 0.9753718972206116, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.9578125029802322, "rewards/xmlcount_reward_func": 0.5, "step": 75 }, { "completion_length": 353.359375, "epoch": 0.9411764705882353, "grad_norm": 0.26207593083381653, "kl": 0.0766323795542121, "learning_rate": 4.4266718718412e-06, "loss": 0.0031, "reward": 3.032430052757263, "reward_std": 0.06077156774699688, "rewards/ngram_similarity_reward_func": 0.4302191510796547, "rewards/reasoning_quality_reward_func": 0.9772109389305115, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.875, "rewards/xmlcount_reward_func": 0.5, "step": 76 }, { "completion_length": 330.796875, "epoch": 0.9535603715170279, "grad_norm": 0.26912447810173035, "kl": 0.07224935106933117, "learning_rate": 4.2734604457802565e-06, "loss": 0.0029, "reward": 2.9969308972358704, "reward_std": 0.11895978637039661, "rewards/ngram_similarity_reward_func": 0.5269326269626617, "rewards/reasoning_quality_reward_func": 0.9542312771081924, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.7657670378684998, "rewards/xmlcount_reward_func": 0.5, "step": 77 }, { "completion_length": 359.5625, "epoch": 0.9659442724458205, "grad_norm": 0.25813353061676025, "kl": 0.07646154798567295, "learning_rate": 4.153992495271414e-06, "loss": 0.0031, "reward": 3.1742658615112305, "reward_std": 0.15984882693737745, "rewards/ngram_similarity_reward_func": 0.4850096367299557, "rewards/reasoning_quality_reward_func": 0.9642562717199326, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.9749999940395355, "rewards/xmlcount_reward_func": 0.5, "step": 78 }, { "completion_length": 348.515625, "epoch": 0.978328173374613, "grad_norm": 0.269113689661026, "kl": 0.07896707952022552, "learning_rate": 4.0684954343485806e-06, "loss": 0.0032, "reward": 2.8017610907554626, "reward_std": 0.23625551722943783, "rewards/ngram_similarity_reward_func": 0.47032520920038223, "rewards/reasoning_quality_reward_func": 0.9892484545707703, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.5921874865889549, "rewards/xmlcount_reward_func": 0.5, "step": 79 }, { "completion_length": 375.296875, "epoch": 0.9907120743034056, "grad_norm": 0.2751859724521637, "kl": 0.06904378905892372, "learning_rate": 4.01713201152656e-06, "loss": 0.0028, "reward": 2.791142702102661, "reward_std": 0.14453750429674983, "rewards/ngram_similarity_reward_func": 0.31529772467911243, "rewards/reasoning_quality_reward_func": 0.9427484273910522, "rewards/soft_format_reward_func": 0.25, "rewards/sql_execution_reward_func": 0.783096581697464, "rewards/xmlcount_reward_func": 0.5, "step": 80 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }