| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9907120743034056, | |
| "eval_steps": 500, | |
| "global_step": 80, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 222.078125, | |
| "epoch": 0.01238390092879257, | |
| "grad_norm": 0.5330060720443726, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0, | |
| "reward": 2.1595765352249146, | |
| "reward_std": 0.4799410402774811, | |
| "rewards/ngram_similarity_reward_func": 0.22074851393699646, | |
| "rewards/reasoning_quality_reward_func": 0.7317968606948853, | |
| "rewards/soft_format_reward_func": 0.2109375, | |
| "rewards/sql_execution_reward_func": 0.5703125, | |
| "rewards/xmlcount_reward_func": 0.42578125, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 228.609375, | |
| "epoch": 0.02476780185758514, | |
| "grad_norm": 0.38014766573905945, | |
| "kl": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0, | |
| "reward": 2.1653581261634827, | |
| "reward_std": 0.35478534549474716, | |
| "rewards/ngram_similarity_reward_func": 0.2899237535893917, | |
| "rewards/reasoning_quality_reward_func": 0.7039499878883362, | |
| "rewards/soft_format_reward_func": 0.203125, | |
| "rewards/sql_execution_reward_func": 0.5406249971129, | |
| "rewards/xmlcount_reward_func": 0.427734375, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 249.984375, | |
| "epoch": 0.03715170278637771, | |
| "grad_norm": 0.4404110908508301, | |
| "kl": 0.00019386520580155775, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 2.2775589525699615, | |
| "reward_std": 0.2987724468111992, | |
| "rewards/ngram_similarity_reward_func": 0.14325893856585026, | |
| "rewards/reasoning_quality_reward_func": 0.7573468536138535, | |
| "rewards/soft_format_reward_func": 0.23828125, | |
| "rewards/sql_execution_reward_func": 0.66015625, | |
| "rewards/xmlcount_reward_func": 0.478515625, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 219.28125, | |
| "epoch": 0.04953560371517028, | |
| "grad_norm": 0.4848824143409729, | |
| "kl": 0.00020055885761394165, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.0, | |
| "reward": 1.9259249866008759, | |
| "reward_std": 0.49138037860393524, | |
| "rewards/ngram_similarity_reward_func": 0.16651681065559387, | |
| "rewards/reasoning_quality_reward_func": 0.7145218700170517, | |
| "rewards/soft_format_reward_func": 0.18359375, | |
| "rewards/sql_execution_reward_func": 0.4745738562196493, | |
| "rewards/xmlcount_reward_func": 0.38671875, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 240.875, | |
| "epoch": 0.06191950464396285, | |
| "grad_norm": 0.3829522430896759, | |
| "kl": 0.00025230981555068865, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0, | |
| "reward": 2.286948025226593, | |
| "reward_std": 0.46134958416223526, | |
| "rewards/ngram_similarity_reward_func": 0.23377767577767372, | |
| "rewards/reasoning_quality_reward_func": 0.7305140793323517, | |
| "rewards/soft_format_reward_func": 0.21484375, | |
| "rewards/sql_execution_reward_func": 0.6585937514901161, | |
| "rewards/xmlcount_reward_func": 0.44921875, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 220.421875, | |
| "epoch": 0.07430340557275542, | |
| "grad_norm": 0.41778889298439026, | |
| "kl": 0.0005900838441448286, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0, | |
| "reward": 2.2411254048347473, | |
| "reward_std": 0.41372712701559067, | |
| "rewards/ngram_similarity_reward_func": 0.21116455644369125, | |
| "rewards/reasoning_quality_reward_func": 0.7018359005451202, | |
| "rewards/soft_format_reward_func": 0.234375, | |
| "rewards/sql_execution_reward_func": 0.625, | |
| "rewards/xmlcount_reward_func": 0.46875, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 243.671875, | |
| "epoch": 0.08668730650154799, | |
| "grad_norm": 0.362834632396698, | |
| "kl": 0.0015416233654832467, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.0001, | |
| "reward": 2.479892313480377, | |
| "reward_std": 0.42981887608766556, | |
| "rewards/ngram_similarity_reward_func": 0.31569236889481544, | |
| "rewards/reasoning_quality_reward_func": 0.7431062161922455, | |
| "rewards/soft_format_reward_func": 0.23046875, | |
| "rewards/sql_execution_reward_func": 0.7257812470197678, | |
| "rewards/xmlcount_reward_func": 0.46484375, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 272.546875, | |
| "epoch": 0.09907120743034056, | |
| "grad_norm": 0.42216944694519043, | |
| "kl": 0.002694789902307093, | |
| "learning_rate": 3.5000000000000004e-05, | |
| "loss": 0.0001, | |
| "reward": 2.703847050666809, | |
| "reward_std": 0.29045113176107407, | |
| "rewards/ngram_similarity_reward_func": 0.3112502619624138, | |
| "rewards/reasoning_quality_reward_func": 0.7433781176805496, | |
| "rewards/soft_format_reward_func": 0.2421875, | |
| "rewards/sql_execution_reward_func": 0.9187500029802322, | |
| "rewards/xmlcount_reward_func": 0.48828125, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 265.84375, | |
| "epoch": 0.11145510835913312, | |
| "grad_norm": 0.34325331449508667, | |
| "kl": 0.004968019318766892, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0002, | |
| "reward": 2.6481465101242065, | |
| "reward_std": 0.24203401803970337, | |
| "rewards/ngram_similarity_reward_func": 0.30987007170915604, | |
| "rewards/reasoning_quality_reward_func": 0.7656203359365463, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.8304687440395355, | |
| "rewards/xmlcount_reward_func": 0.49609375, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 265.515625, | |
| "epoch": 0.1238390092879257, | |
| "grad_norm": 0.36068692803382874, | |
| "kl": 0.005839241901412606, | |
| "learning_rate": 3.9982867988473446e-05, | |
| "loss": 0.0002, | |
| "reward": 2.4154441356658936, | |
| "reward_std": 0.2717306688427925, | |
| "rewards/ngram_similarity_reward_func": 0.30530440993607044, | |
| "rewards/reasoning_quality_reward_func": 0.7639749944210052, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.5961647690273821, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 271.234375, | |
| "epoch": 0.13622291021671826, | |
| "grad_norm": 0.37997934222221375, | |
| "kl": 0.018266105791553855, | |
| "learning_rate": 3.993150456565143e-05, | |
| "loss": 0.0007, | |
| "reward": 2.8274221420288086, | |
| "reward_std": 0.2109035775065422, | |
| "rewards/ngram_similarity_reward_func": 0.3601767495274544, | |
| "rewards/reasoning_quality_reward_func": 0.7848234474658966, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.9421875029802322, | |
| "rewards/xmlcount_reward_func": 0.494140625, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 296.265625, | |
| "epoch": 0.14860681114551083, | |
| "grad_norm": 0.3279929757118225, | |
| "kl": 0.011964517878368497, | |
| "learning_rate": 3.9846007504728593e-05, | |
| "loss": 0.0005, | |
| "reward": 2.734043776988983, | |
| "reward_std": 0.16971006244421005, | |
| "rewards/ngram_similarity_reward_func": 0.3088625408709049, | |
| "rewards/reasoning_quality_reward_func": 0.8142437487840652, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8609375059604645, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 311.25, | |
| "epoch": 0.1609907120743034, | |
| "grad_norm": 0.31357380747795105, | |
| "kl": 0.018573109060525894, | |
| "learning_rate": 3.972653955421975e-05, | |
| "loss": 0.0007, | |
| "reward": 2.4877208471298218, | |
| "reward_std": 0.2542732544243336, | |
| "rewards/ngram_similarity_reward_func": 0.40954745560884476, | |
| "rewards/reasoning_quality_reward_func": 0.8195796459913254, | |
| "rewards/soft_format_reward_func": 0.2421875, | |
| "rewards/sql_execution_reward_func": 0.5242187529802322, | |
| "rewards/xmlcount_reward_func": 0.4921875, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 305.96875, | |
| "epoch": 0.17337461300309598, | |
| "grad_norm": 0.35947704315185547, | |
| "kl": 0.02572451764717698, | |
| "learning_rate": 3.95733281281588e-05, | |
| "loss": 0.001, | |
| "reward": 2.84596848487854, | |
| "reward_std": 0.16431875061243773, | |
| "rewards/ngram_similarity_reward_func": 0.4371529445052147, | |
| "rewards/reasoning_quality_reward_func": 0.7978781312704086, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8609375059604645, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 316.8125, | |
| "epoch": 0.18575851393188855, | |
| "grad_norm": 0.3095603883266449, | |
| "kl": 0.031108289025723934, | |
| "learning_rate": 3.938666487320323e-05, | |
| "loss": 0.0012, | |
| "reward": 2.543521463871002, | |
| "reward_std": 0.21121197938919067, | |
| "rewards/ngram_similarity_reward_func": 0.33944912999868393, | |
| "rewards/reasoning_quality_reward_func": 0.7945765405893326, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.6614488586783409, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 333.65625, | |
| "epoch": 0.19814241486068113, | |
| "grad_norm": 0.32227563858032227, | |
| "kl": 0.027149478904902935, | |
| "learning_rate": 3.9166905113468086e-05, | |
| "loss": 0.0011, | |
| "reward": 2.5172452330589294, | |
| "reward_std": 0.23450876772403717, | |
| "rewards/ngram_similarity_reward_func": 0.33389993757009506, | |
| "rewards/reasoning_quality_reward_func": 0.7915484458208084, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.6476562395691872, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 370.265625, | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.31235265731811523, | |
| "kl": 0.031742531806230545, | |
| "learning_rate": 3.891446717414635e-05, | |
| "loss": 0.0013, | |
| "reward": 2.688088834285736, | |
| "reward_std": 0.11317835189402103, | |
| "rewards/ngram_similarity_reward_func": 0.2212217040359974, | |
| "rewards/reasoning_quality_reward_func": 0.8551484197378159, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8617187440395355, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 340.03125, | |
| "epoch": 0.22291021671826625, | |
| "grad_norm": 0.3352001905441284, | |
| "kl": 0.043277411721646786, | |
| "learning_rate": 3.862983158520316e-05, | |
| "loss": 0.0017, | |
| "reward": 2.741812765598297, | |
| "reward_std": 0.14071671105921268, | |
| "rewards/ngram_similarity_reward_func": 0.28818774223327637, | |
| "rewards/reasoning_quality_reward_func": 0.8387812525033951, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8648437410593033, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 333.21875, | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.31716179847717285, | |
| "kl": 0.04549229796975851, | |
| "learning_rate": 3.83135401666597e-05, | |
| "loss": 0.0018, | |
| "reward": 2.625436544418335, | |
| "reward_std": 0.13139259070158005, | |
| "rewards/ngram_similarity_reward_func": 0.35064588487148285, | |
| "rewards/reasoning_quality_reward_func": 0.8470562547445297, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.68359375, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 330.203125, | |
| "epoch": 0.2476780185758514, | |
| "grad_norm": 0.31764793395996094, | |
| "kl": 0.04236258752644062, | |
| "learning_rate": 3.796619499720799e-05, | |
| "loss": 0.0017, | |
| "reward": 2.899237811565399, | |
| "reward_std": 0.20555716007947922, | |
| "rewards/ngram_similarity_reward_func": 0.4203330874443054, | |
| "rewards/reasoning_quality_reward_func": 0.8171859234571457, | |
| "rewards/soft_format_reward_func": 0.2421875, | |
| "rewards/sql_execution_reward_func": 0.9234375059604645, | |
| "rewards/xmlcount_reward_func": 0.49609375, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 412.453125, | |
| "epoch": 0.26006191950464397, | |
| "grad_norm": 0.30428770184516907, | |
| "kl": 0.036752122454345226, | |
| "learning_rate": 3.75884572681199e-05, | |
| "loss": 0.0015, | |
| "reward": 2.610611140727997, | |
| "reward_std": 0.15251119248569012, | |
| "rewards/ngram_similarity_reward_func": 0.32537825778126717, | |
| "rewards/reasoning_quality_reward_func": 0.8832796812057495, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.6578125059604645, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 354.265625, | |
| "epoch": 0.2724458204334365, | |
| "grad_norm": 0.3025682270526886, | |
| "kl": 0.04639334697276354, | |
| "learning_rate": 3.718104602463194e-05, | |
| "loss": 0.0019, | |
| "reward": 2.575563907623291, | |
| "reward_std": 0.21309060789644718, | |
| "rewards/ngram_similarity_reward_func": 0.42503100633621216, | |
| "rewards/reasoning_quality_reward_func": 0.8571734130382538, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.5492187440395355, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 379.46875, | |
| "epoch": 0.2848297213622291, | |
| "grad_norm": 0.28553083539009094, | |
| "kl": 0.04365707188844681, | |
| "learning_rate": 3.6744736797201856e-05, | |
| "loss": 0.0017, | |
| "reward": 2.4477399587631226, | |
| "reward_std": 0.2514076679944992, | |
| "rewards/ngram_similarity_reward_func": 0.45473647862672806, | |
| "rewards/reasoning_quality_reward_func": 0.8824515789747238, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.36055195331573486, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 346.921875, | |
| "epoch": 0.29721362229102166, | |
| "grad_norm": 0.3103869557380676, | |
| "kl": 0.05476447567343712, | |
| "learning_rate": 3.6280360125242234e-05, | |
| "loss": 0.0022, | |
| "reward": 2.7398348450660706, | |
| "reward_std": 0.23609711229801178, | |
| "rewards/ngram_similarity_reward_func": 0.4382130652666092, | |
| "rewards/reasoning_quality_reward_func": 0.8634234368801117, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.6881983578205109, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 420.453125, | |
| "epoch": 0.30959752321981426, | |
| "grad_norm": 0.2726235091686249, | |
| "kl": 0.054436798207461834, | |
| "learning_rate": 3.578879997614161e-05, | |
| "loss": 0.0022, | |
| "reward": 2.6918256878852844, | |
| "reward_std": 0.14829625003039837, | |
| "rewards/ngram_similarity_reward_func": 0.38556934148073196, | |
| "rewards/reasoning_quality_reward_func": 0.9156312495470047, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.640625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 385.9375, | |
| "epoch": 0.3219814241486068, | |
| "grad_norm": 0.31836339831352234, | |
| "kl": 0.05234138946980238, | |
| "learning_rate": 3.5270992062582236e-05, | |
| "loss": 0.0021, | |
| "reward": 2.4975169897079468, | |
| "reward_std": 0.3400815278291702, | |
| "rewards/ngram_similarity_reward_func": 0.35228848457336426, | |
| "rewards/reasoning_quality_reward_func": 0.852295309305191, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.5526988655328751, | |
| "rewards/xmlcount_reward_func": 0.494140625, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 344.109375, | |
| "epoch": 0.33436532507739936, | |
| "grad_norm": 0.28171250224113464, | |
| "kl": 0.06604294572025537, | |
| "learning_rate": 3.472792206135786e-05, | |
| "loss": 0.0026, | |
| "reward": 2.8997097611427307, | |
| "reward_std": 0.1521923691034317, | |
| "rewards/ngram_similarity_reward_func": 0.5297628864645958, | |
| "rewards/reasoning_quality_reward_func": 0.8543218523263931, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.765625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 367.171875, | |
| "epoch": 0.34674922600619196, | |
| "grad_norm": 0.2663843631744385, | |
| "kl": 0.06475986633449793, | |
| "learning_rate": 3.4160623737081886e-05, | |
| "loss": 0.0026, | |
| "reward": 3.0965726375579834, | |
| "reward_std": 0.09447081200778484, | |
| "rewards/ngram_similarity_reward_func": 0.4422616958618164, | |
| "rewards/reasoning_quality_reward_func": 0.9062640517950058, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 1.0, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 409.859375, | |
| "epoch": 0.3591331269349845, | |
| "grad_norm": 0.28277286887168884, | |
| "kl": 0.047645531594753265, | |
| "learning_rate": 3.3570176974357714e-05, | |
| "loss": 0.0019, | |
| "reward": 2.482310175895691, | |
| "reward_std": 0.155114084482193, | |
| "rewards/ngram_similarity_reward_func": 0.17045550420880318, | |
| "rewards/reasoning_quality_reward_func": 0.874354675412178, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.6875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 359.9375, | |
| "epoch": 0.3715170278637771, | |
| "grad_norm": 0.2870733439922333, | |
| "kl": 0.0618684496730566, | |
| "learning_rate": 3.295770572215697e-05, | |
| "loss": 0.0025, | |
| "reward": 2.976421356201172, | |
| "reward_std": 0.17351308092474937, | |
| "rewards/ngram_similarity_reward_func": 0.5148354731500149, | |
| "rewards/reasoning_quality_reward_func": 0.8990859091281891, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8125, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 364.09375, | |
| "epoch": 0.38390092879256965, | |
| "grad_norm": 0.3511677086353302, | |
| "kl": 0.07197799813002348, | |
| "learning_rate": 3.232437585431883e-05, | |
| "loss": 0.0029, | |
| "reward": 2.9660959243774414, | |
| "reward_std": 0.1771723162382841, | |
| "rewards/ngram_similarity_reward_func": 0.4719693809747696, | |
| "rewards/reasoning_quality_reward_func": 0.8925640434026718, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.859375, | |
| "rewards/xmlcount_reward_func": 0.49609375, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 388.890625, | |
| "epoch": 0.39628482972136225, | |
| "grad_norm": 0.27826693654060364, | |
| "kl": 0.061267949640750885, | |
| "learning_rate": 3.1671392950242836e-05, | |
| "loss": 0.0025, | |
| "reward": 2.6841952204704285, | |
| "reward_std": 0.15280664712190628, | |
| "rewards/ngram_similarity_reward_func": 0.3556499555706978, | |
| "rewards/reasoning_quality_reward_func": 0.9199515581130981, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.6585937440395355, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 366.859375, | |
| "epoch": 0.4086687306501548, | |
| "grad_norm": 0.2917367219924927, | |
| "kl": 0.06815788336098194, | |
| "learning_rate": 3.1e-05, | |
| "loss": 0.0027, | |
| "reward": 2.9397113919258118, | |
| "reward_std": 0.1146523468196392, | |
| "rewards/ngram_similarity_reward_func": 0.296003520488739, | |
| "rewards/reasoning_quality_reward_func": 0.905426561832428, | |
| "rewards/soft_format_reward_func": 0.2421875, | |
| "rewards/sql_execution_reward_func": 1.0, | |
| "rewards/xmlcount_reward_func": 0.49609375, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 349.6875, | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.26540401577949524, | |
| "kl": 0.07279603462666273, | |
| "learning_rate": 3.0311475038230616e-05, | |
| "loss": 0.0029, | |
| "reward": 2.89504611492157, | |
| "reward_std": 0.19008862972259521, | |
| "rewards/ngram_similarity_reward_func": 0.4920961260795593, | |
| "rewards/reasoning_quality_reward_func": 0.9310749769210815, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7218750044703484, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 349.3125, | |
| "epoch": 0.43343653250773995, | |
| "grad_norm": 0.30334243178367615, | |
| "kl": 0.06882389821112156, | |
| "learning_rate": 2.960712871133259e-05, | |
| "loss": 0.0028, | |
| "reward": 2.8439746499061584, | |
| "reward_std": 0.24623795598745346, | |
| "rewards/ngram_similarity_reward_func": 0.5152419656515121, | |
| "rewards/reasoning_quality_reward_func": 0.9223406165838242, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.6563920453190804, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 385.125, | |
| "epoch": 0.4458204334365325, | |
| "grad_norm": 0.28430214524269104, | |
| "kl": 0.06620587687939405, | |
| "learning_rate": 2.8888301782571618e-05, | |
| "loss": 0.0026, | |
| "reward": 2.7404602766036987, | |
| "reward_std": 0.25473709031939507, | |
| "rewards/ngram_similarity_reward_func": 0.3008290082216263, | |
| "rewards/reasoning_quality_reward_func": 0.934943750500679, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7546875029802322, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 340.421875, | |
| "epoch": 0.4582043343653251, | |
| "grad_norm": 0.2889171838760376, | |
| "kl": 0.07523482665419579, | |
| "learning_rate": 2.8156362579862042e-05, | |
| "loss": 0.003, | |
| "reward": 2.897187650203705, | |
| "reward_std": 0.25312334299087524, | |
| "rewards/ngram_similarity_reward_func": 0.41222984343767166, | |
| "rewards/reasoning_quality_reward_func": 0.925192192196846, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.8156249970197678, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 320.171875, | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.2764835059642792, | |
| "kl": 0.07314357813447714, | |
| "learning_rate": 2.7412704391076914e-05, | |
| "loss": 0.0029, | |
| "reward": 3.0073145031929016, | |
| "reward_std": 0.1429160237312317, | |
| "rewards/ngram_similarity_reward_func": 0.4768817350268364, | |
| "rewards/reasoning_quality_reward_func": 0.9210578054189682, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.859375, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 332.09375, | |
| "epoch": 0.48297213622291024, | |
| "grad_norm": 0.32125160098075867, | |
| "kl": 0.07088590506464243, | |
| "learning_rate": 2.6658742811845377e-05, | |
| "loss": 0.0028, | |
| "reward": 3.05528324842453, | |
| "reward_std": 0.1555289849638939, | |
| "rewards/ngram_similarity_reward_func": 0.42346765100955963, | |
| "rewards/reasoning_quality_reward_func": 0.9286906123161316, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.953125, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 329.9375, | |
| "epoch": 0.4953560371517028, | |
| "grad_norm": 0.29124927520751953, | |
| "kl": 0.06565980054438114, | |
| "learning_rate": 2.5895913050885853e-05, | |
| "loss": 0.0026, | |
| "reward": 2.8043496012687683, | |
| "reward_std": 0.18914231285452843, | |
| "rewards/ngram_similarity_reward_func": 0.39962920919060707, | |
| "rewards/reasoning_quality_reward_func": 0.9246421754360199, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7320312410593033, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 317.640625, | |
| "epoch": 0.5077399380804953, | |
| "grad_norm": 0.28270959854125977, | |
| "kl": 0.07616368494927883, | |
| "learning_rate": 2.512566719800475e-05, | |
| "loss": 0.003, | |
| "reward": 3.0060057044029236, | |
| "reward_std": 0.20819612592458725, | |
| "rewards/ngram_similarity_reward_func": 0.46407629549503326, | |
| "rewards/reasoning_quality_reward_func": 0.9291453063488007, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8627840876579285, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 320.1875, | |
| "epoch": 0.5201238390092879, | |
| "grad_norm": 0.29271021485328674, | |
| "kl": 0.07423507608473301, | |
| "learning_rate": 2.4349471459960935e-05, | |
| "loss": 0.003, | |
| "reward": 2.8800657987594604, | |
| "reward_std": 0.12723891995847225, | |
| "rewards/ngram_similarity_reward_func": 0.39660485088825226, | |
| "rewards/reasoning_quality_reward_func": 0.902210921049118, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8312499970197678, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 344.484375, | |
| "epoch": 0.5325077399380805, | |
| "grad_norm": 0.28183987736701965, | |
| "kl": 0.07508978061378002, | |
| "learning_rate": 2.356880336945785e-05, | |
| "loss": 0.003, | |
| "reward": 2.802961766719818, | |
| "reward_std": 0.15244293212890625, | |
| "rewards/ngram_similarity_reward_func": 0.28034142404794693, | |
| "rewards/reasoning_quality_reward_func": 0.9257453233003616, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8468749970197678, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 331.1875, | |
| "epoch": 0.544891640866873, | |
| "grad_norm": 0.2783842086791992, | |
| "kl": 0.0693344809114933, | |
| "learning_rate": 2.2785148972576052e-05, | |
| "loss": 0.0028, | |
| "reward": 2.757667899131775, | |
| "reward_std": 0.18025352619588375, | |
| "rewards/ngram_similarity_reward_func": 0.48256489634513855, | |
| "rewards/reasoning_quality_reward_func": 0.9297906160354614, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.5953124985098839, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 324.140625, | |
| "epoch": 0.5572755417956656, | |
| "grad_norm": 0.2986034154891968, | |
| "kl": 0.07126997038722038, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 0.0029, | |
| "reward": 2.700526773929596, | |
| "reward_std": 0.1273169182240963, | |
| "rewards/ngram_similarity_reward_func": 0.2901314552873373, | |
| "rewards/reasoning_quality_reward_func": 0.9182077944278717, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7421875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 312.859375, | |
| "epoch": 0.5696594427244582, | |
| "grad_norm": 0.2958148717880249, | |
| "kl": 0.07780578825622797, | |
| "learning_rate": 2.1214851027423954e-05, | |
| "loss": 0.0031, | |
| "reward": 2.8383458256721497, | |
| "reward_std": 0.14491115603595972, | |
| "rewards/ngram_similarity_reward_func": 0.40570230409502983, | |
| "rewards/reasoning_quality_reward_func": 0.916734367609024, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7659090906381607, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 311.828125, | |
| "epoch": 0.5820433436532507, | |
| "grad_norm": 0.2920028567314148, | |
| "kl": 0.06001428607851267, | |
| "learning_rate": 2.0431196630542152e-05, | |
| "loss": 0.0024, | |
| "reward": 3.046830952167511, | |
| "reward_std": 0.2057624664157629, | |
| "rewards/ngram_similarity_reward_func": 0.6009372770786285, | |
| "rewards/reasoning_quality_reward_func": 0.9173781126737595, | |
| "rewards/soft_format_reward_func": 0.24609375, | |
| "rewards/sql_execution_reward_func": 0.7843749970197678, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 305.65625, | |
| "epoch": 0.5944272445820433, | |
| "grad_norm": 0.2988949418067932, | |
| "kl": 0.06703130528330803, | |
| "learning_rate": 1.9650528540039077e-05, | |
| "loss": 0.0027, | |
| "reward": 2.6817620396614075, | |
| "reward_std": 0.12368473783135414, | |
| "rewards/ngram_similarity_reward_func": 0.38093067705631256, | |
| "rewards/reasoning_quality_reward_func": 0.9133312404155731, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.637499988079071, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 312.8125, | |
| "epoch": 0.6068111455108359, | |
| "grad_norm": 0.27996182441711426, | |
| "kl": 0.08176139369606972, | |
| "learning_rate": 1.8874332801995258e-05, | |
| "loss": 0.0033, | |
| "reward": 3.1290236711502075, | |
| "reward_std": 0.09270750731229782, | |
| "rewards/ngram_similarity_reward_func": 0.44197215139865875, | |
| "rewards/reasoning_quality_reward_func": 0.9370515793561935, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 1.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 344.09375, | |
| "epoch": 0.6191950464396285, | |
| "grad_norm": 0.27371323108673096, | |
| "kl": 0.07049352023750544, | |
| "learning_rate": 1.810408694911415e-05, | |
| "loss": 0.0028, | |
| "reward": 2.818117916584015, | |
| "reward_std": 0.21658625453710556, | |
| "rewards/ngram_similarity_reward_func": 0.41760701686143875, | |
| "rewards/reasoning_quality_reward_func": 0.9512921720743179, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.703125, | |
| "rewards/xmlcount_reward_func": 0.49609375, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 317.46875, | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.30640900135040283, | |
| "kl": 0.0756241325289011, | |
| "learning_rate": 1.7341257188154625e-05, | |
| "loss": 0.003, | |
| "reward": 2.9922556281089783, | |
| "reward_std": 0.10434877779334784, | |
| "rewards/ngram_similarity_reward_func": 0.4318556822836399, | |
| "rewards/reasoning_quality_reward_func": 0.9354000091552734, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 362.546875, | |
| "epoch": 0.6439628482972136, | |
| "grad_norm": 0.2744141221046448, | |
| "kl": 0.062476624734699726, | |
| "learning_rate": 1.6587295608923088e-05, | |
| "loss": 0.0025, | |
| "reward": 2.807952642440796, | |
| "reward_std": 0.12682343646883965, | |
| "rewards/ngram_similarity_reward_func": 0.25346523337066174, | |
| "rewards/reasoning_quality_reward_func": 0.9455031156539917, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8609375059604645, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 326.640625, | |
| "epoch": 0.6563467492260062, | |
| "grad_norm": 0.2845337688922882, | |
| "kl": 0.06813267059624195, | |
| "learning_rate": 1.5843637420137964e-05, | |
| "loss": 0.0027, | |
| "reward": 2.9080602526664734, | |
| "reward_std": 0.15239747613668442, | |
| "rewards/ngram_similarity_reward_func": 0.47624945640563965, | |
| "rewards/reasoning_quality_reward_func": 0.9552484601736069, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7265625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 336.140625, | |
| "epoch": 0.6687306501547987, | |
| "grad_norm": 0.27873164415359497, | |
| "kl": 0.06789861433207989, | |
| "learning_rate": 1.5111698217428385e-05, | |
| "loss": 0.0027, | |
| "reward": 2.9421083331108093, | |
| "reward_std": 0.14753830805420876, | |
| "rewards/ngram_similarity_reward_func": 0.4633248597383499, | |
| "rewards/reasoning_quality_reward_func": 0.9520296901464462, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7767538130283356, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 342.953125, | |
| "epoch": 0.6811145510835913, | |
| "grad_norm": 0.29634758830070496, | |
| "kl": 0.0591394891962409, | |
| "learning_rate": 1.4392871288667415e-05, | |
| "loss": 0.0024, | |
| "reward": 2.8162970542907715, | |
| "reward_std": 0.11586539167910814, | |
| "rewards/ngram_similarity_reward_func": 0.28963764756917953, | |
| "rewards/reasoning_quality_reward_func": 0.92978435754776, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8468749970197678, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 326.34375, | |
| "epoch": 0.6934984520123839, | |
| "grad_norm": 0.3019636273384094, | |
| "kl": 0.07066548988223076, | |
| "learning_rate": 1.3688524961769396e-05, | |
| "loss": 0.0028, | |
| "reward": 2.952807366847992, | |
| "reward_std": 0.11193438991904259, | |
| "rewards/ngram_similarity_reward_func": 0.4892011173069477, | |
| "rewards/reasoning_quality_reward_func": 0.9323562532663345, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.78125, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 341.09375, | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.27949875593185425, | |
| "kl": 0.08039782661944628, | |
| "learning_rate": 1.3000000000000006e-05, | |
| "loss": 0.0032, | |
| "reward": 3.017184257507324, | |
| "reward_std": 0.12472959142178297, | |
| "rewards/ngram_similarity_reward_func": 0.4284828044474125, | |
| "rewards/reasoning_quality_reward_func": 0.9480765461921692, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.890625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 348.234375, | |
| "epoch": 0.718266253869969, | |
| "grad_norm": 0.26645585894584656, | |
| "kl": 0.06729870289564133, | |
| "learning_rate": 1.232860704975717e-05, | |
| "loss": 0.0027, | |
| "reward": 2.8991669416427612, | |
| "reward_std": 0.04610138572752476, | |
| "rewards/ngram_similarity_reward_func": 0.29805589467287064, | |
| "rewards/reasoning_quality_reward_func": 0.9761109352111816, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 322.765625, | |
| "epoch": 0.7306501547987616, | |
| "grad_norm": 0.2988031804561615, | |
| "kl": 0.07192371133714914, | |
| "learning_rate": 1.1675624145681177e-05, | |
| "loss": 0.0029, | |
| "reward": 2.894783914089203, | |
| "reward_std": 0.19001621287316084, | |
| "rewards/ngram_similarity_reward_func": 0.5530954301357269, | |
| "rewards/reasoning_quality_reward_func": 0.9504609107971191, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.6412276178598404, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 355.75, | |
| "epoch": 0.7430340557275542, | |
| "grad_norm": 0.2641834020614624, | |
| "kl": 0.06368108931928873, | |
| "learning_rate": 1.1042294277843029e-05, | |
| "loss": 0.0025, | |
| "reward": 2.89058655500412, | |
| "reward_std": 0.12556276191025972, | |
| "rewards/ngram_similarity_reward_func": 0.40262240916490555, | |
| "rewards/reasoning_quality_reward_func": 0.9567140638828278, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.78125, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 300.3125, | |
| "epoch": 0.7554179566563467, | |
| "grad_norm": 0.29292401671409607, | |
| "kl": 0.08026566356420517, | |
| "learning_rate": 1.0429823025642292e-05, | |
| "loss": 0.0032, | |
| "reward": 3.160749673843384, | |
| "reward_std": 0.2943864706903696, | |
| "rewards/ngram_similarity_reward_func": 0.5859435126185417, | |
| "rewards/reasoning_quality_reward_func": 0.9517593681812286, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.875, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 307.1875, | |
| "epoch": 0.7678018575851393, | |
| "grad_norm": 0.29157254099845886, | |
| "kl": 0.07550233788788319, | |
| "learning_rate": 9.839376262918117e-06, | |
| "loss": 0.003, | |
| "reward": 2.749490201473236, | |
| "reward_std": 0.1510023418813944, | |
| "rewards/ngram_similarity_reward_func": 0.4469105303287506, | |
| "rewards/reasoning_quality_reward_func": 0.9275796860456467, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 320.5625, | |
| "epoch": 0.7801857585139319, | |
| "grad_norm": 0.26841044425964355, | |
| "kl": 0.07417850941419601, | |
| "learning_rate": 9.272077938642147e-06, | |
| "loss": 0.003, | |
| "reward": 3.029486298561096, | |
| "reward_std": 0.14038624800741673, | |
| "rewards/ngram_similarity_reward_func": 0.5038420185446739, | |
| "rewards/reasoning_quality_reward_func": 0.9472562223672867, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.828388050198555, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 347.9375, | |
| "epoch": 0.7925696594427245, | |
| "grad_norm": 0.2752821743488312, | |
| "kl": 0.0751224858686328, | |
| "learning_rate": 8.72900793741777e-06, | |
| "loss": 0.003, | |
| "reward": 3.038633167743683, | |
| "reward_std": 0.08897098805755377, | |
| "rewards/ngram_similarity_reward_func": 0.4439566656947136, | |
| "rewards/reasoning_quality_reward_func": 0.9696765691041946, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 339.515625, | |
| "epoch": 0.804953560371517, | |
| "grad_norm": 0.31212714314460754, | |
| "kl": 0.08936411328613758, | |
| "learning_rate": 8.2112000238584e-06, | |
| "loss": 0.0036, | |
| "reward": 3.021269977092743, | |
| "reward_std": 0.10588092915713787, | |
| "rewards/ngram_similarity_reward_func": 0.5628981739282608, | |
| "rewards/reasoning_quality_reward_func": 0.9739968627691269, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.734375, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 376.796875, | |
| "epoch": 0.8173374613003096, | |
| "grad_norm": 0.26753705739974976, | |
| "kl": 0.0706013347953558, | |
| "learning_rate": 7.71963987475777e-06, | |
| "loss": 0.0028, | |
| "reward": 2.8900545239448547, | |
| "reward_std": 0.11279613338410854, | |
| "rewards/ngram_similarity_reward_func": 0.31394050642848015, | |
| "rewards/reasoning_quality_reward_func": 0.9636140614748001, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.8624999970197678, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 325.171875, | |
| "epoch": 0.8297213622291022, | |
| "grad_norm": 0.2971738278865814, | |
| "kl": 0.07812906242907047, | |
| "learning_rate": 7.255263202798146e-06, | |
| "loss": 0.0031, | |
| "reward": 2.962576150894165, | |
| "reward_std": 0.0649077408015728, | |
| "rewards/ngram_similarity_reward_func": 0.3916136734187603, | |
| "rewards/reasoning_quality_reward_func": 0.9459624886512756, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 339.921875, | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.2827884256839752, | |
| "kl": 0.08069668803364038, | |
| "learning_rate": 6.818953975368061e-06, | |
| "loss": 0.0032, | |
| "reward": 3.235768675804138, | |
| "reward_std": 0.1414957493543625, | |
| "rewards/ngram_similarity_reward_func": 0.5445858836174011, | |
| "rewards/reasoning_quality_reward_func": 0.9724328070878983, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.96875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 374.375, | |
| "epoch": 0.8544891640866873, | |
| "grad_norm": 0.2604800760746002, | |
| "kl": 0.07065233774483204, | |
| "learning_rate": 6.411542731880104e-06, | |
| "loss": 0.0028, | |
| "reward": 3.021795392036438, | |
| "reward_std": 0.15352355409413576, | |
| "rewards/ngram_similarity_reward_func": 0.34035639837384224, | |
| "rewards/reasoning_quality_reward_func": 0.9814390540122986, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.9500000029802322, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 346.640625, | |
| "epoch": 0.8668730650154799, | |
| "grad_norm": 0.2746836543083191, | |
| "kl": 0.06909866165369749, | |
| "learning_rate": 6.03380500279201e-06, | |
| "loss": 0.0028, | |
| "reward": 2.642494022846222, | |
| "reward_std": 0.19972242414951324, | |
| "rewards/ngram_similarity_reward_func": 0.4592840112745762, | |
| "rewards/reasoning_quality_reward_func": 0.9604828059673309, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.4727272689342499, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 345.78125, | |
| "epoch": 0.8792569659442725, | |
| "grad_norm": 0.3155825734138489, | |
| "kl": 0.07083407044410706, | |
| "learning_rate": 5.686459833340302e-06, | |
| "loss": 0.0028, | |
| "reward": 2.6602479815483093, | |
| "reward_std": 0.1187733905389905, | |
| "rewards/ngram_similarity_reward_func": 0.4456479325890541, | |
| "rewards/reasoning_quality_reward_func": 0.9665531367063522, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.5, | |
| "rewards/xmlcount_reward_func": 0.498046875, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 347.640625, | |
| "epoch": 0.891640866873065, | |
| "grad_norm": 0.2810722589492798, | |
| "kl": 0.0792484674602747, | |
| "learning_rate": 5.370168414796839e-06, | |
| "loss": 0.0032, | |
| "reward": 2.9334104657173157, | |
| "reward_std": 0.15719584189355373, | |
| "rewards/ngram_similarity_reward_func": 0.4541776664555073, | |
| "rewards/reasoning_quality_reward_func": 0.9636077880859375, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.765625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 331.265625, | |
| "epoch": 0.9040247678018576, | |
| "grad_norm": 0.2836628556251526, | |
| "kl": 0.08229007571935654, | |
| "learning_rate": 5.085532825853651e-06, | |
| "loss": 0.0033, | |
| "reward": 3.0361828804016113, | |
| "reward_std": 0.19561532977968454, | |
| "rewards/ngram_similarity_reward_func": 0.40529073029756546, | |
| "rewards/reasoning_quality_reward_func": 0.9590171575546265, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.921875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 345.25, | |
| "epoch": 0.9164086687306502, | |
| "grad_norm": 0.2591722309589386, | |
| "kl": 0.0776594839990139, | |
| "learning_rate": 4.833094886531918e-06, | |
| "loss": 0.0031, | |
| "reward": 2.752366602420807, | |
| "reward_std": 0.1025167815387249, | |
| "rewards/ngram_similarity_reward_func": 0.4143087863922119, | |
| "rewards/reasoning_quality_reward_func": 0.9630578011274338, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 377.71875, | |
| "epoch": 0.9287925696594427, | |
| "grad_norm": 0.25510552525520325, | |
| "kl": 0.07739250548183918, | |
| "learning_rate": 4.613335126796773e-06, | |
| "loss": 0.0031, | |
| "reward": 3.047813057899475, | |
| "reward_std": 0.18943855166435242, | |
| "rewards/ngram_similarity_reward_func": 0.36462873220443726, | |
| "rewards/reasoning_quality_reward_func": 0.9753718972206116, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.9578125029802322, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 353.359375, | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.26207593083381653, | |
| "kl": 0.0766323795542121, | |
| "learning_rate": 4.4266718718412e-06, | |
| "loss": 0.0031, | |
| "reward": 3.032430052757263, | |
| "reward_std": 0.06077156774699688, | |
| "rewards/ngram_similarity_reward_func": 0.4302191510796547, | |
| "rewards/reasoning_quality_reward_func": 0.9772109389305115, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.875, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 330.796875, | |
| "epoch": 0.9535603715170279, | |
| "grad_norm": 0.26912447810173035, | |
| "kl": 0.07224935106933117, | |
| "learning_rate": 4.2734604457802565e-06, | |
| "loss": 0.0029, | |
| "reward": 2.9969308972358704, | |
| "reward_std": 0.11895978637039661, | |
| "rewards/ngram_similarity_reward_func": 0.5269326269626617, | |
| "rewards/reasoning_quality_reward_func": 0.9542312771081924, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.7657670378684998, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 359.5625, | |
| "epoch": 0.9659442724458205, | |
| "grad_norm": 0.25813353061676025, | |
| "kl": 0.07646154798567295, | |
| "learning_rate": 4.153992495271414e-06, | |
| "loss": 0.0031, | |
| "reward": 3.1742658615112305, | |
| "reward_std": 0.15984882693737745, | |
| "rewards/ngram_similarity_reward_func": 0.4850096367299557, | |
| "rewards/reasoning_quality_reward_func": 0.9642562717199326, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.9749999940395355, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 348.515625, | |
| "epoch": 0.978328173374613, | |
| "grad_norm": 0.269113689661026, | |
| "kl": 0.07896707952022552, | |
| "learning_rate": 4.0684954343485806e-06, | |
| "loss": 0.0032, | |
| "reward": 2.8017610907554626, | |
| "reward_std": 0.23625551722943783, | |
| "rewards/ngram_similarity_reward_func": 0.47032520920038223, | |
| "rewards/reasoning_quality_reward_func": 0.9892484545707703, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.5921874865889549, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 375.296875, | |
| "epoch": 0.9907120743034056, | |
| "grad_norm": 0.2751859724521637, | |
| "kl": 0.06904378905892372, | |
| "learning_rate": 4.01713201152656e-06, | |
| "loss": 0.0028, | |
| "reward": 2.791142702102661, | |
| "reward_std": 0.14453750429674983, | |
| "rewards/ngram_similarity_reward_func": 0.31529772467911243, | |
| "rewards/reasoning_quality_reward_func": 0.9427484273910522, | |
| "rewards/soft_format_reward_func": 0.25, | |
| "rewards/sql_execution_reward_func": 0.783096581697464, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 80 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 80, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |