{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 443, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011299435028248588, "grad_norm": 1.2254456281661987, "learning_rate": 1.0810810810810812e-06, "loss": 1.3157, "step": 5 }, { "epoch": 0.022598870056497175, "grad_norm": 1.2222959995269775, "learning_rate": 2.4324324324324325e-06, "loss": 1.3831, "step": 10 }, { "epoch": 0.03389830508474576, "grad_norm": 0.7486730813980103, "learning_rate": 3.7837837837837835e-06, "loss": 1.3448, "step": 15 }, { "epoch": 0.04519774011299435, "grad_norm": 0.7303342819213867, "learning_rate": 5.135135135135135e-06, "loss": 1.2883, "step": 20 }, { "epoch": 0.05649717514124294, "grad_norm": 0.6827901005744934, "learning_rate": 6.486486486486487e-06, "loss": 1.2782, "step": 25 }, { "epoch": 0.06779661016949153, "grad_norm": 0.6475429534912109, "learning_rate": 7.837837837837838e-06, "loss": 1.2937, "step": 30 }, { "epoch": 0.07909604519774012, "grad_norm": 0.5939517617225647, "learning_rate": 9.18918918918919e-06, "loss": 1.2553, "step": 35 }, { "epoch": 0.0903954802259887, "grad_norm": 0.5064879655838013, "learning_rate": 1.0540540540540541e-05, "loss": 1.2067, "step": 40 }, { "epoch": 0.1016949152542373, "grad_norm": 0.6584354639053345, "learning_rate": 1.1891891891891893e-05, "loss": 1.2246, "step": 45 }, { "epoch": 0.11299435028248588, "grad_norm": 0.4975661337375641, "learning_rate": 1.3243243243243242e-05, "loss": 1.2558, "step": 50 }, { "epoch": 0.12429378531073447, "grad_norm": 0.5031336545944214, "learning_rate": 1.4594594594594596e-05, "loss": 1.2726, "step": 55 }, { "epoch": 0.13559322033898305, "grad_norm": 0.4471903443336487, "learning_rate": 1.5945945945945947e-05, "loss": 1.2089, "step": 60 }, { "epoch": 0.14689265536723164, "grad_norm": 0.5343024134635925, "learning_rate": 1.72972972972973e-05, "loss": 1.2018, "step": 65 }, { "epoch": 0.15819209039548024, "grad_norm": 0.47138091921806335, "learning_rate": 1.864864864864865e-05, "loss": 1.2826, "step": 70 }, { "epoch": 0.1694915254237288, "grad_norm": 0.490500271320343, "learning_rate": 1.9999999999999998e-05, "loss": 1.2116, "step": 75 }, { "epoch": 0.1807909604519774, "grad_norm": 0.5078205466270447, "learning_rate": 2.135135135135135e-05, "loss": 1.1873, "step": 80 }, { "epoch": 0.192090395480226, "grad_norm": 1.9143681526184082, "learning_rate": 2.2702702702702705e-05, "loss": 1.2159, "step": 85 }, { "epoch": 0.2033898305084746, "grad_norm": 0.5685150027275085, "learning_rate": 2.4054054054054056e-05, "loss": 1.1666, "step": 90 }, { "epoch": 0.21468926553672316, "grad_norm": 0.5490728616714478, "learning_rate": 2.5405405405405408e-05, "loss": 1.1348, "step": 95 }, { "epoch": 0.22598870056497175, "grad_norm": 0.5205124020576477, "learning_rate": 2.6756756756756756e-05, "loss": 1.1213, "step": 100 }, { "epoch": 0.23728813559322035, "grad_norm": 0.4488866925239563, "learning_rate": 2.8108108108108107e-05, "loss": 1.1414, "step": 105 }, { "epoch": 0.24858757062146894, "grad_norm": 0.4913260340690613, "learning_rate": 2.945945945945946e-05, "loss": 1.139, "step": 110 }, { "epoch": 0.2598870056497175, "grad_norm": 0.5297622084617615, "learning_rate": 2.9999849508725208e-05, "loss": 1.134, "step": 115 }, { "epoch": 0.2711864406779661, "grad_norm": 0.6094189882278442, "learning_rate": 2.9998929850759165e-05, "loss": 1.1023, "step": 120 }, { "epoch": 0.2824858757062147, "grad_norm": 0.6322339177131653, "learning_rate": 2.9997174192288006e-05, "loss": 1.1164, "step": 125 }, { "epoch": 0.2937853107344633, "grad_norm": 0.6201522946357727, "learning_rate": 2.999458263116747e-05, "loss": 1.1188, "step": 130 }, { "epoch": 0.3050847457627119, "grad_norm": 0.5387688875198364, "learning_rate": 2.9991155311844292e-05, "loss": 1.0651, "step": 135 }, { "epoch": 0.3163841807909605, "grad_norm": 0.5230715274810791, "learning_rate": 2.998689242534815e-05, "loss": 1.0626, "step": 140 }, { "epoch": 0.327683615819209, "grad_norm": 0.5703102946281433, "learning_rate": 2.9981794209281003e-05, "loss": 1.0626, "step": 145 }, { "epoch": 0.3389830508474576, "grad_norm": 0.5561928153038025, "learning_rate": 2.997586094780388e-05, "loss": 1.0494, "step": 150 }, { "epoch": 0.3502824858757062, "grad_norm": 0.6391568183898926, "learning_rate": 2.9969092971621006e-05, "loss": 1.05, "step": 155 }, { "epoch": 0.3615819209039548, "grad_norm": 0.6165516376495361, "learning_rate": 2.9961490657961404e-05, "loss": 1.0617, "step": 160 }, { "epoch": 0.3728813559322034, "grad_norm": 0.6393241286277771, "learning_rate": 2.9953054430557835e-05, "loss": 1.0036, "step": 165 }, { "epoch": 0.384180790960452, "grad_norm": 0.6466289758682251, "learning_rate": 2.9943784759623205e-05, "loss": 1.0362, "step": 170 }, { "epoch": 0.3954802259887006, "grad_norm": 0.7020736932754517, "learning_rate": 2.993368216182435e-05, "loss": 1.0052, "step": 175 }, { "epoch": 0.4067796610169492, "grad_norm": 0.630094051361084, "learning_rate": 2.992274720025323e-05, "loss": 1.0279, "step": 180 }, { "epoch": 0.4180790960451977, "grad_norm": 0.629647433757782, "learning_rate": 2.9910980484395555e-05, "loss": 1.0309, "step": 185 }, { "epoch": 0.4293785310734463, "grad_norm": 0.6576105952262878, "learning_rate": 2.989838267009681e-05, "loss": 0.9227, "step": 190 }, { "epoch": 0.4406779661016949, "grad_norm": 0.6078037023544312, "learning_rate": 2.988495445952569e-05, "loss": 1.0005, "step": 195 }, { "epoch": 0.4519774011299435, "grad_norm": 0.657957911491394, "learning_rate": 2.9870696601134994e-05, "loss": 0.9715, "step": 200 }, { "epoch": 0.4632768361581921, "grad_norm": 0.6783279180526733, "learning_rate": 2.9855609889619864e-05, "loss": 0.927, "step": 205 }, { "epoch": 0.4745762711864407, "grad_norm": 0.6905434727668762, "learning_rate": 2.983969516587352e-05, "loss": 0.9506, "step": 210 }, { "epoch": 0.4858757062146893, "grad_norm": 0.8056045174598694, "learning_rate": 2.9822953316940393e-05, "loss": 0.9867, "step": 215 }, { "epoch": 0.4971751412429379, "grad_norm": 0.7994968295097351, "learning_rate": 2.9805385275966658e-05, "loss": 0.9489, "step": 220 }, { "epoch": 0.5084745762711864, "grad_norm": 0.7940033078193665, "learning_rate": 2.978699202214826e-05, "loss": 0.9538, "step": 225 }, { "epoch": 0.519774011299435, "grad_norm": 0.7576586604118347, "learning_rate": 2.97677745806763e-05, "loss": 0.9529, "step": 230 }, { "epoch": 0.5310734463276836, "grad_norm": 0.8559620976448059, "learning_rate": 2.9747734022679913e-05, "loss": 0.9188, "step": 235 }, { "epoch": 0.5423728813559322, "grad_norm": 0.9041480422019958, "learning_rate": 2.9726871465166567e-05, "loss": 0.9145, "step": 240 }, { "epoch": 0.5536723163841808, "grad_norm": 0.7606709003448486, "learning_rate": 2.970518807095979e-05, "loss": 0.9179, "step": 245 }, { "epoch": 0.5649717514124294, "grad_norm": 0.904378354549408, "learning_rate": 2.9682685048634384e-05, "loss": 0.8682, "step": 250 }, { "epoch": 0.576271186440678, "grad_norm": 0.8670118451118469, "learning_rate": 2.9659363652449027e-05, "loss": 0.9448, "step": 255 }, { "epoch": 0.5875706214689266, "grad_norm": 0.7508774399757385, "learning_rate": 2.9635225182276387e-05, "loss": 0.9223, "step": 260 }, { "epoch": 0.5988700564971752, "grad_norm": 0.6919876933097839, "learning_rate": 2.961027098353067e-05, "loss": 0.8971, "step": 265 }, { "epoch": 0.6101694915254238, "grad_norm": 0.7949540615081787, "learning_rate": 2.9584502447092623e-05, "loss": 0.91, "step": 270 }, { "epoch": 0.6214689265536724, "grad_norm": 0.7679325938224792, "learning_rate": 2.9557921009232013e-05, "loss": 0.903, "step": 275 }, { "epoch": 0.632768361581921, "grad_norm": 0.8364932537078857, "learning_rate": 2.953052815152757e-05, "loss": 0.8941, "step": 280 }, { "epoch": 0.6440677966101694, "grad_norm": 0.8773370981216431, "learning_rate": 2.950232540078442e-05, "loss": 0.8812, "step": 285 }, { "epoch": 0.655367231638418, "grad_norm": 1.0121004581451416, "learning_rate": 2.947331432894896e-05, "loss": 0.901, "step": 290 }, { "epoch": 0.6666666666666666, "grad_norm": 0.8700821995735168, "learning_rate": 2.9443496553021268e-05, "loss": 0.8385, "step": 295 }, { "epoch": 0.6779661016949152, "grad_norm": 0.8492361903190613, "learning_rate": 2.9412873734964973e-05, "loss": 0.8498, "step": 300 }, { "epoch": 0.6892655367231638, "grad_norm": 0.860744833946228, "learning_rate": 2.938144758161459e-05, "loss": 0.8381, "step": 305 }, { "epoch": 0.7005649717514124, "grad_norm": 0.8146252036094666, "learning_rate": 2.934921984458043e-05, "loss": 0.8505, "step": 310 }, { "epoch": 0.711864406779661, "grad_norm": 0.828873336315155, "learning_rate": 2.931619232015094e-05, "loss": 0.8359, "step": 315 }, { "epoch": 0.7231638418079096, "grad_norm": 0.8928766250610352, "learning_rate": 2.9282366849192596e-05, "loss": 0.8697, "step": 320 }, { "epoch": 0.7344632768361582, "grad_norm": 0.8543341755867004, "learning_rate": 2.924774531704729e-05, "loss": 0.7961, "step": 325 }, { "epoch": 0.7457627118644068, "grad_norm": 0.8456112146377563, "learning_rate": 2.921232965342725e-05, "loss": 0.8298, "step": 330 }, { "epoch": 0.7570621468926554, "grad_norm": 0.8231430053710938, "learning_rate": 2.9176121832307487e-05, "loss": 0.7866, "step": 335 }, { "epoch": 0.768361581920904, "grad_norm": 0.8562153577804565, "learning_rate": 2.9139123871815762e-05, "loss": 0.7833, "step": 340 }, { "epoch": 0.7796610169491526, "grad_norm": 0.8552905917167664, "learning_rate": 2.9101337834120113e-05, "loss": 0.7777, "step": 345 }, { "epoch": 0.7909604519774012, "grad_norm": 0.9859520792961121, "learning_rate": 2.9062765825313887e-05, "loss": 0.8016, "step": 350 }, { "epoch": 0.8022598870056498, "grad_norm": 0.867014467716217, "learning_rate": 2.9023409995298406e-05, "loss": 0.7895, "step": 355 }, { "epoch": 0.8135593220338984, "grad_norm": 0.890496015548706, "learning_rate": 2.8983272537663082e-05, "loss": 0.7795, "step": 360 }, { "epoch": 0.8248587570621468, "grad_norm": 0.9742592573165894, "learning_rate": 2.8942355689563166e-05, "loss": 0.7661, "step": 365 }, { "epoch": 0.8361581920903954, "grad_norm": 0.855208694934845, "learning_rate": 2.890066173159509e-05, "loss": 0.7863, "step": 370 }, { "epoch": 0.847457627118644, "grad_norm": 0.9460350871086121, "learning_rate": 2.8858192987669303e-05, "loss": 0.7475, "step": 375 }, { "epoch": 0.8587570621468926, "grad_norm": 0.9050624966621399, "learning_rate": 2.881495182488077e-05, "loss": 0.739, "step": 380 }, { "epoch": 0.8700564971751412, "grad_norm": 0.8351825475692749, "learning_rate": 2.8770940653377047e-05, "loss": 0.7322, "step": 385 }, { "epoch": 0.8813559322033898, "grad_norm": 0.9222843050956726, "learning_rate": 2.8726161926223904e-05, "loss": 0.7435, "step": 390 }, { "epoch": 0.8926553672316384, "grad_norm": 0.8897649645805359, "learning_rate": 2.8680618139268643e-05, "loss": 0.7229, "step": 395 }, { "epoch": 0.903954802259887, "grad_norm": 1.0437895059585571, "learning_rate": 2.8634311831000966e-05, "loss": 0.7132, "step": 400 }, { "epoch": 0.9152542372881356, "grad_norm": 0.9872046113014221, "learning_rate": 2.8587245582411475e-05, "loss": 0.7257, "step": 405 }, { "epoch": 0.9265536723163842, "grad_norm": 0.9081714749336243, "learning_rate": 2.8539422016847834e-05, "loss": 0.6956, "step": 410 }, { "epoch": 0.9378531073446328, "grad_norm": 0.9939802289009094, "learning_rate": 2.8490843799868556e-05, "loss": 0.723, "step": 415 }, { "epoch": 0.9491525423728814, "grad_norm": 0.8449594974517822, "learning_rate": 2.8441513639094396e-05, "loss": 0.7294, "step": 420 }, { "epoch": 0.96045197740113, "grad_norm": 0.9042068719863892, "learning_rate": 2.8391434284057466e-05, "loss": 0.6565, "step": 425 }, { "epoch": 0.9717514124293786, "grad_norm": 0.9338545203208923, "learning_rate": 2.8340608526047995e-05, "loss": 0.711, "step": 430 }, { "epoch": 0.9830508474576272, "grad_norm": 0.9345859885215759, "learning_rate": 2.82890391979587e-05, "loss": 0.7006, "step": 435 }, { "epoch": 0.9943502824858758, "grad_norm": 0.9804854393005371, "learning_rate": 2.8236729174126948e-05, "loss": 0.7183, "step": 440 } ], "logging_steps": 5, "max_steps": 2215, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.402730716005663e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }