| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 443, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011299435028248588, | |
| "grad_norm": 1.2254456281661987, | |
| "learning_rate": 1.0810810810810812e-06, | |
| "loss": 1.3157, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.022598870056497175, | |
| "grad_norm": 1.2222959995269775, | |
| "learning_rate": 2.4324324324324325e-06, | |
| "loss": 1.3831, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03389830508474576, | |
| "grad_norm": 0.7486730813980103, | |
| "learning_rate": 3.7837837837837835e-06, | |
| "loss": 1.3448, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04519774011299435, | |
| "grad_norm": 0.7303342819213867, | |
| "learning_rate": 5.135135135135135e-06, | |
| "loss": 1.2883, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05649717514124294, | |
| "grad_norm": 0.6827901005744934, | |
| "learning_rate": 6.486486486486487e-06, | |
| "loss": 1.2782, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06779661016949153, | |
| "grad_norm": 0.6475429534912109, | |
| "learning_rate": 7.837837837837838e-06, | |
| "loss": 1.2937, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07909604519774012, | |
| "grad_norm": 0.5939517617225647, | |
| "learning_rate": 9.18918918918919e-06, | |
| "loss": 1.2553, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0903954802259887, | |
| "grad_norm": 0.5064879655838013, | |
| "learning_rate": 1.0540540540540541e-05, | |
| "loss": 1.2067, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1016949152542373, | |
| "grad_norm": 0.6584354639053345, | |
| "learning_rate": 1.1891891891891893e-05, | |
| "loss": 1.2246, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11299435028248588, | |
| "grad_norm": 0.4975661337375641, | |
| "learning_rate": 1.3243243243243242e-05, | |
| "loss": 1.2558, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12429378531073447, | |
| "grad_norm": 0.5031336545944214, | |
| "learning_rate": 1.4594594594594596e-05, | |
| "loss": 1.2726, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13559322033898305, | |
| "grad_norm": 0.4471903443336487, | |
| "learning_rate": 1.5945945945945947e-05, | |
| "loss": 1.2089, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14689265536723164, | |
| "grad_norm": 0.5343024134635925, | |
| "learning_rate": 1.72972972972973e-05, | |
| "loss": 1.2018, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15819209039548024, | |
| "grad_norm": 0.47138091921806335, | |
| "learning_rate": 1.864864864864865e-05, | |
| "loss": 1.2826, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1694915254237288, | |
| "grad_norm": 0.490500271320343, | |
| "learning_rate": 1.9999999999999998e-05, | |
| "loss": 1.2116, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1807909604519774, | |
| "grad_norm": 0.5078205466270447, | |
| "learning_rate": 2.135135135135135e-05, | |
| "loss": 1.1873, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.192090395480226, | |
| "grad_norm": 1.9143681526184082, | |
| "learning_rate": 2.2702702702702705e-05, | |
| "loss": 1.2159, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2033898305084746, | |
| "grad_norm": 0.5685150027275085, | |
| "learning_rate": 2.4054054054054056e-05, | |
| "loss": 1.1666, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21468926553672316, | |
| "grad_norm": 0.5490728616714478, | |
| "learning_rate": 2.5405405405405408e-05, | |
| "loss": 1.1348, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22598870056497175, | |
| "grad_norm": 0.5205124020576477, | |
| "learning_rate": 2.6756756756756756e-05, | |
| "loss": 1.1213, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23728813559322035, | |
| "grad_norm": 0.4488866925239563, | |
| "learning_rate": 2.8108108108108107e-05, | |
| "loss": 1.1414, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.24858757062146894, | |
| "grad_norm": 0.4913260340690613, | |
| "learning_rate": 2.945945945945946e-05, | |
| "loss": 1.139, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2598870056497175, | |
| "grad_norm": 0.5297622084617615, | |
| "learning_rate": 2.9999849508725208e-05, | |
| "loss": 1.134, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 0.6094189882278442, | |
| "learning_rate": 2.9998929850759165e-05, | |
| "loss": 1.1023, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2824858757062147, | |
| "grad_norm": 0.6322339177131653, | |
| "learning_rate": 2.9997174192288006e-05, | |
| "loss": 1.1164, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2937853107344633, | |
| "grad_norm": 0.6201522946357727, | |
| "learning_rate": 2.999458263116747e-05, | |
| "loss": 1.1188, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3050847457627119, | |
| "grad_norm": 0.5387688875198364, | |
| "learning_rate": 2.9991155311844292e-05, | |
| "loss": 1.0651, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3163841807909605, | |
| "grad_norm": 0.5230715274810791, | |
| "learning_rate": 2.998689242534815e-05, | |
| "loss": 1.0626, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.327683615819209, | |
| "grad_norm": 0.5703102946281433, | |
| "learning_rate": 2.9981794209281003e-05, | |
| "loss": 1.0626, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 0.5561928153038025, | |
| "learning_rate": 2.997586094780388e-05, | |
| "loss": 1.0494, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3502824858757062, | |
| "grad_norm": 0.6391568183898926, | |
| "learning_rate": 2.9969092971621006e-05, | |
| "loss": 1.05, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3615819209039548, | |
| "grad_norm": 0.6165516376495361, | |
| "learning_rate": 2.9961490657961404e-05, | |
| "loss": 1.0617, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3728813559322034, | |
| "grad_norm": 0.6393241286277771, | |
| "learning_rate": 2.9953054430557835e-05, | |
| "loss": 1.0036, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.384180790960452, | |
| "grad_norm": 0.6466289758682251, | |
| "learning_rate": 2.9943784759623205e-05, | |
| "loss": 1.0362, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3954802259887006, | |
| "grad_norm": 0.7020736932754517, | |
| "learning_rate": 2.993368216182435e-05, | |
| "loss": 1.0052, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4067796610169492, | |
| "grad_norm": 0.630094051361084, | |
| "learning_rate": 2.992274720025323e-05, | |
| "loss": 1.0279, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4180790960451977, | |
| "grad_norm": 0.629647433757782, | |
| "learning_rate": 2.9910980484395555e-05, | |
| "loss": 1.0309, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4293785310734463, | |
| "grad_norm": 0.6576105952262878, | |
| "learning_rate": 2.989838267009681e-05, | |
| "loss": 0.9227, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4406779661016949, | |
| "grad_norm": 0.6078037023544312, | |
| "learning_rate": 2.988495445952569e-05, | |
| "loss": 1.0005, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4519774011299435, | |
| "grad_norm": 0.657957911491394, | |
| "learning_rate": 2.9870696601134994e-05, | |
| "loss": 0.9715, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4632768361581921, | |
| "grad_norm": 0.6783279180526733, | |
| "learning_rate": 2.9855609889619864e-05, | |
| "loss": 0.927, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4745762711864407, | |
| "grad_norm": 0.6905434727668762, | |
| "learning_rate": 2.983969516587352e-05, | |
| "loss": 0.9506, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4858757062146893, | |
| "grad_norm": 0.8056045174598694, | |
| "learning_rate": 2.9822953316940393e-05, | |
| "loss": 0.9867, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4971751412429379, | |
| "grad_norm": 0.7994968295097351, | |
| "learning_rate": 2.9805385275966658e-05, | |
| "loss": 0.9489, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5084745762711864, | |
| "grad_norm": 0.7940033078193665, | |
| "learning_rate": 2.978699202214826e-05, | |
| "loss": 0.9538, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.519774011299435, | |
| "grad_norm": 0.7576586604118347, | |
| "learning_rate": 2.97677745806763e-05, | |
| "loss": 0.9529, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5310734463276836, | |
| "grad_norm": 0.8559620976448059, | |
| "learning_rate": 2.9747734022679913e-05, | |
| "loss": 0.9188, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 0.9041480422019958, | |
| "learning_rate": 2.9726871465166567e-05, | |
| "loss": 0.9145, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5536723163841808, | |
| "grad_norm": 0.7606709003448486, | |
| "learning_rate": 2.970518807095979e-05, | |
| "loss": 0.9179, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5649717514124294, | |
| "grad_norm": 0.904378354549408, | |
| "learning_rate": 2.9682685048634384e-05, | |
| "loss": 0.8682, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.576271186440678, | |
| "grad_norm": 0.8670118451118469, | |
| "learning_rate": 2.9659363652449027e-05, | |
| "loss": 0.9448, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5875706214689266, | |
| "grad_norm": 0.7508774399757385, | |
| "learning_rate": 2.9635225182276387e-05, | |
| "loss": 0.9223, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5988700564971752, | |
| "grad_norm": 0.6919876933097839, | |
| "learning_rate": 2.961027098353067e-05, | |
| "loss": 0.8971, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6101694915254238, | |
| "grad_norm": 0.7949540615081787, | |
| "learning_rate": 2.9584502447092623e-05, | |
| "loss": 0.91, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6214689265536724, | |
| "grad_norm": 0.7679325938224792, | |
| "learning_rate": 2.9557921009232013e-05, | |
| "loss": 0.903, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.632768361581921, | |
| "grad_norm": 0.8364932537078857, | |
| "learning_rate": 2.953052815152757e-05, | |
| "loss": 0.8941, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6440677966101694, | |
| "grad_norm": 0.8773370981216431, | |
| "learning_rate": 2.950232540078442e-05, | |
| "loss": 0.8812, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.655367231638418, | |
| "grad_norm": 1.0121004581451416, | |
| "learning_rate": 2.947331432894896e-05, | |
| "loss": 0.901, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.8700821995735168, | |
| "learning_rate": 2.9443496553021268e-05, | |
| "loss": 0.8385, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 0.8492361903190613, | |
| "learning_rate": 2.9412873734964973e-05, | |
| "loss": 0.8498, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6892655367231638, | |
| "grad_norm": 0.860744833946228, | |
| "learning_rate": 2.938144758161459e-05, | |
| "loss": 0.8381, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7005649717514124, | |
| "grad_norm": 0.8146252036094666, | |
| "learning_rate": 2.934921984458043e-05, | |
| "loss": 0.8505, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.711864406779661, | |
| "grad_norm": 0.828873336315155, | |
| "learning_rate": 2.931619232015094e-05, | |
| "loss": 0.8359, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7231638418079096, | |
| "grad_norm": 0.8928766250610352, | |
| "learning_rate": 2.9282366849192596e-05, | |
| "loss": 0.8697, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7344632768361582, | |
| "grad_norm": 0.8543341755867004, | |
| "learning_rate": 2.924774531704729e-05, | |
| "loss": 0.7961, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7457627118644068, | |
| "grad_norm": 0.8456112146377563, | |
| "learning_rate": 2.921232965342725e-05, | |
| "loss": 0.8298, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7570621468926554, | |
| "grad_norm": 0.8231430053710938, | |
| "learning_rate": 2.9176121832307487e-05, | |
| "loss": 0.7866, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.768361581920904, | |
| "grad_norm": 0.8562153577804565, | |
| "learning_rate": 2.9139123871815762e-05, | |
| "loss": 0.7833, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7796610169491526, | |
| "grad_norm": 0.8552905917167664, | |
| "learning_rate": 2.9101337834120113e-05, | |
| "loss": 0.7777, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7909604519774012, | |
| "grad_norm": 0.9859520792961121, | |
| "learning_rate": 2.9062765825313887e-05, | |
| "loss": 0.8016, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8022598870056498, | |
| "grad_norm": 0.867014467716217, | |
| "learning_rate": 2.9023409995298406e-05, | |
| "loss": 0.7895, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 0.890496015548706, | |
| "learning_rate": 2.8983272537663082e-05, | |
| "loss": 0.7795, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8248587570621468, | |
| "grad_norm": 0.9742592573165894, | |
| "learning_rate": 2.8942355689563166e-05, | |
| "loss": 0.7661, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8361581920903954, | |
| "grad_norm": 0.855208694934845, | |
| "learning_rate": 2.890066173159509e-05, | |
| "loss": 0.7863, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.847457627118644, | |
| "grad_norm": 0.9460350871086121, | |
| "learning_rate": 2.8858192987669303e-05, | |
| "loss": 0.7475, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8587570621468926, | |
| "grad_norm": 0.9050624966621399, | |
| "learning_rate": 2.881495182488077e-05, | |
| "loss": 0.739, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8700564971751412, | |
| "grad_norm": 0.8351825475692749, | |
| "learning_rate": 2.8770940653377047e-05, | |
| "loss": 0.7322, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8813559322033898, | |
| "grad_norm": 0.9222843050956726, | |
| "learning_rate": 2.8726161926223904e-05, | |
| "loss": 0.7435, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8926553672316384, | |
| "grad_norm": 0.8897649645805359, | |
| "learning_rate": 2.8680618139268643e-05, | |
| "loss": 0.7229, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.903954802259887, | |
| "grad_norm": 1.0437895059585571, | |
| "learning_rate": 2.8634311831000966e-05, | |
| "loss": 0.7132, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9152542372881356, | |
| "grad_norm": 0.9872046113014221, | |
| "learning_rate": 2.8587245582411475e-05, | |
| "loss": 0.7257, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9265536723163842, | |
| "grad_norm": 0.9081714749336243, | |
| "learning_rate": 2.8539422016847834e-05, | |
| "loss": 0.6956, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9378531073446328, | |
| "grad_norm": 0.9939802289009094, | |
| "learning_rate": 2.8490843799868556e-05, | |
| "loss": 0.723, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9491525423728814, | |
| "grad_norm": 0.8449594974517822, | |
| "learning_rate": 2.8441513639094396e-05, | |
| "loss": 0.7294, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.96045197740113, | |
| "grad_norm": 0.9042068719863892, | |
| "learning_rate": 2.8391434284057466e-05, | |
| "loss": 0.6565, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9717514124293786, | |
| "grad_norm": 0.9338545203208923, | |
| "learning_rate": 2.8340608526047995e-05, | |
| "loss": 0.711, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9830508474576272, | |
| "grad_norm": 0.9345859885215759, | |
| "learning_rate": 2.82890391979587e-05, | |
| "loss": 0.7006, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9943502824858758, | |
| "grad_norm": 0.9804854393005371, | |
| "learning_rate": 2.8236729174126948e-05, | |
| "loss": 0.7183, | |
| "step": 440 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2215, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.402730716005663e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |