{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 17.376963350785342, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.418848167539267, "grad_norm": 1.1237713098526, "learning_rate": 4.9979013702509664e-05, "loss": 1.4003, "num_input_tokens_seen": 63440, "step": 10 }, { "epoch": 0.837696335078534, "grad_norm": 0.5457362532615662, "learning_rate": 4.9906513710563894e-05, "loss": 0.1699, "num_input_tokens_seen": 127024, "step": 20 }, { "epoch": 1.2931937172774868, "grad_norm": 0.47847944498062134, "learning_rate": 4.9782391165565324e-05, "loss": 0.1719, "num_input_tokens_seen": 194352, "step": 30 }, { "epoch": 1.7120418848167538, "grad_norm": 0.8392484188079834, "learning_rate": 4.960690333044279e-05, "loss": 0.1433, "num_input_tokens_seen": 257776, "step": 40 }, { "epoch": 2.167539267015707, "grad_norm": 0.6877206563949585, "learning_rate": 4.938041393053273e-05, "loss": 0.1352, "num_input_tokens_seen": 325440, "step": 50 }, { "epoch": 2.5863874345549736, "grad_norm": 0.5719232559204102, "learning_rate": 4.910339239970286e-05, "loss": 0.1067, "num_input_tokens_seen": 388576, "step": 60 }, { "epoch": 3.0418848167539267, "grad_norm": 0.9559742212295532, "learning_rate": 4.877641290737884e-05, "loss": 0.1101, "num_input_tokens_seen": 456256, "step": 70 }, { "epoch": 3.4607329842931938, "grad_norm": 0.4461997151374817, "learning_rate": 4.8400153168490414e-05, "loss": 0.0852, "num_input_tokens_seen": 518864, "step": 80 }, { "epoch": 3.8795811518324608, "grad_norm": 0.5407759547233582, "learning_rate": 4.7975393038803754e-05, "loss": 0.0897, "num_input_tokens_seen": 582464, "step": 90 }, { "epoch": 4.335078534031414, "grad_norm": 0.7839125990867615, "learning_rate": 4.750301289855128e-05, "loss": 0.0928, "num_input_tokens_seen": 649664, "step": 100 }, { "epoch": 4.7539267015706805, "grad_norm": 0.40812286734580994, "learning_rate": 4.69839918277092e-05, "loss": 0.0748, "num_input_tokens_seen": 712944, "step": 110 }, { "epoch": 5.209424083769633, "grad_norm": 0.5133504867553711, "learning_rate": 4.641940557670478e-05, "loss": 0.0801, "num_input_tokens_seen": 780128, "step": 120 }, { "epoch": 5.628272251308901, "grad_norm": 0.4961135685443878, "learning_rate": 4.581042433675921e-05, "loss": 0.0702, "num_input_tokens_seen": 844064, "step": 130 }, { "epoch": 6.0837696335078535, "grad_norm": 0.5816908478736877, "learning_rate": 4.5158310314487706e-05, "loss": 0.0735, "num_input_tokens_seen": 911488, "step": 140 }, { "epoch": 6.50261780104712, "grad_norm": 0.4245397448539734, "learning_rate": 4.446441511578351e-05, "loss": 0.0632, "num_input_tokens_seen": 975216, "step": 150 }, { "epoch": 6.9214659685863875, "grad_norm": 0.6995230913162231, "learning_rate": 4.373017694440827e-05, "loss": 0.0665, "num_input_tokens_seen": 1037504, "step": 160 }, { "epoch": 7.37696335078534, "grad_norm": 0.5000202059745789, "learning_rate": 4.295711762109515e-05, "loss": 0.0676, "num_input_tokens_seen": 1104512, "step": 170 }, { "epoch": 7.795811518324607, "grad_norm": 0.5529599785804749, "learning_rate": 4.214683942934291e-05, "loss": 0.0612, "num_input_tokens_seen": 1168112, "step": 180 }, { "epoch": 8.25130890052356, "grad_norm": 0.5237070322036743, "learning_rate": 4.130102179443877e-05, "loss": 0.0642, "num_input_tokens_seen": 1235104, "step": 190 }, { "epoch": 8.670157068062828, "grad_norm": 0.4693180024623871, "learning_rate": 4.042141780259292e-05, "loss": 0.0587, "num_input_tokens_seen": 1298512, "step": 200 }, { "epoch": 9.12565445026178, "grad_norm": 0.32981690764427185, "learning_rate": 3.9509850567399774e-05, "loss": 0.0688, "num_input_tokens_seen": 1365312, "step": 210 }, { "epoch": 9.544502617801047, "grad_norm": 0.5822737812995911, "learning_rate": 3.856820945115655e-05, "loss": 0.0533, "num_input_tokens_seen": 1429344, "step": 220 }, { "epoch": 9.963350785340314, "grad_norm": 0.559769332408905, "learning_rate": 3.759844614887141e-05, "loss": 0.0568, "num_input_tokens_seen": 1492960, "step": 230 }, { "epoch": 10.418848167539267, "grad_norm": 0.504461944103241, "learning_rate": 3.6602570643077556e-05, "loss": 0.0569, "num_input_tokens_seen": 1559488, "step": 240 }, { "epoch": 10.837696335078533, "grad_norm": 0.5406277179718018, "learning_rate": 3.5582647037837445e-05, "loss": 0.052, "num_input_tokens_seen": 1622784, "step": 250 }, { "epoch": 11.293193717277488, "grad_norm": 0.4200195074081421, "learning_rate": 3.454078928057196e-05, "loss": 0.0521, "num_input_tokens_seen": 1690544, "step": 260 }, { "epoch": 11.712041884816754, "grad_norm": 0.8368775844573975, "learning_rate": 3.347915678058152e-05, "loss": 0.047, "num_input_tokens_seen": 1754032, "step": 270 }, { "epoch": 12.167539267015707, "grad_norm": 0.3906252682209015, "learning_rate": 3.239994993334059e-05, "loss": 0.051, "num_input_tokens_seen": 1820944, "step": 280 }, { "epoch": 12.586387434554974, "grad_norm": 0.7284943461418152, "learning_rate": 3.1305405559842016e-05, "loss": 0.0467, "num_input_tokens_seen": 1884656, "step": 290 }, { "epoch": 13.041884816753926, "grad_norm": 0.5412812232971191, "learning_rate": 3.0197792270443982e-05, "loss": 0.0471, "num_input_tokens_seen": 1951488, "step": 300 }, { "epoch": 13.460732984293193, "grad_norm": 0.6882799863815308, "learning_rate": 2.907940576282856e-05, "loss": 0.0395, "num_input_tokens_seen": 2015008, "step": 310 }, { "epoch": 13.879581151832461, "grad_norm": 0.528359591960907, "learning_rate": 2.7952564063817704e-05, "loss": 0.0407, "num_input_tokens_seen": 2077264, "step": 320 }, { "epoch": 14.335078534031414, "grad_norm": 0.5524467825889587, "learning_rate": 2.6819602724908742e-05, "loss": 0.0399, "num_input_tokens_seen": 2143504, "step": 330 }, { "epoch": 14.75392670157068, "grad_norm": 0.5251730680465698, "learning_rate": 2.5682869981487152e-05, "loss": 0.0371, "num_input_tokens_seen": 2206768, "step": 340 }, { "epoch": 15.209424083769633, "grad_norm": 1.0152957439422607, "learning_rate": 2.4544721885750217e-05, "loss": 0.0387, "num_input_tokens_seen": 2275200, "step": 350 }, { "epoch": 15.6282722513089, "grad_norm": 0.7967268824577332, "learning_rate": 2.3407517423429015e-05, "loss": 0.0327, "num_input_tokens_seen": 2337952, "step": 360 }, { "epoch": 16.083769633507853, "grad_norm": 0.6417545676231384, "learning_rate": 2.2273613624430255e-05, "loss": 0.0313, "num_input_tokens_seen": 2405232, "step": 370 }, { "epoch": 16.50261780104712, "grad_norm": 0.6872904896736145, "learning_rate": 2.1145360677531924e-05, "loss": 0.0218, "num_input_tokens_seen": 2468304, "step": 380 }, { "epoch": 16.921465968586386, "grad_norm": 0.6166151165962219, "learning_rate": 2.0025097059258046e-05, "loss": 0.0256, "num_input_tokens_seen": 2531808, "step": 390 }, { "epoch": 17.376963350785342, "grad_norm": 0.50600665807724, "learning_rate": 1.8915144687029106e-05, "loss": 0.0221, "num_input_tokens_seen": 2599280, "step": 400 } ], "logging_steps": 10, "max_steps": 690, "num_input_tokens_seen": 2599280, "num_train_epochs": 30, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1154970987672371e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }