{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 13.041884816753926, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.418848167539267, "grad_norm": 1.1237713098526, "learning_rate": 4.9979013702509664e-05, "loss": 1.4003, "num_input_tokens_seen": 63440, "step": 10 }, { "epoch": 0.837696335078534, "grad_norm": 0.5457362532615662, "learning_rate": 4.9906513710563894e-05, "loss": 0.1699, "num_input_tokens_seen": 127024, "step": 20 }, { "epoch": 1.2931937172774868, "grad_norm": 0.47847944498062134, "learning_rate": 4.9782391165565324e-05, "loss": 0.1719, "num_input_tokens_seen": 194352, "step": 30 }, { "epoch": 1.7120418848167538, "grad_norm": 0.8392484188079834, "learning_rate": 4.960690333044279e-05, "loss": 0.1433, "num_input_tokens_seen": 257776, "step": 40 }, { "epoch": 2.167539267015707, "grad_norm": 0.6877206563949585, "learning_rate": 4.938041393053273e-05, "loss": 0.1352, "num_input_tokens_seen": 325440, "step": 50 }, { "epoch": 2.5863874345549736, "grad_norm": 0.5719232559204102, "learning_rate": 4.910339239970286e-05, "loss": 0.1067, "num_input_tokens_seen": 388576, "step": 60 }, { "epoch": 3.0418848167539267, "grad_norm": 0.9559742212295532, "learning_rate": 4.877641290737884e-05, "loss": 0.1101, "num_input_tokens_seen": 456256, "step": 70 }, { "epoch": 3.4607329842931938, "grad_norm": 0.4461997151374817, "learning_rate": 4.8400153168490414e-05, "loss": 0.0852, "num_input_tokens_seen": 518864, "step": 80 }, { "epoch": 3.8795811518324608, "grad_norm": 0.5407759547233582, "learning_rate": 4.7975393038803754e-05, "loss": 0.0897, "num_input_tokens_seen": 582464, "step": 90 }, { "epoch": 4.335078534031414, "grad_norm": 0.7839125990867615, "learning_rate": 4.750301289855128e-05, "loss": 0.0928, "num_input_tokens_seen": 649664, "step": 100 }, { "epoch": 4.7539267015706805, "grad_norm": 0.41905662417411804, "learning_rate": 4.69839918277092e-05, "loss": 0.0748, "num_input_tokens_seen": 712944, "step": 110 }, { "epoch": 5.209424083769633, "grad_norm": 0.5274420976638794, "learning_rate": 4.641940557670478e-05, "loss": 0.0805, "num_input_tokens_seen": 780128, "step": 120 }, { "epoch": 5.628272251308901, "grad_norm": 0.5005943775177002, "learning_rate": 4.581042433675921e-05, "loss": 0.07, "num_input_tokens_seen": 844064, "step": 130 }, { "epoch": 6.0837696335078535, "grad_norm": 0.5864464044570923, "learning_rate": 4.5158310314487706e-05, "loss": 0.0728, "num_input_tokens_seen": 911488, "step": 140 }, { "epoch": 6.50261780104712, "grad_norm": 0.4119901657104492, "learning_rate": 4.446441511578351e-05, "loss": 0.0641, "num_input_tokens_seen": 975216, "step": 150 }, { "epoch": 6.9214659685863875, "grad_norm": 0.47534871101379395, "learning_rate": 4.373017694440827e-05, "loss": 0.0679, "num_input_tokens_seen": 1037504, "step": 160 }, { "epoch": 7.37696335078534, "grad_norm": 0.5238337516784668, "learning_rate": 4.295711762109515e-05, "loss": 0.0682, "num_input_tokens_seen": 1104512, "step": 170 }, { "epoch": 7.795811518324607, "grad_norm": 0.5485618710517883, "learning_rate": 4.214683942934291e-05, "loss": 0.0596, "num_input_tokens_seen": 1168112, "step": 180 }, { "epoch": 8.25130890052356, "grad_norm": 0.544316291809082, "learning_rate": 4.130102179443877e-05, "loss": 0.0641, "num_input_tokens_seen": 1235104, "step": 190 }, { "epoch": 8.670157068062828, "grad_norm": 0.5107344388961792, "learning_rate": 4.042141780259292e-05, "loss": 0.0631, "num_input_tokens_seen": 1298512, "step": 200 }, { "epoch": 9.12565445026178, "grad_norm": 0.3618321120738983, "learning_rate": 3.9509850567399774e-05, "loss": 0.0687, "num_input_tokens_seen": 1365312, "step": 210 }, { "epoch": 9.544502617801047, "grad_norm": 0.5421671271324158, "learning_rate": 3.856820945115655e-05, "loss": 0.0548, "num_input_tokens_seen": 1429344, "step": 220 }, { "epoch": 9.963350785340314, "grad_norm": 0.5248610973358154, "learning_rate": 3.759844614887141e-05, "loss": 0.057, "num_input_tokens_seen": 1492960, "step": 230 }, { "epoch": 10.418848167539267, "grad_norm": 0.4770837724208832, "learning_rate": 3.6602570643077556e-05, "loss": 0.0586, "num_input_tokens_seen": 1559488, "step": 240 }, { "epoch": 10.837696335078533, "grad_norm": 0.5850111246109009, "learning_rate": 3.5582647037837445e-05, "loss": 0.0529, "num_input_tokens_seen": 1622784, "step": 250 }, { "epoch": 11.293193717277488, "grad_norm": 0.5014153122901917, "learning_rate": 3.454078928057196e-05, "loss": 0.0535, "num_input_tokens_seen": 1690544, "step": 260 }, { "epoch": 11.712041884816754, "grad_norm": 0.5679153203964233, "learning_rate": 3.347915678058152e-05, "loss": 0.0469, "num_input_tokens_seen": 1754032, "step": 270 }, { "epoch": 12.167539267015707, "grad_norm": 0.3811012804508209, "learning_rate": 3.239994993334059e-05, "loss": 0.0531, "num_input_tokens_seen": 1820944, "step": 280 }, { "epoch": 12.586387434554974, "grad_norm": 0.6256040334701538, "learning_rate": 3.1305405559842016e-05, "loss": 0.0475, "num_input_tokens_seen": 1884656, "step": 290 }, { "epoch": 13.041884816753926, "grad_norm": 0.6022323966026306, "learning_rate": 3.0197792270443982e-05, "loss": 0.0472, "num_input_tokens_seen": 1951488, "step": 300 } ], "logging_steps": 10, "max_steps": 690, "num_input_tokens_seen": 1951488, "num_train_epochs": 30, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.374931508795802e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }