{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 271, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018484288354898338, "grad_norm": 1.1482020616531372, "learning_rate": 1.7647058823529412e-06, "loss": 1.4088, "step": 5 }, { "epoch": 0.036968576709796676, "grad_norm": 0.8084781169891357, "learning_rate": 3.970588235294118e-06, "loss": 1.3425, "step": 10 }, { "epoch": 0.05545286506469501, "grad_norm": 0.7077063322067261, "learning_rate": 6.176470588235294e-06, "loss": 1.3099, "step": 15 }, { "epoch": 0.07393715341959335, "grad_norm": 0.6323762536048889, "learning_rate": 8.382352941176472e-06, "loss": 1.3226, "step": 20 }, { "epoch": 0.09242144177449169, "grad_norm": 0.6048110127449036, "learning_rate": 1.0588235294117648e-05, "loss": 1.3228, "step": 25 }, { "epoch": 0.11090573012939002, "grad_norm": 0.5438778400421143, "learning_rate": 1.2794117647058824e-05, "loss": 1.2997, "step": 30 }, { "epoch": 0.12939001848428835, "grad_norm": 0.48258763551712036, "learning_rate": 1.5e-05, "loss": 1.2228, "step": 35 }, { "epoch": 0.1478743068391867, "grad_norm": 0.5334736108779907, "learning_rate": 1.7205882352941175e-05, "loss": 1.252, "step": 40 }, { "epoch": 0.16635859519408502, "grad_norm": 0.4233540892601013, "learning_rate": 1.9411764705882355e-05, "loss": 1.238, "step": 45 }, { "epoch": 0.18484288354898337, "grad_norm": 0.44563496112823486, "learning_rate": 2.161764705882353e-05, "loss": 1.2182, "step": 50 }, { "epoch": 0.2033271719038817, "grad_norm": 0.4069574475288391, "learning_rate": 2.3823529411764704e-05, "loss": 1.2401, "step": 55 }, { "epoch": 0.22181146025878004, "grad_norm": 0.4704144597053528, "learning_rate": 2.6029411764705883e-05, "loss": 1.1739, "step": 60 }, { "epoch": 0.24029574861367836, "grad_norm": 0.5614204406738281, "learning_rate": 2.823529411764706e-05, "loss": 1.2586, "step": 65 }, { "epoch": 0.2587800369685767, "grad_norm": 0.44872087240219116, "learning_rate": 2.9999955310684845e-05, "loss": 1.1245, "step": 70 }, { "epoch": 0.27726432532347506, "grad_norm": 0.49408602714538574, "learning_rate": 2.999839121261416e-05, "loss": 1.2028, "step": 75 }, { "epoch": 0.2957486136783734, "grad_norm": 0.5130935907363892, "learning_rate": 2.999459291506328e-05, "loss": 1.1916, "step": 80 }, { "epoch": 0.3142329020332717, "grad_norm": 0.5284329652786255, "learning_rate": 2.9988560983836527e-05, "loss": 1.152, "step": 85 }, { "epoch": 0.33271719038817005, "grad_norm": 0.5931246876716614, "learning_rate": 2.99802963174661e-05, "loss": 1.1045, "step": 90 }, { "epoch": 0.3512014787430684, "grad_norm": 0.5653843283653259, "learning_rate": 2.9969800147078265e-05, "loss": 1.091, "step": 95 }, { "epoch": 0.36968576709796674, "grad_norm": 0.5574877262115479, "learning_rate": 2.9957074036209947e-05, "loss": 1.0728, "step": 100 }, { "epoch": 0.38817005545286504, "grad_norm": 0.5665938258171082, "learning_rate": 2.994211988057582e-05, "loss": 1.0265, "step": 105 }, { "epoch": 0.4066543438077634, "grad_norm": 0.6560489535331726, "learning_rate": 2.9924939907785906e-05, "loss": 1.066, "step": 110 }, { "epoch": 0.42513863216266173, "grad_norm": 0.6012698411941528, "learning_rate": 2.9905536677013782e-05, "loss": 1.0484, "step": 115 }, { "epoch": 0.4436229205175601, "grad_norm": 0.5735741853713989, "learning_rate": 2.9883913078615306e-05, "loss": 1.0043, "step": 120 }, { "epoch": 0.46210720887245843, "grad_norm": 0.5683181881904602, "learning_rate": 2.9860072333698115e-05, "loss": 1.0437, "step": 125 }, { "epoch": 0.4805914972273567, "grad_norm": 0.668404757976532, "learning_rate": 2.9834017993641756e-05, "loss": 1.0245, "step": 130 }, { "epoch": 0.49907578558225507, "grad_norm": 0.5659777522087097, "learning_rate": 2.980575393956869e-05, "loss": 1.004, "step": 135 }, { "epoch": 0.5175600739371534, "grad_norm": 0.7540903091430664, "learning_rate": 2.977528438176615e-05, "loss": 0.9866, "step": 140 }, { "epoch": 0.5360443622920518, "grad_norm": 0.7011395692825317, "learning_rate": 2.974261385905894e-05, "loss": 0.9349, "step": 145 }, { "epoch": 0.5545286506469501, "grad_norm": 0.7044267654418945, "learning_rate": 2.9707747238133358e-05, "loss": 0.956, "step": 150 }, { "epoch": 0.5730129390018485, "grad_norm": 0.7586350440979004, "learning_rate": 2.9670689712812195e-05, "loss": 0.9521, "step": 155 }, { "epoch": 0.5914972273567468, "grad_norm": 0.8364260792732239, "learning_rate": 2.963144680328111e-05, "loss": 0.9237, "step": 160 }, { "epoch": 0.609981515711645, "grad_norm": 0.7463046908378601, "learning_rate": 2.959002435526626e-05, "loss": 0.9384, "step": 165 }, { "epoch": 0.6284658040665434, "grad_norm": 0.7540715336799622, "learning_rate": 2.9546428539163568e-05, "loss": 0.9072, "step": 170 }, { "epoch": 0.6469500924214417, "grad_norm": 0.8799229860305786, "learning_rate": 2.9500665849119523e-05, "loss": 0.9326, "step": 175 }, { "epoch": 0.6654343807763401, "grad_norm": 0.8631930351257324, "learning_rate": 2.945274310206382e-05, "loss": 0.8823, "step": 180 }, { "epoch": 0.6839186691312384, "grad_norm": 0.7702048420906067, "learning_rate": 2.9402667436693852e-05, "loss": 0.8832, "step": 185 }, { "epoch": 0.7024029574861368, "grad_norm": 0.9274978637695312, "learning_rate": 2.935044631241138e-05, "loss": 0.8156, "step": 190 }, { "epoch": 0.7208872458410351, "grad_norm": 0.9201264381408691, "learning_rate": 2.929608750821129e-05, "loss": 0.8671, "step": 195 }, { "epoch": 0.7393715341959335, "grad_norm": 0.9688053131103516, "learning_rate": 2.923959912152287e-05, "loss": 0.9017, "step": 200 }, { "epoch": 0.7578558225508318, "grad_norm": 0.977288544178009, "learning_rate": 2.9180989567003547e-05, "loss": 0.8301, "step": 205 }, { "epoch": 0.7763401109057301, "grad_norm": 0.966156542301178, "learning_rate": 2.9120267575285458e-05, "loss": 0.8573, "step": 210 }, { "epoch": 0.7948243992606284, "grad_norm": 0.9321315884590149, "learning_rate": 2.905744219167489e-05, "loss": 0.8063, "step": 215 }, { "epoch": 0.8133086876155268, "grad_norm": 0.9106122255325317, "learning_rate": 2.899252277480487e-05, "loss": 0.8396, "step": 220 }, { "epoch": 0.8317929759704251, "grad_norm": 1.001532793045044, "learning_rate": 2.892551899524109e-05, "loss": 0.8269, "step": 225 }, { "epoch": 0.8502772643253235, "grad_norm": 0.9747412800788879, "learning_rate": 2.885644083404134e-05, "loss": 0.8071, "step": 230 }, { "epoch": 0.8687615526802218, "grad_norm": 0.8949458599090576, "learning_rate": 2.8785298581268704e-05, "loss": 0.8024, "step": 235 }, { "epoch": 0.8872458410351202, "grad_norm": 1.0015259981155396, "learning_rate": 2.871210283445875e-05, "loss": 0.8123, "step": 240 }, { "epoch": 0.9057301293900185, "grad_norm": 1.0993518829345703, "learning_rate": 2.8636864497040856e-05, "loss": 0.7778, "step": 245 }, { "epoch": 0.9242144177449169, "grad_norm": 1.0659856796264648, "learning_rate": 2.8559594776714034e-05, "loss": 0.7372, "step": 250 }, { "epoch": 0.9426987060998152, "grad_norm": 1.0356419086456299, "learning_rate": 2.848030518377739e-05, "loss": 0.77, "step": 255 }, { "epoch": 0.9611829944547134, "grad_norm": 1.22687828540802, "learning_rate": 2.8399007529415527e-05, "loss": 0.7909, "step": 260 }, { "epoch": 0.9796672828096118, "grad_norm": 0.8971600532531738, "learning_rate": 2.8315713923939113e-05, "loss": 0.663, "step": 265 }, { "epoch": 0.9981515711645101, "grad_norm": 1.0294069051742554, "learning_rate": 2.82304367749809e-05, "loss": 0.7002, "step": 270 } ], "logging_steps": 5, "max_steps": 1355, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.798461404025979e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }