| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 271, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.018484288354898338, | |
| "grad_norm": 1.1482020616531372, | |
| "learning_rate": 1.7647058823529412e-06, | |
| "loss": 1.4088, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.036968576709796676, | |
| "grad_norm": 0.8084781169891357, | |
| "learning_rate": 3.970588235294118e-06, | |
| "loss": 1.3425, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05545286506469501, | |
| "grad_norm": 0.7077063322067261, | |
| "learning_rate": 6.176470588235294e-06, | |
| "loss": 1.3099, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07393715341959335, | |
| "grad_norm": 0.6323762536048889, | |
| "learning_rate": 8.382352941176472e-06, | |
| "loss": 1.3226, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09242144177449169, | |
| "grad_norm": 0.6048110127449036, | |
| "learning_rate": 1.0588235294117648e-05, | |
| "loss": 1.3228, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11090573012939002, | |
| "grad_norm": 0.5438778400421143, | |
| "learning_rate": 1.2794117647058824e-05, | |
| "loss": 1.2997, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12939001848428835, | |
| "grad_norm": 0.48258763551712036, | |
| "learning_rate": 1.5e-05, | |
| "loss": 1.2228, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1478743068391867, | |
| "grad_norm": 0.5334736108779907, | |
| "learning_rate": 1.7205882352941175e-05, | |
| "loss": 1.252, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.16635859519408502, | |
| "grad_norm": 0.4233540892601013, | |
| "learning_rate": 1.9411764705882355e-05, | |
| "loss": 1.238, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.18484288354898337, | |
| "grad_norm": 0.44563496112823486, | |
| "learning_rate": 2.161764705882353e-05, | |
| "loss": 1.2182, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2033271719038817, | |
| "grad_norm": 0.4069574475288391, | |
| "learning_rate": 2.3823529411764704e-05, | |
| "loss": 1.2401, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.22181146025878004, | |
| "grad_norm": 0.4704144597053528, | |
| "learning_rate": 2.6029411764705883e-05, | |
| "loss": 1.1739, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24029574861367836, | |
| "grad_norm": 0.5614204406738281, | |
| "learning_rate": 2.823529411764706e-05, | |
| "loss": 1.2586, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2587800369685767, | |
| "grad_norm": 0.44872087240219116, | |
| "learning_rate": 2.9999955310684845e-05, | |
| "loss": 1.1245, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.27726432532347506, | |
| "grad_norm": 0.49408602714538574, | |
| "learning_rate": 2.999839121261416e-05, | |
| "loss": 1.2028, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2957486136783734, | |
| "grad_norm": 0.5130935907363892, | |
| "learning_rate": 2.999459291506328e-05, | |
| "loss": 1.1916, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3142329020332717, | |
| "grad_norm": 0.5284329652786255, | |
| "learning_rate": 2.9988560983836527e-05, | |
| "loss": 1.152, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.33271719038817005, | |
| "grad_norm": 0.5931246876716614, | |
| "learning_rate": 2.99802963174661e-05, | |
| "loss": 1.1045, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3512014787430684, | |
| "grad_norm": 0.5653843283653259, | |
| "learning_rate": 2.9969800147078265e-05, | |
| "loss": 1.091, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.36968576709796674, | |
| "grad_norm": 0.5574877262115479, | |
| "learning_rate": 2.9957074036209947e-05, | |
| "loss": 1.0728, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.38817005545286504, | |
| "grad_norm": 0.5665938258171082, | |
| "learning_rate": 2.994211988057582e-05, | |
| "loss": 1.0265, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4066543438077634, | |
| "grad_norm": 0.6560489535331726, | |
| "learning_rate": 2.9924939907785906e-05, | |
| "loss": 1.066, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.42513863216266173, | |
| "grad_norm": 0.6012698411941528, | |
| "learning_rate": 2.9905536677013782e-05, | |
| "loss": 1.0484, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4436229205175601, | |
| "grad_norm": 0.5735741853713989, | |
| "learning_rate": 2.9883913078615306e-05, | |
| "loss": 1.0043, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.46210720887245843, | |
| "grad_norm": 0.5683181881904602, | |
| "learning_rate": 2.9860072333698115e-05, | |
| "loss": 1.0437, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4805914972273567, | |
| "grad_norm": 0.668404757976532, | |
| "learning_rate": 2.9834017993641756e-05, | |
| "loss": 1.0245, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.49907578558225507, | |
| "grad_norm": 0.5659777522087097, | |
| "learning_rate": 2.980575393956869e-05, | |
| "loss": 1.004, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5175600739371534, | |
| "grad_norm": 0.7540903091430664, | |
| "learning_rate": 2.977528438176615e-05, | |
| "loss": 0.9866, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5360443622920518, | |
| "grad_norm": 0.7011395692825317, | |
| "learning_rate": 2.974261385905894e-05, | |
| "loss": 0.9349, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5545286506469501, | |
| "grad_norm": 0.7044267654418945, | |
| "learning_rate": 2.9707747238133358e-05, | |
| "loss": 0.956, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5730129390018485, | |
| "grad_norm": 0.7586350440979004, | |
| "learning_rate": 2.9670689712812195e-05, | |
| "loss": 0.9521, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5914972273567468, | |
| "grad_norm": 0.8364260792732239, | |
| "learning_rate": 2.963144680328111e-05, | |
| "loss": 0.9237, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.609981515711645, | |
| "grad_norm": 0.7463046908378601, | |
| "learning_rate": 2.959002435526626e-05, | |
| "loss": 0.9384, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6284658040665434, | |
| "grad_norm": 0.7540715336799622, | |
| "learning_rate": 2.9546428539163568e-05, | |
| "loss": 0.9072, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6469500924214417, | |
| "grad_norm": 0.8799229860305786, | |
| "learning_rate": 2.9500665849119523e-05, | |
| "loss": 0.9326, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6654343807763401, | |
| "grad_norm": 0.8631930351257324, | |
| "learning_rate": 2.945274310206382e-05, | |
| "loss": 0.8823, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6839186691312384, | |
| "grad_norm": 0.7702048420906067, | |
| "learning_rate": 2.9402667436693852e-05, | |
| "loss": 0.8832, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7024029574861368, | |
| "grad_norm": 0.9274978637695312, | |
| "learning_rate": 2.935044631241138e-05, | |
| "loss": 0.8156, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7208872458410351, | |
| "grad_norm": 0.9201264381408691, | |
| "learning_rate": 2.929608750821129e-05, | |
| "loss": 0.8671, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7393715341959335, | |
| "grad_norm": 0.9688053131103516, | |
| "learning_rate": 2.923959912152287e-05, | |
| "loss": 0.9017, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7578558225508318, | |
| "grad_norm": 0.977288544178009, | |
| "learning_rate": 2.9180989567003547e-05, | |
| "loss": 0.8301, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7763401109057301, | |
| "grad_norm": 0.966156542301178, | |
| "learning_rate": 2.9120267575285458e-05, | |
| "loss": 0.8573, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7948243992606284, | |
| "grad_norm": 0.9321315884590149, | |
| "learning_rate": 2.905744219167489e-05, | |
| "loss": 0.8063, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8133086876155268, | |
| "grad_norm": 0.9106122255325317, | |
| "learning_rate": 2.899252277480487e-05, | |
| "loss": 0.8396, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8317929759704251, | |
| "grad_norm": 1.001532793045044, | |
| "learning_rate": 2.892551899524109e-05, | |
| "loss": 0.8269, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.8502772643253235, | |
| "grad_norm": 0.9747412800788879, | |
| "learning_rate": 2.885644083404134e-05, | |
| "loss": 0.8071, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8687615526802218, | |
| "grad_norm": 0.8949458599090576, | |
| "learning_rate": 2.8785298581268704e-05, | |
| "loss": 0.8024, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8872458410351202, | |
| "grad_norm": 1.0015259981155396, | |
| "learning_rate": 2.871210283445875e-05, | |
| "loss": 0.8123, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9057301293900185, | |
| "grad_norm": 1.0993518829345703, | |
| "learning_rate": 2.8636864497040856e-05, | |
| "loss": 0.7778, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9242144177449169, | |
| "grad_norm": 1.0659856796264648, | |
| "learning_rate": 2.8559594776714034e-05, | |
| "loss": 0.7372, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9426987060998152, | |
| "grad_norm": 1.0356419086456299, | |
| "learning_rate": 2.848030518377739e-05, | |
| "loss": 0.77, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.9611829944547134, | |
| "grad_norm": 1.22687828540802, | |
| "learning_rate": 2.8399007529415527e-05, | |
| "loss": 0.7909, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9796672828096118, | |
| "grad_norm": 0.8971600532531738, | |
| "learning_rate": 2.8315713923939113e-05, | |
| "loss": 0.663, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9981515711645101, | |
| "grad_norm": 1.0294069051742554, | |
| "learning_rate": 2.82304367749809e-05, | |
| "loss": 0.7002, | |
| "step": 270 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1355, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.798461404025979e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |