{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.688172043010753, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.053763440860215055, "grad_norm": 3.0181329250335693, "learning_rate": 3.6e-05, "loss": 0.6001, "step": 10 }, { "epoch": 0.10752688172043011, "grad_norm": 1.7442196607589722, "learning_rate": 7.6e-05, "loss": 0.2891, "step": 20 }, { "epoch": 0.16129032258064516, "grad_norm": 1.5924317836761475, "learning_rate": 9.998250366089848e-05, "loss": 0.2614, "step": 30 }, { "epoch": 0.21505376344086022, "grad_norm": 1.3713581562042236, "learning_rate": 9.97858104436822e-05, "loss": 0.2364, "step": 40 }, { "epoch": 0.26881720430107525, "grad_norm": 1.3055821657180786, "learning_rate": 9.937141654477528e-05, "loss": 0.1779, "step": 50 }, { "epoch": 0.3225806451612903, "grad_norm": 0.9266240000724792, "learning_rate": 9.87411340032603e-05, "loss": 0.1571, "step": 60 }, { "epoch": 0.3763440860215054, "grad_norm": 1.0376759767532349, "learning_rate": 9.789771888432375e-05, "loss": 0.137, "step": 70 }, { "epoch": 0.43010752688172044, "grad_norm": 0.7397834062576294, "learning_rate": 9.684485922768422e-05, "loss": 0.1182, "step": 80 }, { "epoch": 0.4838709677419355, "grad_norm": 0.5904547572135925, "learning_rate": 9.558715892073323e-05, "loss": 0.1046, "step": 90 }, { "epoch": 0.5376344086021505, "grad_norm": 1.022401213645935, "learning_rate": 9.413011756690685e-05, "loss": 0.1056, "step": 100 }, { "epoch": 0.5913978494623656, "grad_norm": 0.7747468948364258, "learning_rate": 9.248010643731935e-05, "loss": 0.1006, "step": 110 }, { "epoch": 0.6451612903225806, "grad_norm": 0.8032405972480774, "learning_rate": 9.064434061081562e-05, "loss": 0.0902, "step": 120 }, { "epoch": 0.6989247311827957, "grad_norm": 0.6915440559387207, "learning_rate": 8.863084742426719e-05, "loss": 0.0861, "step": 130 }, { "epoch": 0.7526881720430108, "grad_norm": 0.7343285083770752, "learning_rate": 8.644843137107059e-05, "loss": 0.0873, "step": 140 }, { "epoch": 0.8064516129032258, "grad_norm": 0.6028986573219299, "learning_rate": 8.410663560133784e-05, "loss": 0.0831, "step": 150 }, { "epoch": 0.8602150537634409, "grad_norm": 0.5441303253173828, "learning_rate": 8.161570019212921e-05, "loss": 0.0772, "step": 160 }, { "epoch": 0.9139784946236559, "grad_norm": 0.5184003114700317, "learning_rate": 7.898651737020166e-05, "loss": 0.0739, "step": 170 }, { "epoch": 0.967741935483871, "grad_norm": 0.38568446040153503, "learning_rate": 7.623058388307269e-05, "loss": 0.0729, "step": 180 }, { "epoch": 1.021505376344086, "grad_norm": 0.5226582288742065, "learning_rate": 7.335995072666848e-05, "loss": 0.0733, "step": 190 }, { "epoch": 1.075268817204301, "grad_norm": 0.6022420525550842, "learning_rate": 7.038717044938519e-05, "loss": 0.0773, "step": 200 }, { "epoch": 1.129032258064516, "grad_norm": 0.6212151646614075, "learning_rate": 6.732524226298841e-05, "loss": 0.0684, "step": 210 }, { "epoch": 1.1827956989247312, "grad_norm": 0.5427640080451965, "learning_rate": 6.418755520036775e-05, "loss": 0.0678, "step": 220 }, { "epoch": 1.2365591397849462, "grad_norm": 0.6625052690505981, "learning_rate": 6.0987829568702656e-05, "loss": 0.073, "step": 230 }, { "epoch": 1.2903225806451613, "grad_norm": 0.37876975536346436, "learning_rate": 5.7740056954050084e-05, "loss": 0.0684, "step": 240 }, { "epoch": 1.3440860215053765, "grad_norm": 0.3789575397968292, "learning_rate": 5.445843903969854e-05, "loss": 0.0573, "step": 250 }, { "epoch": 1.3978494623655915, "grad_norm": 0.380149245262146, "learning_rate": 5.1157325505820694e-05, "loss": 0.0618, "step": 260 }, { "epoch": 1.4516129032258065, "grad_norm": 0.4945651888847351, "learning_rate": 4.785115128197298e-05, "loss": 0.0632, "step": 270 }, { "epoch": 1.5053763440860215, "grad_norm": 0.47128596901893616, "learning_rate": 4.4554373426821374e-05, "loss": 0.0619, "step": 280 }, { "epoch": 1.5591397849462365, "grad_norm": 0.3431663513183594, "learning_rate": 4.1281407911102425e-05, "loss": 0.0605, "step": 290 }, { "epoch": 1.6129032258064515, "grad_norm": 0.3071378171443939, "learning_rate": 3.8046566580251e-05, "loss": 0.0539, "step": 300 }, { "epoch": 1.6666666666666665, "grad_norm": 0.25178495049476624, "learning_rate": 3.4863994572341843e-05, "loss": 0.0547, "step": 310 }, { "epoch": 1.7204301075268817, "grad_norm": 0.3284253478050232, "learning_rate": 3.1747608464999725e-05, "loss": 0.0558, "step": 320 }, { "epoch": 1.7741935483870968, "grad_norm": 0.40466904640197754, "learning_rate": 2.8711035421746367e-05, "loss": 0.0519, "step": 330 }, { "epoch": 1.827956989247312, "grad_norm": 0.3861023485660553, "learning_rate": 2.5767553603881767e-05, "loss": 0.0534, "step": 340 }, { "epoch": 1.881720430107527, "grad_norm": 0.4377838671207428, "learning_rate": 2.29300341084631e-05, "loss": 0.0507, "step": 350 }, { "epoch": 1.935483870967742, "grad_norm": 0.3389015197753906, "learning_rate": 2.0210884686272368e-05, "loss": 0.0533, "step": 360 }, { "epoch": 1.989247311827957, "grad_norm": 0.2521311938762665, "learning_rate": 1.7621995485879062e-05, "loss": 0.0536, "step": 370 }, { "epoch": 2.043010752688172, "grad_norm": 0.30193665623664856, "learning_rate": 1.517468706104589e-05, "loss": 0.0562, "step": 380 }, { "epoch": 2.096774193548387, "grad_norm": 0.3276124596595764, "learning_rate": 1.2879660868827508e-05, "loss": 0.0497, "step": 390 }, { "epoch": 2.150537634408602, "grad_norm": 0.2738921642303467, "learning_rate": 1.0746952474821614e-05, "loss": 0.0505, "step": 400 }, { "epoch": 2.204301075268817, "grad_norm": 0.23493026196956635, "learning_rate": 8.785887670194138e-06, "loss": 0.05, "step": 410 }, { "epoch": 2.258064516129032, "grad_norm": 0.27332600951194763, "learning_rate": 7.005041692367154e-06, "loss": 0.0478, "step": 420 }, { "epoch": 2.3118279569892475, "grad_norm": 0.23733267188072205, "learning_rate": 5.412201727687644e-06, "loss": 0.0485, "step": 430 }, { "epoch": 2.3655913978494625, "grad_norm": 0.3148542046546936, "learning_rate": 4.01433286004283e-06, "loss": 0.0486, "step": 440 }, { "epoch": 2.4193548387096775, "grad_norm": 0.17621102929115295, "learning_rate": 2.817547614320615e-06, "loss": 0.0459, "step": 450 }, { "epoch": 2.4731182795698925, "grad_norm": 0.3627742528915405, "learning_rate": 1.8270792278934302e-06, "loss": 0.0447, "step": 460 }, { "epoch": 2.5268817204301075, "grad_norm": 0.2204674929380417, "learning_rate": 1.0472587670027678e-06, "loss": 0.0432, "step": 470 }, { "epoch": 2.5806451612903225, "grad_norm": 0.2851138114929199, "learning_rate": 4.814961881085045e-07, "loss": 0.049, "step": 480 }, { "epoch": 2.6344086021505375, "grad_norm": 0.22748719155788422, "learning_rate": 1.3226542701689215e-07, "loss": 0.0452, "step": 490 }, { "epoch": 2.688172043010753, "grad_norm": 0.29575690627098083, "learning_rate": 1.0935809887702154e-09, "loss": 0.0484, "step": 500 }, { "epoch": 2.688172043010753, "step": 500, "total_flos": 0.0, "train_loss": 0.09292892861366273, "train_runtime": 1122.3185, "train_samples_per_second": 42.769, "train_steps_per_second": 0.446 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 96, "trial_name": null, "trial_params": null }