| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 303, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01652892561983471, | |
| "grad_norm": 1.4201081991195679, | |
| "learning_rate": 1.5789473684210526e-06, | |
| "loss": 1.3341, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03305785123966942, | |
| "grad_norm": 0.8825254440307617, | |
| "learning_rate": 3.5526315789473683e-06, | |
| "loss": 1.3559, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.049586776859504134, | |
| "grad_norm": 0.6610410213470459, | |
| "learning_rate": 5.526315789473684e-06, | |
| "loss": 1.3433, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06611570247933884, | |
| "grad_norm": 0.6865679025650024, | |
| "learning_rate": 7.5e-06, | |
| "loss": 1.2905, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08264462809917356, | |
| "grad_norm": 0.5552710294723511, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 1.3332, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09917355371900827, | |
| "grad_norm": 0.5776343941688538, | |
| "learning_rate": 1.1447368421052632e-05, | |
| "loss": 1.2557, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11570247933884298, | |
| "grad_norm": 0.4994240701198578, | |
| "learning_rate": 1.3421052631578948e-05, | |
| "loss": 1.2805, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1322314049586777, | |
| "grad_norm": 0.5451419353485107, | |
| "learning_rate": 1.5394736842105264e-05, | |
| "loss": 1.2875, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1487603305785124, | |
| "grad_norm": 0.5521426200866699, | |
| "learning_rate": 1.736842105263158e-05, | |
| "loss": 1.2729, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1652892561983471, | |
| "grad_norm": 0.448354572057724, | |
| "learning_rate": 1.9342105263157896e-05, | |
| "loss": 1.2043, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 0.5871981978416443, | |
| "learning_rate": 2.1315789473684212e-05, | |
| "loss": 1.1982, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.19834710743801653, | |
| "grad_norm": 0.5270593762397766, | |
| "learning_rate": 2.3289473684210525e-05, | |
| "loss": 1.2128, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21487603305785125, | |
| "grad_norm": 0.513990044593811, | |
| "learning_rate": 2.526315789473684e-05, | |
| "loss": 1.1809, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.23140495867768596, | |
| "grad_norm": 0.5345117449760437, | |
| "learning_rate": 2.723684210526316e-05, | |
| "loss": 1.2193, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24793388429752067, | |
| "grad_norm": 0.4606321454048157, | |
| "learning_rate": 2.9210526315789474e-05, | |
| "loss": 1.1448, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2644628099173554, | |
| "grad_norm": 0.46585503220558167, | |
| "learning_rate": 2.9999678278282968e-05, | |
| "loss": 1.1272, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2809917355371901, | |
| "grad_norm": 0.5272030234336853, | |
| "learning_rate": 2.9997712251100747e-05, | |
| "loss": 1.1208, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2975206611570248, | |
| "grad_norm": 0.5860956311225891, | |
| "learning_rate": 2.9993959165002407e-05, | |
| "loss": 1.0957, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3140495867768595, | |
| "grad_norm": 0.7299471497535706, | |
| "learning_rate": 2.998841946718855e-05, | |
| "loss": 1.1043, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3305785123966942, | |
| "grad_norm": 0.5312178134918213, | |
| "learning_rate": 2.9981093817744277e-05, | |
| "loss": 1.1348, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34710743801652894, | |
| "grad_norm": 0.7180105447769165, | |
| "learning_rate": 2.997198308956052e-05, | |
| "loss": 1.0544, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.6310957670211792, | |
| "learning_rate": 2.9961088368230065e-05, | |
| "loss": 1.0619, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.38016528925619836, | |
| "grad_norm": 0.586087703704834, | |
| "learning_rate": 2.9948410951918154e-05, | |
| "loss": 1.0515, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.39669421487603307, | |
| "grad_norm": 0.6914042830467224, | |
| "learning_rate": 2.993395235120784e-05, | |
| "loss": 1.0209, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4132231404958678, | |
| "grad_norm": 0.6734358668327332, | |
| "learning_rate": 2.991771428891996e-05, | |
| "loss": 0.9988, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4297520661157025, | |
| "grad_norm": 0.68830806016922, | |
| "learning_rate": 2.989969869990789e-05, | |
| "loss": 1.0326, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4462809917355372, | |
| "grad_norm": 0.7114909291267395, | |
| "learning_rate": 2.9879907730826946e-05, | |
| "loss": 1.0411, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4628099173553719, | |
| "grad_norm": 0.6947762370109558, | |
| "learning_rate": 2.9858343739878657e-05, | |
| "loss": 0.945, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4793388429752066, | |
| "grad_norm": 0.7105550765991211, | |
| "learning_rate": 2.9835009296529717e-05, | |
| "loss": 0.9779, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.49586776859504134, | |
| "grad_norm": 0.9549520015716553, | |
| "learning_rate": 2.980990718120587e-05, | |
| "loss": 0.9691, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.512396694214876, | |
| "grad_norm": 0.7530831694602966, | |
| "learning_rate": 2.978304038496056e-05, | |
| "loss": 0.961, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5289256198347108, | |
| "grad_norm": 0.7150148153305054, | |
| "learning_rate": 2.975441210911856e-05, | |
| "loss": 0.9344, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 0.6854411959648132, | |
| "learning_rate": 2.9724025764894513e-05, | |
| "loss": 0.9801, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5619834710743802, | |
| "grad_norm": 0.8350316286087036, | |
| "learning_rate": 2.9691884972986458e-05, | |
| "loss": 0.898, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5785123966942148, | |
| "grad_norm": 0.8690942525863647, | |
| "learning_rate": 2.965799356314441e-05, | |
| "loss": 0.911, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5950413223140496, | |
| "grad_norm": 0.796727180480957, | |
| "learning_rate": 2.9622355573714036e-05, | |
| "loss": 0.8893, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6115702479338843, | |
| "grad_norm": 0.8369293212890625, | |
| "learning_rate": 2.9584975251155434e-05, | |
| "loss": 0.8583, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.628099173553719, | |
| "grad_norm": 0.9035501480102539, | |
| "learning_rate": 2.954585704953717e-05, | |
| "loss": 0.9182, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6446280991735537, | |
| "grad_norm": 0.8341655135154724, | |
| "learning_rate": 2.950500563000555e-05, | |
| "loss": 0.8838, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6611570247933884, | |
| "grad_norm": 0.8167555332183838, | |
| "learning_rate": 2.946242586022921e-05, | |
| "loss": 0.8592, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6776859504132231, | |
| "grad_norm": 0.958501398563385, | |
| "learning_rate": 2.94181228138191e-05, | |
| "loss": 0.8798, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6942148760330579, | |
| "grad_norm": 0.8715956807136536, | |
| "learning_rate": 2.9372101769723958e-05, | |
| "loss": 0.8307, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7107438016528925, | |
| "grad_norm": 0.8931337594985962, | |
| "learning_rate": 2.9324368211601284e-05, | |
| "loss": 0.8488, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.9372528791427612, | |
| "learning_rate": 2.9274927827163913e-05, | |
| "loss": 0.8181, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.743801652892562, | |
| "grad_norm": 0.9352923631668091, | |
| "learning_rate": 2.9223786507502333e-05, | |
| "loss": 0.7852, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7603305785123967, | |
| "grad_norm": 1.0228369235992432, | |
| "learning_rate": 2.917095034638269e-05, | |
| "loss": 0.7731, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7768595041322314, | |
| "grad_norm": 1.1870708465576172, | |
| "learning_rate": 2.9116425639520713e-05, | |
| "loss": 0.8079, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7933884297520661, | |
| "grad_norm": 0.933369517326355, | |
| "learning_rate": 2.9060218883831517e-05, | |
| "loss": 0.7646, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8099173553719008, | |
| "grad_norm": 1.1031955480575562, | |
| "learning_rate": 2.9002336776655494e-05, | |
| "loss": 0.7794, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8264462809917356, | |
| "grad_norm": 0.9256858229637146, | |
| "learning_rate": 2.894278621496025e-05, | |
| "loss": 0.7541, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8429752066115702, | |
| "grad_norm": 1.17303466796875, | |
| "learning_rate": 2.888157429451883e-05, | |
| "loss": 0.7456, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.859504132231405, | |
| "grad_norm": 1.0275311470031738, | |
| "learning_rate": 2.88187083090642e-05, | |
| "loss": 0.7497, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8760330578512396, | |
| "grad_norm": 0.9848041534423828, | |
| "learning_rate": 2.8754195749420145e-05, | |
| "loss": 0.7138, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8925619834710744, | |
| "grad_norm": 1.0821123123168945, | |
| "learning_rate": 2.8688044302608735e-05, | |
| "loss": 0.7326, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.0524564981460571, | |
| "learning_rate": 2.8620261850934338e-05, | |
| "loss": 0.6945, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9256198347107438, | |
| "grad_norm": 1.1582773923873901, | |
| "learning_rate": 2.8550856471044415e-05, | |
| "loss": 0.7245, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9421487603305785, | |
| "grad_norm": 0.9988569617271423, | |
| "learning_rate": 2.847983643296715e-05, | |
| "loss": 0.7613, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9586776859504132, | |
| "grad_norm": 1.1030054092407227, | |
| "learning_rate": 2.840721019912602e-05, | |
| "loss": 0.6793, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9752066115702479, | |
| "grad_norm": 1.0013433694839478, | |
| "learning_rate": 2.833298642333146e-05, | |
| "loss": 0.7052, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9917355371900827, | |
| "grad_norm": 1.1024919748306274, | |
| "learning_rate": 2.8257173949749703e-05, | |
| "loss": 0.7322, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1515, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.2486090752537395e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |