| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 346, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01447178002894356, | |
| "grad_norm": 1.2414709329605103, | |
| "learning_rate": 1.3793103448275862e-06, | |
| "loss": 1.2575, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02894356005788712, | |
| "grad_norm": 0.995053231716156, | |
| "learning_rate": 3.103448275862069e-06, | |
| "loss": 1.2964, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04341534008683068, | |
| "grad_norm": 0.6870602369308472, | |
| "learning_rate": 4.827586206896552e-06, | |
| "loss": 1.2136, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05788712011577424, | |
| "grad_norm": 0.7477986216545105, | |
| "learning_rate": 6.551724137931035e-06, | |
| "loss": 1.208, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0723589001447178, | |
| "grad_norm": 0.503740131855011, | |
| "learning_rate": 8.275862068965517e-06, | |
| "loss": 1.262, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08683068017366136, | |
| "grad_norm": 0.5463724732398987, | |
| "learning_rate": 9.999999999999999e-06, | |
| "loss": 1.1591, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10130246020260492, | |
| "grad_norm": 0.5826733112335205, | |
| "learning_rate": 1.1724137931034483e-05, | |
| "loss": 1.1859, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11577424023154848, | |
| "grad_norm": 0.478679895401001, | |
| "learning_rate": 1.3448275862068966e-05, | |
| "loss": 1.1185, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13024602026049203, | |
| "grad_norm": 0.45855218172073364, | |
| "learning_rate": 1.517241379310345e-05, | |
| "loss": 1.0916, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1447178002894356, | |
| "grad_norm": 0.4334068298339844, | |
| "learning_rate": 1.6896551724137932e-05, | |
| "loss": 1.1103, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15918958031837915, | |
| "grad_norm": 0.49706199765205383, | |
| "learning_rate": 1.8620689655172415e-05, | |
| "loss": 1.1351, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1736613603473227, | |
| "grad_norm": 0.5013290643692017, | |
| "learning_rate": 2.0344827586206894e-05, | |
| "loss": 1.1267, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18813314037626627, | |
| "grad_norm": 0.4919586181640625, | |
| "learning_rate": 2.206896551724138e-05, | |
| "loss": 1.0824, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20260492040520983, | |
| "grad_norm": 0.4955993890762329, | |
| "learning_rate": 2.3793103448275862e-05, | |
| "loss": 1.088, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2170767004341534, | |
| "grad_norm": 0.47736915946006775, | |
| "learning_rate": 2.5517241379310345e-05, | |
| "loss": 1.097, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.23154848046309695, | |
| "grad_norm": 0.4829305112361908, | |
| "learning_rate": 2.7241379310344827e-05, | |
| "loss": 1.0582, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2460202604920405, | |
| "grad_norm": 0.5232416391372681, | |
| "learning_rate": 2.8965517241379313e-05, | |
| "loss": 1.0447, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.26049204052098407, | |
| "grad_norm": 0.5268356800079346, | |
| "learning_rate": 2.999989031547876e-05, | |
| "loss": 1.0425, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.27496382054992763, | |
| "grad_norm": 0.6044050455093384, | |
| "learning_rate": 2.9998656383036702e-05, | |
| "loss": 0.9967, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2894356005788712, | |
| "grad_norm": 0.5722519755363464, | |
| "learning_rate": 2.9996051525662343e-05, | |
| "loss": 0.9925, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.30390738060781475, | |
| "grad_norm": 0.5845656991004944, | |
| "learning_rate": 2.9992075981447947e-05, | |
| "loss": 0.9589, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3183791606367583, | |
| "grad_norm": 0.6687687635421753, | |
| "learning_rate": 2.9986730113770898e-05, | |
| "loss": 0.9607, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.33285094066570187, | |
| "grad_norm": 0.5653572678565979, | |
| "learning_rate": 2.9980014411260523e-05, | |
| "loss": 0.9765, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3473227206946454, | |
| "grad_norm": 0.6110051274299622, | |
| "learning_rate": 2.9971929487753402e-05, | |
| "loss": 0.9347, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.361794500723589, | |
| "grad_norm": 0.6042832136154175, | |
| "learning_rate": 2.9962476082237285e-05, | |
| "loss": 0.8882, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.37626628075253254, | |
| "grad_norm": 0.6558710932731628, | |
| "learning_rate": 2.9951655058783517e-05, | |
| "loss": 0.9611, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3907380607814761, | |
| "grad_norm": 0.6648174524307251, | |
| "learning_rate": 2.993946740646809e-05, | |
| "loss": 0.9514, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.40520984081041966, | |
| "grad_norm": 0.6010047793388367, | |
| "learning_rate": 2.992591423928121e-05, | |
| "loss": 0.8977, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4196816208393632, | |
| "grad_norm": 0.594023585319519, | |
| "learning_rate": 2.99109967960255e-05, | |
| "loss": 0.9169, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4341534008683068, | |
| "grad_norm": 0.6303651928901672, | |
| "learning_rate": 2.9894716440202756e-05, | |
| "loss": 0.8779, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.44862518089725034, | |
| "grad_norm": 0.6532771587371826, | |
| "learning_rate": 2.9877074659889316e-05, | |
| "loss": 0.8661, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4630969609261939, | |
| "grad_norm": 0.7434287071228027, | |
| "learning_rate": 2.9858073067600054e-05, | |
| "loss": 0.8627, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.47756874095513746, | |
| "grad_norm": 0.7219240069389343, | |
| "learning_rate": 2.983771340014098e-05, | |
| "loss": 0.8855, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.492040520984081, | |
| "grad_norm": 0.7225666046142578, | |
| "learning_rate": 2.981599751845051e-05, | |
| "loss": 0.8531, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5065123010130246, | |
| "grad_norm": 0.7195663452148438, | |
| "learning_rate": 2.9792927407429344e-05, | |
| "loss": 0.8174, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5209840810419681, | |
| "grad_norm": 0.7346758842468262, | |
| "learning_rate": 2.976850517575906e-05, | |
| "loss": 0.8491, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5354558610709117, | |
| "grad_norm": 0.7614021897315979, | |
| "learning_rate": 2.9742733055709366e-05, | |
| "loss": 0.8102, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5499276410998553, | |
| "grad_norm": 0.7299443483352661, | |
| "learning_rate": 2.9715613402934064e-05, | |
| "loss": 0.7749, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5643994211287988, | |
| "grad_norm": 0.7258509993553162, | |
| "learning_rate": 2.9687148696255737e-05, | |
| "loss": 0.8408, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5788712011577424, | |
| "grad_norm": 0.8355885148048401, | |
| "learning_rate": 2.9657341537439176e-05, | |
| "loss": 0.7793, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5933429811866859, | |
| "grad_norm": 0.7875196933746338, | |
| "learning_rate": 2.9626194650953563e-05, | |
| "loss": 0.8168, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6078147612156295, | |
| "grad_norm": 1.2102893590927124, | |
| "learning_rate": 2.9593710883723465e-05, | |
| "loss": 0.7554, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.622286541244573, | |
| "grad_norm": 0.9075624346733093, | |
| "learning_rate": 2.95598932048686e-05, | |
| "loss": 0.7782, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6367583212735166, | |
| "grad_norm": 0.9814792275428772, | |
| "learning_rate": 2.9524744705432446e-05, | |
| "loss": 0.7679, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6512301013024602, | |
| "grad_norm": 0.8673927187919617, | |
| "learning_rate": 2.9488268598099735e-05, | |
| "loss": 0.7131, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6657018813314037, | |
| "grad_norm": 1.2335319519042969, | |
| "learning_rate": 2.945046821690277e-05, | |
| "loss": 0.7368, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6801736613603473, | |
| "grad_norm": 1.0025559663772583, | |
| "learning_rate": 2.9411347016916704e-05, | |
| "loss": 0.7447, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6946454413892909, | |
| "grad_norm": 0.8002074360847473, | |
| "learning_rate": 2.9370908573943737e-05, | |
| "loss": 0.7396, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7091172214182344, | |
| "grad_norm": 0.7924606800079346, | |
| "learning_rate": 2.9329156584186267e-05, | |
| "loss": 0.7821, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.723589001447178, | |
| "grad_norm": 0.8294292092323303, | |
| "learning_rate": 2.9286094863909053e-05, | |
| "loss": 0.7546, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7380607814761215, | |
| "grad_norm": 0.8875476121902466, | |
| "learning_rate": 2.924172734909038e-05, | |
| "loss": 0.7541, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7525325615050651, | |
| "grad_norm": 1.0516374111175537, | |
| "learning_rate": 2.919605809506233e-05, | |
| "loss": 0.7339, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7670043415340086, | |
| "grad_norm": 0.9758723378181458, | |
| "learning_rate": 2.9149091276140066e-05, | |
| "loss": 0.7286, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7814761215629522, | |
| "grad_norm": 0.8721650838851929, | |
| "learning_rate": 2.910083118524034e-05, | |
| "loss": 0.7224, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7959479015918958, | |
| "grad_norm": 0.8282304406166077, | |
| "learning_rate": 2.9051282233489065e-05, | |
| "loss": 0.6857, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8104196816208393, | |
| "grad_norm": 0.9779431223869324, | |
| "learning_rate": 2.900044894981813e-05, | |
| "loss": 0.6938, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8248914616497829, | |
| "grad_norm": 0.842149019241333, | |
| "learning_rate": 2.894833598055147e-05, | |
| "loss": 0.6996, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8393632416787264, | |
| "grad_norm": 0.8657100200653076, | |
| "learning_rate": 2.8894948088980338e-05, | |
| "loss": 0.6343, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.85383502170767, | |
| "grad_norm": 0.8805454969406128, | |
| "learning_rate": 2.8840290154927955e-05, | |
| "loss": 0.6506, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8683068017366136, | |
| "grad_norm": 0.8671079874038696, | |
| "learning_rate": 2.878436717430346e-05, | |
| "loss": 0.7051, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8827785817655571, | |
| "grad_norm": 0.9610555768013, | |
| "learning_rate": 2.8727184258645276e-05, | |
| "loss": 0.6888, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8972503617945007, | |
| "grad_norm": 0.9625582695007324, | |
| "learning_rate": 2.8668746634653908e-05, | |
| "loss": 0.6669, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9117221418234442, | |
| "grad_norm": 0.8685387372970581, | |
| "learning_rate": 2.860905964371418e-05, | |
| "loss": 0.645, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9261939218523878, | |
| "grad_norm": 1.1596406698226929, | |
| "learning_rate": 2.8548128741407043e-05, | |
| "loss": 0.6432, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9406657018813314, | |
| "grad_norm": 0.9292768239974976, | |
| "learning_rate": 2.8485959497010906e-05, | |
| "loss": 0.6599, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9551374819102749, | |
| "grad_norm": 1.0150357484817505, | |
| "learning_rate": 2.8422557592992584e-05, | |
| "loss": 0.5829, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9696092619392185, | |
| "grad_norm": 1.0730253458023071, | |
| "learning_rate": 2.83579288244879e-05, | |
| "loss": 0.615, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.984081041968162, | |
| "grad_norm": 1.0182080268859863, | |
| "learning_rate": 2.829207909877201e-05, | |
| "loss": 0.6109, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9985528219971056, | |
| "grad_norm": 0.9661368131637573, | |
| "learning_rate": 2.8225014434719423e-05, | |
| "loss": 0.5798, | |
| "step": 345 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1730, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.213914221804257e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |