| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 311, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01610305958132045, | |
| "grad_norm": 1.2303273677825928, | |
| "learning_rate": 1.5384615384615385e-06, | |
| "loss": 1.2575, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0322061191626409, | |
| "grad_norm": 0.9952624440193176, | |
| "learning_rate": 3.4615384615384617e-06, | |
| "loss": 1.1872, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04830917874396135, | |
| "grad_norm": 0.7314412593841553, | |
| "learning_rate": 5.384615384615385e-06, | |
| "loss": 1.2217, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0644122383252818, | |
| "grad_norm": 0.6402362585067749, | |
| "learning_rate": 7.307692307692308e-06, | |
| "loss": 1.2104, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08051529790660225, | |
| "grad_norm": 0.577061116695404, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 1.1606, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0966183574879227, | |
| "grad_norm": 0.6211334466934204, | |
| "learning_rate": 1.1153846153846154e-05, | |
| "loss": 1.176, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11272141706924316, | |
| "grad_norm": 0.5643324255943298, | |
| "learning_rate": 1.3076923076923078e-05, | |
| "loss": 1.1054, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1288244766505636, | |
| "grad_norm": 0.4905526638031006, | |
| "learning_rate": 1.5e-05, | |
| "loss": 1.1263, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14492753623188406, | |
| "grad_norm": 0.5200614333152771, | |
| "learning_rate": 1.6923076923076924e-05, | |
| "loss": 1.1117, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1610305958132045, | |
| "grad_norm": 0.4680725634098053, | |
| "learning_rate": 1.8846153846153846e-05, | |
| "loss": 1.0819, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17713365539452497, | |
| "grad_norm": 0.39172106981277466, | |
| "learning_rate": 2.076923076923077e-05, | |
| "loss": 1.1024, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1932367149758454, | |
| "grad_norm": 0.5212093591690063, | |
| "learning_rate": 2.269230769230769e-05, | |
| "loss": 1.0433, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20933977455716588, | |
| "grad_norm": 0.46433719992637634, | |
| "learning_rate": 2.4615384615384616e-05, | |
| "loss": 1.1014, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.22544283413848631, | |
| "grad_norm": 0.4807704985141754, | |
| "learning_rate": 2.6538461538461538e-05, | |
| "loss": 1.0802, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24154589371980675, | |
| "grad_norm": 0.5271221995353699, | |
| "learning_rate": 2.846153846153846e-05, | |
| "loss": 1.0513, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2576489533011272, | |
| "grad_norm": 0.5992240905761719, | |
| "learning_rate": 2.999996606875036e-05, | |
| "loss": 1.0418, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.27375201288244766, | |
| "grad_norm": 0.47334596514701843, | |
| "learning_rate": 2.9998778491131415e-05, | |
| "loss": 1.1015, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2898550724637681, | |
| "grad_norm": 0.4986027479171753, | |
| "learning_rate": 2.9995894504537867e-05, | |
| "loss": 1.0006, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3059581320450886, | |
| "grad_norm": 0.522322952747345, | |
| "learning_rate": 2.999131443515766e-05, | |
| "loss": 0.9928, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.322061191626409, | |
| "grad_norm": 0.6326313614845276, | |
| "learning_rate": 2.998503880101102e-05, | |
| "loss": 0.9708, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.33816425120772947, | |
| "grad_norm": 0.5072181820869446, | |
| "learning_rate": 2.99770683118919e-05, | |
| "loss": 0.9663, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.35426731078904994, | |
| "grad_norm": 0.4815085232257843, | |
| "learning_rate": 2.996740386928766e-05, | |
| "loss": 0.9348, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 0.5493502616882324, | |
| "learning_rate": 2.9956046566277126e-05, | |
| "loss": 0.9747, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3864734299516908, | |
| "grad_norm": 0.5534172654151917, | |
| "learning_rate": 2.994299768740695e-05, | |
| "loss": 0.9313, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4025764895330113, | |
| "grad_norm": 0.6558434963226318, | |
| "learning_rate": 2.9928258708546335e-05, | |
| "loss": 0.9263, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.41867954911433175, | |
| "grad_norm": 0.6211656332015991, | |
| "learning_rate": 2.99118312967201e-05, | |
| "loss": 0.9015, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.580075204372406, | |
| "learning_rate": 2.9893717309920134e-05, | |
| "loss": 0.942, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.45088566827697263, | |
| "grad_norm": 0.7895562648773193, | |
| "learning_rate": 2.9873918796895273e-05, | |
| "loss": 0.8512, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4669887278582931, | |
| "grad_norm": 0.7165409326553345, | |
| "learning_rate": 2.9852437996919537e-05, | |
| "loss": 0.8618, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4830917874396135, | |
| "grad_norm": 0.6827163100242615, | |
| "learning_rate": 2.9829277339538903e-05, | |
| "loss": 0.87, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.499194847020934, | |
| "grad_norm": 0.7102004289627075, | |
| "learning_rate": 2.9804439444296495e-05, | |
| "loss": 0.8927, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5152979066022544, | |
| "grad_norm": 0.6652743816375732, | |
| "learning_rate": 2.9777927120436293e-05, | |
| "loss": 0.8125, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5314009661835749, | |
| "grad_norm": 0.8374409675598145, | |
| "learning_rate": 2.974974336658545e-05, | |
| "loss": 0.7972, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5475040257648953, | |
| "grad_norm": 0.9754908084869385, | |
| "learning_rate": 2.9719891370415072e-05, | |
| "loss": 0.7772, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5636070853462157, | |
| "grad_norm": 0.7553638219833374, | |
| "learning_rate": 2.9688374508279715e-05, | |
| "loss": 0.8185, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5797101449275363, | |
| "grad_norm": 0.7818605899810791, | |
| "learning_rate": 2.9655196344835528e-05, | |
| "loss": 0.7852, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5958132045088567, | |
| "grad_norm": 0.7234418392181396, | |
| "learning_rate": 2.962036063263704e-05, | |
| "loss": 0.7731, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6119162640901772, | |
| "grad_norm": 0.7485138773918152, | |
| "learning_rate": 2.9583871311712753e-05, | |
| "loss": 0.7524, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6280193236714976, | |
| "grad_norm": 0.7542211413383484, | |
| "learning_rate": 2.9545732509119526e-05, | |
| "loss": 0.7658, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.644122383252818, | |
| "grad_norm": 0.7797560095787048, | |
| "learning_rate": 2.9505948538475754e-05, | |
| "loss": 0.7299, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6602254428341385, | |
| "grad_norm": 0.7286772727966309, | |
| "learning_rate": 2.946452389947353e-05, | |
| "loss": 0.7136, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6763285024154589, | |
| "grad_norm": 0.8011156916618347, | |
| "learning_rate": 2.9421463277369678e-05, | |
| "loss": 0.6947, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6924315619967794, | |
| "grad_norm": 0.768643856048584, | |
| "learning_rate": 2.9376771542455868e-05, | |
| "loss": 0.7684, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7085346215780999, | |
| "grad_norm": 0.8555277585983276, | |
| "learning_rate": 2.9330453749507733e-05, | |
| "loss": 0.713, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7246376811594203, | |
| "grad_norm": 0.7823854684829712, | |
| "learning_rate": 2.9282515137213192e-05, | |
| "loss": 0.7267, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.8770233392715454, | |
| "learning_rate": 2.9232961127579922e-05, | |
| "loss": 0.6951, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7568438003220612, | |
| "grad_norm": 0.8260562419891357, | |
| "learning_rate": 2.9181797325322123e-05, | |
| "loss": 0.7074, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7729468599033816, | |
| "grad_norm": 1.1942952871322632, | |
| "learning_rate": 2.912902951722658e-05, | |
| "loss": 0.6977, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.789049919484702, | |
| "grad_norm": 1.013655424118042, | |
| "learning_rate": 2.907466367149819e-05, | |
| "loss": 0.6778, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8051529790660226, | |
| "grad_norm": 1.917531967163086, | |
| "learning_rate": 2.901870593708493e-05, | |
| "loss": 0.6634, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.821256038647343, | |
| "grad_norm": 0.979008138179779, | |
| "learning_rate": 2.896116264298239e-05, | |
| "loss": 0.7098, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8373590982286635, | |
| "grad_norm": 0.982454776763916, | |
| "learning_rate": 2.890204029751793e-05, | |
| "loss": 0.6709, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8534621578099839, | |
| "grad_norm": 0.8853998780250549, | |
| "learning_rate": 2.8841345587614587e-05, | |
| "loss": 0.6853, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.9082754850387573, | |
| "learning_rate": 2.8779085378034768e-05, | |
| "loss": 0.6747, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8856682769726248, | |
| "grad_norm": 0.9272617697715759, | |
| "learning_rate": 2.87152667106038e-05, | |
| "loss": 0.6811, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9017713365539453, | |
| "grad_norm": 0.9023019075393677, | |
| "learning_rate": 2.864989680341348e-05, | |
| "loss": 0.6504, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9178743961352657, | |
| "grad_norm": 0.9130707383155823, | |
| "learning_rate": 2.858298305000573e-05, | |
| "loss": 0.6506, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9339774557165862, | |
| "grad_norm": 0.8529173135757446, | |
| "learning_rate": 2.8514533018536286e-05, | |
| "loss": 0.6151, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9500805152979066, | |
| "grad_norm": 0.9180151224136353, | |
| "learning_rate": 2.84445544509188e-05, | |
| "loss": 0.6526, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.966183574879227, | |
| "grad_norm": 1.0019443035125732, | |
| "learning_rate": 2.8373055261949138e-05, | |
| "loss": 0.6054, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9822866344605475, | |
| "grad_norm": 1.0061304569244385, | |
| "learning_rate": 2.830004353841023e-05, | |
| "loss": 0.6013, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.998389694041868, | |
| "grad_norm": 0.9755994081497192, | |
| "learning_rate": 2.8225527538157413e-05, | |
| "loss": 0.5871, | |
| "step": 310 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1555, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.556253433996247e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |