{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 311, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01610305958132045, "grad_norm": 1.2303273677825928, "learning_rate": 1.5384615384615385e-06, "loss": 1.2575, "step": 5 }, { "epoch": 0.0322061191626409, "grad_norm": 0.9952624440193176, "learning_rate": 3.4615384615384617e-06, "loss": 1.1872, "step": 10 }, { "epoch": 0.04830917874396135, "grad_norm": 0.7314412593841553, "learning_rate": 5.384615384615385e-06, "loss": 1.2217, "step": 15 }, { "epoch": 0.0644122383252818, "grad_norm": 0.6402362585067749, "learning_rate": 7.307692307692308e-06, "loss": 1.2104, "step": 20 }, { "epoch": 0.08051529790660225, "grad_norm": 0.577061116695404, "learning_rate": 9.230769230769232e-06, "loss": 1.1606, "step": 25 }, { "epoch": 0.0966183574879227, "grad_norm": 0.6211334466934204, "learning_rate": 1.1153846153846154e-05, "loss": 1.176, "step": 30 }, { "epoch": 0.11272141706924316, "grad_norm": 0.5643324255943298, "learning_rate": 1.3076923076923078e-05, "loss": 1.1054, "step": 35 }, { "epoch": 0.1288244766505636, "grad_norm": 0.4905526638031006, "learning_rate": 1.5e-05, "loss": 1.1263, "step": 40 }, { "epoch": 0.14492753623188406, "grad_norm": 0.5200614333152771, "learning_rate": 1.6923076923076924e-05, "loss": 1.1117, "step": 45 }, { "epoch": 0.1610305958132045, "grad_norm": 0.4680725634098053, "learning_rate": 1.8846153846153846e-05, "loss": 1.0819, "step": 50 }, { "epoch": 0.17713365539452497, "grad_norm": 0.39172106981277466, "learning_rate": 2.076923076923077e-05, "loss": 1.1024, "step": 55 }, { "epoch": 0.1932367149758454, "grad_norm": 0.5212093591690063, "learning_rate": 2.269230769230769e-05, "loss": 1.0433, "step": 60 }, { "epoch": 0.20933977455716588, "grad_norm": 0.46433719992637634, "learning_rate": 2.4615384615384616e-05, "loss": 1.1014, "step": 65 }, { "epoch": 0.22544283413848631, "grad_norm": 0.4807704985141754, "learning_rate": 2.6538461538461538e-05, "loss": 1.0802, "step": 70 }, { "epoch": 0.24154589371980675, "grad_norm": 0.5271221995353699, "learning_rate": 2.846153846153846e-05, "loss": 1.0513, "step": 75 }, { "epoch": 0.2576489533011272, "grad_norm": 0.5992240905761719, "learning_rate": 2.999996606875036e-05, "loss": 1.0418, "step": 80 }, { "epoch": 0.27375201288244766, "grad_norm": 0.47334596514701843, "learning_rate": 2.9998778491131415e-05, "loss": 1.1015, "step": 85 }, { "epoch": 0.2898550724637681, "grad_norm": 0.4986027479171753, "learning_rate": 2.9995894504537867e-05, "loss": 1.0006, "step": 90 }, { "epoch": 0.3059581320450886, "grad_norm": 0.522322952747345, "learning_rate": 2.999131443515766e-05, "loss": 0.9928, "step": 95 }, { "epoch": 0.322061191626409, "grad_norm": 0.6326313614845276, "learning_rate": 2.998503880101102e-05, "loss": 0.9708, "step": 100 }, { "epoch": 0.33816425120772947, "grad_norm": 0.5072181820869446, "learning_rate": 2.99770683118919e-05, "loss": 0.9663, "step": 105 }, { "epoch": 0.35426731078904994, "grad_norm": 0.4815085232257843, "learning_rate": 2.996740386928766e-05, "loss": 0.9348, "step": 110 }, { "epoch": 0.37037037037037035, "grad_norm": 0.5493502616882324, "learning_rate": 2.9956046566277126e-05, "loss": 0.9747, "step": 115 }, { "epoch": 0.3864734299516908, "grad_norm": 0.5534172654151917, "learning_rate": 2.994299768740695e-05, "loss": 0.9313, "step": 120 }, { "epoch": 0.4025764895330113, "grad_norm": 0.6558434963226318, "learning_rate": 2.9928258708546335e-05, "loss": 0.9263, "step": 125 }, { "epoch": 0.41867954911433175, "grad_norm": 0.6211656332015991, "learning_rate": 2.99118312967201e-05, "loss": 0.9015, "step": 130 }, { "epoch": 0.43478260869565216, "grad_norm": 0.580075204372406, "learning_rate": 2.9893717309920134e-05, "loss": 0.942, "step": 135 }, { "epoch": 0.45088566827697263, "grad_norm": 0.7895562648773193, "learning_rate": 2.9873918796895273e-05, "loss": 0.8512, "step": 140 }, { "epoch": 0.4669887278582931, "grad_norm": 0.7165409326553345, "learning_rate": 2.9852437996919537e-05, "loss": 0.8618, "step": 145 }, { "epoch": 0.4830917874396135, "grad_norm": 0.6827163100242615, "learning_rate": 2.9829277339538903e-05, "loss": 0.87, "step": 150 }, { "epoch": 0.499194847020934, "grad_norm": 0.7102004289627075, "learning_rate": 2.9804439444296495e-05, "loss": 0.8927, "step": 155 }, { "epoch": 0.5152979066022544, "grad_norm": 0.6652743816375732, "learning_rate": 2.9777927120436293e-05, "loss": 0.8125, "step": 160 }, { "epoch": 0.5314009661835749, "grad_norm": 0.8374409675598145, "learning_rate": 2.974974336658545e-05, "loss": 0.7972, "step": 165 }, { "epoch": 0.5475040257648953, "grad_norm": 0.9754908084869385, "learning_rate": 2.9719891370415072e-05, "loss": 0.7772, "step": 170 }, { "epoch": 0.5636070853462157, "grad_norm": 0.7553638219833374, "learning_rate": 2.9688374508279715e-05, "loss": 0.8185, "step": 175 }, { "epoch": 0.5797101449275363, "grad_norm": 0.7818605899810791, "learning_rate": 2.9655196344835528e-05, "loss": 0.7852, "step": 180 }, { "epoch": 0.5958132045088567, "grad_norm": 0.7234418392181396, "learning_rate": 2.962036063263704e-05, "loss": 0.7731, "step": 185 }, { "epoch": 0.6119162640901772, "grad_norm": 0.7485138773918152, "learning_rate": 2.9583871311712753e-05, "loss": 0.7524, "step": 190 }, { "epoch": 0.6280193236714976, "grad_norm": 0.7542211413383484, "learning_rate": 2.9545732509119526e-05, "loss": 0.7658, "step": 195 }, { "epoch": 0.644122383252818, "grad_norm": 0.7797560095787048, "learning_rate": 2.9505948538475754e-05, "loss": 0.7299, "step": 200 }, { "epoch": 0.6602254428341385, "grad_norm": 0.7286772727966309, "learning_rate": 2.946452389947353e-05, "loss": 0.7136, "step": 205 }, { "epoch": 0.6763285024154589, "grad_norm": 0.8011156916618347, "learning_rate": 2.9421463277369678e-05, "loss": 0.6947, "step": 210 }, { "epoch": 0.6924315619967794, "grad_norm": 0.768643856048584, "learning_rate": 2.9376771542455868e-05, "loss": 0.7684, "step": 215 }, { "epoch": 0.7085346215780999, "grad_norm": 0.8555277585983276, "learning_rate": 2.9330453749507733e-05, "loss": 0.713, "step": 220 }, { "epoch": 0.7246376811594203, "grad_norm": 0.7823854684829712, "learning_rate": 2.9282515137213192e-05, "loss": 0.7267, "step": 225 }, { "epoch": 0.7407407407407407, "grad_norm": 0.8770233392715454, "learning_rate": 2.9232961127579922e-05, "loss": 0.6951, "step": 230 }, { "epoch": 0.7568438003220612, "grad_norm": 0.8260562419891357, "learning_rate": 2.9181797325322123e-05, "loss": 0.7074, "step": 235 }, { "epoch": 0.7729468599033816, "grad_norm": 1.1942952871322632, "learning_rate": 2.912902951722658e-05, "loss": 0.6977, "step": 240 }, { "epoch": 0.789049919484702, "grad_norm": 1.013655424118042, "learning_rate": 2.907466367149819e-05, "loss": 0.6778, "step": 245 }, { "epoch": 0.8051529790660226, "grad_norm": 1.917531967163086, "learning_rate": 2.901870593708493e-05, "loss": 0.6634, "step": 250 }, { "epoch": 0.821256038647343, "grad_norm": 0.979008138179779, "learning_rate": 2.896116264298239e-05, "loss": 0.7098, "step": 255 }, { "epoch": 0.8373590982286635, "grad_norm": 0.982454776763916, "learning_rate": 2.890204029751793e-05, "loss": 0.6709, "step": 260 }, { "epoch": 0.8534621578099839, "grad_norm": 0.8853998780250549, "learning_rate": 2.8841345587614587e-05, "loss": 0.6853, "step": 265 }, { "epoch": 0.8695652173913043, "grad_norm": 0.9082754850387573, "learning_rate": 2.8779085378034768e-05, "loss": 0.6747, "step": 270 }, { "epoch": 0.8856682769726248, "grad_norm": 0.9272617697715759, "learning_rate": 2.87152667106038e-05, "loss": 0.6811, "step": 275 }, { "epoch": 0.9017713365539453, "grad_norm": 0.9023019075393677, "learning_rate": 2.864989680341348e-05, "loss": 0.6504, "step": 280 }, { "epoch": 0.9178743961352657, "grad_norm": 0.9130707383155823, "learning_rate": 2.858298305000573e-05, "loss": 0.6506, "step": 285 }, { "epoch": 0.9339774557165862, "grad_norm": 0.8529173135757446, "learning_rate": 2.8514533018536286e-05, "loss": 0.6151, "step": 290 }, { "epoch": 0.9500805152979066, "grad_norm": 0.9180151224136353, "learning_rate": 2.84445544509188e-05, "loss": 0.6526, "step": 295 }, { "epoch": 0.966183574879227, "grad_norm": 1.0019443035125732, "learning_rate": 2.8373055261949138e-05, "loss": 0.6054, "step": 300 }, { "epoch": 0.9822866344605475, "grad_norm": 1.0061304569244385, "learning_rate": 2.830004353841023e-05, "loss": 0.6013, "step": 305 }, { "epoch": 0.998389694041868, "grad_norm": 0.9755994081497192, "learning_rate": 2.8225527538157413e-05, "loss": 0.5871, "step": 310 } ], "logging_steps": 5, "max_steps": 1555, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.556253433996247e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }