| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 20.0, |
| "eval_steps": 100, |
| "global_step": 1160, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.1724137931034483, |
| "grad_norm": 59.60115051269531, |
| "learning_rate": 5.172413793103448e-06, |
| "loss": 5.4773, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.3448275862068966, |
| "grad_norm": 0.6899381875991821, |
| "learning_rate": 1.3793103448275863e-05, |
| "loss": 0.5795, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.5172413793103449, |
| "grad_norm": 0.37347301840782166, |
| "learning_rate": 2.2413793103448276e-05, |
| "loss": 0.1975, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 0.5068468451499939, |
| "learning_rate": 3.103448275862069e-05, |
| "loss": 0.1843, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.8620689655172413, |
| "grad_norm": 0.4019465148448944, |
| "learning_rate": 3.965517241379311e-05, |
| "loss": 0.1866, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.0344827586206897, |
| "grad_norm": 0.3685491681098938, |
| "learning_rate": 4.827586206896552e-05, |
| "loss": 0.1578, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.206896551724138, |
| "grad_norm": 0.3507291376590729, |
| "learning_rate": 5.689655172413794e-05, |
| "loss": 0.1198, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.3793103448275863, |
| "grad_norm": 0.437311053276062, |
| "learning_rate": 6.551724137931034e-05, |
| "loss": 0.1032, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.5517241379310345, |
| "grad_norm": 0.36062315106391907, |
| "learning_rate": 7.413793103448277e-05, |
| "loss": 0.1172, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.7241379310344827, |
| "grad_norm": 0.4424302279949188, |
| "learning_rate": 8.275862068965517e-05, |
| "loss": 0.1212, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.7241379310344827, |
| "eval_loss": 0.07583244889974594, |
| "eval_runtime": 8.495, |
| "eval_samples_per_second": 55.327, |
| "eval_steps_per_second": 6.945, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.896551724137931, |
| "grad_norm": 0.32817938923835754, |
| "learning_rate": 9.137931034482759e-05, |
| "loss": 0.1186, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.0689655172413794, |
| "grad_norm": 0.27987948060035706, |
| "learning_rate": 0.0001, |
| "loss": 0.1074, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.2413793103448274, |
| "grad_norm": 0.3074031472206116, |
| "learning_rate": 9.997736367166968e-05, |
| "loss": 0.0599, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.413793103448276, |
| "grad_norm": 0.29311925172805786, |
| "learning_rate": 9.990947518281311e-05, |
| "loss": 0.0648, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.586206896551724, |
| "grad_norm": 0.29754340648651123, |
| "learning_rate": 9.979639600327522e-05, |
| "loss": 0.0708, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.7586206896551726, |
| "grad_norm": 0.3316575586795807, |
| "learning_rate": 9.963822852095345e-05, |
| "loss": 0.0684, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.9310344827586206, |
| "grad_norm": 0.25907158851623535, |
| "learning_rate": 9.943511594909023e-05, |
| "loss": 0.0697, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.103448275862069, |
| "grad_norm": 0.22101767361164093, |
| "learning_rate": 9.918724219660013e-05, |
| "loss": 0.0518, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.2758620689655173, |
| "grad_norm": 0.26941612362861633, |
| "learning_rate": 9.889483170154903e-05, |
| "loss": 0.0395, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.4482758620689653, |
| "grad_norm": 0.18463601171970367, |
| "learning_rate": 9.855814922793582e-05, |
| "loss": 0.0439, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.4482758620689653, |
| "eval_loss": 0.03639198839664459, |
| "eval_runtime": 8.4882, |
| "eval_samples_per_second": 55.371, |
| "eval_steps_per_second": 6.951, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.6206896551724137, |
| "grad_norm": 0.22298499941825867, |
| "learning_rate": 9.817749962596115e-05, |
| "loss": 0.0453, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.793103448275862, |
| "grad_norm": 0.18436843156814575, |
| "learning_rate": 9.775322755599978e-05, |
| "loss": 0.0426, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.9655172413793105, |
| "grad_norm": 0.19848646223545074, |
| "learning_rate": 9.728571717652677e-05, |
| "loss": 0.0463, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.137931034482759, |
| "grad_norm": 0.13991042971611023, |
| "learning_rate": 9.677539179628005e-05, |
| "loss": 0.034, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.310344827586207, |
| "grad_norm": 0.15892928838729858, |
| "learning_rate": 9.622271349097411e-05, |
| "loss": 0.0301, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.482758620689655, |
| "grad_norm": 0.27909988164901733, |
| "learning_rate": 9.562818268491216e-05, |
| "loss": 0.0333, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.655172413793103, |
| "grad_norm": 0.19205643236637115, |
| "learning_rate": 9.499233769787535e-05, |
| "loss": 0.0319, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.827586206896552, |
| "grad_norm": 0.19656455516815186, |
| "learning_rate": 9.431575425769938e-05, |
| "loss": 0.032, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.15377789735794067, |
| "learning_rate": 9.359904497898009e-05, |
| "loss": 0.0325, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.172413793103448, |
| "grad_norm": 0.11539698392152786, |
| "learning_rate": 9.284285880837946e-05, |
| "loss": 0.0208, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.172413793103448, |
| "eval_loss": 0.021401584148406982, |
| "eval_runtime": 8.4928, |
| "eval_samples_per_second": 55.341, |
| "eval_steps_per_second": 6.947, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.344827586206897, |
| "grad_norm": 0.09723104536533356, |
| "learning_rate": 9.2047880437035e-05, |
| "loss": 0.024, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.517241379310345, |
| "grad_norm": 0.10288870334625244, |
| "learning_rate": 9.121482968060384e-05, |
| "loss": 0.025, |
| "step": 320 |
| }, |
| { |
| "epoch": 5.689655172413794, |
| "grad_norm": 0.13955926895141602, |
| "learning_rate": 9.034446082750352e-05, |
| "loss": 0.0267, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.862068965517241, |
| "grad_norm": 0.09979069977998734, |
| "learning_rate": 8.943756195593916e-05, |
| "loss": 0.0249, |
| "step": 340 |
| }, |
| { |
| "epoch": 6.0344827586206895, |
| "grad_norm": 0.100404754281044, |
| "learning_rate": 8.849495422033549e-05, |
| "loss": 0.0223, |
| "step": 350 |
| }, |
| { |
| "epoch": 6.206896551724138, |
| "grad_norm": 0.06972885876893997, |
| "learning_rate": 8.751749110782012e-05, |
| "loss": 0.0175, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.379310344827586, |
| "grad_norm": 0.10955193638801575, |
| "learning_rate": 8.650605766543089e-05, |
| "loss": 0.0191, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.551724137931035, |
| "grad_norm": 0.08194795995950699, |
| "learning_rate": 8.546156969874723e-05, |
| "loss": 0.0194, |
| "step": 380 |
| }, |
| { |
| "epoch": 6.724137931034483, |
| "grad_norm": 0.10594207793474197, |
| "learning_rate": 8.438497294267117e-05, |
| "loss": 0.0215, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.896551724137931, |
| "grad_norm": 0.09979037195444107, |
| "learning_rate": 8.327724220510873e-05, |
| "loss": 0.0209, |
| "step": 400 |
| }, |
| { |
| "epoch": 6.896551724137931, |
| "eval_loss": 0.016114579513669014, |
| "eval_runtime": 8.4861, |
| "eval_samples_per_second": 55.385, |
| "eval_steps_per_second": 6.953, |
| "step": 400 |
| }, |
| { |
| "epoch": 7.068965517241379, |
| "grad_norm": 0.06945820152759552, |
| "learning_rate": 8.213938048432697e-05, |
| "loss": 0.0196, |
| "step": 410 |
| }, |
| { |
| "epoch": 7.241379310344827, |
| "grad_norm": 0.07130986452102661, |
| "learning_rate": 8.097241806078615e-05, |
| "loss": 0.0166, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.413793103448276, |
| "grad_norm": 0.07004086673259735, |
| "learning_rate": 7.977741156426901e-05, |
| "loss": 0.0168, |
| "step": 430 |
| }, |
| { |
| "epoch": 7.586206896551724, |
| "grad_norm": 0.08699115365743637, |
| "learning_rate": 7.855544301715203e-05, |
| "loss": 0.0179, |
| "step": 440 |
| }, |
| { |
| "epoch": 7.758620689655173, |
| "grad_norm": 0.11357751488685608, |
| "learning_rate": 7.730761885468485e-05, |
| "loss": 0.0182, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.931034482758621, |
| "grad_norm": 0.08816002309322357, |
| "learning_rate": 7.603506892316512e-05, |
| "loss": 0.0182, |
| "step": 460 |
| }, |
| { |
| "epoch": 8.10344827586207, |
| "grad_norm": 0.07963044941425323, |
| "learning_rate": 7.47389454569155e-05, |
| "loss": 0.0174, |
| "step": 470 |
| }, |
| { |
| "epoch": 8.275862068965518, |
| "grad_norm": 0.07185494154691696, |
| "learning_rate": 7.342042203498951e-05, |
| "loss": 0.0145, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.448275862068966, |
| "grad_norm": 0.0649387463927269, |
| "learning_rate": 7.208069251855078e-05, |
| "loss": 0.016, |
| "step": 490 |
| }, |
| { |
| "epoch": 8.620689655172415, |
| "grad_norm": 0.06952589750289917, |
| "learning_rate": 7.07209699698876e-05, |
| "loss": 0.0161, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.620689655172415, |
| "eval_loss": 0.014497065916657448, |
| "eval_runtime": 8.487, |
| "eval_samples_per_second": 55.379, |
| "eval_steps_per_second": 6.952, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.793103448275861, |
| "grad_norm": 0.07031919807195663, |
| "learning_rate": 6.934248555404198e-05, |
| "loss": 0.0175, |
| "step": 510 |
| }, |
| { |
| "epoch": 8.96551724137931, |
| "grad_norm": 0.08514226227998734, |
| "learning_rate": 6.79464874240473e-05, |
| "loss": 0.0182, |
| "step": 520 |
| }, |
| { |
| "epoch": 9.137931034482758, |
| "grad_norm": 0.07341048121452332, |
| "learning_rate": 6.653423959078436e-05, |
| "loss": 0.0147, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.310344827586206, |
| "grad_norm": 0.06410729140043259, |
| "learning_rate": 6.510702077847863e-05, |
| "loss": 0.0144, |
| "step": 540 |
| }, |
| { |
| "epoch": 9.482758620689655, |
| "grad_norm": 0.06970565766096115, |
| "learning_rate": 6.366612326687554e-05, |
| "loss": 0.0153, |
| "step": 550 |
| }, |
| { |
| "epoch": 9.655172413793103, |
| "grad_norm": 0.0829567089676857, |
| "learning_rate": 6.221285172114157e-05, |
| "loss": 0.0152, |
| "step": 560 |
| }, |
| { |
| "epoch": 9.827586206896552, |
| "grad_norm": 0.06180880591273308, |
| "learning_rate": 6.0748522010551215e-05, |
| "loss": 0.0154, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.07084178179502487, |
| "learning_rate": 5.927446001702899e-05, |
| "loss": 0.0164, |
| "step": 580 |
| }, |
| { |
| "epoch": 10.172413793103448, |
| "grad_norm": 0.05729280412197113, |
| "learning_rate": 5.779200043462549e-05, |
| "loss": 0.0134, |
| "step": 590 |
| }, |
| { |
| "epoch": 10.344827586206897, |
| "grad_norm": 0.05883341655135155, |
| "learning_rate": 5.6302485561014475e-05, |
| "loss": 0.0142, |
| "step": 600 |
| }, |
| { |
| "epoch": 10.344827586206897, |
| "eval_loss": 0.013431085273623466, |
| "eval_runtime": 8.4852, |
| "eval_samples_per_second": 55.391, |
| "eval_steps_per_second": 6.953, |
| "step": 600 |
| }, |
| { |
| "epoch": 10.517241379310345, |
| "grad_norm": 0.05796743184328079, |
| "learning_rate": 5.4807264082105195e-05, |
| "loss": 0.0144, |
| "step": 610 |
| }, |
| { |
| "epoch": 10.689655172413794, |
| "grad_norm": 0.07652316242456436, |
| "learning_rate": 5.330768985087059e-05, |
| "loss": 0.0146, |
| "step": 620 |
| }, |
| { |
| "epoch": 10.862068965517242, |
| "grad_norm": 0.06839638203382492, |
| "learning_rate": 5.180512066149682e-05, |
| "loss": 0.0154, |
| "step": 630 |
| }, |
| { |
| "epoch": 11.03448275862069, |
| "grad_norm": 0.05099385976791382, |
| "learning_rate": 5.030091701996428e-05, |
| "loss": 0.0147, |
| "step": 640 |
| }, |
| { |
| "epoch": 11.206896551724139, |
| "grad_norm": 0.05085451900959015, |
| "learning_rate": 4.879644091217317e-05, |
| "loss": 0.0133, |
| "step": 650 |
| }, |
| { |
| "epoch": 11.379310344827585, |
| "grad_norm": 0.07139307260513306, |
| "learning_rate": 4.729305457072913e-05, |
| "loss": 0.0136, |
| "step": 660 |
| }, |
| { |
| "epoch": 11.551724137931034, |
| "grad_norm": 0.06991631537675858, |
| "learning_rate": 4.579211924150547e-05, |
| "loss": 0.0142, |
| "step": 670 |
| }, |
| { |
| "epoch": 11.724137931034482, |
| "grad_norm": 0.06048361957073212, |
| "learning_rate": 4.429499395109877e-05, |
| "loss": 0.0146, |
| "step": 680 |
| }, |
| { |
| "epoch": 11.89655172413793, |
| "grad_norm": 0.06556624919176102, |
| "learning_rate": 4.280303427629404e-05, |
| "loss": 0.0148, |
| "step": 690 |
| }, |
| { |
| "epoch": 12.068965517241379, |
| "grad_norm": 0.05133816972374916, |
| "learning_rate": 4.131759111665349e-05, |
| "loss": 0.0141, |
| "step": 700 |
| }, |
| { |
| "epoch": 12.068965517241379, |
| "eval_loss": 0.012837924063205719, |
| "eval_runtime": 8.4874, |
| "eval_samples_per_second": 55.376, |
| "eval_steps_per_second": 6.951, |
| "step": 700 |
| }, |
| { |
| "epoch": 12.241379310344827, |
| "grad_norm": 0.05528413876891136, |
| "learning_rate": 3.9840009471340194e-05, |
| "loss": 0.0129, |
| "step": 710 |
| }, |
| { |
| "epoch": 12.413793103448276, |
| "grad_norm": 0.05302777513861656, |
| "learning_rate": 3.8371627221284495e-05, |
| "loss": 0.0136, |
| "step": 720 |
| }, |
| { |
| "epoch": 12.586206896551724, |
| "grad_norm": 0.06363388895988464, |
| "learning_rate": 3.691377391779543e-05, |
| "loss": 0.0139, |
| "step": 730 |
| }, |
| { |
| "epoch": 12.758620689655173, |
| "grad_norm": 0.056679144501686096, |
| "learning_rate": 3.546776957871445e-05, |
| "loss": 0.0141, |
| "step": 740 |
| }, |
| { |
| "epoch": 12.931034482758621, |
| "grad_norm": 0.07062114775180817, |
| "learning_rate": 3.403492349320101e-05, |
| "loss": 0.0143, |
| "step": 750 |
| }, |
| { |
| "epoch": 13.10344827586207, |
| "grad_norm": 0.05683109909296036, |
| "learning_rate": 3.261653303623263e-05, |
| "loss": 0.0134, |
| "step": 760 |
| }, |
| { |
| "epoch": 13.275862068965518, |
| "grad_norm": 0.05229064077138901, |
| "learning_rate": 3.121388249389269e-05, |
| "loss": 0.0128, |
| "step": 770 |
| }, |
| { |
| "epoch": 13.448275862068966, |
| "grad_norm": 0.07219678908586502, |
| "learning_rate": 2.982824190050958e-05, |
| "loss": 0.0133, |
| "step": 780 |
| }, |
| { |
| "epoch": 13.620689655172415, |
| "grad_norm": 0.06031765043735504, |
| "learning_rate": 2.846086588870006e-05, |
| "loss": 0.0136, |
| "step": 790 |
| }, |
| { |
| "epoch": 13.793103448275861, |
| "grad_norm": 0.06178103759884834, |
| "learning_rate": 2.711299255335833e-05, |
| "loss": 0.0138, |
| "step": 800 |
| }, |
| { |
| "epoch": 13.793103448275861, |
| "eval_loss": 0.012654323130846024, |
| "eval_runtime": 8.4929, |
| "eval_samples_per_second": 55.34, |
| "eval_steps_per_second": 6.947, |
| "step": 800 |
| }, |
| { |
| "epoch": 13.96551724137931, |
| "grad_norm": 0.051913850009441376, |
| "learning_rate": 2.5785842330619038e-05, |
| "loss": 0.014, |
| "step": 810 |
| }, |
| { |
| "epoch": 14.137931034482758, |
| "grad_norm": 0.05191033333539963, |
| "learning_rate": 2.4480616892809594e-05, |
| "loss": 0.0129, |
| "step": 820 |
| }, |
| { |
| "epoch": 14.310344827586206, |
| "grad_norm": 0.055176835507154465, |
| "learning_rate": 2.3198498060392232e-05, |
| "loss": 0.0127, |
| "step": 830 |
| }, |
| { |
| "epoch": 14.482758620689655, |
| "grad_norm": 0.0499984435737133, |
| "learning_rate": 2.194064673188089e-05, |
| "loss": 0.013, |
| "step": 840 |
| }, |
| { |
| "epoch": 14.655172413793103, |
| "grad_norm": 0.05339398235082626, |
| "learning_rate": 2.070820183270211e-05, |
| "loss": 0.0134, |
| "step": 850 |
| }, |
| { |
| "epoch": 14.827586206896552, |
| "grad_norm": 0.06339651346206665, |
| "learning_rate": 1.9502279283951364e-05, |
| "loss": 0.0135, |
| "step": 860 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.061148300766944885, |
| "learning_rate": 1.832397099197882e-05, |
| "loss": 0.0135, |
| "step": 870 |
| }, |
| { |
| "epoch": 15.172413793103448, |
| "grad_norm": 0.05747521296143532, |
| "learning_rate": 1.7174343859719333e-05, |
| "loss": 0.0125, |
| "step": 880 |
| }, |
| { |
| "epoch": 15.344827586206897, |
| "grad_norm": 0.05627927929162979, |
| "learning_rate": 1.6054438820661854e-05, |
| "loss": 0.0126, |
| "step": 890 |
| }, |
| { |
| "epoch": 15.517241379310345, |
| "grad_norm": 0.0764516219496727, |
| "learning_rate": 1.4965269896332885e-05, |
| "loss": 0.0129, |
| "step": 900 |
| }, |
| { |
| "epoch": 15.517241379310345, |
| "eval_loss": 0.01242972631007433, |
| "eval_runtime": 8.4929, |
| "eval_samples_per_second": 55.34, |
| "eval_steps_per_second": 6.947, |
| "step": 900 |
| }, |
| { |
| "epoch": 15.689655172413794, |
| "grad_norm": 0.0588865764439106, |
| "learning_rate": 1.3907823278147563e-05, |
| "loss": 0.013, |
| "step": 910 |
| }, |
| { |
| "epoch": 15.862068965517242, |
| "grad_norm": 0.06028977409005165, |
| "learning_rate": 1.2883056434459506e-05, |
| "loss": 0.0132, |
| "step": 920 |
| }, |
| { |
| "epoch": 16.03448275862069, |
| "grad_norm": 0.0516447052359581, |
| "learning_rate": 1.1891897243618182e-05, |
| "loss": 0.0132, |
| "step": 930 |
| }, |
| { |
| "epoch": 16.20689655172414, |
| "grad_norm": 0.053294651210308075, |
| "learning_rate": 1.0935243153818436e-05, |
| "loss": 0.0123, |
| "step": 940 |
| }, |
| { |
| "epoch": 16.379310344827587, |
| "grad_norm": 0.059708479791879654, |
| "learning_rate": 1.0013960370503261e-05, |
| "loss": 0.0125, |
| "step": 950 |
| }, |
| { |
| "epoch": 16.551724137931036, |
| "grad_norm": 0.051865532994270325, |
| "learning_rate": 9.12888307205541e-06, |
| "loss": 0.0126, |
| "step": 960 |
| }, |
| { |
| "epoch": 16.724137931034484, |
| "grad_norm": 0.06396190822124481, |
| "learning_rate": 8.280812654487891e-06, |
| "loss": 0.0128, |
| "step": 970 |
| }, |
| { |
| "epoch": 16.896551724137932, |
| "grad_norm": 0.060719434171915054, |
| "learning_rate": 7.470517005817474e-06, |
| "loss": 0.013, |
| "step": 980 |
| }, |
| { |
| "epoch": 17.06896551724138, |
| "grad_norm": 0.05675433203577995, |
| "learning_rate": 6.698729810778065e-06, |
| "loss": 0.0126, |
| "step": 990 |
| }, |
| { |
| "epoch": 17.24137931034483, |
| "grad_norm": 0.06285712867975235, |
| "learning_rate": 5.966149886503614e-06, |
| "loss": 0.0123, |
| "step": 1000 |
| }, |
| { |
| "epoch": 17.24137931034483, |
| "eval_loss": 0.01223958469927311, |
| "eval_runtime": 8.4904, |
| "eval_samples_per_second": 55.357, |
| "eval_steps_per_second": 6.949, |
| "step": 1000 |
| }, |
| { |
| "epoch": 17.413793103448278, |
| "grad_norm": 0.06330039352178574, |
| "learning_rate": 5.27344054978186e-06, |
| "loss": 0.0123, |
| "step": 1010 |
| }, |
| { |
| "epoch": 17.586206896551722, |
| "grad_norm": 0.05098165571689606, |
| "learning_rate": 4.621229016452156e-06, |
| "loss": 0.0124, |
| "step": 1020 |
| }, |
| { |
| "epoch": 17.75862068965517, |
| "grad_norm": 0.06866684556007385, |
| "learning_rate": 4.010105833490857e-06, |
| "loss": 0.0125, |
| "step": 1030 |
| }, |
| { |
| "epoch": 17.93103448275862, |
| "grad_norm": 0.06366662681102753, |
| "learning_rate": 3.4406243442987764e-06, |
| "loss": 0.0126, |
| "step": 1040 |
| }, |
| { |
| "epoch": 18.103448275862068, |
| "grad_norm": 0.06267203390598297, |
| "learning_rate": 2.9133001876746004e-06, |
| "loss": 0.0124, |
| "step": 1050 |
| }, |
| { |
| "epoch": 18.275862068965516, |
| "grad_norm": 0.05903858318924904, |
| "learning_rate": 2.428610830928152e-06, |
| "loss": 0.0122, |
| "step": 1060 |
| }, |
| { |
| "epoch": 18.448275862068964, |
| "grad_norm": 0.05347118899226189, |
| "learning_rate": 1.9869951375561523e-06, |
| "loss": 0.0123, |
| "step": 1070 |
| }, |
| { |
| "epoch": 18.620689655172413, |
| "grad_norm": 0.0637890100479126, |
| "learning_rate": 1.5888529698718346e-06, |
| "loss": 0.0122, |
| "step": 1080 |
| }, |
| { |
| "epoch": 18.79310344827586, |
| "grad_norm": 0.051785457879304886, |
| "learning_rate": 1.2345448269483916e-06, |
| "loss": 0.0124, |
| "step": 1090 |
| }, |
| { |
| "epoch": 18.96551724137931, |
| "grad_norm": 0.06472212821245193, |
| "learning_rate": 9.243915182039431e-07, |
| "loss": 0.0123, |
| "step": 1100 |
| }, |
| { |
| "epoch": 18.96551724137931, |
| "eval_loss": 0.012206222862005234, |
| "eval_runtime": 8.4831, |
| "eval_samples_per_second": 55.404, |
| "eval_steps_per_second": 6.955, |
| "step": 1100 |
| }, |
| { |
| "epoch": 19.137931034482758, |
| "grad_norm": 0.06167494133114815, |
| "learning_rate": 6.58673872923693e-07, |
| "loss": 0.0122, |
| "step": 1110 |
| }, |
| { |
| "epoch": 19.310344827586206, |
| "grad_norm": 0.06309988349676132, |
| "learning_rate": 4.376324859820924e-07, |
| "loss": 0.0122, |
| "step": 1120 |
| }, |
| { |
| "epoch": 19.482758620689655, |
| "grad_norm": 0.05701352283358574, |
| "learning_rate": 2.614674999955269e-07, |
| "loss": 0.0122, |
| "step": 1130 |
| }, |
| { |
| "epoch": 19.655172413793103, |
| "grad_norm": 0.06496866792440414, |
| "learning_rate": 1.3033842410251075e-07, |
| "loss": 0.0122, |
| "step": 1140 |
| }, |
| { |
| "epoch": 19.82758620689655, |
| "grad_norm": 0.05343254283070564, |
| "learning_rate": 4.436398953567289e-08, |
| "loss": 0.0122, |
| "step": 1150 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.06212453171610832, |
| "learning_rate": 3.622042116169233e-09, |
| "loss": 0.0122, |
| "step": 1160 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1160, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.74429913710592e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|