{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 65, "global_step": 1040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019230769230769232, "grad_norm": 3.5625, "learning_rate": 0.0, "loss": 1.1154, "step": 1 }, { "epoch": 0.0019230769230769232, "eval_loss": 1.137184739112854, "eval_runtime": 34.305, "eval_samples_per_second": 68.299, "eval_steps_per_second": 17.082, "step": 1 }, { "epoch": 0.0038461538461538464, "grad_norm": 3.5, "learning_rate": 1.9230769230769234e-07, "loss": 1.1559, "step": 2 }, { "epoch": 0.0057692307692307696, "grad_norm": 3.515625, "learning_rate": 3.846153846153847e-07, "loss": 1.1031, "step": 3 }, { "epoch": 0.007692307692307693, "grad_norm": 3.6875, "learning_rate": 5.76923076923077e-07, "loss": 1.1168, "step": 4 }, { "epoch": 0.009615384615384616, "grad_norm": 3.78125, "learning_rate": 7.692307692307694e-07, "loss": 1.0839, "step": 5 }, { "epoch": 0.011538461538461539, "grad_norm": 3.546875, "learning_rate": 9.615384615384617e-07, "loss": 1.0938, "step": 6 }, { "epoch": 0.013461538461538462, "grad_norm": 3.71875, "learning_rate": 1.153846153846154e-06, "loss": 1.1459, "step": 7 }, { "epoch": 0.015384615384615385, "grad_norm": 3.671875, "learning_rate": 1.3461538461538462e-06, "loss": 1.0944, "step": 8 }, { "epoch": 0.01730769230769231, "grad_norm": 3.65625, "learning_rate": 1.5384615384615387e-06, "loss": 1.1185, "step": 9 }, { "epoch": 0.019230769230769232, "grad_norm": 3.734375, "learning_rate": 1.7307692307692308e-06, "loss": 1.0774, "step": 10 }, { "epoch": 0.021153846153846155, "grad_norm": 3.46875, "learning_rate": 1.9230769230769234e-06, "loss": 1.1568, "step": 11 }, { "epoch": 0.023076923076923078, "grad_norm": 3.515625, "learning_rate": 2.1153846153846155e-06, "loss": 1.086, "step": 12 }, { "epoch": 0.025, "grad_norm": 3.3125, "learning_rate": 2.307692307692308e-06, "loss": 1.0978, "step": 13 }, { "epoch": 0.026923076923076925, "grad_norm": 3.359375, "learning_rate": 2.5e-06, "loss": 1.109, "step": 14 }, { "epoch": 0.028846153846153848, "grad_norm": 2.953125, "learning_rate": 2.6923076923076923e-06, "loss": 1.0631, "step": 15 }, { "epoch": 0.03076923076923077, "grad_norm": 2.890625, "learning_rate": 2.8846153846153845e-06, "loss": 1.1241, "step": 16 }, { "epoch": 0.032692307692307694, "grad_norm": 2.6875, "learning_rate": 3.0769230769230774e-06, "loss": 1.0644, "step": 17 }, { "epoch": 0.03461538461538462, "grad_norm": 2.640625, "learning_rate": 3.2692307692307696e-06, "loss": 1.0823, "step": 18 }, { "epoch": 0.03653846153846154, "grad_norm": 2.703125, "learning_rate": 3.4615384615384617e-06, "loss": 1.0494, "step": 19 }, { "epoch": 0.038461538461538464, "grad_norm": 2.375, "learning_rate": 3.653846153846154e-06, "loss": 1.0778, "step": 20 }, { "epoch": 0.04038461538461539, "grad_norm": 2.359375, "learning_rate": 3.846153846153847e-06, "loss": 0.9911, "step": 21 }, { "epoch": 0.04230769230769231, "grad_norm": 1.9296875, "learning_rate": 4.0384615384615385e-06, "loss": 1.1214, "step": 22 }, { "epoch": 0.04423076923076923, "grad_norm": 1.859375, "learning_rate": 4.230769230769231e-06, "loss": 1.0294, "step": 23 }, { "epoch": 0.046153846153846156, "grad_norm": 1.6796875, "learning_rate": 4.423076923076924e-06, "loss": 1.0473, "step": 24 }, { "epoch": 0.04807692307692308, "grad_norm": 1.4765625, "learning_rate": 4.615384615384616e-06, "loss": 1.0562, "step": 25 }, { "epoch": 0.05, "grad_norm": 1.4140625, "learning_rate": 4.807692307692308e-06, "loss": 1.0273, "step": 26 }, { "epoch": 0.051923076923076926, "grad_norm": 1.2890625, "learning_rate": 5e-06, "loss": 1.0713, "step": 27 }, { "epoch": 0.05384615384615385, "grad_norm": 1.2578125, "learning_rate": 5.192307692307693e-06, "loss": 1.0218, "step": 28 }, { "epoch": 0.05576923076923077, "grad_norm": 1.2265625, "learning_rate": 5.384615384615385e-06, "loss": 1.0322, "step": 29 }, { "epoch": 0.057692307692307696, "grad_norm": 1.125, "learning_rate": 5.576923076923077e-06, "loss": 0.9993, "step": 30 }, { "epoch": 0.05961538461538462, "grad_norm": 1.1796875, "learning_rate": 5.769230769230769e-06, "loss": 0.9909, "step": 31 }, { "epoch": 0.06153846153846154, "grad_norm": 1.078125, "learning_rate": 5.961538461538462e-06, "loss": 1.0497, "step": 32 }, { "epoch": 0.06346153846153846, "grad_norm": 1.078125, "learning_rate": 6.153846153846155e-06, "loss": 1.0699, "step": 33 }, { "epoch": 0.06538461538461539, "grad_norm": 1.1171875, "learning_rate": 6.3461538461538466e-06, "loss": 1.0065, "step": 34 }, { "epoch": 0.0673076923076923, "grad_norm": 1.03125, "learning_rate": 6.538461538461539e-06, "loss": 0.9839, "step": 35 }, { "epoch": 0.06923076923076923, "grad_norm": 1.421875, "learning_rate": 6.730769230769232e-06, "loss": 1.0069, "step": 36 }, { "epoch": 0.07115384615384615, "grad_norm": 1.0546875, "learning_rate": 6.923076923076923e-06, "loss": 1.0441, "step": 37 }, { "epoch": 0.07307692307692308, "grad_norm": 1.015625, "learning_rate": 7.115384615384616e-06, "loss": 0.9343, "step": 38 }, { "epoch": 0.075, "grad_norm": 1.0234375, "learning_rate": 7.307692307692308e-06, "loss": 0.9853, "step": 39 }, { "epoch": 0.07692307692307693, "grad_norm": 0.98046875, "learning_rate": 7.500000000000001e-06, "loss": 0.9502, "step": 40 }, { "epoch": 0.07884615384615384, "grad_norm": 0.984375, "learning_rate": 7.692307692307694e-06, "loss": 0.987, "step": 41 }, { "epoch": 0.08076923076923077, "grad_norm": 1.0234375, "learning_rate": 7.884615384615384e-06, "loss": 1.009, "step": 42 }, { "epoch": 0.08269230769230769, "grad_norm": 0.9765625, "learning_rate": 8.076923076923077e-06, "loss": 1.0174, "step": 43 }, { "epoch": 0.08461538461538462, "grad_norm": 0.99609375, "learning_rate": 8.26923076923077e-06, "loss": 1.0515, "step": 44 }, { "epoch": 0.08653846153846154, "grad_norm": 1.0234375, "learning_rate": 8.461538461538462e-06, "loss": 1.0196, "step": 45 }, { "epoch": 0.08846153846153847, "grad_norm": 0.94921875, "learning_rate": 8.653846153846155e-06, "loss": 0.9957, "step": 46 }, { "epoch": 0.09038461538461538, "grad_norm": 1.0078125, "learning_rate": 8.846153846153847e-06, "loss": 0.9422, "step": 47 }, { "epoch": 0.09230769230769231, "grad_norm": 0.97265625, "learning_rate": 9.03846153846154e-06, "loss": 0.9527, "step": 48 }, { "epoch": 0.09423076923076923, "grad_norm": 1.015625, "learning_rate": 9.230769230769232e-06, "loss": 0.9944, "step": 49 }, { "epoch": 0.09615384615384616, "grad_norm": 1.0078125, "learning_rate": 9.423076923076923e-06, "loss": 0.9794, "step": 50 }, { "epoch": 0.09807692307692308, "grad_norm": 1.03125, "learning_rate": 9.615384615384616e-06, "loss": 1.0226, "step": 51 }, { "epoch": 0.1, "grad_norm": 1.0, "learning_rate": 9.807692307692308e-06, "loss": 1.0325, "step": 52 }, { "epoch": 0.10192307692307692, "grad_norm": 1.0234375, "learning_rate": 1e-05, "loss": 0.9893, "step": 53 }, { "epoch": 0.10384615384615385, "grad_norm": 0.98046875, "learning_rate": 9.999974723001716e-06, "loss": 1.0543, "step": 54 }, { "epoch": 0.10576923076923077, "grad_norm": 1.03125, "learning_rate": 9.999898892262433e-06, "loss": 1.0227, "step": 55 }, { "epoch": 0.1076923076923077, "grad_norm": 1.046875, "learning_rate": 9.999772508548863e-06, "loss": 1.067, "step": 56 }, { "epoch": 0.10961538461538461, "grad_norm": 1.015625, "learning_rate": 9.999595573138845e-06, "loss": 0.9794, "step": 57 }, { "epoch": 0.11153846153846154, "grad_norm": 1.0625, "learning_rate": 9.999368087821337e-06, "loss": 1.0166, "step": 58 }, { "epoch": 0.11346153846153846, "grad_norm": 1.0546875, "learning_rate": 9.999090054896397e-06, "loss": 1.0092, "step": 59 }, { "epoch": 0.11538461538461539, "grad_norm": 1.015625, "learning_rate": 9.99876147717516e-06, "loss": 0.9518, "step": 60 }, { "epoch": 0.11730769230769231, "grad_norm": 1.0390625, "learning_rate": 9.99838235797981e-06, "loss": 0.9558, "step": 61 }, { "epoch": 0.11923076923076924, "grad_norm": 1.078125, "learning_rate": 9.997952701143547e-06, "loss": 1.0134, "step": 62 }, { "epoch": 0.12115384615384615, "grad_norm": 1.03125, "learning_rate": 9.997472511010543e-06, "loss": 0.8941, "step": 63 }, { "epoch": 0.12307692307692308, "grad_norm": 1.046875, "learning_rate": 9.996941792435903e-06, "loss": 0.9207, "step": 64 }, { "epoch": 0.125, "grad_norm": 1.0625, "learning_rate": 9.996360550785619e-06, "loss": 0.9351, "step": 65 }, { "epoch": 0.125, "eval_loss": 1.0073611736297607, "eval_runtime": 34.5277, "eval_samples_per_second": 67.859, "eval_steps_per_second": 16.972, "step": 65 }, { "epoch": 0.12692307692307692, "grad_norm": 1.046875, "learning_rate": 9.995728791936505e-06, "loss": 1.0024, "step": 66 }, { "epoch": 0.12884615384615383, "grad_norm": 1.109375, "learning_rate": 9.995046522276152e-06, "loss": 1.0015, "step": 67 }, { "epoch": 0.13076923076923078, "grad_norm": 1.3359375, "learning_rate": 9.994313748702848e-06, "loss": 0.9595, "step": 68 }, { "epoch": 0.1326923076923077, "grad_norm": 1.0625, "learning_rate": 9.993530478625524e-06, "loss": 0.9331, "step": 69 }, { "epoch": 0.1346153846153846, "grad_norm": 1.0390625, "learning_rate": 9.992696719963662e-06, "loss": 0.9174, "step": 70 }, { "epoch": 0.13653846153846153, "grad_norm": 1.0546875, "learning_rate": 9.99181248114723e-06, "loss": 1.009, "step": 71 }, { "epoch": 0.13846153846153847, "grad_norm": 1.0390625, "learning_rate": 9.990877771116588e-06, "loss": 0.9661, "step": 72 }, { "epoch": 0.14038461538461539, "grad_norm": 1.078125, "learning_rate": 9.989892599322404e-06, "loss": 0.9398, "step": 73 }, { "epoch": 0.1423076923076923, "grad_norm": 1.078125, "learning_rate": 9.988856975725551e-06, "loss": 0.9973, "step": 74 }, { "epoch": 0.14423076923076922, "grad_norm": 1.046875, "learning_rate": 9.987770910797014e-06, "loss": 0.8935, "step": 75 }, { "epoch": 0.14615384615384616, "grad_norm": 1.09375, "learning_rate": 9.986634415517774e-06, "loss": 0.958, "step": 76 }, { "epoch": 0.14807692307692308, "grad_norm": 1.0390625, "learning_rate": 9.985447501378706e-06, "loss": 0.9349, "step": 77 }, { "epoch": 0.15, "grad_norm": 1.078125, "learning_rate": 9.984210180380464e-06, "loss": 0.9474, "step": 78 }, { "epoch": 0.1519230769230769, "grad_norm": 1.0234375, "learning_rate": 9.98292246503335e-06, "loss": 0.9704, "step": 79 }, { "epoch": 0.15384615384615385, "grad_norm": 1.0625, "learning_rate": 9.981584368357198e-06, "loss": 0.9745, "step": 80 }, { "epoch": 0.15576923076923077, "grad_norm": 1.0859375, "learning_rate": 9.980195903881231e-06, "loss": 0.9527, "step": 81 }, { "epoch": 0.1576923076923077, "grad_norm": 1.0859375, "learning_rate": 9.978757085643937e-06, "loss": 0.9732, "step": 82 }, { "epoch": 0.1596153846153846, "grad_norm": 1.09375, "learning_rate": 9.97726792819292e-06, "loss": 0.9304, "step": 83 }, { "epoch": 0.16153846153846155, "grad_norm": 1.03125, "learning_rate": 9.975728446584748e-06, "loss": 0.9999, "step": 84 }, { "epoch": 0.16346153846153846, "grad_norm": 1.0625, "learning_rate": 9.974138656384815e-06, "loss": 0.9477, "step": 85 }, { "epoch": 0.16538461538461538, "grad_norm": 1.0234375, "learning_rate": 9.97249857366717e-06, "loss": 0.9661, "step": 86 }, { "epoch": 0.1673076923076923, "grad_norm": 1.078125, "learning_rate": 9.970808215014357e-06, "loss": 0.9763, "step": 87 }, { "epoch": 0.16923076923076924, "grad_norm": 1.0703125, "learning_rate": 9.969067597517255e-06, "loss": 0.9292, "step": 88 }, { "epoch": 0.17115384615384616, "grad_norm": 1.0625, "learning_rate": 9.967276738774897e-06, "loss": 0.9083, "step": 89 }, { "epoch": 0.17307692307692307, "grad_norm": 1.1328125, "learning_rate": 9.9654356568943e-06, "loss": 0.9764, "step": 90 }, { "epoch": 0.175, "grad_norm": 1.109375, "learning_rate": 9.96354437049027e-06, "loss": 0.9924, "step": 91 }, { "epoch": 0.17692307692307693, "grad_norm": 1.09375, "learning_rate": 9.961602898685225e-06, "loss": 0.9551, "step": 92 }, { "epoch": 0.17884615384615385, "grad_norm": 1.109375, "learning_rate": 9.959611261108999e-06, "loss": 1.0074, "step": 93 }, { "epoch": 0.18076923076923077, "grad_norm": 1.09375, "learning_rate": 9.957569477898636e-06, "loss": 1.0348, "step": 94 }, { "epoch": 0.18269230769230768, "grad_norm": 1.0859375, "learning_rate": 9.955477569698197e-06, "loss": 0.9745, "step": 95 }, { "epoch": 0.18461538461538463, "grad_norm": 1.0625, "learning_rate": 9.95333555765855e-06, "loss": 0.9278, "step": 96 }, { "epoch": 0.18653846153846154, "grad_norm": 1.0859375, "learning_rate": 9.951143463437145e-06, "loss": 0.9497, "step": 97 }, { "epoch": 0.18846153846153846, "grad_norm": 1.109375, "learning_rate": 9.948901309197807e-06, "loss": 1.0283, "step": 98 }, { "epoch": 0.19038461538461537, "grad_norm": 1.078125, "learning_rate": 9.946609117610508e-06, "loss": 0.9384, "step": 99 }, { "epoch": 0.19230769230769232, "grad_norm": 1.0546875, "learning_rate": 9.94426691185114e-06, "loss": 1.0316, "step": 100 }, { "epoch": 0.19423076923076923, "grad_norm": 1.1328125, "learning_rate": 9.94187471560127e-06, "loss": 0.9401, "step": 101 }, { "epoch": 0.19615384615384615, "grad_norm": 1.0703125, "learning_rate": 9.939432553047919e-06, "loss": 0.9112, "step": 102 }, { "epoch": 0.19807692307692307, "grad_norm": 1.0859375, "learning_rate": 9.936940448883299e-06, "loss": 0.9085, "step": 103 }, { "epoch": 0.2, "grad_norm": 1.09375, "learning_rate": 9.934398428304577e-06, "loss": 0.9583, "step": 104 }, { "epoch": 0.20192307692307693, "grad_norm": 1.1171875, "learning_rate": 9.931806517013612e-06, "loss": 0.9923, "step": 105 }, { "epoch": 0.20384615384615384, "grad_norm": 1.09375, "learning_rate": 9.929164741216702e-06, "loss": 0.9281, "step": 106 }, { "epoch": 0.20576923076923076, "grad_norm": 2.03125, "learning_rate": 9.926473127624306e-06, "loss": 0.9767, "step": 107 }, { "epoch": 0.2076923076923077, "grad_norm": 1.0625, "learning_rate": 9.923731703450794e-06, "loss": 0.9255, "step": 108 }, { "epoch": 0.20961538461538462, "grad_norm": 1.109375, "learning_rate": 9.920940496414153e-06, "loss": 0.9394, "step": 109 }, { "epoch": 0.21153846153846154, "grad_norm": 1.0703125, "learning_rate": 9.91809953473572e-06, "loss": 0.8881, "step": 110 }, { "epoch": 0.21346153846153845, "grad_norm": 1.0703125, "learning_rate": 9.915208847139883e-06, "loss": 0.9327, "step": 111 }, { "epoch": 0.2153846153846154, "grad_norm": 1.09375, "learning_rate": 9.912268462853811e-06, "loss": 0.9637, "step": 112 }, { "epoch": 0.2173076923076923, "grad_norm": 1.09375, "learning_rate": 9.909278411607134e-06, "loss": 0.904, "step": 113 }, { "epoch": 0.21923076923076923, "grad_norm": 1.0859375, "learning_rate": 9.906238723631662e-06, "loss": 1.0109, "step": 114 }, { "epoch": 0.22115384615384615, "grad_norm": 1.109375, "learning_rate": 9.903149429661072e-06, "loss": 0.9612, "step": 115 }, { "epoch": 0.2230769230769231, "grad_norm": 1.140625, "learning_rate": 9.90001056093059e-06, "loss": 0.966, "step": 116 }, { "epoch": 0.225, "grad_norm": 1.0859375, "learning_rate": 9.896822149176695e-06, "loss": 0.9076, "step": 117 }, { "epoch": 0.22692307692307692, "grad_norm": 1.1328125, "learning_rate": 9.893584226636773e-06, "loss": 0.9435, "step": 118 }, { "epoch": 0.22884615384615384, "grad_norm": 1.109375, "learning_rate": 9.89029682604881e-06, "loss": 0.9738, "step": 119 }, { "epoch": 0.23076923076923078, "grad_norm": 1.09375, "learning_rate": 9.886959980651056e-06, "loss": 1.0009, "step": 120 }, { "epoch": 0.2326923076923077, "grad_norm": 1.046875, "learning_rate": 9.883573724181683e-06, "loss": 0.9864, "step": 121 }, { "epoch": 0.23461538461538461, "grad_norm": 1.125, "learning_rate": 9.880138090878452e-06, "loss": 0.9537, "step": 122 }, { "epoch": 0.23653846153846153, "grad_norm": 1.09375, "learning_rate": 9.87665311547836e-06, "loss": 1.0204, "step": 123 }, { "epoch": 0.23846153846153847, "grad_norm": 1.1328125, "learning_rate": 9.873118833217294e-06, "loss": 0.9623, "step": 124 }, { "epoch": 0.2403846153846154, "grad_norm": 1.078125, "learning_rate": 9.869535279829674e-06, "loss": 0.9458, "step": 125 }, { "epoch": 0.2423076923076923, "grad_norm": 1.0703125, "learning_rate": 9.86590249154809e-06, "loss": 0.9223, "step": 126 }, { "epoch": 0.24423076923076922, "grad_norm": 1.078125, "learning_rate": 9.862220505102933e-06, "loss": 0.9847, "step": 127 }, { "epoch": 0.24615384615384617, "grad_norm": 1.1015625, "learning_rate": 9.858489357722028e-06, "loss": 0.9633, "step": 128 }, { "epoch": 0.24807692307692308, "grad_norm": 1.1328125, "learning_rate": 9.854709087130261e-06, "loss": 0.896, "step": 129 }, { "epoch": 0.25, "grad_norm": 1.078125, "learning_rate": 9.850879731549188e-06, "loss": 0.8884, "step": 130 }, { "epoch": 0.25, "eval_loss": 0.9758404493331909, "eval_runtime": 34.4304, "eval_samples_per_second": 68.05, "eval_steps_per_second": 17.02, "step": 130 }, { "epoch": 0.2519230769230769, "grad_norm": 1.078125, "learning_rate": 9.847001329696653e-06, "loss": 0.9047, "step": 131 }, { "epoch": 0.25384615384615383, "grad_norm": 1.1015625, "learning_rate": 9.843073920786402e-06, "loss": 0.979, "step": 132 }, { "epoch": 0.25576923076923075, "grad_norm": 1.09375, "learning_rate": 9.839097544527674e-06, "loss": 0.9224, "step": 133 }, { "epoch": 0.25769230769230766, "grad_norm": 1.1328125, "learning_rate": 9.835072241124815e-06, "loss": 0.8739, "step": 134 }, { "epoch": 0.25961538461538464, "grad_norm": 1.125, "learning_rate": 9.830998051276858e-06, "loss": 0.9326, "step": 135 }, { "epoch": 0.26153846153846155, "grad_norm": 1.0859375, "learning_rate": 9.82687501617712e-06, "loss": 0.9303, "step": 136 }, { "epoch": 0.26346153846153847, "grad_norm": 1.1171875, "learning_rate": 9.822703177512783e-06, "loss": 1.0002, "step": 137 }, { "epoch": 0.2653846153846154, "grad_norm": 1.125, "learning_rate": 9.818482577464466e-06, "loss": 0.9562, "step": 138 }, { "epoch": 0.2673076923076923, "grad_norm": 1.140625, "learning_rate": 9.814213258705813e-06, "loss": 0.9212, "step": 139 }, { "epoch": 0.2692307692307692, "grad_norm": 1.125, "learning_rate": 9.809895264403046e-06, "loss": 0.9679, "step": 140 }, { "epoch": 0.27115384615384613, "grad_norm": 1.1015625, "learning_rate": 9.805528638214543e-06, "loss": 0.9903, "step": 141 }, { "epoch": 0.27307692307692305, "grad_norm": 1.1171875, "learning_rate": 9.801113424290381e-06, "loss": 0.9754, "step": 142 }, { "epoch": 0.275, "grad_norm": 1.125, "learning_rate": 9.796649667271905e-06, "loss": 0.8977, "step": 143 }, { "epoch": 0.27692307692307694, "grad_norm": 1.109375, "learning_rate": 9.792137412291265e-06, "loss": 0.8798, "step": 144 }, { "epoch": 0.27884615384615385, "grad_norm": 1.15625, "learning_rate": 9.787576704970965e-06, "loss": 0.9382, "step": 145 }, { "epoch": 0.28076923076923077, "grad_norm": 1.1796875, "learning_rate": 9.7829675914234e-06, "loss": 0.9692, "step": 146 }, { "epoch": 0.2826923076923077, "grad_norm": 1.125, "learning_rate": 9.778310118250397e-06, "loss": 0.8939, "step": 147 }, { "epoch": 0.2846153846153846, "grad_norm": 1.1640625, "learning_rate": 9.77360433254273e-06, "loss": 1.0047, "step": 148 }, { "epoch": 0.2865384615384615, "grad_norm": 1.1328125, "learning_rate": 9.768850281879651e-06, "loss": 0.9484, "step": 149 }, { "epoch": 0.28846153846153844, "grad_norm": 1.1171875, "learning_rate": 9.764048014328417e-06, "loss": 0.9004, "step": 150 }, { "epoch": 0.2903846153846154, "grad_norm": 1.1171875, "learning_rate": 9.759197578443787e-06, "loss": 0.9281, "step": 151 }, { "epoch": 0.2923076923076923, "grad_norm": 1.2109375, "learning_rate": 9.754299023267548e-06, "loss": 0.9638, "step": 152 }, { "epoch": 0.29423076923076924, "grad_norm": 1.1640625, "learning_rate": 9.74935239832801e-06, "loss": 0.9366, "step": 153 }, { "epoch": 0.29615384615384616, "grad_norm": 1.1796875, "learning_rate": 9.7443577536395e-06, "loss": 0.9125, "step": 154 }, { "epoch": 0.2980769230769231, "grad_norm": 1.125, "learning_rate": 9.739315139701868e-06, "loss": 0.9226, "step": 155 }, { "epoch": 0.3, "grad_norm": 1.1484375, "learning_rate": 9.734224607499978e-06, "loss": 0.9384, "step": 156 }, { "epoch": 0.3019230769230769, "grad_norm": 1.2890625, "learning_rate": 9.729086208503174e-06, "loss": 0.8788, "step": 157 }, { "epoch": 0.3038461538461538, "grad_norm": 1.15625, "learning_rate": 9.723899994664779e-06, "loss": 0.9672, "step": 158 }, { "epoch": 0.3057692307692308, "grad_norm": 1.125, "learning_rate": 9.71866601842156e-06, "loss": 0.8889, "step": 159 }, { "epoch": 0.3076923076923077, "grad_norm": 1.15625, "learning_rate": 9.713384332693199e-06, "loss": 0.8975, "step": 160 }, { "epoch": 0.3096153846153846, "grad_norm": 1.1015625, "learning_rate": 9.708054990881763e-06, "loss": 0.9614, "step": 161 }, { "epoch": 0.31153846153846154, "grad_norm": 1.125, "learning_rate": 9.702678046871157e-06, "loss": 0.9061, "step": 162 }, { "epoch": 0.31346153846153846, "grad_norm": 1.1484375, "learning_rate": 9.69725355502658e-06, "loss": 0.9322, "step": 163 }, { "epoch": 0.3153846153846154, "grad_norm": 1.15625, "learning_rate": 9.691781570193983e-06, "loss": 0.9512, "step": 164 }, { "epoch": 0.3173076923076923, "grad_norm": 1.1328125, "learning_rate": 9.686262147699507e-06, "loss": 1.0271, "step": 165 }, { "epoch": 0.3192307692307692, "grad_norm": 1.125, "learning_rate": 9.680695343348923e-06, "loss": 0.9529, "step": 166 }, { "epoch": 0.3211538461538462, "grad_norm": 1.203125, "learning_rate": 9.675081213427076e-06, "loss": 0.9636, "step": 167 }, { "epoch": 0.3230769230769231, "grad_norm": 1.109375, "learning_rate": 9.669419814697303e-06, "loss": 0.8879, "step": 168 }, { "epoch": 0.325, "grad_norm": 1.125, "learning_rate": 9.663711204400872e-06, "loss": 0.8889, "step": 169 }, { "epoch": 0.3269230769230769, "grad_norm": 1.0546875, "learning_rate": 9.657955440256396e-06, "loss": 0.9735, "step": 170 }, { "epoch": 0.32884615384615384, "grad_norm": 1.1015625, "learning_rate": 9.65215258045925e-06, "loss": 1.0088, "step": 171 }, { "epoch": 0.33076923076923076, "grad_norm": 1.1015625, "learning_rate": 9.64630268368099e-06, "loss": 0.9318, "step": 172 }, { "epoch": 0.3326923076923077, "grad_norm": 1.09375, "learning_rate": 9.640405809068743e-06, "loss": 0.9902, "step": 173 }, { "epoch": 0.3346153846153846, "grad_norm": 1.109375, "learning_rate": 9.634462016244625e-06, "loss": 0.9315, "step": 174 }, { "epoch": 0.33653846153846156, "grad_norm": 1.0546875, "learning_rate": 9.628471365305134e-06, "loss": 0.931, "step": 175 }, { "epoch": 0.3384615384615385, "grad_norm": 1.1015625, "learning_rate": 9.622433916820539e-06, "loss": 0.9167, "step": 176 }, { "epoch": 0.3403846153846154, "grad_norm": 1.109375, "learning_rate": 9.616349731834271e-06, "loss": 0.9214, "step": 177 }, { "epoch": 0.3423076923076923, "grad_norm": 1.1171875, "learning_rate": 9.610218871862303e-06, "loss": 0.945, "step": 178 }, { "epoch": 0.34423076923076923, "grad_norm": 1.1171875, "learning_rate": 9.604041398892528e-06, "loss": 0.9445, "step": 179 }, { "epoch": 0.34615384615384615, "grad_norm": 1.140625, "learning_rate": 9.597817375384138e-06, "loss": 1.0135, "step": 180 }, { "epoch": 0.34807692307692306, "grad_norm": 1.3203125, "learning_rate": 9.591546864266983e-06, "loss": 0.9393, "step": 181 }, { "epoch": 0.35, "grad_norm": 1.1171875, "learning_rate": 9.585229928940944e-06, "loss": 0.9273, "step": 182 }, { "epoch": 0.35192307692307695, "grad_norm": 1.1171875, "learning_rate": 9.578866633275289e-06, "loss": 1.0091, "step": 183 }, { "epoch": 0.35384615384615387, "grad_norm": 1.0859375, "learning_rate": 9.572457041608018e-06, "loss": 0.9301, "step": 184 }, { "epoch": 0.3557692307692308, "grad_norm": 1.046875, "learning_rate": 9.56600121874523e-06, "loss": 0.991, "step": 185 }, { "epoch": 0.3576923076923077, "grad_norm": 1.125, "learning_rate": 9.55949922996045e-06, "loss": 0.8897, "step": 186 }, { "epoch": 0.3596153846153846, "grad_norm": 1.109375, "learning_rate": 9.55295114099399e-06, "loss": 0.8844, "step": 187 }, { "epoch": 0.36153846153846153, "grad_norm": 1.0703125, "learning_rate": 9.546357018052254e-06, "loss": 0.9226, "step": 188 }, { "epoch": 0.36346153846153845, "grad_norm": 1.078125, "learning_rate": 9.539716927807102e-06, "loss": 0.9273, "step": 189 }, { "epoch": 0.36538461538461536, "grad_norm": 1.109375, "learning_rate": 9.533030937395151e-06, "loss": 0.9116, "step": 190 }, { "epoch": 0.36730769230769234, "grad_norm": 1.0703125, "learning_rate": 9.526299114417108e-06, "loss": 0.9732, "step": 191 }, { "epoch": 0.36923076923076925, "grad_norm": 1.078125, "learning_rate": 9.519521526937087e-06, "loss": 0.918, "step": 192 }, { "epoch": 0.37115384615384617, "grad_norm": 1.03125, "learning_rate": 9.512698243481914e-06, "loss": 0.9045, "step": 193 }, { "epoch": 0.3730769230769231, "grad_norm": 1.078125, "learning_rate": 9.505829333040437e-06, "loss": 0.9189, "step": 194 }, { "epoch": 0.375, "grad_norm": 1.1015625, "learning_rate": 9.498914865062831e-06, "loss": 0.9853, "step": 195 }, { "epoch": 0.375, "eval_loss": 0.9608431458473206, "eval_runtime": 34.5143, "eval_samples_per_second": 67.885, "eval_steps_per_second": 16.978, "step": 195 }, { "epoch": 0.3769230769230769, "grad_norm": 1.046875, "learning_rate": 9.491954909459895e-06, "loss": 0.9078, "step": 196 }, { "epoch": 0.37884615384615383, "grad_norm": 1.0703125, "learning_rate": 9.484949536602343e-06, "loss": 0.8859, "step": 197 }, { "epoch": 0.38076923076923075, "grad_norm": 1.0390625, "learning_rate": 9.477898817320094e-06, "loss": 0.9183, "step": 198 }, { "epoch": 0.38269230769230766, "grad_norm": 1.0390625, "learning_rate": 9.470802822901558e-06, "loss": 0.918, "step": 199 }, { "epoch": 0.38461538461538464, "grad_norm": 1.0625, "learning_rate": 9.463661625092907e-06, "loss": 0.9444, "step": 200 }, { "epoch": 0.38653846153846155, "grad_norm": 1.046875, "learning_rate": 9.45647529609736e-06, "loss": 0.8958, "step": 201 }, { "epoch": 0.38846153846153847, "grad_norm": 1.046875, "learning_rate": 9.44924390857445e-06, "loss": 0.9071, "step": 202 }, { "epoch": 0.3903846153846154, "grad_norm": 1.09375, "learning_rate": 9.44196753563928e-06, "loss": 0.962, "step": 203 }, { "epoch": 0.3923076923076923, "grad_norm": 1.0546875, "learning_rate": 9.434646250861801e-06, "loss": 0.9814, "step": 204 }, { "epoch": 0.3942307692307692, "grad_norm": 1.09375, "learning_rate": 9.427280128266049e-06, "loss": 0.9873, "step": 205 }, { "epoch": 0.39615384615384613, "grad_norm": 1.15625, "learning_rate": 9.419869242329417e-06, "loss": 0.9034, "step": 206 }, { "epoch": 0.39807692307692305, "grad_norm": 1.0703125, "learning_rate": 9.412413667981884e-06, "loss": 0.8953, "step": 207 }, { "epoch": 0.4, "grad_norm": 1.1015625, "learning_rate": 9.404913480605264e-06, "loss": 1.0005, "step": 208 }, { "epoch": 0.40192307692307694, "grad_norm": 1.0625, "learning_rate": 9.397368756032445e-06, "loss": 0.9455, "step": 209 }, { "epoch": 0.40384615384615385, "grad_norm": 1.0234375, "learning_rate": 9.389779570546628e-06, "loss": 0.9336, "step": 210 }, { "epoch": 0.40576923076923077, "grad_norm": 1.1953125, "learning_rate": 9.38214600088054e-06, "loss": 0.8907, "step": 211 }, { "epoch": 0.4076923076923077, "grad_norm": 1.078125, "learning_rate": 9.374468124215676e-06, "loss": 0.8735, "step": 212 }, { "epoch": 0.4096153846153846, "grad_norm": 1.0625, "learning_rate": 9.366746018181503e-06, "loss": 0.9682, "step": 213 }, { "epoch": 0.4115384615384615, "grad_norm": 1.0078125, "learning_rate": 9.358979760854686e-06, "loss": 0.9069, "step": 214 }, { "epoch": 0.41346153846153844, "grad_norm": 0.99609375, "learning_rate": 9.351169430758293e-06, "loss": 0.9371, "step": 215 }, { "epoch": 0.4153846153846154, "grad_norm": 1.0703125, "learning_rate": 9.343315106861008e-06, "loss": 0.9691, "step": 216 }, { "epoch": 0.4173076923076923, "grad_norm": 1.0234375, "learning_rate": 9.33541686857632e-06, "loss": 0.9521, "step": 217 }, { "epoch": 0.41923076923076924, "grad_norm": 1.0078125, "learning_rate": 9.327474795761734e-06, "loss": 0.9503, "step": 218 }, { "epoch": 0.42115384615384616, "grad_norm": 1.015625, "learning_rate": 9.31948896871795e-06, "loss": 0.9311, "step": 219 }, { "epoch": 0.4230769230769231, "grad_norm": 1.046875, "learning_rate": 9.311459468188066e-06, "loss": 0.8998, "step": 220 }, { "epoch": 0.425, "grad_norm": 1.0859375, "learning_rate": 9.303386375356752e-06, "loss": 0.8776, "step": 221 }, { "epoch": 0.4269230769230769, "grad_norm": 1.0703125, "learning_rate": 9.295269771849426e-06, "loss": 0.9003, "step": 222 }, { "epoch": 0.4288461538461538, "grad_norm": 0.98046875, "learning_rate": 9.28710973973144e-06, "loss": 0.9597, "step": 223 }, { "epoch": 0.4307692307692308, "grad_norm": 0.96875, "learning_rate": 9.278906361507238e-06, "loss": 0.8842, "step": 224 }, { "epoch": 0.4326923076923077, "grad_norm": 1.0078125, "learning_rate": 9.270659720119533e-06, "loss": 0.9524, "step": 225 }, { "epoch": 0.4346153846153846, "grad_norm": 1.0625, "learning_rate": 9.262369898948462e-06, "loss": 0.9271, "step": 226 }, { "epoch": 0.43653846153846154, "grad_norm": 1.078125, "learning_rate": 9.254036981810741e-06, "loss": 0.9627, "step": 227 }, { "epoch": 0.43846153846153846, "grad_norm": 1.015625, "learning_rate": 9.245661052958823e-06, "loss": 0.9305, "step": 228 }, { "epoch": 0.4403846153846154, "grad_norm": 1.03125, "learning_rate": 9.237242197080045e-06, "loss": 0.9855, "step": 229 }, { "epoch": 0.4423076923076923, "grad_norm": 1.0390625, "learning_rate": 9.22878049929577e-06, "loss": 0.9237, "step": 230 }, { "epoch": 0.4442307692307692, "grad_norm": 1.0078125, "learning_rate": 9.220276045160524e-06, "loss": 0.9733, "step": 231 }, { "epoch": 0.4461538461538462, "grad_norm": 0.9765625, "learning_rate": 9.211728920661136e-06, "loss": 0.9613, "step": 232 }, { "epoch": 0.4480769230769231, "grad_norm": 0.9609375, "learning_rate": 9.203139212215868e-06, "loss": 0.9317, "step": 233 }, { "epoch": 0.45, "grad_norm": 0.99609375, "learning_rate": 9.19450700667354e-06, "loss": 0.9067, "step": 234 }, { "epoch": 0.4519230769230769, "grad_norm": 0.9296875, "learning_rate": 9.185832391312644e-06, "loss": 0.9088, "step": 235 }, { "epoch": 0.45384615384615384, "grad_norm": 0.953125, "learning_rate": 9.17711545384048e-06, "loss": 0.9359, "step": 236 }, { "epoch": 0.45576923076923076, "grad_norm": 1.015625, "learning_rate": 9.168356282392253e-06, "loss": 0.96, "step": 237 }, { "epoch": 0.4576923076923077, "grad_norm": 0.90234375, "learning_rate": 9.159554965530184e-06, "loss": 0.9193, "step": 238 }, { "epoch": 0.4596153846153846, "grad_norm": 0.93359375, "learning_rate": 9.150711592242627e-06, "loss": 0.9439, "step": 239 }, { "epoch": 0.46153846153846156, "grad_norm": 1.0078125, "learning_rate": 9.14182625194315e-06, "loss": 0.8804, "step": 240 }, { "epoch": 0.4634615384615385, "grad_norm": 0.9375, "learning_rate": 9.132899034469648e-06, "loss": 0.916, "step": 241 }, { "epoch": 0.4653846153846154, "grad_norm": 0.98828125, "learning_rate": 9.123930030083425e-06, "loss": 0.8793, "step": 242 }, { "epoch": 0.4673076923076923, "grad_norm": 0.93359375, "learning_rate": 9.114919329468283e-06, "loss": 0.9162, "step": 243 }, { "epoch": 0.46923076923076923, "grad_norm": 0.98046875, "learning_rate": 9.10586702372961e-06, "loss": 0.9098, "step": 244 }, { "epoch": 0.47115384615384615, "grad_norm": 0.91015625, "learning_rate": 9.09677320439345e-06, "loss": 0.9219, "step": 245 }, { "epoch": 0.47307692307692306, "grad_norm": 0.921875, "learning_rate": 9.087637963405586e-06, "loss": 0.9554, "step": 246 }, { "epoch": 0.475, "grad_norm": 0.9140625, "learning_rate": 9.07846139313061e-06, "loss": 0.9994, "step": 247 }, { "epoch": 0.47692307692307695, "grad_norm": 0.96875, "learning_rate": 9.069243586350976e-06, "loss": 0.9052, "step": 248 }, { "epoch": 0.47884615384615387, "grad_norm": 0.94921875, "learning_rate": 9.059984636266082e-06, "loss": 0.9232, "step": 249 }, { "epoch": 0.4807692307692308, "grad_norm": 0.97265625, "learning_rate": 9.050684636491317e-06, "loss": 0.8964, "step": 250 }, { "epoch": 0.4826923076923077, "grad_norm": 0.91015625, "learning_rate": 9.041343681057106e-06, "loss": 0.9386, "step": 251 }, { "epoch": 0.4846153846153846, "grad_norm": 0.8984375, "learning_rate": 9.03196186440798e-06, "loss": 0.9078, "step": 252 }, { "epoch": 0.48653846153846153, "grad_norm": 0.96875, "learning_rate": 9.022539281401601e-06, "loss": 0.9056, "step": 253 }, { "epoch": 0.48846153846153845, "grad_norm": 0.8671875, "learning_rate": 9.013076027307817e-06, "loss": 0.8973, "step": 254 }, { "epoch": 0.49038461538461536, "grad_norm": 0.921875, "learning_rate": 9.00357219780769e-06, "loss": 0.9663, "step": 255 }, { "epoch": 0.49230769230769234, "grad_norm": 0.90234375, "learning_rate": 8.994027888992533e-06, "loss": 0.8857, "step": 256 }, { "epoch": 0.49423076923076925, "grad_norm": 0.9921875, "learning_rate": 8.984443197362938e-06, "loss": 0.9815, "step": 257 }, { "epoch": 0.49615384615384617, "grad_norm": 0.91015625, "learning_rate": 8.974818219827796e-06, "loss": 0.9693, "step": 258 }, { "epoch": 0.4980769230769231, "grad_norm": 1.2265625, "learning_rate": 8.965153053703325e-06, "loss": 0.8971, "step": 259 }, { "epoch": 0.5, "grad_norm": 0.94140625, "learning_rate": 8.955447796712083e-06, "loss": 0.8998, "step": 260 }, { "epoch": 0.5, "eval_loss": 0.9489682912826538, "eval_runtime": 34.8214, "eval_samples_per_second": 67.286, "eval_steps_per_second": 16.829, "step": 260 }, { "epoch": 0.5019230769230769, "grad_norm": 0.9609375, "learning_rate": 8.94570254698197e-06, "loss": 0.9153, "step": 261 }, { "epoch": 0.5038461538461538, "grad_norm": 0.94140625, "learning_rate": 8.935917403045251e-06, "loss": 0.9449, "step": 262 }, { "epoch": 0.5057692307692307, "grad_norm": 0.91796875, "learning_rate": 8.926092463837557e-06, "loss": 0.9087, "step": 263 }, { "epoch": 0.5076923076923077, "grad_norm": 0.890625, "learning_rate": 8.916227828696873e-06, "loss": 0.8946, "step": 264 }, { "epoch": 0.5096153846153846, "grad_norm": 0.8671875, "learning_rate": 8.906323597362547e-06, "loss": 0.9261, "step": 265 }, { "epoch": 0.5115384615384615, "grad_norm": 0.8828125, "learning_rate": 8.896379869974273e-06, "loss": 0.8826, "step": 266 }, { "epoch": 0.5134615384615384, "grad_norm": 0.9609375, "learning_rate": 8.886396747071085e-06, "loss": 0.9224, "step": 267 }, { "epoch": 0.5153846153846153, "grad_norm": 0.84765625, "learning_rate": 8.876374329590331e-06, "loss": 0.8849, "step": 268 }, { "epoch": 0.5173076923076924, "grad_norm": 0.890625, "learning_rate": 8.866312718866669e-06, "loss": 0.9516, "step": 269 }, { "epoch": 0.5192307692307693, "grad_norm": 0.921875, "learning_rate": 8.85621201663102e-06, "loss": 0.9049, "step": 270 }, { "epoch": 0.5211538461538462, "grad_norm": 0.8125, "learning_rate": 8.846072325009562e-06, "loss": 0.8953, "step": 271 }, { "epoch": 0.5230769230769231, "grad_norm": 0.8046875, "learning_rate": 8.83589374652268e-06, "loss": 0.8507, "step": 272 }, { "epoch": 0.525, "grad_norm": 0.8359375, "learning_rate": 8.825676384083936e-06, "loss": 0.8904, "step": 273 }, { "epoch": 0.5269230769230769, "grad_norm": 0.83203125, "learning_rate": 8.815420340999034e-06, "loss": 0.9469, "step": 274 }, { "epoch": 0.5288461538461539, "grad_norm": 0.90234375, "learning_rate": 8.805125720964766e-06, "loss": 0.9144, "step": 275 }, { "epoch": 0.5307692307692308, "grad_norm": 0.828125, "learning_rate": 8.79479262806797e-06, "loss": 0.8757, "step": 276 }, { "epoch": 0.5326923076923077, "grad_norm": 0.83203125, "learning_rate": 8.784421166784476e-06, "loss": 0.8781, "step": 277 }, { "epoch": 0.5346153846153846, "grad_norm": 0.89453125, "learning_rate": 8.774011441978046e-06, "loss": 0.9348, "step": 278 }, { "epoch": 0.5365384615384615, "grad_norm": 0.9296875, "learning_rate": 8.763563558899317e-06, "loss": 0.9949, "step": 279 }, { "epoch": 0.5384615384615384, "grad_norm": 0.8359375, "learning_rate": 8.75307762318474e-06, "loss": 0.9171, "step": 280 }, { "epoch": 0.5403846153846154, "grad_norm": 0.890625, "learning_rate": 8.742553740855507e-06, "loss": 1.0024, "step": 281 }, { "epoch": 0.5423076923076923, "grad_norm": 0.82421875, "learning_rate": 8.731992018316478e-06, "loss": 0.8619, "step": 282 }, { "epoch": 0.5442307692307692, "grad_norm": 0.7890625, "learning_rate": 8.721392562355113e-06, "loss": 0.955, "step": 283 }, { "epoch": 0.5461538461538461, "grad_norm": 1.046875, "learning_rate": 8.71075548014038e-06, "loss": 0.9189, "step": 284 }, { "epoch": 0.5480769230769231, "grad_norm": 0.9140625, "learning_rate": 8.700080879221689e-06, "loss": 0.9118, "step": 285 }, { "epoch": 0.55, "grad_norm": 0.8203125, "learning_rate": 8.689368867527781e-06, "loss": 0.8916, "step": 286 }, { "epoch": 0.551923076923077, "grad_norm": 0.86328125, "learning_rate": 8.67861955336566e-06, "loss": 0.8934, "step": 287 }, { "epoch": 0.5538461538461539, "grad_norm": 0.875, "learning_rate": 8.667833045419483e-06, "loss": 0.8921, "step": 288 }, { "epoch": 0.5557692307692308, "grad_norm": 0.87890625, "learning_rate": 8.657009452749466e-06, "loss": 1.0005, "step": 289 }, { "epoch": 0.5576923076923077, "grad_norm": 0.85546875, "learning_rate": 8.646148884790786e-06, "loss": 0.8828, "step": 290 }, { "epoch": 0.5596153846153846, "grad_norm": 0.82421875, "learning_rate": 8.635251451352463e-06, "loss": 0.8704, "step": 291 }, { "epoch": 0.5615384615384615, "grad_norm": 0.7890625, "learning_rate": 8.624317262616261e-06, "loss": 0.9182, "step": 292 }, { "epoch": 0.5634615384615385, "grad_norm": 0.86328125, "learning_rate": 8.613346429135567e-06, "loss": 0.9128, "step": 293 }, { "epoch": 0.5653846153846154, "grad_norm": 0.83984375, "learning_rate": 8.602339061834278e-06, "loss": 0.9893, "step": 294 }, { "epoch": 0.5673076923076923, "grad_norm": 0.84375, "learning_rate": 8.591295272005674e-06, "loss": 0.9299, "step": 295 }, { "epoch": 0.5692307692307692, "grad_norm": 0.77734375, "learning_rate": 8.5802151713113e-06, "loss": 0.894, "step": 296 }, { "epoch": 0.5711538461538461, "grad_norm": 0.8359375, "learning_rate": 8.569098871779828e-06, "loss": 0.9472, "step": 297 }, { "epoch": 0.573076923076923, "grad_norm": 0.828125, "learning_rate": 8.557946485805932e-06, "loss": 0.919, "step": 298 }, { "epoch": 0.575, "grad_norm": 0.8046875, "learning_rate": 8.546758126149148e-06, "loss": 0.882, "step": 299 }, { "epoch": 0.5769230769230769, "grad_norm": 0.88671875, "learning_rate": 8.535533905932739e-06, "loss": 0.8685, "step": 300 }, { "epoch": 0.5788461538461539, "grad_norm": 0.85546875, "learning_rate": 8.524273938642539e-06, "loss": 0.8966, "step": 301 }, { "epoch": 0.5807692307692308, "grad_norm": 0.796875, "learning_rate": 8.512978338125818e-06, "loss": 0.9205, "step": 302 }, { "epoch": 0.5826923076923077, "grad_norm": 0.83984375, "learning_rate": 8.501647218590127e-06, "loss": 0.9815, "step": 303 }, { "epoch": 0.5846153846153846, "grad_norm": 0.80859375, "learning_rate": 8.490280694602142e-06, "loss": 0.9317, "step": 304 }, { "epoch": 0.5865384615384616, "grad_norm": 0.7890625, "learning_rate": 8.478878881086505e-06, "loss": 0.9498, "step": 305 }, { "epoch": 0.5884615384615385, "grad_norm": 0.7578125, "learning_rate": 8.467441893324667e-06, "loss": 0.9088, "step": 306 }, { "epoch": 0.5903846153846154, "grad_norm": 0.81640625, "learning_rate": 8.455969846953711e-06, "loss": 0.8728, "step": 307 }, { "epoch": 0.5923076923076923, "grad_norm": 0.8046875, "learning_rate": 8.444462857965198e-06, "loss": 0.9345, "step": 308 }, { "epoch": 0.5942307692307692, "grad_norm": 0.8046875, "learning_rate": 8.432921042703985e-06, "loss": 0.8951, "step": 309 }, { "epoch": 0.5961538461538461, "grad_norm": 0.8046875, "learning_rate": 8.42134451786705e-06, "loss": 1.005, "step": 310 }, { "epoch": 0.5980769230769231, "grad_norm": 0.7734375, "learning_rate": 8.409733400502311e-06, "loss": 0.919, "step": 311 }, { "epoch": 0.6, "grad_norm": 0.81640625, "learning_rate": 8.398087808007447e-06, "loss": 0.882, "step": 312 }, { "epoch": 0.6019230769230769, "grad_norm": 0.84375, "learning_rate": 8.386407858128707e-06, "loss": 0.9254, "step": 313 }, { "epoch": 0.6038461538461538, "grad_norm": 0.77734375, "learning_rate": 8.374693668959717e-06, "loss": 0.9312, "step": 314 }, { "epoch": 0.6057692307692307, "grad_norm": 0.77734375, "learning_rate": 8.362945358940295e-06, "loss": 0.9124, "step": 315 }, { "epoch": 0.6076923076923076, "grad_norm": 0.765625, "learning_rate": 8.351163046855246e-06, "loss": 0.9181, "step": 316 }, { "epoch": 0.6096153846153847, "grad_norm": 0.77734375, "learning_rate": 8.339346851833163e-06, "loss": 0.9124, "step": 317 }, { "epoch": 0.6115384615384616, "grad_norm": 0.78515625, "learning_rate": 8.327496893345223e-06, "loss": 0.9282, "step": 318 }, { "epoch": 0.6134615384615385, "grad_norm": 0.7421875, "learning_rate": 8.315613291203977e-06, "loss": 0.9125, "step": 319 }, { "epoch": 0.6153846153846154, "grad_norm": 0.7734375, "learning_rate": 8.303696165562141e-06, "loss": 0.9366, "step": 320 }, { "epoch": 0.6173076923076923, "grad_norm": 0.78515625, "learning_rate": 8.291745636911382e-06, "loss": 0.9808, "step": 321 }, { "epoch": 0.6192307692307693, "grad_norm": 0.74609375, "learning_rate": 8.279761826081096e-06, "loss": 0.9105, "step": 322 }, { "epoch": 0.6211538461538462, "grad_norm": 0.79296875, "learning_rate": 8.26774485423719e-06, "loss": 0.9444, "step": 323 }, { "epoch": 0.6230769230769231, "grad_norm": 0.79296875, "learning_rate": 8.255694842880854e-06, "loss": 0.8981, "step": 324 }, { "epoch": 0.625, "grad_norm": 0.76171875, "learning_rate": 8.243611913847337e-06, "loss": 0.8919, "step": 325 }, { "epoch": 0.625, "eval_loss": 0.9419646263122559, "eval_runtime": 34.6043, "eval_samples_per_second": 67.708, "eval_steps_per_second": 16.934, "step": 325 }, { "epoch": 0.6269230769230769, "grad_norm": 0.76171875, "learning_rate": 8.231496189304704e-06, "loss": 0.8434, "step": 326 }, { "epoch": 0.6288461538461538, "grad_norm": 0.8671875, "learning_rate": 8.21934779175262e-06, "loss": 0.912, "step": 327 }, { "epoch": 0.6307692307692307, "grad_norm": 0.7890625, "learning_rate": 8.207166844021093e-06, "loss": 0.9085, "step": 328 }, { "epoch": 0.6326923076923077, "grad_norm": 0.77734375, "learning_rate": 8.19495346926924e-06, "loss": 0.9301, "step": 329 }, { "epoch": 0.6346153846153846, "grad_norm": 0.75, "learning_rate": 8.182707790984043e-06, "loss": 0.9023, "step": 330 }, { "epoch": 0.6365384615384615, "grad_norm": 0.76953125, "learning_rate": 8.170429932979097e-06, "loss": 0.9363, "step": 331 }, { "epoch": 0.6384615384615384, "grad_norm": 0.76953125, "learning_rate": 8.15812001939336e-06, "loss": 0.927, "step": 332 }, { "epoch": 0.6403846153846153, "grad_norm": 0.8671875, "learning_rate": 8.145778174689897e-06, "loss": 0.9826, "step": 333 }, { "epoch": 0.6423076923076924, "grad_norm": 0.8125, "learning_rate": 8.133404523654626e-06, "loss": 0.922, "step": 334 }, { "epoch": 0.6442307692307693, "grad_norm": 0.7578125, "learning_rate": 8.120999191395048e-06, "loss": 0.9405, "step": 335 }, { "epoch": 0.6461538461538462, "grad_norm": 0.78515625, "learning_rate": 8.108562303338987e-06, "loss": 0.8947, "step": 336 }, { "epoch": 0.6480769230769231, "grad_norm": 0.74609375, "learning_rate": 8.096093985233323e-06, "loss": 0.9109, "step": 337 }, { "epoch": 0.65, "grad_norm": 0.8125, "learning_rate": 8.083594363142717e-06, "loss": 0.9111, "step": 338 }, { "epoch": 0.6519230769230769, "grad_norm": 0.78515625, "learning_rate": 8.071063563448341e-06, "loss": 0.8957, "step": 339 }, { "epoch": 0.6538461538461539, "grad_norm": 0.74609375, "learning_rate": 8.058501712846594e-06, "loss": 0.9003, "step": 340 }, { "epoch": 0.6557692307692308, "grad_norm": 0.7890625, "learning_rate": 8.045908938347828e-06, "loss": 0.9372, "step": 341 }, { "epoch": 0.6576923076923077, "grad_norm": 0.75390625, "learning_rate": 8.03328536727506e-06, "loss": 0.8854, "step": 342 }, { "epoch": 0.6596153846153846, "grad_norm": 0.77734375, "learning_rate": 8.020631127262681e-06, "loss": 0.9505, "step": 343 }, { "epoch": 0.6615384615384615, "grad_norm": 0.765625, "learning_rate": 8.007946346255176e-06, "loss": 0.9581, "step": 344 }, { "epoch": 0.6634615384615384, "grad_norm": 0.83984375, "learning_rate": 7.995231152505815e-06, "loss": 0.9068, "step": 345 }, { "epoch": 0.6653846153846154, "grad_norm": 0.78125, "learning_rate": 7.982485674575373e-06, "loss": 0.9159, "step": 346 }, { "epoch": 0.6673076923076923, "grad_norm": 0.765625, "learning_rate": 7.96971004133082e-06, "loss": 0.9015, "step": 347 }, { "epoch": 0.6692307692307692, "grad_norm": 0.796875, "learning_rate": 7.95690438194402e-06, "loss": 0.9531, "step": 348 }, { "epoch": 0.6711538461538461, "grad_norm": 0.8046875, "learning_rate": 7.944068825890424e-06, "loss": 0.8971, "step": 349 }, { "epoch": 0.6730769230769231, "grad_norm": 0.83203125, "learning_rate": 7.931203502947762e-06, "loss": 0.868, "step": 350 }, { "epoch": 0.675, "grad_norm": 0.734375, "learning_rate": 7.918308543194735e-06, "loss": 0.9151, "step": 351 }, { "epoch": 0.676923076923077, "grad_norm": 0.77734375, "learning_rate": 7.905384077009693e-06, "loss": 0.9949, "step": 352 }, { "epoch": 0.6788461538461539, "grad_norm": 0.77734375, "learning_rate": 7.892430235069317e-06, "loss": 0.9025, "step": 353 }, { "epoch": 0.6807692307692308, "grad_norm": 0.76171875, "learning_rate": 7.879447148347307e-06, "loss": 0.8969, "step": 354 }, { "epoch": 0.6826923076923077, "grad_norm": 0.8828125, "learning_rate": 7.866434948113046e-06, "loss": 0.9086, "step": 355 }, { "epoch": 0.6846153846153846, "grad_norm": 0.78125, "learning_rate": 7.853393765930279e-06, "loss": 0.865, "step": 356 }, { "epoch": 0.6865384615384615, "grad_norm": 0.78515625, "learning_rate": 7.84032373365578e-06, "loss": 0.9514, "step": 357 }, { "epoch": 0.6884615384615385, "grad_norm": 0.7421875, "learning_rate": 7.827224983438024e-06, "loss": 0.8866, "step": 358 }, { "epoch": 0.6903846153846154, "grad_norm": 0.8671875, "learning_rate": 7.814097647715848e-06, "loss": 0.8856, "step": 359 }, { "epoch": 0.6923076923076923, "grad_norm": 0.78125, "learning_rate": 7.800941859217103e-06, "loss": 0.8864, "step": 360 }, { "epoch": 0.6942307692307692, "grad_norm": 0.75390625, "learning_rate": 7.787757750957335e-06, "loss": 0.9212, "step": 361 }, { "epoch": 0.6961538461538461, "grad_norm": 0.796875, "learning_rate": 7.77454545623841e-06, "loss": 0.9006, "step": 362 }, { "epoch": 0.698076923076923, "grad_norm": 0.7578125, "learning_rate": 7.761305108647188e-06, "loss": 0.9427, "step": 363 }, { "epoch": 0.7, "grad_norm": 0.78125, "learning_rate": 7.74803684205417e-06, "loss": 0.9583, "step": 364 }, { "epoch": 0.7019230769230769, "grad_norm": 0.80078125, "learning_rate": 7.734740790612137e-06, "loss": 0.9303, "step": 365 }, { "epoch": 0.7038461538461539, "grad_norm": 0.796875, "learning_rate": 7.72141708875479e-06, "loss": 0.9017, "step": 366 }, { "epoch": 0.7057692307692308, "grad_norm": 0.7421875, "learning_rate": 7.708065871195413e-06, "loss": 0.9247, "step": 367 }, { "epoch": 0.7076923076923077, "grad_norm": 0.7109375, "learning_rate": 7.694687272925487e-06, "loss": 0.8598, "step": 368 }, { "epoch": 0.7096153846153846, "grad_norm": 0.73046875, "learning_rate": 7.681281429213328e-06, "loss": 0.9719, "step": 369 }, { "epoch": 0.7115384615384616, "grad_norm": 0.76171875, "learning_rate": 7.667848475602735e-06, "loss": 0.9588, "step": 370 }, { "epoch": 0.7134615384615385, "grad_norm": 0.73046875, "learning_rate": 7.654388547911605e-06, "loss": 0.8185, "step": 371 }, { "epoch": 0.7153846153846154, "grad_norm": 0.73046875, "learning_rate": 7.640901782230567e-06, "loss": 0.93, "step": 372 }, { "epoch": 0.7173076923076923, "grad_norm": 0.734375, "learning_rate": 7.627388314921602e-06, "loss": 0.9846, "step": 373 }, { "epoch": 0.7192307692307692, "grad_norm": 0.74609375, "learning_rate": 7.613848282616665e-06, "loss": 0.9807, "step": 374 }, { "epoch": 0.7211538461538461, "grad_norm": 0.75390625, "learning_rate": 7.600281822216307e-06, "loss": 0.9011, "step": 375 }, { "epoch": 0.7230769230769231, "grad_norm": 0.74609375, "learning_rate": 7.586689070888284e-06, "loss": 0.8961, "step": 376 }, { "epoch": 0.725, "grad_norm": 0.7890625, "learning_rate": 7.5730701660661795e-06, "loss": 0.9279, "step": 377 }, { "epoch": 0.7269230769230769, "grad_norm": 0.74609375, "learning_rate": 7.559425245448006e-06, "loss": 0.9177, "step": 378 }, { "epoch": 0.7288461538461538, "grad_norm": 0.734375, "learning_rate": 7.5457544469948164e-06, "loss": 0.9309, "step": 379 }, { "epoch": 0.7307692307692307, "grad_norm": 0.75, "learning_rate": 7.532057908929311e-06, "loss": 0.8937, "step": 380 }, { "epoch": 0.7326923076923076, "grad_norm": 0.78515625, "learning_rate": 7.5183357697344395e-06, "loss": 0.895, "step": 381 }, { "epoch": 0.7346153846153847, "grad_norm": 0.7421875, "learning_rate": 7.504588168151994e-06, "loss": 0.9167, "step": 382 }, { "epoch": 0.7365384615384616, "grad_norm": 0.7421875, "learning_rate": 7.4908152431812175e-06, "loss": 0.921, "step": 383 }, { "epoch": 0.7384615384615385, "grad_norm": 0.75390625, "learning_rate": 7.477017134077389e-06, "loss": 0.8987, "step": 384 }, { "epoch": 0.7403846153846154, "grad_norm": 0.7578125, "learning_rate": 7.4631939803504215e-06, "loss": 0.8866, "step": 385 }, { "epoch": 0.7423076923076923, "grad_norm": 0.73828125, "learning_rate": 7.449345921763449e-06, "loss": 0.8745, "step": 386 }, { "epoch": 0.7442307692307693, "grad_norm": 0.75390625, "learning_rate": 7.435473098331411e-06, "loss": 0.865, "step": 387 }, { "epoch": 0.7461538461538462, "grad_norm": 0.78125, "learning_rate": 7.421575650319641e-06, "loss": 0.8841, "step": 388 }, { "epoch": 0.7480769230769231, "grad_norm": 0.765625, "learning_rate": 7.407653718242449e-06, "loss": 0.9637, "step": 389 }, { "epoch": 0.75, "grad_norm": 0.70703125, "learning_rate": 7.393707442861693e-06, "loss": 0.914, "step": 390 }, { "epoch": 0.75, "eval_loss": 0.9376137256622314, "eval_runtime": 34.5412, "eval_samples_per_second": 67.832, "eval_steps_per_second": 16.965, "step": 390 }, { "epoch": 0.7519230769230769, "grad_norm": 0.74609375, "learning_rate": 7.379736965185369e-06, "loss": 0.9394, "step": 391 }, { "epoch": 0.7538461538461538, "grad_norm": 0.77734375, "learning_rate": 7.365742426466169e-06, "loss": 0.9122, "step": 392 }, { "epoch": 0.7557692307692307, "grad_norm": 0.73046875, "learning_rate": 7.3517239682000675e-06, "loss": 0.9033, "step": 393 }, { "epoch": 0.7576923076923077, "grad_norm": 0.97265625, "learning_rate": 7.337681732124882e-06, "loss": 0.8908, "step": 394 }, { "epoch": 0.7596153846153846, "grad_norm": 0.734375, "learning_rate": 7.323615860218844e-06, "loss": 0.8938, "step": 395 }, { "epoch": 0.7615384615384615, "grad_norm": 0.75390625, "learning_rate": 7.30952649469916e-06, "loss": 0.9013, "step": 396 }, { "epoch": 0.7634615384615384, "grad_norm": 0.75390625, "learning_rate": 7.295413778020579e-06, "loss": 0.9203, "step": 397 }, { "epoch": 0.7653846153846153, "grad_norm": 0.76171875, "learning_rate": 7.281277852873947e-06, "loss": 0.9713, "step": 398 }, { "epoch": 0.7673076923076924, "grad_norm": 0.7890625, "learning_rate": 7.267118862184767e-06, "loss": 0.9376, "step": 399 }, { "epoch": 0.7692307692307693, "grad_norm": 0.765625, "learning_rate": 7.252936949111749e-06, "loss": 0.9329, "step": 400 }, { "epoch": 0.7711538461538462, "grad_norm": 0.734375, "learning_rate": 7.2387322570453724e-06, "loss": 0.8421, "step": 401 }, { "epoch": 0.7730769230769231, "grad_norm": 0.75, "learning_rate": 7.224504929606429e-06, "loss": 0.8929, "step": 402 }, { "epoch": 0.775, "grad_norm": 0.75390625, "learning_rate": 7.210255110644569e-06, "loss": 0.9063, "step": 403 }, { "epoch": 0.7769230769230769, "grad_norm": 0.7265625, "learning_rate": 7.195982944236853e-06, "loss": 0.9735, "step": 404 }, { "epoch": 0.7788461538461539, "grad_norm": 0.7109375, "learning_rate": 7.181688574686292e-06, "loss": 0.8794, "step": 405 }, { "epoch": 0.7807692307692308, "grad_norm": 0.75, "learning_rate": 7.167372146520386e-06, "loss": 0.8891, "step": 406 }, { "epoch": 0.7826923076923077, "grad_norm": 0.71484375, "learning_rate": 7.15303380448967e-06, "loss": 0.8951, "step": 407 }, { "epoch": 0.7846153846153846, "grad_norm": 0.78125, "learning_rate": 7.138673693566241e-06, "loss": 0.897, "step": 408 }, { "epoch": 0.7865384615384615, "grad_norm": 0.796875, "learning_rate": 7.1242919589422974e-06, "loss": 0.9431, "step": 409 }, { "epoch": 0.7884615384615384, "grad_norm": 0.74609375, "learning_rate": 7.1098887460286745e-06, "loss": 0.8704, "step": 410 }, { "epoch": 0.7903846153846154, "grad_norm": 0.765625, "learning_rate": 7.095464200453366e-06, "loss": 0.9813, "step": 411 }, { "epoch": 0.7923076923076923, "grad_norm": 0.734375, "learning_rate": 7.081018468060057e-06, "loss": 0.8657, "step": 412 }, { "epoch": 0.7942307692307692, "grad_norm": 0.7578125, "learning_rate": 7.066551694906651e-06, "loss": 0.9216, "step": 413 }, { "epoch": 0.7961538461538461, "grad_norm": 0.734375, "learning_rate": 7.052064027263785e-06, "loss": 0.9203, "step": 414 }, { "epoch": 0.7980769230769231, "grad_norm": 0.73828125, "learning_rate": 7.0375556116133605e-06, "loss": 0.9002, "step": 415 }, { "epoch": 0.8, "grad_norm": 0.73046875, "learning_rate": 7.023026594647057e-06, "loss": 0.9279, "step": 416 }, { "epoch": 0.801923076923077, "grad_norm": 0.71875, "learning_rate": 7.008477123264849e-06, "loss": 0.8851, "step": 417 }, { "epoch": 0.8038461538461539, "grad_norm": 0.72265625, "learning_rate": 6.9939073445735205e-06, "loss": 0.8718, "step": 418 }, { "epoch": 0.8057692307692308, "grad_norm": 0.703125, "learning_rate": 6.9793174058851805e-06, "loss": 0.8874, "step": 419 }, { "epoch": 0.8076923076923077, "grad_norm": 0.73828125, "learning_rate": 6.964707454715772e-06, "loss": 0.8747, "step": 420 }, { "epoch": 0.8096153846153846, "grad_norm": 0.7578125, "learning_rate": 6.9500776387835785e-06, "loss": 0.9146, "step": 421 }, { "epoch": 0.8115384615384615, "grad_norm": 0.78515625, "learning_rate": 6.935428106007734e-06, "loss": 0.9598, "step": 422 }, { "epoch": 0.8134615384615385, "grad_norm": 0.75390625, "learning_rate": 6.920759004506723e-06, "loss": 0.873, "step": 423 }, { "epoch": 0.8153846153846154, "grad_norm": 0.80859375, "learning_rate": 6.906070482596887e-06, "loss": 0.9395, "step": 424 }, { "epoch": 0.8173076923076923, "grad_norm": 0.71484375, "learning_rate": 6.891362688790925e-06, "loss": 0.8713, "step": 425 }, { "epoch": 0.8192307692307692, "grad_norm": 0.7109375, "learning_rate": 6.876635771796386e-06, "loss": 0.8427, "step": 426 }, { "epoch": 0.8211538461538461, "grad_norm": 0.75, "learning_rate": 6.8618898805141744e-06, "loss": 0.9148, "step": 427 }, { "epoch": 0.823076923076923, "grad_norm": 0.74609375, "learning_rate": 6.847125164037036e-06, "loss": 0.8788, "step": 428 }, { "epoch": 0.825, "grad_norm": 0.72265625, "learning_rate": 6.832341771648057e-06, "loss": 0.8523, "step": 429 }, { "epoch": 0.8269230769230769, "grad_norm": 0.7265625, "learning_rate": 6.817539852819149e-06, "loss": 0.869, "step": 430 }, { "epoch": 0.8288461538461539, "grad_norm": 0.6953125, "learning_rate": 6.802719557209547e-06, "loss": 0.8934, "step": 431 }, { "epoch": 0.8307692307692308, "grad_norm": 0.7265625, "learning_rate": 6.787881034664283e-06, "loss": 0.9127, "step": 432 }, { "epoch": 0.8326923076923077, "grad_norm": 0.7421875, "learning_rate": 6.773024435212678e-06, "loss": 0.9617, "step": 433 }, { "epoch": 0.8346153846153846, "grad_norm": 0.7421875, "learning_rate": 6.758149909066832e-06, "loss": 0.8918, "step": 434 }, { "epoch": 0.8365384615384616, "grad_norm": 0.7734375, "learning_rate": 6.743257606620094e-06, "loss": 0.9721, "step": 435 }, { "epoch": 0.8384615384615385, "grad_norm": 0.734375, "learning_rate": 6.728347678445539e-06, "loss": 0.9183, "step": 436 }, { "epoch": 0.8403846153846154, "grad_norm": 0.7265625, "learning_rate": 6.713420275294467e-06, "loss": 0.8995, "step": 437 }, { "epoch": 0.8423076923076923, "grad_norm": 0.75, "learning_rate": 6.69847554809485e-06, "loss": 0.879, "step": 438 }, { "epoch": 0.8442307692307692, "grad_norm": 0.7421875, "learning_rate": 6.683513647949826e-06, "loss": 0.927, "step": 439 }, { "epoch": 0.8461538461538461, "grad_norm": 0.75390625, "learning_rate": 6.668534726136166e-06, "loss": 0.9, "step": 440 }, { "epoch": 0.8480769230769231, "grad_norm": 0.74609375, "learning_rate": 6.653538934102743e-06, "loss": 0.8526, "step": 441 }, { "epoch": 0.85, "grad_norm": 0.75, "learning_rate": 6.638526423468999e-06, "loss": 0.8354, "step": 442 }, { "epoch": 0.8519230769230769, "grad_norm": 0.75, "learning_rate": 6.6234973460234184e-06, "loss": 0.8852, "step": 443 }, { "epoch": 0.8538461538461538, "grad_norm": 0.75, "learning_rate": 6.608451853721985e-06, "loss": 0.9275, "step": 444 }, { "epoch": 0.8557692307692307, "grad_norm": 0.72265625, "learning_rate": 6.593390098686653e-06, "loss": 0.9023, "step": 445 }, { "epoch": 0.8576923076923076, "grad_norm": 0.75, "learning_rate": 6.578312233203804e-06, "loss": 0.8804, "step": 446 }, { "epoch": 0.8596153846153847, "grad_norm": 0.7109375, "learning_rate": 6.563218409722712e-06, "loss": 0.9229, "step": 447 }, { "epoch": 0.8615384615384616, "grad_norm": 0.71484375, "learning_rate": 6.548108780853995e-06, "loss": 0.8995, "step": 448 }, { "epoch": 0.8634615384615385, "grad_norm": 0.703125, "learning_rate": 6.532983499368078e-06, "loss": 0.8847, "step": 449 }, { "epoch": 0.8653846153846154, "grad_norm": 0.71484375, "learning_rate": 6.5178427181936485e-06, "loss": 0.923, "step": 450 }, { "epoch": 0.8673076923076923, "grad_norm": 0.73828125, "learning_rate": 6.502686590416105e-06, "loss": 0.8987, "step": 451 }, { "epoch": 0.8692307692307693, "grad_norm": 0.7109375, "learning_rate": 6.487515269276015e-06, "loss": 0.9345, "step": 452 }, { "epoch": 0.8711538461538462, "grad_norm": 0.74609375, "learning_rate": 6.472328908167562e-06, "loss": 0.8575, "step": 453 }, { "epoch": 0.8730769230769231, "grad_norm": 0.734375, "learning_rate": 6.457127660636994e-06, "loss": 0.9209, "step": 454 }, { "epoch": 0.875, "grad_norm": 0.74609375, "learning_rate": 6.441911680381074e-06, "loss": 0.8873, "step": 455 }, { "epoch": 0.875, "eval_loss": 0.9346491098403931, "eval_runtime": 34.5947, "eval_samples_per_second": 67.727, "eval_steps_per_second": 16.939, "step": 455 }, { "epoch": 0.8769230769230769, "grad_norm": 0.73046875, "learning_rate": 6.426681121245527e-06, "loss": 0.9187, "step": 456 }, { "epoch": 0.8788461538461538, "grad_norm": 0.77734375, "learning_rate": 6.411436137223479e-06, "loss": 0.9509, "step": 457 }, { "epoch": 0.8807692307692307, "grad_norm": 0.75, "learning_rate": 6.396176882453902e-06, "loss": 0.9401, "step": 458 }, { "epoch": 0.8826923076923077, "grad_norm": 0.73828125, "learning_rate": 6.38090351122006e-06, "loss": 0.8767, "step": 459 }, { "epoch": 0.8846153846153846, "grad_norm": 0.72265625, "learning_rate": 6.365616177947945e-06, "loss": 0.8637, "step": 460 }, { "epoch": 0.8865384615384615, "grad_norm": 0.74609375, "learning_rate": 6.350315037204714e-06, "loss": 0.9081, "step": 461 }, { "epoch": 0.8884615384615384, "grad_norm": 0.73828125, "learning_rate": 6.335000243697134e-06, "loss": 0.9054, "step": 462 }, { "epoch": 0.8903846153846153, "grad_norm": 0.73046875, "learning_rate": 6.319671952270004e-06, "loss": 0.9045, "step": 463 }, { "epoch": 0.8923076923076924, "grad_norm": 0.71484375, "learning_rate": 6.304330317904605e-06, "loss": 0.9227, "step": 464 }, { "epoch": 0.8942307692307693, "grad_norm": 0.69921875, "learning_rate": 6.288975495717124e-06, "loss": 0.8882, "step": 465 }, { "epoch": 0.8961538461538462, "grad_norm": 0.76171875, "learning_rate": 6.273607640957085e-06, "loss": 0.9967, "step": 466 }, { "epoch": 0.8980769230769231, "grad_norm": 0.73046875, "learning_rate": 6.258226909005783e-06, "loss": 0.9474, "step": 467 }, { "epoch": 0.9, "grad_norm": 0.7578125, "learning_rate": 6.2428334553747135e-06, "loss": 0.912, "step": 468 }, { "epoch": 0.9019230769230769, "grad_norm": 0.74609375, "learning_rate": 6.227427435703997e-06, "loss": 0.9355, "step": 469 }, { "epoch": 0.9038461538461539, "grad_norm": 0.75, "learning_rate": 6.212009005760805e-06, "loss": 0.9328, "step": 470 }, { "epoch": 0.9057692307692308, "grad_norm": 0.74609375, "learning_rate": 6.1965783214377895e-06, "loss": 0.9323, "step": 471 }, { "epoch": 0.9076923076923077, "grad_norm": 0.71484375, "learning_rate": 6.181135538751504e-06, "loss": 0.8865, "step": 472 }, { "epoch": 0.9096153846153846, "grad_norm": 0.73046875, "learning_rate": 6.165680813840822e-06, "loss": 0.9123, "step": 473 }, { "epoch": 0.9115384615384615, "grad_norm": 0.73828125, "learning_rate": 6.150214302965368e-06, "loss": 0.9209, "step": 474 }, { "epoch": 0.9134615384615384, "grad_norm": 0.7109375, "learning_rate": 6.134736162503929e-06, "loss": 0.9377, "step": 475 }, { "epoch": 0.9153846153846154, "grad_norm": 0.734375, "learning_rate": 6.119246548952877e-06, "loss": 0.9317, "step": 476 }, { "epoch": 0.9173076923076923, "grad_norm": 0.71484375, "learning_rate": 6.103745618924587e-06, "loss": 0.8839, "step": 477 }, { "epoch": 0.9192307692307692, "grad_norm": 0.71484375, "learning_rate": 6.088233529145849e-06, "loss": 0.8823, "step": 478 }, { "epoch": 0.9211538461538461, "grad_norm": 0.7421875, "learning_rate": 6.072710436456293e-06, "loss": 0.9031, "step": 479 }, { "epoch": 0.9230769230769231, "grad_norm": 0.7109375, "learning_rate": 6.057176497806791e-06, "loss": 0.9132, "step": 480 }, { "epoch": 0.925, "grad_norm": 0.7734375, "learning_rate": 6.041631870257882e-06, "loss": 0.8772, "step": 481 }, { "epoch": 0.926923076923077, "grad_norm": 0.73046875, "learning_rate": 6.026076710978172e-06, "loss": 0.901, "step": 482 }, { "epoch": 0.9288461538461539, "grad_norm": 0.74609375, "learning_rate": 6.010511177242757e-06, "loss": 0.9196, "step": 483 }, { "epoch": 0.9307692307692308, "grad_norm": 0.73046875, "learning_rate": 5.994935426431627e-06, "loss": 0.9718, "step": 484 }, { "epoch": 0.9326923076923077, "grad_norm": 0.76953125, "learning_rate": 5.979349616028067e-06, "loss": 0.963, "step": 485 }, { "epoch": 0.9346153846153846, "grad_norm": 0.70703125, "learning_rate": 5.963753903617084e-06, "loss": 0.9048, "step": 486 }, { "epoch": 0.9365384615384615, "grad_norm": 0.7578125, "learning_rate": 5.948148446883794e-06, "loss": 0.9705, "step": 487 }, { "epoch": 0.9384615384615385, "grad_norm": 0.71484375, "learning_rate": 5.932533403611835e-06, "loss": 0.8878, "step": 488 }, { "epoch": 0.9403846153846154, "grad_norm": 0.73046875, "learning_rate": 5.916908931681781e-06, "loss": 0.9049, "step": 489 }, { "epoch": 0.9423076923076923, "grad_norm": 0.703125, "learning_rate": 5.90127518906953e-06, "loss": 0.8733, "step": 490 }, { "epoch": 0.9442307692307692, "grad_norm": 0.69921875, "learning_rate": 5.885632333844714e-06, "loss": 0.8896, "step": 491 }, { "epoch": 0.9461538461538461, "grad_norm": 0.7109375, "learning_rate": 5.8699805241691065e-06, "loss": 0.9191, "step": 492 }, { "epoch": 0.948076923076923, "grad_norm": 0.69921875, "learning_rate": 5.854319918295012e-06, "loss": 0.8721, "step": 493 }, { "epoch": 0.95, "grad_norm": 0.73828125, "learning_rate": 5.838650674563674e-06, "loss": 0.8746, "step": 494 }, { "epoch": 0.9519230769230769, "grad_norm": 0.74609375, "learning_rate": 5.82297295140367e-06, "loss": 0.9072, "step": 495 }, { "epoch": 0.9538461538461539, "grad_norm": 0.74609375, "learning_rate": 5.807286907329315e-06, "loss": 0.8981, "step": 496 }, { "epoch": 0.9557692307692308, "grad_norm": 0.734375, "learning_rate": 5.79159270093905e-06, "loss": 0.9358, "step": 497 }, { "epoch": 0.9576923076923077, "grad_norm": 0.7109375, "learning_rate": 5.7758904909138495e-06, "loss": 0.915, "step": 498 }, { "epoch": 0.9596153846153846, "grad_norm": 0.72265625, "learning_rate": 5.760180436015604e-06, "loss": 0.8652, "step": 499 }, { "epoch": 0.9615384615384616, "grad_norm": 0.7265625, "learning_rate": 5.74446269508553e-06, "loss": 0.8926, "step": 500 }, { "epoch": 0.9634615384615385, "grad_norm": 0.71484375, "learning_rate": 5.7287374270425475e-06, "loss": 0.8889, "step": 501 }, { "epoch": 0.9653846153846154, "grad_norm": 0.703125, "learning_rate": 5.7130047908816884e-06, "loss": 0.9027, "step": 502 }, { "epoch": 0.9673076923076923, "grad_norm": 0.7421875, "learning_rate": 5.69726494567248e-06, "loss": 0.9419, "step": 503 }, { "epoch": 0.9692307692307692, "grad_norm": 0.73046875, "learning_rate": 5.681518050557336e-06, "loss": 0.9396, "step": 504 }, { "epoch": 0.9711538461538461, "grad_norm": 0.71875, "learning_rate": 5.6657642647499545e-06, "loss": 0.8828, "step": 505 }, { "epoch": 0.9730769230769231, "grad_norm": 0.765625, "learning_rate": 5.650003747533701e-06, "loss": 0.944, "step": 506 }, { "epoch": 0.975, "grad_norm": 0.73046875, "learning_rate": 5.6342366582600035e-06, "loss": 0.9072, "step": 507 }, { "epoch": 0.9769230769230769, "grad_norm": 0.7265625, "learning_rate": 5.61846315634674e-06, "loss": 0.9015, "step": 508 }, { "epoch": 0.9788461538461538, "grad_norm": 0.7265625, "learning_rate": 5.6026834012766155e-06, "loss": 0.9117, "step": 509 }, { "epoch": 0.9807692307692307, "grad_norm": 0.75, "learning_rate": 5.586897552595573e-06, "loss": 0.971, "step": 510 }, { "epoch": 0.9826923076923076, "grad_norm": 0.73828125, "learning_rate": 5.571105769911159e-06, "loss": 0.8729, "step": 511 }, { "epoch": 0.9846153846153847, "grad_norm": 0.72265625, "learning_rate": 5.555308212890917e-06, "loss": 0.9132, "step": 512 }, { "epoch": 0.9865384615384616, "grad_norm": 0.72265625, "learning_rate": 5.539505041260779e-06, "loss": 0.8606, "step": 513 }, { "epoch": 0.9884615384615385, "grad_norm": 0.703125, "learning_rate": 5.523696414803438e-06, "loss": 0.8937, "step": 514 }, { "epoch": 0.9903846153846154, "grad_norm": 0.73046875, "learning_rate": 5.507882493356745e-06, "loss": 0.913, "step": 515 }, { "epoch": 0.9923076923076923, "grad_norm": 4.5625, "learning_rate": 5.49206343681209e-06, "loss": 0.862, "step": 516 }, { "epoch": 0.9942307692307693, "grad_norm": 0.703125, "learning_rate": 5.476239405112775e-06, "loss": 0.8662, "step": 517 }, { "epoch": 0.9961538461538462, "grad_norm": 0.73046875, "learning_rate": 5.460410558252408e-06, "loss": 0.8443, "step": 518 }, { "epoch": 0.9980769230769231, "grad_norm": 0.7578125, "learning_rate": 5.444577056273284e-06, "loss": 0.9569, "step": 519 }, { "epoch": 1.0, "grad_norm": 0.73828125, "learning_rate": 5.428739059264767e-06, "loss": 0.8854, "step": 520 }, { "epoch": 1.0, "eval_loss": 0.9325999617576599, "eval_runtime": 34.5297, "eval_samples_per_second": 67.855, "eval_steps_per_second": 16.971, "step": 520 }, { "epoch": 1.001923076923077, "grad_norm": 0.7265625, "learning_rate": 5.412896727361663e-06, "loss": 0.9419, "step": 521 }, { "epoch": 1.0038461538461538, "grad_norm": 0.7265625, "learning_rate": 5.39705022074261e-06, "loss": 0.843, "step": 522 }, { "epoch": 1.0057692307692307, "grad_norm": 0.73046875, "learning_rate": 5.381199699628459e-06, "loss": 0.8494, "step": 523 }, { "epoch": 1.0076923076923077, "grad_norm": 0.7265625, "learning_rate": 5.365345324280646e-06, "loss": 0.8623, "step": 524 }, { "epoch": 1.0096153846153846, "grad_norm": 0.7421875, "learning_rate": 5.349487254999579e-06, "loss": 0.8716, "step": 525 }, { "epoch": 1.0115384615384615, "grad_norm": 0.68359375, "learning_rate": 5.333625652123014e-06, "loss": 0.9326, "step": 526 }, { "epoch": 1.0134615384615384, "grad_norm": 0.75, "learning_rate": 5.317760676024436e-06, "loss": 0.8741, "step": 527 }, { "epoch": 1.0153846153846153, "grad_norm": 0.71875, "learning_rate": 5.301892487111431e-06, "loss": 0.8816, "step": 528 }, { "epoch": 1.0173076923076922, "grad_norm": 0.71484375, "learning_rate": 5.286021245824075e-06, "loss": 0.8976, "step": 529 }, { "epoch": 1.0192307692307692, "grad_norm": 0.703125, "learning_rate": 5.270147112633304e-06, "loss": 0.8997, "step": 530 }, { "epoch": 1.021153846153846, "grad_norm": 0.74609375, "learning_rate": 5.254270248039291e-06, "loss": 0.9766, "step": 531 }, { "epoch": 1.023076923076923, "grad_norm": 0.71484375, "learning_rate": 5.238390812569828e-06, "loss": 0.8946, "step": 532 }, { "epoch": 1.025, "grad_norm": 0.70703125, "learning_rate": 5.222508966778702e-06, "loss": 0.909, "step": 533 }, { "epoch": 1.0269230769230768, "grad_norm": 0.69140625, "learning_rate": 5.206624871244066e-06, "loss": 0.8777, "step": 534 }, { "epoch": 1.0288461538461537, "grad_norm": 0.7421875, "learning_rate": 5.190738686566826e-06, "loss": 0.8775, "step": 535 }, { "epoch": 1.0307692307692307, "grad_norm": 0.7265625, "learning_rate": 5.1748505733690035e-06, "loss": 0.8591, "step": 536 }, { "epoch": 1.0326923076923078, "grad_norm": 0.76171875, "learning_rate": 5.158960692292122e-06, "loss": 0.942, "step": 537 }, { "epoch": 1.0346153846153847, "grad_norm": 0.734375, "learning_rate": 5.143069203995586e-06, "loss": 0.8728, "step": 538 }, { "epoch": 1.0365384615384616, "grad_norm": 0.74609375, "learning_rate": 5.1271762691550375e-06, "loss": 0.8437, "step": 539 }, { "epoch": 1.0384615384615385, "grad_norm": 0.69921875, "learning_rate": 5.111282048460753e-06, "loss": 0.8885, "step": 540 }, { "epoch": 1.0403846153846155, "grad_norm": 0.69140625, "learning_rate": 5.095386702616012e-06, "loss": 0.9171, "step": 541 }, { "epoch": 1.0423076923076924, "grad_norm": 0.70703125, "learning_rate": 5.079490392335463e-06, "loss": 0.8742, "step": 542 }, { "epoch": 1.0442307692307693, "grad_norm": 0.73046875, "learning_rate": 5.06359327834351e-06, "loss": 0.9521, "step": 543 }, { "epoch": 1.0461538461538462, "grad_norm": 0.70703125, "learning_rate": 5.047695521372681e-06, "loss": 0.913, "step": 544 }, { "epoch": 1.0480769230769231, "grad_norm": 0.75390625, "learning_rate": 5.031797282162007e-06, "loss": 0.9162, "step": 545 }, { "epoch": 1.05, "grad_norm": 0.734375, "learning_rate": 5.015898721455394e-06, "loss": 0.9098, "step": 546 }, { "epoch": 1.051923076923077, "grad_norm": 0.7109375, "learning_rate": 5e-06, "loss": 0.8851, "step": 547 }, { "epoch": 1.0538461538461539, "grad_norm": 0.7734375, "learning_rate": 4.984101278544607e-06, "loss": 0.9895, "step": 548 }, { "epoch": 1.0557692307692308, "grad_norm": 0.7265625, "learning_rate": 4.968202717837996e-06, "loss": 0.8406, "step": 549 }, { "epoch": 1.0576923076923077, "grad_norm": 0.72265625, "learning_rate": 4.9523044786273214e-06, "loss": 0.9372, "step": 550 }, { "epoch": 1.0596153846153846, "grad_norm": 0.69921875, "learning_rate": 4.936406721656492e-06, "loss": 0.8837, "step": 551 }, { "epoch": 1.0615384615384615, "grad_norm": 0.70703125, "learning_rate": 4.92050960766454e-06, "loss": 0.9099, "step": 552 }, { "epoch": 1.0634615384615385, "grad_norm": 0.7265625, "learning_rate": 4.9046132973839895e-06, "loss": 0.8802, "step": 553 }, { "epoch": 1.0653846153846154, "grad_norm": 0.73828125, "learning_rate": 4.8887179515392465e-06, "loss": 0.8892, "step": 554 }, { "epoch": 1.0673076923076923, "grad_norm": 0.73046875, "learning_rate": 4.872823730844966e-06, "loss": 0.9367, "step": 555 }, { "epoch": 1.0692307692307692, "grad_norm": 0.765625, "learning_rate": 4.856930796004417e-06, "loss": 0.999, "step": 556 }, { "epoch": 1.0711538461538461, "grad_norm": 0.671875, "learning_rate": 4.841039307707878e-06, "loss": 0.8129, "step": 557 }, { "epoch": 1.073076923076923, "grad_norm": 0.78515625, "learning_rate": 4.825149426630999e-06, "loss": 0.8832, "step": 558 }, { "epoch": 1.075, "grad_norm": 0.6875, "learning_rate": 4.809261313433176e-06, "loss": 0.9148, "step": 559 }, { "epoch": 1.0769230769230769, "grad_norm": 0.734375, "learning_rate": 4.793375128755934e-06, "loss": 0.9256, "step": 560 }, { "epoch": 1.0788461538461538, "grad_norm": 0.7265625, "learning_rate": 4.7774910332213005e-06, "loss": 0.9012, "step": 561 }, { "epoch": 1.0807692307692307, "grad_norm": 0.7265625, "learning_rate": 4.761609187430174e-06, "loss": 0.85, "step": 562 }, { "epoch": 1.0826923076923076, "grad_norm": 1.1015625, "learning_rate": 4.74572975196071e-06, "loss": 0.9216, "step": 563 }, { "epoch": 1.0846153846153845, "grad_norm": 0.703125, "learning_rate": 4.7298528873666985e-06, "loss": 0.8746, "step": 564 }, { "epoch": 1.0865384615384615, "grad_norm": 0.75, "learning_rate": 4.713978754175926e-06, "loss": 0.8979, "step": 565 }, { "epoch": 1.0884615384615384, "grad_norm": 0.7265625, "learning_rate": 4.69810751288857e-06, "loss": 0.888, "step": 566 }, { "epoch": 1.0903846153846153, "grad_norm": 0.734375, "learning_rate": 4.682239323975566e-06, "loss": 0.9075, "step": 567 }, { "epoch": 1.0923076923076924, "grad_norm": 0.7421875, "learning_rate": 4.666374347876987e-06, "loss": 0.8873, "step": 568 }, { "epoch": 1.0942307692307693, "grad_norm": 0.71484375, "learning_rate": 4.6505127450004216e-06, "loss": 0.8379, "step": 569 }, { "epoch": 1.0961538461538463, "grad_norm": 0.71875, "learning_rate": 4.634654675719355e-06, "loss": 0.9033, "step": 570 }, { "epoch": 1.0980769230769232, "grad_norm": 0.71875, "learning_rate": 4.618800300371543e-06, "loss": 0.896, "step": 571 }, { "epoch": 1.1, "grad_norm": 0.734375, "learning_rate": 4.60294977925739e-06, "loss": 0.8901, "step": 572 }, { "epoch": 1.101923076923077, "grad_norm": 0.7421875, "learning_rate": 4.587103272638339e-06, "loss": 0.9841, "step": 573 }, { "epoch": 1.103846153846154, "grad_norm": 0.72265625, "learning_rate": 4.571260940735235e-06, "loss": 0.8862, "step": 574 }, { "epoch": 1.1057692307692308, "grad_norm": 0.71875, "learning_rate": 4.555422943726715e-06, "loss": 0.9193, "step": 575 }, { "epoch": 1.1076923076923078, "grad_norm": 0.765625, "learning_rate": 4.539589441747595e-06, "loss": 0.9534, "step": 576 }, { "epoch": 1.1096153846153847, "grad_norm": 0.7578125, "learning_rate": 4.523760594887228e-06, "loss": 0.9352, "step": 577 }, { "epoch": 1.1115384615384616, "grad_norm": 0.69921875, "learning_rate": 4.507936563187911e-06, "loss": 0.8709, "step": 578 }, { "epoch": 1.1134615384615385, "grad_norm": 0.76171875, "learning_rate": 4.492117506643256e-06, "loss": 0.9662, "step": 579 }, { "epoch": 1.1153846153846154, "grad_norm": 0.671875, "learning_rate": 4.476303585196563e-06, "loss": 0.8556, "step": 580 }, { "epoch": 1.1173076923076923, "grad_norm": 0.73046875, "learning_rate": 4.460494958739223e-06, "loss": 0.9715, "step": 581 }, { "epoch": 1.1192307692307693, "grad_norm": 0.72265625, "learning_rate": 4.444691787109085e-06, "loss": 0.8285, "step": 582 }, { "epoch": 1.1211538461538462, "grad_norm": 0.72265625, "learning_rate": 4.428894230088842e-06, "loss": 0.908, "step": 583 }, { "epoch": 1.123076923076923, "grad_norm": 0.734375, "learning_rate": 4.413102447404428e-06, "loss": 0.946, "step": 584 }, { "epoch": 1.125, "grad_norm": 0.7265625, "learning_rate": 4.397316598723385e-06, "loss": 0.9365, "step": 585 }, { "epoch": 1.125, "eval_loss": 0.9315983653068542, "eval_runtime": 34.5056, "eval_samples_per_second": 67.902, "eval_steps_per_second": 16.983, "step": 585 }, { "epoch": 1.126923076923077, "grad_norm": 0.75390625, "learning_rate": 4.381536843653262e-06, "loss": 0.9313, "step": 586 }, { "epoch": 1.1288461538461538, "grad_norm": 0.73046875, "learning_rate": 4.365763341739996e-06, "loss": 0.882, "step": 587 }, { "epoch": 1.1307692307692307, "grad_norm": 0.73828125, "learning_rate": 4.3499962524662995e-06, "loss": 0.8806, "step": 588 }, { "epoch": 1.1326923076923077, "grad_norm": 0.74609375, "learning_rate": 4.334235735250047e-06, "loss": 0.8719, "step": 589 }, { "epoch": 1.1346153846153846, "grad_norm": 0.72265625, "learning_rate": 4.318481949442665e-06, "loss": 0.9148, "step": 590 }, { "epoch": 1.1365384615384615, "grad_norm": 0.734375, "learning_rate": 4.302735054327523e-06, "loss": 0.8417, "step": 591 }, { "epoch": 1.1384615384615384, "grad_norm": 0.734375, "learning_rate": 4.286995209118313e-06, "loss": 0.8802, "step": 592 }, { "epoch": 1.1403846153846153, "grad_norm": 0.71875, "learning_rate": 4.271262572957453e-06, "loss": 0.9018, "step": 593 }, { "epoch": 1.1423076923076922, "grad_norm": 0.68359375, "learning_rate": 4.255537304914472e-06, "loss": 0.8499, "step": 594 }, { "epoch": 1.1442307692307692, "grad_norm": 0.74609375, "learning_rate": 4.239819563984397e-06, "loss": 0.948, "step": 595 }, { "epoch": 1.146153846153846, "grad_norm": 0.71484375, "learning_rate": 4.224109509086151e-06, "loss": 0.9229, "step": 596 }, { "epoch": 1.148076923076923, "grad_norm": 0.7109375, "learning_rate": 4.2084072990609505e-06, "loss": 0.8326, "step": 597 }, { "epoch": 1.15, "grad_norm": 0.703125, "learning_rate": 4.192713092670687e-06, "loss": 0.899, "step": 598 }, { "epoch": 1.1519230769230768, "grad_norm": 0.7265625, "learning_rate": 4.17702704859633e-06, "loss": 0.8957, "step": 599 }, { "epoch": 1.1538461538461537, "grad_norm": 0.73828125, "learning_rate": 4.161349325436328e-06, "loss": 0.9516, "step": 600 }, { "epoch": 1.1557692307692307, "grad_norm": 0.734375, "learning_rate": 4.145680081704989e-06, "loss": 0.9012, "step": 601 }, { "epoch": 1.1576923076923076, "grad_norm": 0.6875, "learning_rate": 4.1300194758308935e-06, "loss": 0.8685, "step": 602 }, { "epoch": 1.1596153846153845, "grad_norm": 0.79296875, "learning_rate": 4.1143676661552876e-06, "loss": 0.9122, "step": 603 }, { "epoch": 1.1615384615384616, "grad_norm": 0.71484375, "learning_rate": 4.098724810930472e-06, "loss": 0.9732, "step": 604 }, { "epoch": 1.1634615384615385, "grad_norm": 0.71484375, "learning_rate": 4.08309106831822e-06, "loss": 0.8518, "step": 605 }, { "epoch": 1.1653846153846155, "grad_norm": 0.73046875, "learning_rate": 4.067466596388166e-06, "loss": 0.8326, "step": 606 }, { "epoch": 1.1673076923076924, "grad_norm": 0.71484375, "learning_rate": 4.051851553116208e-06, "loss": 0.8629, "step": 607 }, { "epoch": 1.1692307692307693, "grad_norm": 0.71875, "learning_rate": 4.036246096382916e-06, "loss": 0.9053, "step": 608 }, { "epoch": 1.1711538461538462, "grad_norm": 0.73046875, "learning_rate": 4.0206503839719335e-06, "loss": 0.9191, "step": 609 }, { "epoch": 1.1730769230769231, "grad_norm": 0.7734375, "learning_rate": 4.0050645735683745e-06, "loss": 0.9079, "step": 610 }, { "epoch": 1.175, "grad_norm": 0.72265625, "learning_rate": 3.989488822757244e-06, "loss": 0.9048, "step": 611 }, { "epoch": 1.176923076923077, "grad_norm": 0.71875, "learning_rate": 3.973923289021829e-06, "loss": 0.8945, "step": 612 }, { "epoch": 1.1788461538461539, "grad_norm": 0.71484375, "learning_rate": 3.9583681297421194e-06, "loss": 0.8879, "step": 613 }, { "epoch": 1.1807692307692308, "grad_norm": 0.79296875, "learning_rate": 3.9428235021932104e-06, "loss": 0.982, "step": 614 }, { "epoch": 1.1826923076923077, "grad_norm": 0.73828125, "learning_rate": 3.927289563543709e-06, "loss": 0.8807, "step": 615 }, { "epoch": 1.1846153846153846, "grad_norm": 0.73828125, "learning_rate": 3.911766470854152e-06, "loss": 0.9508, "step": 616 }, { "epoch": 1.1865384615384615, "grad_norm": 0.7265625, "learning_rate": 3.896254381075416e-06, "loss": 0.8701, "step": 617 }, { "epoch": 1.1884615384615385, "grad_norm": 0.71484375, "learning_rate": 3.880753451047124e-06, "loss": 0.9148, "step": 618 }, { "epoch": 1.1903846153846154, "grad_norm": 0.73828125, "learning_rate": 3.865263837496072e-06, "loss": 0.8937, "step": 619 }, { "epoch": 1.1923076923076923, "grad_norm": 0.7265625, "learning_rate": 3.849785697034634e-06, "loss": 0.855, "step": 620 }, { "epoch": 1.1942307692307692, "grad_norm": 0.68359375, "learning_rate": 3.834319186159179e-06, "loss": 0.8574, "step": 621 }, { "epoch": 1.1961538461538461, "grad_norm": 0.7265625, "learning_rate": 3.818864461248498e-06, "loss": 0.9341, "step": 622 }, { "epoch": 1.198076923076923, "grad_norm": 0.69921875, "learning_rate": 3.803421678562213e-06, "loss": 0.9029, "step": 623 }, { "epoch": 1.2, "grad_norm": 0.7265625, "learning_rate": 3.7879909942391963e-06, "loss": 0.9188, "step": 624 }, { "epoch": 1.2019230769230769, "grad_norm": 0.7109375, "learning_rate": 3.7725725642960047e-06, "loss": 0.8459, "step": 625 }, { "epoch": 1.2038461538461538, "grad_norm": 0.6953125, "learning_rate": 3.7571665446252886e-06, "loss": 0.8931, "step": 626 }, { "epoch": 1.2057692307692307, "grad_norm": 0.69921875, "learning_rate": 3.7417730909942184e-06, "loss": 0.8719, "step": 627 }, { "epoch": 1.2076923076923076, "grad_norm": 0.734375, "learning_rate": 3.726392359042917e-06, "loss": 0.922, "step": 628 }, { "epoch": 1.2096153846153845, "grad_norm": 0.77734375, "learning_rate": 3.7110245042828786e-06, "loss": 0.8538, "step": 629 }, { "epoch": 1.2115384615384615, "grad_norm": 0.75, "learning_rate": 3.695669682095397e-06, "loss": 0.9149, "step": 630 }, { "epoch": 1.2134615384615384, "grad_norm": 0.67578125, "learning_rate": 3.6803280477299975e-06, "loss": 0.8769, "step": 631 }, { "epoch": 1.2153846153846155, "grad_norm": 0.69140625, "learning_rate": 3.664999756302869e-06, "loss": 0.8532, "step": 632 }, { "epoch": 1.2173076923076924, "grad_norm": 0.76953125, "learning_rate": 3.6496849627952875e-06, "loss": 0.9334, "step": 633 }, { "epoch": 1.2192307692307693, "grad_norm": 0.73046875, "learning_rate": 3.634383822052057e-06, "loss": 0.9112, "step": 634 }, { "epoch": 1.2211538461538463, "grad_norm": 0.69921875, "learning_rate": 3.6190964887799418e-06, "loss": 0.8462, "step": 635 }, { "epoch": 1.2230769230769232, "grad_norm": 0.76953125, "learning_rate": 3.6038231175461004e-06, "loss": 0.915, "step": 636 }, { "epoch": 1.225, "grad_norm": 0.7109375, "learning_rate": 3.5885638627765228e-06, "loss": 0.9869, "step": 637 }, { "epoch": 1.226923076923077, "grad_norm": 0.6875, "learning_rate": 3.573318878754475e-06, "loss": 0.8105, "step": 638 }, { "epoch": 1.228846153846154, "grad_norm": 0.7421875, "learning_rate": 3.5580883196189265e-06, "loss": 0.929, "step": 639 }, { "epoch": 1.2307692307692308, "grad_norm": 0.70703125, "learning_rate": 3.5428723393630067e-06, "loss": 0.9044, "step": 640 }, { "epoch": 1.2326923076923078, "grad_norm": 0.71875, "learning_rate": 3.52767109183244e-06, "loss": 0.8723, "step": 641 }, { "epoch": 1.2346153846153847, "grad_norm": 0.921875, "learning_rate": 3.5124847307239863e-06, "loss": 0.9235, "step": 642 }, { "epoch": 1.2365384615384616, "grad_norm": 0.73046875, "learning_rate": 3.4973134095838943e-06, "loss": 0.9271, "step": 643 }, { "epoch": 1.2384615384615385, "grad_norm": 0.68359375, "learning_rate": 3.4821572818063544e-06, "loss": 0.8456, "step": 644 }, { "epoch": 1.2403846153846154, "grad_norm": 0.69921875, "learning_rate": 3.4670165006319236e-06, "loss": 0.9204, "step": 645 }, { "epoch": 1.2423076923076923, "grad_norm": 0.73828125, "learning_rate": 3.4518912191460073e-06, "loss": 0.908, "step": 646 }, { "epoch": 1.2442307692307693, "grad_norm": 0.6953125, "learning_rate": 3.4367815902772917e-06, "loss": 0.8835, "step": 647 }, { "epoch": 1.2461538461538462, "grad_norm": 0.734375, "learning_rate": 3.4216877667961975e-06, "loss": 0.8741, "step": 648 }, { "epoch": 1.248076923076923, "grad_norm": 0.6953125, "learning_rate": 3.406609901313349e-06, "loss": 0.9994, "step": 649 }, { "epoch": 1.25, "grad_norm": 0.703125, "learning_rate": 3.3915481462780174e-06, "loss": 0.8865, "step": 650 }, { "epoch": 1.25, "eval_loss": 0.9308112859725952, "eval_runtime": 34.3336, "eval_samples_per_second": 68.242, "eval_steps_per_second": 17.068, "step": 650 }, { "epoch": 1.251923076923077, "grad_norm": 0.671875, "learning_rate": 3.3765026539765832e-06, "loss": 0.8766, "step": 651 }, { "epoch": 1.2538461538461538, "grad_norm": 0.7109375, "learning_rate": 3.3614735765310013e-06, "loss": 0.8693, "step": 652 }, { "epoch": 1.2557692307692307, "grad_norm": 0.7109375, "learning_rate": 3.3464610658972584e-06, "loss": 0.858, "step": 653 }, { "epoch": 1.2576923076923077, "grad_norm": 0.72265625, "learning_rate": 3.331465273863834e-06, "loss": 0.8857, "step": 654 }, { "epoch": 1.2596153846153846, "grad_norm": 0.73828125, "learning_rate": 3.3164863520501744e-06, "loss": 0.8839, "step": 655 }, { "epoch": 1.2615384615384615, "grad_norm": 0.67578125, "learning_rate": 3.3015244519051525e-06, "loss": 0.8891, "step": 656 }, { "epoch": 1.2634615384615384, "grad_norm": 0.7265625, "learning_rate": 3.2865797247055354e-06, "loss": 0.8891, "step": 657 }, { "epoch": 1.2653846153846153, "grad_norm": 0.71875, "learning_rate": 3.2716523215544602e-06, "loss": 0.8977, "step": 658 }, { "epoch": 1.2673076923076922, "grad_norm": 0.71484375, "learning_rate": 3.256742393379909e-06, "loss": 0.9019, "step": 659 }, { "epoch": 1.2692307692307692, "grad_norm": 0.71875, "learning_rate": 3.2418500909331684e-06, "loss": 0.8878, "step": 660 }, { "epoch": 1.271153846153846, "grad_norm": 0.71484375, "learning_rate": 3.226975564787322e-06, "loss": 0.8501, "step": 661 }, { "epoch": 1.273076923076923, "grad_norm": 0.7265625, "learning_rate": 3.21211896533572e-06, "loss": 0.8878, "step": 662 }, { "epoch": 1.275, "grad_norm": 0.7421875, "learning_rate": 3.197280442790455e-06, "loss": 0.9186, "step": 663 }, { "epoch": 1.2769230769230768, "grad_norm": 0.6953125, "learning_rate": 3.1824601471808504e-06, "loss": 0.8288, "step": 664 }, { "epoch": 1.2788461538461537, "grad_norm": 0.71484375, "learning_rate": 3.1676582283519454e-06, "loss": 0.9064, "step": 665 }, { "epoch": 1.2807692307692307, "grad_norm": 0.69921875, "learning_rate": 3.1528748359629657e-06, "loss": 0.8728, "step": 666 }, { "epoch": 1.2826923076923076, "grad_norm": 0.7109375, "learning_rate": 3.1381101194858264e-06, "loss": 0.8885, "step": 667 }, { "epoch": 1.2846153846153845, "grad_norm": 0.69921875, "learning_rate": 3.1233642282036147e-06, "loss": 0.9146, "step": 668 }, { "epoch": 1.2865384615384614, "grad_norm": 0.74609375, "learning_rate": 3.1086373112090762e-06, "loss": 0.897, "step": 669 }, { "epoch": 1.2884615384615383, "grad_norm": 0.73046875, "learning_rate": 3.0939295174031127e-06, "loss": 0.8649, "step": 670 }, { "epoch": 1.2903846153846155, "grad_norm": 0.69140625, "learning_rate": 3.079240995493279e-06, "loss": 0.8316, "step": 671 }, { "epoch": 1.2923076923076924, "grad_norm": 0.71484375, "learning_rate": 3.0645718939922668e-06, "loss": 0.8968, "step": 672 }, { "epoch": 1.2942307692307693, "grad_norm": 0.703125, "learning_rate": 3.049922361216422e-06, "loss": 0.9268, "step": 673 }, { "epoch": 1.2961538461538462, "grad_norm": 0.71875, "learning_rate": 3.03529254528423e-06, "loss": 0.8527, "step": 674 }, { "epoch": 1.2980769230769231, "grad_norm": 0.75390625, "learning_rate": 3.0206825941148203e-06, "loss": 0.8926, "step": 675 }, { "epoch": 1.3, "grad_norm": 0.77734375, "learning_rate": 3.006092655426481e-06, "loss": 0.8785, "step": 676 }, { "epoch": 1.301923076923077, "grad_norm": 0.73046875, "learning_rate": 2.991522876735154e-06, "loss": 0.9372, "step": 677 }, { "epoch": 1.3038461538461539, "grad_norm": 0.74609375, "learning_rate": 2.9769734053529443e-06, "loss": 0.876, "step": 678 }, { "epoch": 1.3057692307692308, "grad_norm": 0.77734375, "learning_rate": 2.9624443883866403e-06, "loss": 0.9531, "step": 679 }, { "epoch": 1.3076923076923077, "grad_norm": 0.71875, "learning_rate": 2.947935972736217e-06, "loss": 0.881, "step": 680 }, { "epoch": 1.3096153846153846, "grad_norm": 0.75, "learning_rate": 2.9334483050933506e-06, "loss": 0.9362, "step": 681 }, { "epoch": 1.3115384615384615, "grad_norm": 0.72265625, "learning_rate": 2.9189815319399422e-06, "loss": 0.9015, "step": 682 }, { "epoch": 1.3134615384615385, "grad_norm": 0.7109375, "learning_rate": 2.904535799546636e-06, "loss": 0.9278, "step": 683 }, { "epoch": 1.3153846153846154, "grad_norm": 0.73046875, "learning_rate": 2.890111253971327e-06, "loss": 0.8761, "step": 684 }, { "epoch": 1.3173076923076923, "grad_norm": 0.69921875, "learning_rate": 2.8757080410577042e-06, "loss": 0.8785, "step": 685 }, { "epoch": 1.3192307692307692, "grad_norm": 0.70703125, "learning_rate": 2.8613263064337617e-06, "loss": 0.8794, "step": 686 }, { "epoch": 1.3211538461538461, "grad_norm": 0.7109375, "learning_rate": 2.846966195510332e-06, "loss": 0.891, "step": 687 }, { "epoch": 1.323076923076923, "grad_norm": 0.72265625, "learning_rate": 2.8326278534796154e-06, "loss": 0.903, "step": 688 }, { "epoch": 1.325, "grad_norm": 0.71484375, "learning_rate": 2.81831142531371e-06, "loss": 0.9126, "step": 689 }, { "epoch": 1.3269230769230769, "grad_norm": 0.69921875, "learning_rate": 2.804017055763149e-06, "loss": 0.8889, "step": 690 }, { "epoch": 1.3288461538461538, "grad_norm": 0.703125, "learning_rate": 2.7897448893554335e-06, "loss": 0.8606, "step": 691 }, { "epoch": 1.3307692307692307, "grad_norm": 0.72265625, "learning_rate": 2.7754950703935735e-06, "loss": 0.8373, "step": 692 }, { "epoch": 1.3326923076923076, "grad_norm": 0.73046875, "learning_rate": 2.761267742954629e-06, "loss": 0.9246, "step": 693 }, { "epoch": 1.3346153846153845, "grad_norm": 0.72265625, "learning_rate": 2.7470630508882525e-06, "loss": 0.9381, "step": 694 }, { "epoch": 1.3365384615384617, "grad_norm": 0.73828125, "learning_rate": 2.7328811378152355e-06, "loss": 0.8936, "step": 695 }, { "epoch": 1.3384615384615386, "grad_norm": 0.7734375, "learning_rate": 2.718722147126054e-06, "loss": 0.9314, "step": 696 }, { "epoch": 1.3403846153846155, "grad_norm": 0.69921875, "learning_rate": 2.704586221979422e-06, "loss": 0.8957, "step": 697 }, { "epoch": 1.3423076923076924, "grad_norm": 0.70703125, "learning_rate": 2.6904735053008405e-06, "loss": 0.8703, "step": 698 }, { "epoch": 1.3442307692307693, "grad_norm": 0.7421875, "learning_rate": 2.6763841397811576e-06, "loss": 0.9244, "step": 699 }, { "epoch": 1.3461538461538463, "grad_norm": 0.71875, "learning_rate": 2.662318267875119e-06, "loss": 0.8768, "step": 700 }, { "epoch": 1.3480769230769232, "grad_norm": 0.71875, "learning_rate": 2.6482760317999338e-06, "loss": 0.9445, "step": 701 }, { "epoch": 1.35, "grad_norm": 0.74609375, "learning_rate": 2.634257573533833e-06, "loss": 0.9024, "step": 702 }, { "epoch": 1.351923076923077, "grad_norm": 0.73046875, "learning_rate": 2.6202630348146323e-06, "loss": 0.8766, "step": 703 }, { "epoch": 1.353846153846154, "grad_norm": 0.70703125, "learning_rate": 2.606292557138307e-06, "loss": 0.889, "step": 704 }, { "epoch": 1.3557692307692308, "grad_norm": 0.72265625, "learning_rate": 2.592346281757552e-06, "loss": 0.9081, "step": 705 }, { "epoch": 1.3576923076923078, "grad_norm": 0.7109375, "learning_rate": 2.5784243496803596e-06, "loss": 0.8632, "step": 706 }, { "epoch": 1.3596153846153847, "grad_norm": 0.6953125, "learning_rate": 2.5645269016685905e-06, "loss": 0.8161, "step": 707 }, { "epoch": 1.3615384615384616, "grad_norm": 0.71484375, "learning_rate": 2.550654078236552e-06, "loss": 0.8689, "step": 708 }, { "epoch": 1.3634615384615385, "grad_norm": 0.7109375, "learning_rate": 2.5368060196495785e-06, "loss": 0.9514, "step": 709 }, { "epoch": 1.3653846153846154, "grad_norm": 0.703125, "learning_rate": 2.5229828659226114e-06, "loss": 0.8675, "step": 710 }, { "epoch": 1.3673076923076923, "grad_norm": 0.72265625, "learning_rate": 2.5091847568187834e-06, "loss": 0.9018, "step": 711 }, { "epoch": 1.3692307692307693, "grad_norm": 0.73828125, "learning_rate": 2.4954118318480063e-06, "loss": 0.9049, "step": 712 }, { "epoch": 1.3711538461538462, "grad_norm": 0.703125, "learning_rate": 2.4816642302655634e-06, "loss": 0.8864, "step": 713 }, { "epoch": 1.373076923076923, "grad_norm": 0.74609375, "learning_rate": 2.4679420910706887e-06, "loss": 0.9208, "step": 714 }, { "epoch": 1.375, "grad_norm": 0.72265625, "learning_rate": 2.454245553005184e-06, "loss": 0.9696, "step": 715 }, { "epoch": 1.375, "eval_loss": 0.9303730130195618, "eval_runtime": 34.4007, "eval_samples_per_second": 68.109, "eval_steps_per_second": 17.035, "step": 715 }, { "epoch": 1.376923076923077, "grad_norm": 0.703125, "learning_rate": 2.4405747545519966e-06, "loss": 0.8986, "step": 716 }, { "epoch": 1.3788461538461538, "grad_norm": 0.73046875, "learning_rate": 2.4269298339338205e-06, "loss": 0.9069, "step": 717 }, { "epoch": 1.3807692307692307, "grad_norm": 0.70703125, "learning_rate": 2.4133109291117156e-06, "loss": 0.8608, "step": 718 }, { "epoch": 1.3826923076923077, "grad_norm": 0.73046875, "learning_rate": 2.3997181777836955e-06, "loss": 0.9137, "step": 719 }, { "epoch": 1.3846153846153846, "grad_norm": 0.73046875, "learning_rate": 2.3861517173833347e-06, "loss": 0.8828, "step": 720 }, { "epoch": 1.3865384615384615, "grad_norm": 0.76953125, "learning_rate": 2.3726116850783987e-06, "loss": 0.9207, "step": 721 }, { "epoch": 1.3884615384615384, "grad_norm": 0.69140625, "learning_rate": 2.3590982177694348e-06, "loss": 0.8221, "step": 722 }, { "epoch": 1.3903846153846153, "grad_norm": 0.7265625, "learning_rate": 2.3456114520883956e-06, "loss": 0.8922, "step": 723 }, { "epoch": 1.3923076923076922, "grad_norm": 0.72265625, "learning_rate": 2.3321515243972663e-06, "loss": 0.8462, "step": 724 }, { "epoch": 1.3942307692307692, "grad_norm": 0.69921875, "learning_rate": 2.318718570786675e-06, "loss": 0.8804, "step": 725 }, { "epoch": 1.396153846153846, "grad_norm": 0.69921875, "learning_rate": 2.3053127270745163e-06, "loss": 0.8969, "step": 726 }, { "epoch": 1.398076923076923, "grad_norm": 0.7265625, "learning_rate": 2.2919341288045853e-06, "loss": 0.9326, "step": 727 }, { "epoch": 1.4, "grad_norm": 0.69921875, "learning_rate": 2.27858291124521e-06, "loss": 0.9407, "step": 728 }, { "epoch": 1.4019230769230768, "grad_norm": 0.6953125, "learning_rate": 2.265259209387867e-06, "loss": 0.8988, "step": 729 }, { "epoch": 1.4038461538461537, "grad_norm": 0.7109375, "learning_rate": 2.25196315794583e-06, "loss": 0.8821, "step": 730 }, { "epoch": 1.4057692307692307, "grad_norm": 0.69921875, "learning_rate": 2.238694891352814e-06, "loss": 0.8893, "step": 731 }, { "epoch": 1.4076923076923076, "grad_norm": 0.6953125, "learning_rate": 2.2254545437615932e-06, "loss": 0.9305, "step": 732 }, { "epoch": 1.4096153846153845, "grad_norm": 0.71875, "learning_rate": 2.2122422490426676e-06, "loss": 0.9039, "step": 733 }, { "epoch": 1.4115384615384614, "grad_norm": 0.7109375, "learning_rate": 2.199058140782897e-06, "loss": 0.8692, "step": 734 }, { "epoch": 1.4134615384615383, "grad_norm": 0.72265625, "learning_rate": 2.1859023522841543e-06, "loss": 0.9083, "step": 735 }, { "epoch": 1.4153846153846155, "grad_norm": 0.6875, "learning_rate": 2.172775016561977e-06, "loss": 0.8858, "step": 736 }, { "epoch": 1.4173076923076924, "grad_norm": 0.73046875, "learning_rate": 2.159676266344222e-06, "loss": 0.8998, "step": 737 }, { "epoch": 1.4192307692307693, "grad_norm": 0.7265625, "learning_rate": 2.1466062340697234e-06, "loss": 0.8965, "step": 738 }, { "epoch": 1.4211538461538462, "grad_norm": 0.6953125, "learning_rate": 2.1335650518869555e-06, "loss": 0.8807, "step": 739 }, { "epoch": 1.4230769230769231, "grad_norm": 0.7109375, "learning_rate": 2.120552851652694e-06, "loss": 0.9295, "step": 740 }, { "epoch": 1.425, "grad_norm": 0.70703125, "learning_rate": 2.1075697649306838e-06, "loss": 0.8947, "step": 741 }, { "epoch": 1.426923076923077, "grad_norm": 0.72265625, "learning_rate": 2.094615922990309e-06, "loss": 0.9056, "step": 742 }, { "epoch": 1.4288461538461539, "grad_norm": 0.76171875, "learning_rate": 2.0816914568052664e-06, "loss": 0.9257, "step": 743 }, { "epoch": 1.4307692307692308, "grad_norm": 0.70703125, "learning_rate": 2.0687964970522394e-06, "loss": 0.9331, "step": 744 }, { "epoch": 1.4326923076923077, "grad_norm": 0.703125, "learning_rate": 2.055931174109579e-06, "loss": 0.9563, "step": 745 }, { "epoch": 1.4346153846153846, "grad_norm": 0.73046875, "learning_rate": 2.043095618055982e-06, "loss": 0.8613, "step": 746 }, { "epoch": 1.4365384615384615, "grad_norm": 0.7109375, "learning_rate": 2.030289958669181e-06, "loss": 0.8563, "step": 747 }, { "epoch": 1.4384615384615385, "grad_norm": 0.6953125, "learning_rate": 2.0175143254246277e-06, "loss": 0.8817, "step": 748 }, { "epoch": 1.4403846153846154, "grad_norm": 0.72265625, "learning_rate": 2.004768847494186e-06, "loss": 0.8463, "step": 749 }, { "epoch": 1.4423076923076923, "grad_norm": 0.71484375, "learning_rate": 1.992053653744826e-06, "loss": 0.9228, "step": 750 }, { "epoch": 1.4442307692307692, "grad_norm": 0.71484375, "learning_rate": 1.979368872737319e-06, "loss": 0.8673, "step": 751 }, { "epoch": 1.4461538461538461, "grad_norm": 0.703125, "learning_rate": 1.966714632724941e-06, "loss": 0.8408, "step": 752 }, { "epoch": 1.448076923076923, "grad_norm": 0.72265625, "learning_rate": 1.954091061652172e-06, "loss": 0.9419, "step": 753 }, { "epoch": 1.45, "grad_norm": 0.71484375, "learning_rate": 1.941498287153409e-06, "loss": 0.9032, "step": 754 }, { "epoch": 1.4519230769230769, "grad_norm": 0.7421875, "learning_rate": 1.928936436551661e-06, "loss": 0.9398, "step": 755 }, { "epoch": 1.4538461538461538, "grad_norm": 0.70703125, "learning_rate": 1.9164056368572847e-06, "loss": 0.8877, "step": 756 }, { "epoch": 1.4557692307692307, "grad_norm": 0.7109375, "learning_rate": 1.903906014766681e-06, "loss": 0.9077, "step": 757 }, { "epoch": 1.4576923076923076, "grad_norm": 0.71484375, "learning_rate": 1.891437696661015e-06, "loss": 0.9012, "step": 758 }, { "epoch": 1.4596153846153845, "grad_norm": 0.703125, "learning_rate": 1.8790008086049534e-06, "loss": 0.8972, "step": 759 }, { "epoch": 1.4615384615384617, "grad_norm": 0.71484375, "learning_rate": 1.8665954763453764e-06, "loss": 0.885, "step": 760 }, { "epoch": 1.4634615384615386, "grad_norm": 0.75390625, "learning_rate": 1.854221825310103e-06, "loss": 0.9224, "step": 761 }, { "epoch": 1.4653846153846155, "grad_norm": 0.6796875, "learning_rate": 1.8418799806066413e-06, "loss": 0.8654, "step": 762 }, { "epoch": 1.4673076923076924, "grad_norm": 0.7109375, "learning_rate": 1.829570067020906e-06, "loss": 0.9112, "step": 763 }, { "epoch": 1.4692307692307693, "grad_norm": 0.7109375, "learning_rate": 1.8172922090159578e-06, "loss": 0.8584, "step": 764 }, { "epoch": 1.4711538461538463, "grad_norm": 0.70703125, "learning_rate": 1.8050465307307602e-06, "loss": 0.8674, "step": 765 }, { "epoch": 1.4730769230769232, "grad_norm": 0.73046875, "learning_rate": 1.7928331559789087e-06, "loss": 0.8901, "step": 766 }, { "epoch": 1.475, "grad_norm": 0.69921875, "learning_rate": 1.7806522082473809e-06, "loss": 0.9509, "step": 767 }, { "epoch": 1.476923076923077, "grad_norm": 0.734375, "learning_rate": 1.7685038106952952e-06, "loss": 0.8837, "step": 768 }, { "epoch": 1.478846153846154, "grad_norm": 0.69921875, "learning_rate": 1.7563880861526656e-06, "loss": 0.9192, "step": 769 }, { "epoch": 1.4807692307692308, "grad_norm": 0.71484375, "learning_rate": 1.7443051571191472e-06, "loss": 0.8741, "step": 770 }, { "epoch": 1.4826923076923078, "grad_norm": 0.69140625, "learning_rate": 1.73225514576281e-06, "loss": 0.9072, "step": 771 }, { "epoch": 1.4846153846153847, "grad_norm": 0.70703125, "learning_rate": 1.7202381739189055e-06, "loss": 0.851, "step": 772 }, { "epoch": 1.4865384615384616, "grad_norm": 0.75, "learning_rate": 1.70825436308862e-06, "loss": 0.9552, "step": 773 }, { "epoch": 1.4884615384615385, "grad_norm": 0.73046875, "learning_rate": 1.696303834437859e-06, "loss": 0.8915, "step": 774 }, { "epoch": 1.4903846153846154, "grad_norm": 0.734375, "learning_rate": 1.6843867087960252e-06, "loss": 0.9092, "step": 775 }, { "epoch": 1.4923076923076923, "grad_norm": 0.71484375, "learning_rate": 1.6725031066547786e-06, "loss": 0.8589, "step": 776 }, { "epoch": 1.4942307692307693, "grad_norm": 0.7265625, "learning_rate": 1.6606531481668364e-06, "loss": 0.8642, "step": 777 }, { "epoch": 1.4961538461538462, "grad_norm": 0.73046875, "learning_rate": 1.648836953144755e-06, "loss": 0.9987, "step": 778 }, { "epoch": 1.498076923076923, "grad_norm": 0.72265625, "learning_rate": 1.6370546410597066e-06, "loss": 0.8802, "step": 779 }, { "epoch": 1.5, "grad_norm": 0.69140625, "learning_rate": 1.6253063310402833e-06, "loss": 0.9119, "step": 780 }, { "epoch": 1.5, "eval_loss": 0.9302220344543457, "eval_runtime": 34.2902, "eval_samples_per_second": 68.329, "eval_steps_per_second": 17.089, "step": 780 }, { "epoch": 1.501923076923077, "grad_norm": 0.72265625, "learning_rate": 1.6135921418712959e-06, "loss": 0.8816, "step": 781 }, { "epoch": 1.5038461538461538, "grad_norm": 0.73046875, "learning_rate": 1.601912191992554e-06, "loss": 0.9097, "step": 782 }, { "epoch": 1.5057692307692307, "grad_norm": 1.1484375, "learning_rate": 1.5902665994976896e-06, "loss": 0.9058, "step": 783 }, { "epoch": 1.5076923076923077, "grad_norm": 0.7109375, "learning_rate": 1.5786554821329515e-06, "loss": 0.8729, "step": 784 }, { "epoch": 1.5096153846153846, "grad_norm": 0.6953125, "learning_rate": 1.567078957296016e-06, "loss": 0.8892, "step": 785 }, { "epoch": 1.5115384615384615, "grad_norm": 0.71484375, "learning_rate": 1.5555371420348031e-06, "loss": 0.8654, "step": 786 }, { "epoch": 1.5134615384615384, "grad_norm": 0.75, "learning_rate": 1.544030153046291e-06, "loss": 0.8445, "step": 787 }, { "epoch": 1.5153846153846153, "grad_norm": 0.69140625, "learning_rate": 1.5325581066753354e-06, "loss": 0.8402, "step": 788 }, { "epoch": 1.5173076923076922, "grad_norm": 0.69140625, "learning_rate": 1.5211211189134955e-06, "loss": 0.8546, "step": 789 }, { "epoch": 1.5192307692307692, "grad_norm": 0.72265625, "learning_rate": 1.5097193053978587e-06, "loss": 0.9212, "step": 790 }, { "epoch": 1.521153846153846, "grad_norm": 0.66796875, "learning_rate": 1.4983527814098736e-06, "loss": 0.8601, "step": 791 }, { "epoch": 1.523076923076923, "grad_norm": 0.73046875, "learning_rate": 1.4870216618741833e-06, "loss": 0.8572, "step": 792 }, { "epoch": 1.525, "grad_norm": 0.90234375, "learning_rate": 1.475726061357463e-06, "loss": 0.9179, "step": 793 }, { "epoch": 1.5269230769230768, "grad_norm": 0.73828125, "learning_rate": 1.4644660940672628e-06, "loss": 0.9296, "step": 794 }, { "epoch": 1.5288461538461537, "grad_norm": 0.6953125, "learning_rate": 1.4532418738508525e-06, "loss": 0.8652, "step": 795 }, { "epoch": 1.5307692307692307, "grad_norm": 1.1640625, "learning_rate": 1.44205351419407e-06, "loss": 0.8567, "step": 796 }, { "epoch": 1.5326923076923076, "grad_norm": 0.7109375, "learning_rate": 1.430901128220174e-06, "loss": 0.9234, "step": 797 }, { "epoch": 1.5346153846153845, "grad_norm": 0.6953125, "learning_rate": 1.4197848286887017e-06, "loss": 0.8839, "step": 798 }, { "epoch": 1.5365384615384614, "grad_norm": 0.7109375, "learning_rate": 1.4087047279943267e-06, "loss": 0.9021, "step": 799 }, { "epoch": 1.5384615384615383, "grad_norm": 0.71875, "learning_rate": 1.397660938165723e-06, "loss": 0.9017, "step": 800 }, { "epoch": 1.5403846153846152, "grad_norm": 0.703125, "learning_rate": 1.3866535708644335e-06, "loss": 0.9315, "step": 801 }, { "epoch": 1.5423076923076922, "grad_norm": 1.046875, "learning_rate": 1.3756827373837396e-06, "loss": 0.8562, "step": 802 }, { "epoch": 1.544230769230769, "grad_norm": 0.71484375, "learning_rate": 1.3647485486475376e-06, "loss": 0.9092, "step": 803 }, { "epoch": 1.546153846153846, "grad_norm": 0.6953125, "learning_rate": 1.353851115209215e-06, "loss": 0.9142, "step": 804 }, { "epoch": 1.5480769230769231, "grad_norm": 0.7109375, "learning_rate": 1.3429905472505344e-06, "loss": 0.8867, "step": 805 }, { "epoch": 1.55, "grad_norm": 0.73046875, "learning_rate": 1.3321669545805188e-06, "loss": 0.9228, "step": 806 }, { "epoch": 1.551923076923077, "grad_norm": 0.6875, "learning_rate": 1.321380446634342e-06, "loss": 0.8635, "step": 807 }, { "epoch": 1.5538461538461539, "grad_norm": 0.6796875, "learning_rate": 1.310631132472222e-06, "loss": 0.8625, "step": 808 }, { "epoch": 1.5557692307692308, "grad_norm": 0.6875, "learning_rate": 1.2999191207783129e-06, "loss": 0.8367, "step": 809 }, { "epoch": 1.5576923076923077, "grad_norm": 0.69921875, "learning_rate": 1.2892445198596198e-06, "loss": 0.9199, "step": 810 }, { "epoch": 1.5596153846153846, "grad_norm": 0.6796875, "learning_rate": 1.27860743764489e-06, "loss": 0.8648, "step": 811 }, { "epoch": 1.5615384615384615, "grad_norm": 0.71875, "learning_rate": 1.2680079816835228e-06, "loss": 0.9395, "step": 812 }, { "epoch": 1.5634615384615385, "grad_norm": 0.6796875, "learning_rate": 1.257446259144494e-06, "loss": 0.8859, "step": 813 }, { "epoch": 1.5653846153846154, "grad_norm": 0.6875, "learning_rate": 1.2469223768152622e-06, "loss": 0.884, "step": 814 }, { "epoch": 1.5673076923076923, "grad_norm": 0.72265625, "learning_rate": 1.2364364411006841e-06, "loss": 0.8908, "step": 815 }, { "epoch": 1.5692307692307692, "grad_norm": 0.6875, "learning_rate": 1.2259885580219555e-06, "loss": 0.871, "step": 816 }, { "epoch": 1.5711538461538461, "grad_norm": 0.6953125, "learning_rate": 1.215578833215526e-06, "loss": 0.9022, "step": 817 }, { "epoch": 1.573076923076923, "grad_norm": 0.6875, "learning_rate": 1.2052073719320296e-06, "loss": 0.9199, "step": 818 }, { "epoch": 1.575, "grad_norm": 0.71484375, "learning_rate": 1.1948742790352342e-06, "loss": 0.9211, "step": 819 }, { "epoch": 1.5769230769230769, "grad_norm": 0.703125, "learning_rate": 1.1845796590009684e-06, "loss": 0.9067, "step": 820 }, { "epoch": 1.578846153846154, "grad_norm": 0.703125, "learning_rate": 1.1743236159160654e-06, "loss": 0.9342, "step": 821 }, { "epoch": 1.580769230769231, "grad_norm": 0.75390625, "learning_rate": 1.1641062534773218e-06, "loss": 0.915, "step": 822 }, { "epoch": 1.5826923076923078, "grad_norm": 0.72265625, "learning_rate": 1.15392767499044e-06, "loss": 0.8886, "step": 823 }, { "epoch": 1.5846153846153848, "grad_norm": 0.7109375, "learning_rate": 1.1437879833689808e-06, "loss": 0.9112, "step": 824 }, { "epoch": 1.5865384615384617, "grad_norm": 0.734375, "learning_rate": 1.133687281133331e-06, "loss": 0.9056, "step": 825 }, { "epoch": 1.5884615384615386, "grad_norm": 0.734375, "learning_rate": 1.1236256704096693e-06, "loss": 0.91, "step": 826 }, { "epoch": 1.5903846153846155, "grad_norm": 0.73046875, "learning_rate": 1.113603252928917e-06, "loss": 0.8725, "step": 827 }, { "epoch": 1.5923076923076924, "grad_norm": 0.72265625, "learning_rate": 1.1036201300257266e-06, "loss": 0.9173, "step": 828 }, { "epoch": 1.5942307692307693, "grad_norm": 0.7109375, "learning_rate": 1.0936764026374547e-06, "loss": 0.8928, "step": 829 }, { "epoch": 1.5961538461538463, "grad_norm": 0.69921875, "learning_rate": 1.083772171303128e-06, "loss": 0.8897, "step": 830 }, { "epoch": 1.5980769230769232, "grad_norm": 0.72265625, "learning_rate": 1.073907536162443e-06, "loss": 0.9053, "step": 831 }, { "epoch": 1.6, "grad_norm": 0.6953125, "learning_rate": 1.0640825969547498e-06, "loss": 0.8655, "step": 832 }, { "epoch": 1.601923076923077, "grad_norm": 0.703125, "learning_rate": 1.0542974530180327e-06, "loss": 0.8698, "step": 833 }, { "epoch": 1.603846153846154, "grad_norm": 0.6953125, "learning_rate": 1.0445522032879184e-06, "loss": 0.8369, "step": 834 }, { "epoch": 1.6057692307692308, "grad_norm": 0.6953125, "learning_rate": 1.0348469462966753e-06, "loss": 0.8818, "step": 835 }, { "epoch": 1.6076923076923078, "grad_norm": 0.703125, "learning_rate": 1.0251817801722047e-06, "loss": 0.837, "step": 836 }, { "epoch": 1.6096153846153847, "grad_norm": 0.69921875, "learning_rate": 1.0155568026370637e-06, "loss": 0.8907, "step": 837 }, { "epoch": 1.6115384615384616, "grad_norm": 0.73046875, "learning_rate": 1.0059721110074678e-06, "loss": 0.9129, "step": 838 }, { "epoch": 1.6134615384615385, "grad_norm": 0.73828125, "learning_rate": 9.964278021923107e-07, "loss": 0.8743, "step": 839 }, { "epoch": 1.6153846153846154, "grad_norm": 1.1015625, "learning_rate": 9.869239726921843e-07, "loss": 0.8883, "step": 840 }, { "epoch": 1.6173076923076923, "grad_norm": 0.6953125, "learning_rate": 9.774607185984004e-07, "loss": 0.904, "step": 841 }, { "epoch": 1.6192307692307693, "grad_norm": 0.70703125, "learning_rate": 9.68038135592022e-07, "loss": 0.9311, "step": 842 }, { "epoch": 1.6211538461538462, "grad_norm": 0.69140625, "learning_rate": 9.586563189428954e-07, "loss": 0.8621, "step": 843 }, { "epoch": 1.623076923076923, "grad_norm": 0.69921875, "learning_rate": 9.493153635086855e-07, "loss": 0.8783, "step": 844 }, { "epoch": 1.625, "grad_norm": 0.69921875, "learning_rate": 9.400153637339182e-07, "loss": 0.8793, "step": 845 }, { "epoch": 1.625, "eval_loss": 0.9300703406333923, "eval_runtime": 34.3126, "eval_samples_per_second": 68.284, "eval_steps_per_second": 17.078, "step": 845 }, { "epoch": 1.626923076923077, "grad_norm": 0.69140625, "learning_rate": 9.307564136490255e-07, "loss": 0.8341, "step": 846 }, { "epoch": 1.6288461538461538, "grad_norm": 0.71484375, "learning_rate": 9.215386068693927e-07, "loss": 0.8605, "step": 847 }, { "epoch": 1.6307692307692307, "grad_norm": 0.72265625, "learning_rate": 9.123620365944147e-07, "loss": 0.8513, "step": 848 }, { "epoch": 1.6326923076923077, "grad_norm": 0.72265625, "learning_rate": 9.032267956065516e-07, "loss": 0.9004, "step": 849 }, { "epoch": 1.6346153846153846, "grad_norm": 0.75, "learning_rate": 8.941329762703921e-07, "loss": 0.9814, "step": 850 }, { "epoch": 1.6365384615384615, "grad_norm": 0.73828125, "learning_rate": 8.850806705317183e-07, "loss": 0.9074, "step": 851 }, { "epoch": 1.6384615384615384, "grad_norm": 0.6953125, "learning_rate": 8.76069969916577e-07, "loss": 0.9301, "step": 852 }, { "epoch": 1.6403846153846153, "grad_norm": 0.7109375, "learning_rate": 8.671009655303531e-07, "loss": 0.9227, "step": 853 }, { "epoch": 1.6423076923076922, "grad_norm": 0.69921875, "learning_rate": 8.581737480568514e-07, "loss": 0.8816, "step": 854 }, { "epoch": 1.6442307692307692, "grad_norm": 0.69140625, "learning_rate": 8.492884077573749e-07, "loss": 0.8869, "step": 855 }, { "epoch": 1.646153846153846, "grad_norm": 0.72265625, "learning_rate": 8.404450344698167e-07, "loss": 0.9363, "step": 856 }, { "epoch": 1.648076923076923, "grad_norm": 0.6875, "learning_rate": 8.316437176077491e-07, "loss": 0.8935, "step": 857 }, { "epoch": 1.65, "grad_norm": 0.71875, "learning_rate": 8.228845461595225e-07, "loss": 0.8805, "step": 858 }, { "epoch": 1.6519230769230768, "grad_norm": 0.765625, "learning_rate": 8.141676086873574e-07, "loss": 0.8784, "step": 859 }, { "epoch": 1.6538461538461537, "grad_norm": 0.70703125, "learning_rate": 8.054929933264626e-07, "loss": 0.9377, "step": 860 }, { "epoch": 1.6557692307692307, "grad_norm": 0.734375, "learning_rate": 7.968607877841333e-07, "loss": 0.8825, "step": 861 }, { "epoch": 1.6576923076923076, "grad_norm": 0.70703125, "learning_rate": 7.882710793388643e-07, "loss": 0.915, "step": 862 }, { "epoch": 1.6596153846153845, "grad_norm": 0.70703125, "learning_rate": 7.79723954839477e-07, "loss": 0.9108, "step": 863 }, { "epoch": 1.6615384615384614, "grad_norm": 0.71875, "learning_rate": 7.712195007042322e-07, "loss": 0.8689, "step": 864 }, { "epoch": 1.6634615384615383, "grad_norm": 0.7109375, "learning_rate": 7.627578029199562e-07, "loss": 0.8841, "step": 865 }, { "epoch": 1.6653846153846152, "grad_norm": 0.71875, "learning_rate": 7.543389470411772e-07, "loss": 0.9736, "step": 866 }, { "epoch": 1.6673076923076922, "grad_norm": 0.6953125, "learning_rate": 7.459630181892608e-07, "loss": 0.9249, "step": 867 }, { "epoch": 1.669230769230769, "grad_norm": 0.7421875, "learning_rate": 7.376301010515397e-07, "loss": 0.9258, "step": 868 }, { "epoch": 1.671153846153846, "grad_norm": 0.7265625, "learning_rate": 7.293402798804667e-07, "loss": 0.909, "step": 869 }, { "epoch": 1.6730769230769231, "grad_norm": 0.69921875, "learning_rate": 7.210936384927631e-07, "loss": 0.888, "step": 870 }, { "epoch": 1.675, "grad_norm": 0.72265625, "learning_rate": 7.128902602685617e-07, "loss": 0.9196, "step": 871 }, { "epoch": 1.676923076923077, "grad_norm": 0.703125, "learning_rate": 7.047302281505735e-07, "loss": 0.943, "step": 872 }, { "epoch": 1.6788461538461539, "grad_norm": 0.74609375, "learning_rate": 6.966136246432492e-07, "loss": 0.9123, "step": 873 }, { "epoch": 1.6807692307692308, "grad_norm": 0.6796875, "learning_rate": 6.885405318119342e-07, "loss": 0.8793, "step": 874 }, { "epoch": 1.6826923076923077, "grad_norm": 0.71484375, "learning_rate": 6.805110312820501e-07, "loss": 0.9478, "step": 875 }, { "epoch": 1.6846153846153846, "grad_norm": 0.69921875, "learning_rate": 6.725252042382691e-07, "loss": 0.8718, "step": 876 }, { "epoch": 1.6865384615384615, "grad_norm": 0.7265625, "learning_rate": 6.645831314236817e-07, "loss": 0.8476, "step": 877 }, { "epoch": 1.6884615384615385, "grad_norm": 0.75390625, "learning_rate": 6.566848931389935e-07, "loss": 0.9334, "step": 878 }, { "epoch": 1.6903846153846154, "grad_norm": 0.71875, "learning_rate": 6.488305692417074e-07, "loss": 0.8902, "step": 879 }, { "epoch": 1.6923076923076923, "grad_norm": 0.6875, "learning_rate": 6.410202391453157e-07, "loss": 0.8728, "step": 880 }, { "epoch": 1.6942307692307692, "grad_norm": 0.7265625, "learning_rate": 6.332539818184985e-07, "loss": 0.8788, "step": 881 }, { "epoch": 1.6961538461538461, "grad_norm": 0.76171875, "learning_rate": 6.255318757843249e-07, "loss": 0.8917, "step": 882 }, { "epoch": 1.698076923076923, "grad_norm": 0.72265625, "learning_rate": 6.178539991194599e-07, "loss": 0.8898, "step": 883 }, { "epoch": 1.7, "grad_norm": 0.6875, "learning_rate": 6.102204294533731e-07, "loss": 0.894, "step": 884 }, { "epoch": 1.7019230769230769, "grad_norm": 0.703125, "learning_rate": 6.026312439675553e-07, "loss": 0.882, "step": 885 }, { "epoch": 1.703846153846154, "grad_norm": 0.7109375, "learning_rate": 5.95086519394738e-07, "loss": 0.917, "step": 886 }, { "epoch": 1.705769230769231, "grad_norm": 0.73046875, "learning_rate": 5.875863320181175e-07, "loss": 0.8931, "step": 887 }, { "epoch": 1.7076923076923078, "grad_norm": 0.703125, "learning_rate": 5.801307576705833e-07, "loss": 0.8623, "step": 888 }, { "epoch": 1.7096153846153848, "grad_norm": 0.6953125, "learning_rate": 5.727198717339511e-07, "loss": 0.8815, "step": 889 }, { "epoch": 1.7115384615384617, "grad_norm": 0.70703125, "learning_rate": 5.653537491382011e-07, "loss": 0.9418, "step": 890 }, { "epoch": 1.7134615384615386, "grad_norm": 0.73828125, "learning_rate": 5.58032464360721e-07, "loss": 0.8381, "step": 891 }, { "epoch": 1.7153846153846155, "grad_norm": 0.73046875, "learning_rate": 5.507560914255516e-07, "loss": 0.9239, "step": 892 }, { "epoch": 1.7173076923076924, "grad_norm": 0.69921875, "learning_rate": 5.435247039026398e-07, "loss": 0.9262, "step": 893 }, { "epoch": 1.7192307692307693, "grad_norm": 0.734375, "learning_rate": 5.363383749070939e-07, "loss": 0.844, "step": 894 }, { "epoch": 1.7211538461538463, "grad_norm": 0.7265625, "learning_rate": 5.291971770984428e-07, "loss": 0.9041, "step": 895 }, { "epoch": 1.7230769230769232, "grad_norm": 0.7109375, "learning_rate": 5.221011826799055e-07, "loss": 0.8342, "step": 896 }, { "epoch": 1.725, "grad_norm": 0.6796875, "learning_rate": 5.150504633976572e-07, "loss": 0.8119, "step": 897 }, { "epoch": 1.726923076923077, "grad_norm": 0.73828125, "learning_rate": 5.080450905401057e-07, "loss": 0.9174, "step": 898 }, { "epoch": 1.728846153846154, "grad_norm": 0.72265625, "learning_rate": 5.010851349371704e-07, "loss": 0.8972, "step": 899 }, { "epoch": 1.7307692307692308, "grad_norm": 0.71875, "learning_rate": 4.941706669595647e-07, "loss": 0.9052, "step": 900 }, { "epoch": 1.7326923076923078, "grad_norm": 0.7109375, "learning_rate": 4.873017565180871e-07, "loss": 0.8823, "step": 901 }, { "epoch": 1.7346153846153847, "grad_norm": 0.69140625, "learning_rate": 4.804784730629131e-07, "loss": 0.8687, "step": 902 }, { "epoch": 1.7365384615384616, "grad_norm": 0.7265625, "learning_rate": 4.7370088558289175e-07, "loss": 0.8865, "step": 903 }, { "epoch": 1.7384615384615385, "grad_norm": 0.69921875, "learning_rate": 4.6696906260485007e-07, "loss": 0.8775, "step": 904 }, { "epoch": 1.7403846153846154, "grad_norm": 0.71875, "learning_rate": 4.602830721928997e-07, "loss": 0.8814, "step": 905 }, { "epoch": 1.7423076923076923, "grad_norm": 0.73828125, "learning_rate": 4.536429819477478e-07, "loss": 0.9094, "step": 906 }, { "epoch": 1.7442307692307693, "grad_norm": 0.72265625, "learning_rate": 4.4704885900601236e-07, "loss": 0.9433, "step": 907 }, { "epoch": 1.7461538461538462, "grad_norm": 0.6796875, "learning_rate": 4.405007700395497e-07, "loss": 0.8625, "step": 908 }, { "epoch": 1.748076923076923, "grad_norm": 0.72265625, "learning_rate": 4.33998781254773e-07, "loss": 0.8546, "step": 909 }, { "epoch": 1.75, "grad_norm": 0.7265625, "learning_rate": 4.2754295839198325e-07, "loss": 0.9265, "step": 910 }, { "epoch": 1.75, "eval_loss": 0.9300666451454163, "eval_runtime": 34.3132, "eval_samples_per_second": 68.283, "eval_steps_per_second": 17.078, "step": 910 }, { "epoch": 1.751923076923077, "grad_norm": 0.69140625, "learning_rate": 4.211333667247125e-07, "loss": 0.8753, "step": 911 }, { "epoch": 1.7538461538461538, "grad_norm": 0.67578125, "learning_rate": 4.147700710590563e-07, "loss": 0.8862, "step": 912 }, { "epoch": 1.7557692307692307, "grad_norm": 0.7109375, "learning_rate": 4.0845313573301736e-07, "loss": 0.9027, "step": 913 }, { "epoch": 1.7576923076923077, "grad_norm": 0.72265625, "learning_rate": 4.021826246158628e-07, "loss": 0.9113, "step": 914 }, { "epoch": 1.7596153846153846, "grad_norm": 0.71484375, "learning_rate": 3.959586011074729e-07, "loss": 0.8939, "step": 915 }, { "epoch": 1.7615384615384615, "grad_norm": 0.7421875, "learning_rate": 3.8978112813769786e-07, "loss": 0.9124, "step": 916 }, { "epoch": 1.7634615384615384, "grad_norm": 0.73046875, "learning_rate": 3.836502681657289e-07, "loss": 0.9074, "step": 917 }, { "epoch": 1.7653846153846153, "grad_norm": 0.6953125, "learning_rate": 3.7756608317946144e-07, "loss": 0.8721, "step": 918 }, { "epoch": 1.7673076923076922, "grad_norm": 0.71875, "learning_rate": 3.715286346948671e-07, "loss": 0.9188, "step": 919 }, { "epoch": 1.7692307692307692, "grad_norm": 0.7109375, "learning_rate": 3.6553798375537574e-07, "loss": 0.8737, "step": 920 }, { "epoch": 1.771153846153846, "grad_norm": 0.69140625, "learning_rate": 3.595941909312595e-07, "loss": 0.9252, "step": 921 }, { "epoch": 1.773076923076923, "grad_norm": 0.78125, "learning_rate": 3.5369731631901214e-07, "loss": 0.921, "step": 922 }, { "epoch": 1.775, "grad_norm": 0.7421875, "learning_rate": 3.4784741954074884e-07, "loss": 0.8851, "step": 923 }, { "epoch": 1.7769230769230768, "grad_norm": 0.69140625, "learning_rate": 3.420445597436056e-07, "loss": 0.8651, "step": 924 }, { "epoch": 1.7788461538461537, "grad_norm": 0.734375, "learning_rate": 3.362887955991301e-07, "loss": 0.902, "step": 925 }, { "epoch": 1.7807692307692307, "grad_norm": 0.6796875, "learning_rate": 3.305801853026985e-07, "loss": 0.8853, "step": 926 }, { "epoch": 1.7826923076923076, "grad_norm": 0.671875, "learning_rate": 3.2491878657292643e-07, "loss": 0.8499, "step": 927 }, { "epoch": 1.7846153846153845, "grad_norm": 0.734375, "learning_rate": 3.193046566510777e-07, "loss": 0.8928, "step": 928 }, { "epoch": 1.7865384615384614, "grad_norm": 0.71875, "learning_rate": 3.1373785230049356e-07, "loss": 0.8892, "step": 929 }, { "epoch": 1.7884615384615383, "grad_norm": 0.703125, "learning_rate": 3.0821842980601756e-07, "loss": 0.8887, "step": 930 }, { "epoch": 1.7903846153846152, "grad_norm": 0.7421875, "learning_rate": 3.0274644497342133e-07, "loss": 0.9069, "step": 931 }, { "epoch": 1.7923076923076922, "grad_norm": 0.72265625, "learning_rate": 2.9732195312884515e-07, "loss": 0.9282, "step": 932 }, { "epoch": 1.794230769230769, "grad_norm": 0.71484375, "learning_rate": 2.91945009118238e-07, "loss": 0.8736, "step": 933 }, { "epoch": 1.796153846153846, "grad_norm": 0.73046875, "learning_rate": 2.866156673068016e-07, "loss": 0.9549, "step": 934 }, { "epoch": 1.7980769230769231, "grad_norm": 0.69921875, "learning_rate": 2.813339815784416e-07, "loss": 0.8407, "step": 935 }, { "epoch": 1.8, "grad_norm": 0.703125, "learning_rate": 2.76100005335222e-07, "loss": 0.9366, "step": 936 }, { "epoch": 1.801923076923077, "grad_norm": 0.7734375, "learning_rate": 2.7091379149682683e-07, "loss": 0.8663, "step": 937 }, { "epoch": 1.8038461538461539, "grad_norm": 0.73828125, "learning_rate": 2.657753925000228e-07, "loss": 0.8676, "step": 938 }, { "epoch": 1.8057692307692308, "grad_norm": 0.6953125, "learning_rate": 2.6068486029813154e-07, "loss": 0.8491, "step": 939 }, { "epoch": 1.8076923076923077, "grad_norm": 0.703125, "learning_rate": 2.556422463605024e-07, "loss": 0.9075, "step": 940 }, { "epoch": 1.8096153846153846, "grad_norm": 0.6796875, "learning_rate": 2.506476016719922e-07, "loss": 0.8678, "step": 941 }, { "epoch": 1.8115384615384615, "grad_norm": 0.7265625, "learning_rate": 2.4570097673245197e-07, "loss": 0.921, "step": 942 }, { "epoch": 1.8134615384615385, "grad_norm": 0.75, "learning_rate": 2.4080242155621327e-07, "loss": 0.9795, "step": 943 }, { "epoch": 1.8153846153846154, "grad_norm": 0.7421875, "learning_rate": 2.3595198567158473e-07, "loss": 0.9078, "step": 944 }, { "epoch": 1.8173076923076923, "grad_norm": 0.71875, "learning_rate": 2.3114971812034981e-07, "loss": 0.8915, "step": 945 }, { "epoch": 1.8192307692307692, "grad_norm": 0.70703125, "learning_rate": 2.2639566745727203e-07, "loss": 0.8709, "step": 946 }, { "epoch": 1.8211538461538461, "grad_norm": 0.7109375, "learning_rate": 2.2168988174960382e-07, "loss": 0.9404, "step": 947 }, { "epoch": 1.823076923076923, "grad_norm": 0.71875, "learning_rate": 2.1703240857659958e-07, "loss": 0.8965, "step": 948 }, { "epoch": 1.825, "grad_norm": 0.70703125, "learning_rate": 2.124232950290367e-07, "loss": 0.8497, "step": 949 }, { "epoch": 1.8269230769230769, "grad_norm": 0.72265625, "learning_rate": 2.0786258770873647e-07, "loss": 0.9354, "step": 950 }, { "epoch": 1.828846153846154, "grad_norm": 0.73046875, "learning_rate": 2.0335033272809612e-07, "loss": 0.8639, "step": 951 }, { "epoch": 1.830769230769231, "grad_norm": 0.70703125, "learning_rate": 1.9888657570961924e-07, "loss": 0.8492, "step": 952 }, { "epoch": 1.8326923076923078, "grad_norm": 0.75390625, "learning_rate": 1.9447136178545766e-07, "loss": 0.929, "step": 953 }, { "epoch": 1.8346153846153848, "grad_norm": 0.69140625, "learning_rate": 1.9010473559695376e-07, "loss": 0.8714, "step": 954 }, { "epoch": 1.8365384615384617, "grad_norm": 0.70703125, "learning_rate": 1.857867412941883e-07, "loss": 0.9262, "step": 955 }, { "epoch": 1.8384615384615386, "grad_norm": 0.7109375, "learning_rate": 1.8151742253553483e-07, "loss": 0.8796, "step": 956 }, { "epoch": 1.8403846153846155, "grad_norm": 0.7421875, "learning_rate": 1.7729682248721848e-07, "loss": 0.8677, "step": 957 }, { "epoch": 1.8423076923076924, "grad_norm": 0.68359375, "learning_rate": 1.731249838228799e-07, "loss": 0.8699, "step": 958 }, { "epoch": 1.8442307692307693, "grad_norm": 0.6875, "learning_rate": 1.69001948723142e-07, "loss": 0.8765, "step": 959 }, { "epoch": 1.8461538461538463, "grad_norm": 0.72265625, "learning_rate": 1.649277588751863e-07, "loss": 0.9203, "step": 960 }, { "epoch": 1.8480769230769232, "grad_norm": 0.73828125, "learning_rate": 1.6090245547232707e-07, "loss": 0.9009, "step": 961 }, { "epoch": 1.85, "grad_norm": 1.4140625, "learning_rate": 1.5692607921360014e-07, "loss": 0.8951, "step": 962 }, { "epoch": 1.851923076923077, "grad_norm": 0.7109375, "learning_rate": 1.5299867030334815e-07, "loss": 0.938, "step": 963 }, { "epoch": 1.853846153846154, "grad_norm": 0.71875, "learning_rate": 1.491202684508136e-07, "loss": 0.9381, "step": 964 }, { "epoch": 1.8557692307692308, "grad_norm": 0.72265625, "learning_rate": 1.4529091286973994e-07, "loss": 0.9324, "step": 965 }, { "epoch": 1.8576923076923078, "grad_norm": 0.7109375, "learning_rate": 1.415106422779733e-07, "loss": 0.881, "step": 966 }, { "epoch": 1.8596153846153847, "grad_norm": 0.7734375, "learning_rate": 1.3777949489706898e-07, "loss": 0.8983, "step": 967 }, { "epoch": 1.8615384615384616, "grad_norm": 0.7109375, "learning_rate": 1.3409750845191138e-07, "loss": 0.9162, "step": 968 }, { "epoch": 1.8634615384615385, "grad_norm": 0.70703125, "learning_rate": 1.3046472017032685e-07, "loss": 0.8893, "step": 969 }, { "epoch": 1.8653846153846154, "grad_norm": 0.68359375, "learning_rate": 1.2688116678270636e-07, "loss": 0.9097, "step": 970 }, { "epoch": 1.8673076923076923, "grad_norm": 0.71484375, "learning_rate": 1.2334688452164122e-07, "loss": 0.9282, "step": 971 }, { "epoch": 1.8692307692307693, "grad_norm": 0.69921875, "learning_rate": 1.198619091215497e-07, "loss": 0.8906, "step": 972 }, { "epoch": 1.8711538461538462, "grad_norm": 0.796875, "learning_rate": 1.1642627581831767e-07, "loss": 0.9165, "step": 973 }, { "epoch": 1.873076923076923, "grad_norm": 0.73828125, "learning_rate": 1.1304001934894393e-07, "loss": 0.9424, "step": 974 }, { "epoch": 1.875, "grad_norm": 0.71875, "learning_rate": 1.0970317395119001e-07, "loss": 0.9375, "step": 975 }, { "epoch": 1.875, "eval_loss": 0.9301103949546814, "eval_runtime": 34.343, "eval_samples_per_second": 68.223, "eval_steps_per_second": 17.063, "step": 975 }, { "epoch": 1.876923076923077, "grad_norm": 0.69921875, "learning_rate": 1.0641577336322761e-07, "loss": 0.8467, "step": 976 }, { "epoch": 1.8788461538461538, "grad_norm": 0.6875, "learning_rate": 1.0317785082330555e-07, "loss": 0.9042, "step": 977 }, { "epoch": 1.8807692307692307, "grad_norm": 0.71484375, "learning_rate": 9.998943906941005e-08, "loss": 0.8585, "step": 978 }, { "epoch": 1.8826923076923077, "grad_norm": 0.703125, "learning_rate": 9.685057033892998e-08, "loss": 0.9187, "step": 979 }, { "epoch": 1.8846153846153846, "grad_norm": 0.72265625, "learning_rate": 9.376127636833876e-08, "loss": 0.8607, "step": 980 }, { "epoch": 1.8865384615384615, "grad_norm": 0.7265625, "learning_rate": 9.072158839286748e-08, "loss": 0.9251, "step": 981 }, { "epoch": 1.8884615384615384, "grad_norm": 0.7265625, "learning_rate": 8.773153714619064e-08, "loss": 0.9172, "step": 982 }, { "epoch": 1.8903846153846153, "grad_norm": 0.734375, "learning_rate": 8.479115286011752e-08, "loss": 0.9146, "step": 983 }, { "epoch": 1.8923076923076922, "grad_norm": 0.69921875, "learning_rate": 8.190046526428241e-08, "loss": 0.8388, "step": 984 }, { "epoch": 1.8942307692307692, "grad_norm": 0.6953125, "learning_rate": 7.905950358584768e-08, "loss": 0.8973, "step": 985 }, { "epoch": 1.896153846153846, "grad_norm": 0.73046875, "learning_rate": 7.626829654920732e-08, "loss": 0.9986, "step": 986 }, { "epoch": 1.898076923076923, "grad_norm": 0.7109375, "learning_rate": 7.352687237569489e-08, "loss": 0.9084, "step": 987 }, { "epoch": 1.9, "grad_norm": 0.70703125, "learning_rate": 7.08352587833e-08, "loss": 0.8867, "step": 988 }, { "epoch": 1.9019230769230768, "grad_norm": 0.6796875, "learning_rate": 6.819348298638839e-08, "loss": 0.9154, "step": 989 }, { "epoch": 1.9038461538461537, "grad_norm": 0.671875, "learning_rate": 6.560157169542391e-08, "loss": 0.8247, "step": 990 }, { "epoch": 1.9057692307692307, "grad_norm": 0.703125, "learning_rate": 6.305955111670204e-08, "loss": 0.9041, "step": 991 }, { "epoch": 1.9076923076923076, "grad_norm": 0.6875, "learning_rate": 6.056744695208283e-08, "loss": 0.9043, "step": 992 }, { "epoch": 1.9096153846153845, "grad_norm": 0.71875, "learning_rate": 5.8125284398730666e-08, "loss": 0.8587, "step": 993 }, { "epoch": 1.9115384615384614, "grad_norm": 0.73828125, "learning_rate": 5.573308814886158e-08, "loss": 0.9092, "step": 994 }, { "epoch": 1.9134615384615383, "grad_norm": 0.69921875, "learning_rate": 5.339088238949186e-08, "loss": 0.8529, "step": 995 }, { "epoch": 1.9153846153846152, "grad_norm": 0.71484375, "learning_rate": 5.109869080219376e-08, "loss": 0.9078, "step": 996 }, { "epoch": 1.9173076923076922, "grad_norm": 0.671875, "learning_rate": 4.885653656285627e-08, "loss": 0.8423, "step": 997 }, { "epoch": 1.919230769230769, "grad_norm": 0.75390625, "learning_rate": 4.666444234145084e-08, "loss": 0.8814, "step": 998 }, { "epoch": 1.921153846153846, "grad_norm": 0.734375, "learning_rate": 4.45224303018027e-08, "loss": 0.8811, "step": 999 }, { "epoch": 1.9230769230769231, "grad_norm": 0.71875, "learning_rate": 4.2430522101364894e-08, "loss": 0.9339, "step": 1000 }, { "epoch": 1.925, "grad_norm": 0.70703125, "learning_rate": 4.038873889100237e-08, "loss": 0.8606, "step": 1001 }, { "epoch": 1.926923076923077, "grad_norm": 0.6953125, "learning_rate": 3.839710131477492e-08, "loss": 0.9095, "step": 1002 }, { "epoch": 1.9288461538461539, "grad_norm": 0.73828125, "learning_rate": 3.645562950973014e-08, "loss": 0.9305, "step": 1003 }, { "epoch": 1.9307692307692308, "grad_norm": 0.72265625, "learning_rate": 3.456434310570023e-08, "loss": 0.9045, "step": 1004 }, { "epoch": 1.9326923076923077, "grad_norm": 0.74609375, "learning_rate": 3.2723261225102164e-08, "loss": 0.8942, "step": 1005 }, { "epoch": 1.9346153846153846, "grad_norm": 0.6953125, "learning_rate": 3.093240248274565e-08, "loss": 0.8703, "step": 1006 }, { "epoch": 1.9365384615384615, "grad_norm": 0.72265625, "learning_rate": 2.9191784985644345e-08, "loss": 0.9141, "step": 1007 }, { "epoch": 1.9384615384615385, "grad_norm": 0.75, "learning_rate": 2.7501426332831594e-08, "loss": 0.8912, "step": 1008 }, { "epoch": 1.9403846153846154, "grad_norm": 0.7109375, "learning_rate": 2.5861343615184997e-08, "loss": 0.8769, "step": 1009 }, { "epoch": 1.9423076923076923, "grad_norm": 0.6953125, "learning_rate": 2.427155341525156e-08, "loss": 0.8312, "step": 1010 }, { "epoch": 1.9442307692307692, "grad_norm": 0.72265625, "learning_rate": 2.2732071807081147e-08, "loss": 0.8963, "step": 1011 }, { "epoch": 1.9461538461538461, "grad_norm": 0.6796875, "learning_rate": 2.1242914356063292e-08, "loss": 0.9095, "step": 1012 }, { "epoch": 1.948076923076923, "grad_norm": 0.71484375, "learning_rate": 1.98040961187701e-08, "loss": 0.9588, "step": 1013 }, { "epoch": 1.95, "grad_norm": 0.78125, "learning_rate": 1.841563164280413e-08, "loss": 0.8922, "step": 1014 }, { "epoch": 1.9519230769230769, "grad_norm": 0.8046875, "learning_rate": 1.7077534966650767e-08, "loss": 0.8975, "step": 1015 }, { "epoch": 1.953846153846154, "grad_norm": 0.74609375, "learning_rate": 1.5789819619537182e-08, "loss": 0.8766, "step": 1016 }, { "epoch": 1.955769230769231, "grad_norm": 0.7109375, "learning_rate": 1.4552498621295264e-08, "loss": 0.9691, "step": 1017 }, { "epoch": 1.9576923076923078, "grad_norm": 0.71484375, "learning_rate": 1.3365584482228356e-08, "loss": 0.8297, "step": 1018 }, { "epoch": 1.9596153846153848, "grad_norm": 0.70703125, "learning_rate": 1.2229089202987487e-08, "loss": 0.9212, "step": 1019 }, { "epoch": 1.9615384615384617, "grad_norm": 0.7265625, "learning_rate": 1.1143024274448689e-08, "loss": 0.9787, "step": 1020 }, { "epoch": 1.9634615384615386, "grad_norm": 0.703125, "learning_rate": 1.0107400677596413e-08, "loss": 0.9338, "step": 1021 }, { "epoch": 1.9653846153846155, "grad_norm": 0.75, "learning_rate": 9.12222888341252e-09, "loss": 0.901, "step": 1022 }, { "epoch": 1.9673076923076924, "grad_norm": 0.67578125, "learning_rate": 8.187518852771914e-09, "loss": 0.8544, "step": 1023 }, { "epoch": 1.9692307692307693, "grad_norm": 0.67578125, "learning_rate": 7.3032800363398435e-09, "loss": 0.892, "step": 1024 }, { "epoch": 1.9711538461538463, "grad_norm": 0.71484375, "learning_rate": 6.469521374477539e-09, "loss": 0.8719, "step": 1025 }, { "epoch": 1.9730769230769232, "grad_norm": 0.73046875, "learning_rate": 5.686251297151724e-09, "loss": 0.9165, "step": 1026 }, { "epoch": 1.975, "grad_norm": 0.73828125, "learning_rate": 4.9534777238485764e-09, "loss": 0.9102, "step": 1027 }, { "epoch": 1.976923076923077, "grad_norm": 0.734375, "learning_rate": 4.2712080634949024e-09, "loss": 0.9305, "step": 1028 }, { "epoch": 1.978846153846154, "grad_norm": 0.6953125, "learning_rate": 3.6394492143820847e-09, "loss": 0.9779, "step": 1029 }, { "epoch": 1.9807692307692308, "grad_norm": 0.73046875, "learning_rate": 3.0582075640972487e-09, "loss": 0.904, "step": 1030 }, { "epoch": 1.9826923076923078, "grad_norm": 0.6953125, "learning_rate": 2.5274889894583156e-09, "loss": 0.9295, "step": 1031 }, { "epoch": 1.9846153846153847, "grad_norm": 0.6640625, "learning_rate": 2.0472988564540496e-09, "loss": 0.8869, "step": 1032 }, { "epoch": 1.9865384615384616, "grad_norm": 0.73046875, "learning_rate": 1.6176420201902132e-09, "loss": 0.9327, "step": 1033 }, { "epoch": 1.9884615384615385, "grad_norm": 0.73046875, "learning_rate": 1.2385228248407155e-09, "loss": 0.9076, "step": 1034 }, { "epoch": 1.9903846153846154, "grad_norm": 0.71875, "learning_rate": 9.099451036048701e-10, "loss": 0.9368, "step": 1035 }, { "epoch": 1.9923076923076923, "grad_norm": 0.73828125, "learning_rate": 6.319121786646509e-10, "loss": 0.8626, "step": 1036 }, { "epoch": 1.9942307692307693, "grad_norm": 0.7421875, "learning_rate": 4.0442686115582665e-10, "loss": 0.9733, "step": 1037 }, { "epoch": 1.9961538461538462, "grad_norm": 0.71875, "learning_rate": 2.274914511374293e-10, "loss": 0.824, "step": 1038 }, { "epoch": 1.998076923076923, "grad_norm": 0.7265625, "learning_rate": 1.011077375662195e-10, "loss": 0.8492, "step": 1039 }, { "epoch": 2.0, "grad_norm": 0.7265625, "learning_rate": 2.5276998284473608e-11, "loss": 0.8473, "step": 1040 }, { "epoch": 2.0, "eval_loss": 0.930040180683136, "eval_runtime": 34.5175, "eval_samples_per_second": 67.879, "eval_steps_per_second": 16.977, "step": 1040 } ], "logging_steps": 1, "max_steps": 1040, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 520, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.943620643240018e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }