{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500.0, "global_step": 171, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017543859649122806, "grad_norm": 10.858891815094267, "learning_rate": 3.4482758620689656e-07, "loss": 0.8595, "step": 1 }, { "epoch": 0.03508771929824561, "grad_norm": 9.921986611909597, "learning_rate": 6.896551724137931e-07, "loss": 0.8199, "step": 2 }, { "epoch": 0.05263157894736842, "grad_norm": 11.486078048946561, "learning_rate": 1.0344827586206898e-06, "loss": 0.9249, "step": 3 }, { "epoch": 0.07017543859649122, "grad_norm": 10.665932008284694, "learning_rate": 1.3793103448275862e-06, "loss": 0.9003, "step": 4 }, { "epoch": 0.08771929824561403, "grad_norm": 12.32633670526995, "learning_rate": 1.724137931034483e-06, "loss": 0.8337, "step": 5 }, { "epoch": 0.10526315789473684, "grad_norm": 12.266330627967024, "learning_rate": 2.0689655172413796e-06, "loss": 0.7852, "step": 6 }, { "epoch": 0.12280701754385964, "grad_norm": 16.196618467447166, "learning_rate": 2.4137931034482762e-06, "loss": 0.6844, "step": 7 }, { "epoch": 0.14035087719298245, "grad_norm": 19.536139648092984, "learning_rate": 2.7586206896551725e-06, "loss": 0.6811, "step": 8 }, { "epoch": 0.15789473684210525, "grad_norm": 21.186657992967227, "learning_rate": 3.103448275862069e-06, "loss": 0.5463, "step": 9 }, { "epoch": 0.17543859649122806, "grad_norm": 15.998035060391652, "learning_rate": 3.448275862068966e-06, "loss": 0.5021, "step": 10 }, { "epoch": 0.19298245614035087, "grad_norm": 8.43835880238007, "learning_rate": 3.793103448275862e-06, "loss": 0.4279, "step": 11 }, { "epoch": 0.21052631578947367, "grad_norm": 7.362934453475211, "learning_rate": 4.137931034482759e-06, "loss": 0.3179, "step": 12 }, { "epoch": 0.22807017543859648, "grad_norm": 3.577221690314773, "learning_rate": 4.482758620689656e-06, "loss": 0.2599, "step": 13 }, { "epoch": 0.24561403508771928, "grad_norm": 3.7932012075004025, "learning_rate": 4.8275862068965525e-06, "loss": 0.2707, "step": 14 }, { "epoch": 0.2631578947368421, "grad_norm": 5.029838865597322, "learning_rate": 5.172413793103449e-06, "loss": 0.2084, "step": 15 }, { "epoch": 0.2807017543859649, "grad_norm": 3.9893917170188398, "learning_rate": 5.517241379310345e-06, "loss": 0.2306, "step": 16 }, { "epoch": 0.2982456140350877, "grad_norm": 2.416715355181999, "learning_rate": 5.862068965517242e-06, "loss": 0.2179, "step": 17 }, { "epoch": 0.3157894736842105, "grad_norm": 2.3492997945282186, "learning_rate": 6.206896551724138e-06, "loss": 0.1928, "step": 18 }, { "epoch": 0.3333333333333333, "grad_norm": 1.910421047206756, "learning_rate": 6.551724137931035e-06, "loss": 0.1749, "step": 19 }, { "epoch": 0.3508771929824561, "grad_norm": 2.1822333335907778, "learning_rate": 6.896551724137932e-06, "loss": 0.1686, "step": 20 }, { "epoch": 0.3684210526315789, "grad_norm": 2.0144168178562474, "learning_rate": 7.241379310344828e-06, "loss": 0.1743, "step": 21 }, { "epoch": 0.38596491228070173, "grad_norm": 2.146685921044344, "learning_rate": 7.586206896551724e-06, "loss": 0.1835, "step": 22 }, { "epoch": 0.40350877192982454, "grad_norm": 2.1004487230521396, "learning_rate": 7.93103448275862e-06, "loss": 0.1428, "step": 23 }, { "epoch": 0.42105263157894735, "grad_norm": 3.442392326691301, "learning_rate": 8.275862068965518e-06, "loss": 0.1945, "step": 24 }, { "epoch": 0.43859649122807015, "grad_norm": 2.3206229431994694, "learning_rate": 8.620689655172414e-06, "loss": 0.1873, "step": 25 }, { "epoch": 0.45614035087719296, "grad_norm": 1.9065540995615944, "learning_rate": 8.965517241379312e-06, "loss": 0.1611, "step": 26 }, { "epoch": 0.47368421052631576, "grad_norm": 2.2678887656021383, "learning_rate": 9.310344827586207e-06, "loss": 0.1545, "step": 27 }, { "epoch": 0.49122807017543857, "grad_norm": 2.042938634642255, "learning_rate": 9.655172413793105e-06, "loss": 0.144, "step": 28 }, { "epoch": 0.5087719298245614, "grad_norm": 1.9368511767985146, "learning_rate": 1e-05, "loss": 0.1508, "step": 29 }, { "epoch": 0.5263157894736842, "grad_norm": 2.0942080441404047, "learning_rate": 9.999623509195724e-06, "loss": 0.1615, "step": 30 }, { "epoch": 0.543859649122807, "grad_norm": 1.9514609324222132, "learning_rate": 9.998494093481022e-06, "loss": 0.1479, "step": 31 }, { "epoch": 0.5614035087719298, "grad_norm": 2.1171183568813077, "learning_rate": 9.996611922941748e-06, "loss": 0.149, "step": 32 }, { "epoch": 0.5789473684210527, "grad_norm": 1.9117324845858215, "learning_rate": 9.993977281025862e-06, "loss": 0.1559, "step": 33 }, { "epoch": 0.5964912280701754, "grad_norm": 2.0353076536422847, "learning_rate": 9.990590564500745e-06, "loss": 0.1716, "step": 34 }, { "epoch": 0.6140350877192983, "grad_norm": 1.8882555798254166, "learning_rate": 9.986452283393452e-06, "loss": 0.1574, "step": 35 }, { "epoch": 0.631578947368421, "grad_norm": 2.3834906677162095, "learning_rate": 9.98156306091389e-06, "loss": 0.1723, "step": 36 }, { "epoch": 0.6491228070175439, "grad_norm": 2.0779048891119536, "learning_rate": 9.975923633360985e-06, "loss": 0.1242, "step": 37 }, { "epoch": 0.6666666666666666, "grad_norm": 1.8059353953983028, "learning_rate": 9.969534850011782e-06, "loss": 0.1407, "step": 38 }, { "epoch": 0.6842105263157895, "grad_norm": 2.0737332886405224, "learning_rate": 9.962397672993552e-06, "loss": 0.1331, "step": 39 }, { "epoch": 0.7017543859649122, "grad_norm": 1.9297626738628924, "learning_rate": 9.9545131771389e-06, "loss": 0.1326, "step": 40 }, { "epoch": 0.7192982456140351, "grad_norm": 1.872161533028589, "learning_rate": 9.945882549823906e-06, "loss": 0.1234, "step": 41 }, { "epoch": 0.7368421052631579, "grad_norm": 1.8210574224684197, "learning_rate": 9.936507090789294e-06, "loss": 0.1347, "step": 42 }, { "epoch": 0.7543859649122807, "grad_norm": 1.692612206991856, "learning_rate": 9.926388211944707e-06, "loss": 0.1196, "step": 43 }, { "epoch": 0.7719298245614035, "grad_norm": 1.782798391389361, "learning_rate": 9.915527437156083e-06, "loss": 0.1135, "step": 44 }, { "epoch": 0.7894736842105263, "grad_norm": 1.8663061704368509, "learning_rate": 9.903926402016153e-06, "loss": 0.1131, "step": 45 }, { "epoch": 0.8070175438596491, "grad_norm": 2.026541513062656, "learning_rate": 9.891586853598139e-06, "loss": 0.1188, "step": 46 }, { "epoch": 0.8245614035087719, "grad_norm": 1.843361250666892, "learning_rate": 9.878510650192644e-06, "loss": 0.1425, "step": 47 }, { "epoch": 0.8421052631578947, "grad_norm": 1.733943578532701, "learning_rate": 9.864699761027801e-06, "loss": 0.0907, "step": 48 }, { "epoch": 0.8596491228070176, "grad_norm": 2.013364795030338, "learning_rate": 9.850156265972722e-06, "loss": 0.1281, "step": 49 }, { "epoch": 0.8771929824561403, "grad_norm": 1.7825403158875694, "learning_rate": 9.834882355224261e-06, "loss": 0.1242, "step": 50 }, { "epoch": 0.8947368421052632, "grad_norm": 1.859620339721197, "learning_rate": 9.8188803289772e-06, "loss": 0.1161, "step": 51 }, { "epoch": 0.9122807017543859, "grad_norm": 1.6351165217318777, "learning_rate": 9.80215259707783e-06, "loss": 0.102, "step": 52 }, { "epoch": 0.9298245614035088, "grad_norm": 1.5445149557127498, "learning_rate": 9.784701678661045e-06, "loss": 0.1003, "step": 53 }, { "epoch": 0.9473684210526315, "grad_norm": 1.7068847707523054, "learning_rate": 9.766530201770969e-06, "loss": 0.1066, "step": 54 }, { "epoch": 0.9649122807017544, "grad_norm": 1.5735672025524092, "learning_rate": 9.747640902965185e-06, "loss": 0.118, "step": 55 }, { "epoch": 0.9824561403508771, "grad_norm": 1.4914631342446407, "learning_rate": 9.728036626902607e-06, "loss": 0.1093, "step": 56 }, { "epoch": 1.0, "grad_norm": 1.4066836670471368, "learning_rate": 9.707720325915105e-06, "loss": 0.1068, "step": 57 }, { "epoch": 1.0175438596491229, "grad_norm": 1.5347203623586319, "learning_rate": 9.686695059562875e-06, "loss": 0.0791, "step": 58 }, { "epoch": 1.0350877192982457, "grad_norm": 1.0428967147543007, "learning_rate": 9.664963994173695e-06, "loss": 0.0723, "step": 59 }, { "epoch": 1.0526315789473684, "grad_norm": 1.1434695045500671, "learning_rate": 9.64253040236608e-06, "loss": 0.0559, "step": 60 }, { "epoch": 1.0701754385964912, "grad_norm": 0.9742940357159334, "learning_rate": 9.619397662556434e-06, "loss": 0.0517, "step": 61 }, { "epoch": 1.087719298245614, "grad_norm": 1.2461364702759137, "learning_rate": 9.59556925845029e-06, "loss": 0.0762, "step": 62 }, { "epoch": 1.1052631578947367, "grad_norm": 1.0049119142623577, "learning_rate": 9.571048778517655e-06, "loss": 0.0574, "step": 63 }, { "epoch": 1.1228070175438596, "grad_norm": 1.0847028026479022, "learning_rate": 9.545839915452612e-06, "loss": 0.0611, "step": 64 }, { "epoch": 1.1403508771929824, "grad_norm": 1.1792867233983542, "learning_rate": 9.519946465617217e-06, "loss": 0.0613, "step": 65 }, { "epoch": 1.1578947368421053, "grad_norm": 0.8106778901681555, "learning_rate": 9.49337232846977e-06, "loss": 0.0505, "step": 66 }, { "epoch": 1.1754385964912282, "grad_norm": 0.849214933232956, "learning_rate": 9.466121505977577e-06, "loss": 0.0482, "step": 67 }, { "epoch": 1.1929824561403508, "grad_norm": 1.1077518605738848, "learning_rate": 9.438198102014271e-06, "loss": 0.0785, "step": 68 }, { "epoch": 1.2105263157894737, "grad_norm": 0.7566984334204481, "learning_rate": 9.409606321741776e-06, "loss": 0.0585, "step": 69 }, { "epoch": 1.2280701754385965, "grad_norm": 0.9085507029981399, "learning_rate": 9.380350470977033e-06, "loss": 0.0639, "step": 70 }, { "epoch": 1.2456140350877192, "grad_norm": 0.928358906164026, "learning_rate": 9.350434955543557e-06, "loss": 0.0474, "step": 71 }, { "epoch": 1.263157894736842, "grad_norm": 0.7880693527593644, "learning_rate": 9.319864280607935e-06, "loss": 0.0491, "step": 72 }, { "epoch": 1.280701754385965, "grad_norm": 0.9148719439651587, "learning_rate": 9.288643050001362e-06, "loss": 0.0673, "step": 73 }, { "epoch": 1.2982456140350878, "grad_norm": 0.8327662341568611, "learning_rate": 9.256775965526327e-06, "loss": 0.0611, "step": 74 }, { "epoch": 1.3157894736842106, "grad_norm": 0.7812237270066599, "learning_rate": 9.224267826248536e-06, "loss": 0.0525, "step": 75 }, { "epoch": 1.3333333333333333, "grad_norm": 0.8459610892111997, "learning_rate": 9.19112352777419e-06, "loss": 0.0696, "step": 76 }, { "epoch": 1.3508771929824561, "grad_norm": 1.0604992549632657, "learning_rate": 9.157348061512728e-06, "loss": 0.071, "step": 77 }, { "epoch": 1.368421052631579, "grad_norm": 1.0056253432583089, "learning_rate": 9.122946513925128e-06, "loss": 0.0853, "step": 78 }, { "epoch": 1.3859649122807016, "grad_norm": 0.9277407408307676, "learning_rate": 9.08792406575792e-06, "loss": 0.0654, "step": 79 }, { "epoch": 1.4035087719298245, "grad_norm": 0.9567847127164941, "learning_rate": 9.052285991262975e-06, "loss": 0.067, "step": 80 }, { "epoch": 1.4210526315789473, "grad_norm": 0.9254067178308217, "learning_rate": 9.016037657403225e-06, "loss": 0.0833, "step": 81 }, { "epoch": 1.4385964912280702, "grad_norm": 1.0691557469032043, "learning_rate": 8.979184523044419e-06, "loss": 0.0614, "step": 82 }, { "epoch": 1.456140350877193, "grad_norm": 1.0561683653640652, "learning_rate": 8.941732138133032e-06, "loss": 0.0651, "step": 83 }, { "epoch": 1.4736842105263157, "grad_norm": 0.8881795831731262, "learning_rate": 8.903686142860473e-06, "loss": 0.0627, "step": 84 }, { "epoch": 1.4912280701754386, "grad_norm": 0.90484500996426, "learning_rate": 8.865052266813686e-06, "loss": 0.0653, "step": 85 }, { "epoch": 1.5087719298245614, "grad_norm": 0.8479688839299309, "learning_rate": 8.825836328112296e-06, "loss": 0.0531, "step": 86 }, { "epoch": 1.526315789473684, "grad_norm": 0.770843061360994, "learning_rate": 8.786044232532423e-06, "loss": 0.0592, "step": 87 }, { "epoch": 1.543859649122807, "grad_norm": 0.8465258036235466, "learning_rate": 8.745681972617298e-06, "loss": 0.062, "step": 88 }, { "epoch": 1.5614035087719298, "grad_norm": 0.8247168238640108, "learning_rate": 8.704755626774796e-06, "loss": 0.0561, "step": 89 }, { "epoch": 1.5789473684210527, "grad_norm": 0.7995751912905001, "learning_rate": 8.663271358362064e-06, "loss": 0.052, "step": 90 }, { "epoch": 1.5964912280701755, "grad_norm": 0.7495685929926141, "learning_rate": 8.621235414757337e-06, "loss": 0.0588, "step": 91 }, { "epoch": 1.6140350877192984, "grad_norm": 0.7573418865411702, "learning_rate": 8.578654126419094e-06, "loss": 0.0467, "step": 92 }, { "epoch": 1.631578947368421, "grad_norm": 0.6820664774921356, "learning_rate": 8.535533905932739e-06, "loss": 0.0341, "step": 93 }, { "epoch": 1.6491228070175439, "grad_norm": 0.869674808243179, "learning_rate": 8.491881247044866e-06, "loss": 0.0656, "step": 94 }, { "epoch": 1.6666666666666665, "grad_norm": 0.8266715793847506, "learning_rate": 8.447702723685335e-06, "loss": 0.0615, "step": 95 }, { "epoch": 1.6842105263157894, "grad_norm": 0.780231544917249, "learning_rate": 8.403004988977267e-06, "loss": 0.048, "step": 96 }, { "epoch": 1.7017543859649122, "grad_norm": 0.7049961336136691, "learning_rate": 8.357794774235094e-06, "loss": 0.05, "step": 97 }, { "epoch": 1.719298245614035, "grad_norm": 0.7004291823114401, "learning_rate": 8.31207888795086e-06, "loss": 0.0412, "step": 98 }, { "epoch": 1.736842105263158, "grad_norm": 0.8491120000949876, "learning_rate": 8.265864214768883e-06, "loss": 0.0583, "step": 99 }, { "epoch": 1.7543859649122808, "grad_norm": 0.904612072300696, "learning_rate": 8.219157714448957e-06, "loss": 0.0561, "step": 100 }, { "epoch": 1.7719298245614035, "grad_norm": 1.2352255572375976, "learning_rate": 8.171966420818227e-06, "loss": 0.0698, "step": 101 }, { "epoch": 1.7894736842105263, "grad_norm": 0.8042638647746813, "learning_rate": 8.124297440711933e-06, "loss": 0.0587, "step": 102 }, { "epoch": 1.807017543859649, "grad_norm": 0.7056507871808961, "learning_rate": 8.076157952903134e-06, "loss": 0.0433, "step": 103 }, { "epoch": 1.8245614035087718, "grad_norm": 0.8069684693324715, "learning_rate": 8.02755520702163e-06, "loss": 0.0577, "step": 104 }, { "epoch": 1.8421052631578947, "grad_norm": 0.8411539344688561, "learning_rate": 7.978496522462167e-06, "loss": 0.0554, "step": 105 }, { "epoch": 1.8596491228070176, "grad_norm": 0.6246099301130649, "learning_rate": 7.928989287282195e-06, "loss": 0.0526, "step": 106 }, { "epoch": 1.8771929824561404, "grad_norm": 0.8134173462976111, "learning_rate": 7.879040957089229e-06, "loss": 0.0489, "step": 107 }, { "epoch": 1.8947368421052633, "grad_norm": 0.8233101495435547, "learning_rate": 7.828659053918067e-06, "loss": 0.0554, "step": 108 }, { "epoch": 1.912280701754386, "grad_norm": 0.8139594364760937, "learning_rate": 7.777851165098012e-06, "loss": 0.0515, "step": 109 }, { "epoch": 1.9298245614035088, "grad_norm": 0.8420135284595219, "learning_rate": 7.726624942110233e-06, "loss": 0.0613, "step": 110 }, { "epoch": 1.9473684210526314, "grad_norm": 0.6914453691259363, "learning_rate": 7.674988099435487e-06, "loss": 0.0578, "step": 111 }, { "epoch": 1.9649122807017543, "grad_norm": 0.7494812262812492, "learning_rate": 7.6229484133923445e-06, "loss": 0.0528, "step": 112 }, { "epoch": 1.9824561403508771, "grad_norm": 0.8971283004688011, "learning_rate": 7.570513720966108e-06, "loss": 0.0678, "step": 113 }, { "epoch": 2.0, "grad_norm": 0.6804071807397585, "learning_rate": 7.517691918628589e-06, "loss": 0.0628, "step": 114 }, { "epoch": 2.017543859649123, "grad_norm": 0.5969942923496542, "learning_rate": 7.464490961148921e-06, "loss": 0.0303, "step": 115 }, { "epoch": 2.0350877192982457, "grad_norm": 0.36972630052276856, "learning_rate": 7.410918860395615e-06, "loss": 0.0197, "step": 116 }, { "epoch": 2.0526315789473686, "grad_norm": 0.5245688642809315, "learning_rate": 7.3569836841299905e-06, "loss": 0.0324, "step": 117 }, { "epoch": 2.0701754385964914, "grad_norm": 0.586886410043569, "learning_rate": 7.3026935547912004e-06, "loss": 0.0322, "step": 118 }, { "epoch": 2.087719298245614, "grad_norm": 0.52688198929108, "learning_rate": 7.248056648273034e-06, "loss": 0.0317, "step": 119 }, { "epoch": 2.1052631578947367, "grad_norm": 0.4845274411404232, "learning_rate": 7.193081192692639e-06, "loss": 0.0232, "step": 120 }, { "epoch": 2.1228070175438596, "grad_norm": 0.37896879688644275, "learning_rate": 7.137775467151411e-06, "loss": 0.023, "step": 121 }, { "epoch": 2.1403508771929824, "grad_norm": 0.48333085052863545, "learning_rate": 7.0821478004881875e-06, "loss": 0.0249, "step": 122 }, { "epoch": 2.1578947368421053, "grad_norm": 0.5087235050027019, "learning_rate": 7.026206570024949e-06, "loss": 0.026, "step": 123 }, { "epoch": 2.175438596491228, "grad_norm": 0.5900796410435952, "learning_rate": 6.969960200305242e-06, "loss": 0.03, "step": 124 }, { "epoch": 2.192982456140351, "grad_norm": 0.5278206883353274, "learning_rate": 6.913417161825449e-06, "loss": 0.0221, "step": 125 }, { "epoch": 2.2105263157894735, "grad_norm": 0.47692494321697626, "learning_rate": 6.856585969759189e-06, "loss": 0.0264, "step": 126 }, { "epoch": 2.2280701754385963, "grad_norm": 0.46053887456028303, "learning_rate": 6.799475182674942e-06, "loss": 0.0205, "step": 127 }, { "epoch": 2.245614035087719, "grad_norm": 0.621184886789942, "learning_rate": 6.742093401247173e-06, "loss": 0.0338, "step": 128 }, { "epoch": 2.263157894736842, "grad_norm": 0.591049215409491, "learning_rate": 6.684449266961101e-06, "loss": 0.0279, "step": 129 }, { "epoch": 2.280701754385965, "grad_norm": 0.7230540299030428, "learning_rate": 6.626551460811316e-06, "loss": 0.0453, "step": 130 }, { "epoch": 2.2982456140350878, "grad_norm": 0.49492484163558687, "learning_rate": 6.568408701994459e-06, "loss": 0.0253, "step": 131 }, { "epoch": 2.3157894736842106, "grad_norm": 0.7317513887580467, "learning_rate": 6.510029746596141e-06, "loss": 0.0266, "step": 132 }, { "epoch": 2.3333333333333335, "grad_norm": 0.6300761851984211, "learning_rate": 6.451423386272312e-06, "loss": 0.0317, "step": 133 }, { "epoch": 2.3508771929824563, "grad_norm": 0.6972395573858822, "learning_rate": 6.392598446925266e-06, "loss": 0.0284, "step": 134 }, { "epoch": 2.3684210526315788, "grad_norm": 0.4472525715518054, "learning_rate": 6.333563787374493e-06, "loss": 0.0134, "step": 135 }, { "epoch": 2.3859649122807016, "grad_norm": 0.5547552937334421, "learning_rate": 6.274328298022574e-06, "loss": 0.0241, "step": 136 }, { "epoch": 2.4035087719298245, "grad_norm": 0.6287088060697468, "learning_rate": 6.21490089951632e-06, "loss": 0.0235, "step": 137 }, { "epoch": 2.4210526315789473, "grad_norm": 0.7592719638455317, "learning_rate": 6.155290541403357e-06, "loss": 0.024, "step": 138 }, { "epoch": 2.43859649122807, "grad_norm": 0.7061314974517516, "learning_rate": 6.095506200784349e-06, "loss": 0.0216, "step": 139 }, { "epoch": 2.456140350877193, "grad_norm": 0.4146934339346621, "learning_rate": 6.035556880961093e-06, "loss": 0.0203, "step": 140 }, { "epoch": 2.473684210526316, "grad_norm": 0.5808662642747494, "learning_rate": 5.975451610080643e-06, "loss": 0.0237, "step": 141 }, { "epoch": 2.4912280701754383, "grad_norm": 0.9156090409068862, "learning_rate": 5.915199439775706e-06, "loss": 0.0342, "step": 142 }, { "epoch": 2.5087719298245617, "grad_norm": 0.8273130210726023, "learning_rate": 5.8548094438015065e-06, "loss": 0.0374, "step": 143 }, { "epoch": 2.526315789473684, "grad_norm": 0.5208985562668808, "learning_rate": 5.794290716669307e-06, "loss": 0.0274, "step": 144 }, { "epoch": 2.543859649122807, "grad_norm": 0.5817399089445716, "learning_rate": 5.733652372276809e-06, "loss": 0.0238, "step": 145 }, { "epoch": 2.56140350877193, "grad_norm": 0.41873508080850685, "learning_rate": 5.672903542535631e-06, "loss": 0.0198, "step": 146 }, { "epoch": 2.5789473684210527, "grad_norm": 0.5139902783555422, "learning_rate": 5.612053375996082e-06, "loss": 0.029, "step": 147 }, { "epoch": 2.5964912280701755, "grad_norm": 0.5210356553605748, "learning_rate": 5.551111036469416e-06, "loss": 0.0181, "step": 148 }, { "epoch": 2.6140350877192984, "grad_norm": 0.46862390876194876, "learning_rate": 5.490085701647805e-06, "loss": 0.026, "step": 149 }, { "epoch": 2.6315789473684212, "grad_norm": 0.405287753073754, "learning_rate": 5.4289865617222005e-06, "loss": 0.0152, "step": 150 }, { "epoch": 2.6491228070175437, "grad_norm": 0.5417504620103765, "learning_rate": 5.367822817998338e-06, "loss": 0.0328, "step": 151 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5889155423711966, "learning_rate": 5.306603681511043e-06, "loss": 0.0223, "step": 152 }, { "epoch": 2.6842105263157894, "grad_norm": 0.6623980663043445, "learning_rate": 5.245338371637091e-06, "loss": 0.0396, "step": 153 }, { "epoch": 2.7017543859649122, "grad_norm": 0.394044500779114, "learning_rate": 5.184036114706795e-06, "loss": 0.0185, "step": 154 }, { "epoch": 2.719298245614035, "grad_norm": 0.6238209593070779, "learning_rate": 5.122706142614562e-06, "loss": 0.0282, "step": 155 }, { "epoch": 2.736842105263158, "grad_norm": 0.4543063288143525, "learning_rate": 5.0613576914286e-06, "loss": 0.0208, "step": 156 }, { "epoch": 2.754385964912281, "grad_norm": 0.8733220433036752, "learning_rate": 5e-06, "loss": 0.0258, "step": 157 }, { "epoch": 2.7719298245614032, "grad_norm": 0.5556283328421064, "learning_rate": 4.938642308571401e-06, "loss": 0.0286, "step": 158 }, { "epoch": 2.7894736842105265, "grad_norm": 0.5917707359836524, "learning_rate": 4.87729385738544e-06, "loss": 0.0351, "step": 159 }, { "epoch": 2.807017543859649, "grad_norm": 0.5766947111963191, "learning_rate": 4.815963885293206e-06, "loss": 0.0423, "step": 160 }, { "epoch": 2.824561403508772, "grad_norm": 0.48545690964635335, "learning_rate": 4.75466162836291e-06, "loss": 0.0207, "step": 161 }, { "epoch": 2.8421052631578947, "grad_norm": 0.6123984743262666, "learning_rate": 4.693396318488958e-06, "loss": 0.03, "step": 162 }, { "epoch": 2.8596491228070176, "grad_norm": 0.8017397383836624, "learning_rate": 4.6321771820016635e-06, "loss": 0.0292, "step": 163 }, { "epoch": 2.8771929824561404, "grad_norm": 0.3923449231624014, "learning_rate": 4.571013438277801e-06, "loss": 0.0169, "step": 164 }, { "epoch": 2.8947368421052633, "grad_norm": 0.3530622647170764, "learning_rate": 4.509914298352197e-06, "loss": 0.0135, "step": 165 }, { "epoch": 2.912280701754386, "grad_norm": 0.4669626843941875, "learning_rate": 4.448888963530585e-06, "loss": 0.0161, "step": 166 }, { "epoch": 2.9298245614035086, "grad_norm": 0.48346557930060835, "learning_rate": 4.38794662400392e-06, "loss": 0.017, "step": 167 }, { "epoch": 2.9473684210526314, "grad_norm": 0.5346644318960719, "learning_rate": 4.3270964574643695e-06, "loss": 0.0182, "step": 168 }, { "epoch": 2.9649122807017543, "grad_norm": 0.46108525432610675, "learning_rate": 4.266347627723192e-06, "loss": 0.0205, "step": 169 }, { "epoch": 2.982456140350877, "grad_norm": 0.5100637866702252, "learning_rate": 4.205709283330694e-06, "loss": 0.0276, "step": 170 }, { "epoch": 3.0, "grad_norm": 0.6346018527302484, "learning_rate": 4.145190556198494e-06, "loss": 0.0305, "step": 171 } ], "logging_steps": 1.0, "max_steps": 285, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 20236215975936.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }