| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500.0, | |
| "global_step": 171, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017543859649122806, | |
| "grad_norm": 10.858891815094267, | |
| "learning_rate": 3.4482758620689656e-07, | |
| "loss": 0.8595, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 9.921986611909597, | |
| "learning_rate": 6.896551724137931e-07, | |
| "loss": 0.8199, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 11.486078048946561, | |
| "learning_rate": 1.0344827586206898e-06, | |
| "loss": 0.9249, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 10.665932008284694, | |
| "learning_rate": 1.3793103448275862e-06, | |
| "loss": 0.9003, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "grad_norm": 12.32633670526995, | |
| "learning_rate": 1.724137931034483e-06, | |
| "loss": 0.8337, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 12.266330627967024, | |
| "learning_rate": 2.0689655172413796e-06, | |
| "loss": 0.7852, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.12280701754385964, | |
| "grad_norm": 16.196618467447166, | |
| "learning_rate": 2.4137931034482762e-06, | |
| "loss": 0.6844, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 19.536139648092984, | |
| "learning_rate": 2.7586206896551725e-06, | |
| "loss": 0.6811, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 21.186657992967227, | |
| "learning_rate": 3.103448275862069e-06, | |
| "loss": 0.5463, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 15.998035060391652, | |
| "learning_rate": 3.448275862068966e-06, | |
| "loss": 0.5021, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.19298245614035087, | |
| "grad_norm": 8.43835880238007, | |
| "learning_rate": 3.793103448275862e-06, | |
| "loss": 0.4279, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 7.362934453475211, | |
| "learning_rate": 4.137931034482759e-06, | |
| "loss": 0.3179, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.22807017543859648, | |
| "grad_norm": 3.577221690314773, | |
| "learning_rate": 4.482758620689656e-06, | |
| "loss": 0.2599, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 3.7932012075004025, | |
| "learning_rate": 4.8275862068965525e-06, | |
| "loss": 0.2707, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 5.029838865597322, | |
| "learning_rate": 5.172413793103449e-06, | |
| "loss": 0.2084, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 3.9893917170188398, | |
| "learning_rate": 5.517241379310345e-06, | |
| "loss": 0.2306, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2982456140350877, | |
| "grad_norm": 2.416715355181999, | |
| "learning_rate": 5.862068965517242e-06, | |
| "loss": 0.2179, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 2.3492997945282186, | |
| "learning_rate": 6.206896551724138e-06, | |
| "loss": 0.1928, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 1.910421047206756, | |
| "learning_rate": 6.551724137931035e-06, | |
| "loss": 0.1749, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 2.1822333335907778, | |
| "learning_rate": 6.896551724137932e-06, | |
| "loss": 0.1686, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 2.0144168178562474, | |
| "learning_rate": 7.241379310344828e-06, | |
| "loss": 0.1743, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 2.146685921044344, | |
| "learning_rate": 7.586206896551724e-06, | |
| "loss": 0.1835, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.40350877192982454, | |
| "grad_norm": 2.1004487230521396, | |
| "learning_rate": 7.93103448275862e-06, | |
| "loss": 0.1428, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 3.442392326691301, | |
| "learning_rate": 8.275862068965518e-06, | |
| "loss": 0.1945, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "grad_norm": 2.3206229431994694, | |
| "learning_rate": 8.620689655172414e-06, | |
| "loss": 0.1873, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 1.9065540995615944, | |
| "learning_rate": 8.965517241379312e-06, | |
| "loss": 0.1611, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 2.2678887656021383, | |
| "learning_rate": 9.310344827586207e-06, | |
| "loss": 0.1545, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 2.042938634642255, | |
| "learning_rate": 9.655172413793105e-06, | |
| "loss": 0.144, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5087719298245614, | |
| "grad_norm": 1.9368511767985146, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1508, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 2.0942080441404047, | |
| "learning_rate": 9.999623509195724e-06, | |
| "loss": 0.1615, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.543859649122807, | |
| "grad_norm": 1.9514609324222132, | |
| "learning_rate": 9.998494093481022e-06, | |
| "loss": 0.1479, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 2.1171183568813077, | |
| "learning_rate": 9.996611922941748e-06, | |
| "loss": 0.149, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 1.9117324845858215, | |
| "learning_rate": 9.993977281025862e-06, | |
| "loss": 0.1559, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 2.0353076536422847, | |
| "learning_rate": 9.990590564500745e-06, | |
| "loss": 0.1716, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "grad_norm": 1.8882555798254166, | |
| "learning_rate": 9.986452283393452e-06, | |
| "loss": 0.1574, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 2.3834906677162095, | |
| "learning_rate": 9.98156306091389e-06, | |
| "loss": 0.1723, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6491228070175439, | |
| "grad_norm": 2.0779048891119536, | |
| "learning_rate": 9.975923633360985e-06, | |
| "loss": 0.1242, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.8059353953983028, | |
| "learning_rate": 9.969534850011782e-06, | |
| "loss": 0.1407, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 2.0737332886405224, | |
| "learning_rate": 9.962397672993552e-06, | |
| "loss": 0.1331, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 1.9297626738628924, | |
| "learning_rate": 9.9545131771389e-06, | |
| "loss": 0.1326, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7192982456140351, | |
| "grad_norm": 1.872161533028589, | |
| "learning_rate": 9.945882549823906e-06, | |
| "loss": 0.1234, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 1.8210574224684197, | |
| "learning_rate": 9.936507090789294e-06, | |
| "loss": 0.1347, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7543859649122807, | |
| "grad_norm": 1.692612206991856, | |
| "learning_rate": 9.926388211944707e-06, | |
| "loss": 0.1196, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 1.782798391389361, | |
| "learning_rate": 9.915527437156083e-06, | |
| "loss": 0.1135, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 1.8663061704368509, | |
| "learning_rate": 9.903926402016153e-06, | |
| "loss": 0.1131, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 2.026541513062656, | |
| "learning_rate": 9.891586853598139e-06, | |
| "loss": 0.1188, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8245614035087719, | |
| "grad_norm": 1.843361250666892, | |
| "learning_rate": 9.878510650192644e-06, | |
| "loss": 0.1425, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 1.733943578532701, | |
| "learning_rate": 9.864699761027801e-06, | |
| "loss": 0.0907, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8596491228070176, | |
| "grad_norm": 2.013364795030338, | |
| "learning_rate": 9.850156265972722e-06, | |
| "loss": 0.1281, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 1.7825403158875694, | |
| "learning_rate": 9.834882355224261e-06, | |
| "loss": 0.1242, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 1.859620339721197, | |
| "learning_rate": 9.8188803289772e-06, | |
| "loss": 0.1161, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 1.6351165217318777, | |
| "learning_rate": 9.80215259707783e-06, | |
| "loss": 0.102, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.9298245614035088, | |
| "grad_norm": 1.5445149557127498, | |
| "learning_rate": 9.784701678661045e-06, | |
| "loss": 0.1003, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 1.7068847707523054, | |
| "learning_rate": 9.766530201770969e-06, | |
| "loss": 0.1066, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "grad_norm": 1.5735672025524092, | |
| "learning_rate": 9.747640902965185e-06, | |
| "loss": 0.118, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 1.4914631342446407, | |
| "learning_rate": 9.728036626902607e-06, | |
| "loss": 0.1093, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.4066836670471368, | |
| "learning_rate": 9.707720325915105e-06, | |
| "loss": 0.1068, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 1.5347203623586319, | |
| "learning_rate": 9.686695059562875e-06, | |
| "loss": 0.0791, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 1.0428967147543007, | |
| "learning_rate": 9.664963994173695e-06, | |
| "loss": 0.0723, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 1.1434695045500671, | |
| "learning_rate": 9.64253040236608e-06, | |
| "loss": 0.0559, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 0.9742940357159334, | |
| "learning_rate": 9.619397662556434e-06, | |
| "loss": 0.0517, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 1.2461364702759137, | |
| "learning_rate": 9.59556925845029e-06, | |
| "loss": 0.0762, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 1.0049119142623577, | |
| "learning_rate": 9.571048778517655e-06, | |
| "loss": 0.0574, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 1.0847028026479022, | |
| "learning_rate": 9.545839915452612e-06, | |
| "loss": 0.0611, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 1.1792867233983542, | |
| "learning_rate": 9.519946465617217e-06, | |
| "loss": 0.0613, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.8106778901681555, | |
| "learning_rate": 9.49337232846977e-06, | |
| "loss": 0.0505, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 0.849214933232956, | |
| "learning_rate": 9.466121505977577e-06, | |
| "loss": 0.0482, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 1.1077518605738848, | |
| "learning_rate": 9.438198102014271e-06, | |
| "loss": 0.0785, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.7566984334204481, | |
| "learning_rate": 9.409606321741776e-06, | |
| "loss": 0.0585, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 0.9085507029981399, | |
| "learning_rate": 9.380350470977033e-06, | |
| "loss": 0.0639, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2456140350877192, | |
| "grad_norm": 0.928358906164026, | |
| "learning_rate": 9.350434955543557e-06, | |
| "loss": 0.0474, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.7880693527593644, | |
| "learning_rate": 9.319864280607935e-06, | |
| "loss": 0.0491, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.280701754385965, | |
| "grad_norm": 0.9148719439651587, | |
| "learning_rate": 9.288643050001362e-06, | |
| "loss": 0.0673, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 0.8327662341568611, | |
| "learning_rate": 9.256775965526327e-06, | |
| "loss": 0.0611, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.7812237270066599, | |
| "learning_rate": 9.224267826248536e-06, | |
| "loss": 0.0525, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.8459610892111997, | |
| "learning_rate": 9.19112352777419e-06, | |
| "loss": 0.0696, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.3508771929824561, | |
| "grad_norm": 1.0604992549632657, | |
| "learning_rate": 9.157348061512728e-06, | |
| "loss": 0.071, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 1.0056253432583089, | |
| "learning_rate": 9.122946513925128e-06, | |
| "loss": 0.0853, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 0.9277407408307676, | |
| "learning_rate": 9.08792406575792e-06, | |
| "loss": 0.0654, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 0.9567847127164941, | |
| "learning_rate": 9.052285991262975e-06, | |
| "loss": 0.067, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.9254067178308217, | |
| "learning_rate": 9.016037657403225e-06, | |
| "loss": 0.0833, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 1.0691557469032043, | |
| "learning_rate": 8.979184523044419e-06, | |
| "loss": 0.0614, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 1.0561683653640652, | |
| "learning_rate": 8.941732138133032e-06, | |
| "loss": 0.0651, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.8881795831731262, | |
| "learning_rate": 8.903686142860473e-06, | |
| "loss": 0.0627, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "grad_norm": 0.90484500996426, | |
| "learning_rate": 8.865052266813686e-06, | |
| "loss": 0.0653, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 0.8479688839299309, | |
| "learning_rate": 8.825836328112296e-06, | |
| "loss": 0.0531, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.770843061360994, | |
| "learning_rate": 8.786044232532423e-06, | |
| "loss": 0.0592, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 0.8465258036235466, | |
| "learning_rate": 8.745681972617298e-06, | |
| "loss": 0.062, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 0.8247168238640108, | |
| "learning_rate": 8.704755626774796e-06, | |
| "loss": 0.0561, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.7995751912905001, | |
| "learning_rate": 8.663271358362064e-06, | |
| "loss": 0.052, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 0.7495685929926141, | |
| "learning_rate": 8.621235414757337e-06, | |
| "loss": 0.0588, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 0.7573418865411702, | |
| "learning_rate": 8.578654126419094e-06, | |
| "loss": 0.0467, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.6820664774921356, | |
| "learning_rate": 8.535533905932739e-06, | |
| "loss": 0.0341, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 0.869674808243179, | |
| "learning_rate": 8.491881247044866e-06, | |
| "loss": 0.0656, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.8266715793847506, | |
| "learning_rate": 8.447702723685335e-06, | |
| "loss": 0.0615, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.780231544917249, | |
| "learning_rate": 8.403004988977267e-06, | |
| "loss": 0.048, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.7017543859649122, | |
| "grad_norm": 0.7049961336136691, | |
| "learning_rate": 8.357794774235094e-06, | |
| "loss": 0.05, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 0.7004291823114401, | |
| "learning_rate": 8.31207888795086e-06, | |
| "loss": 0.0412, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.8491120000949876, | |
| "learning_rate": 8.265864214768883e-06, | |
| "loss": 0.0583, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.904612072300696, | |
| "learning_rate": 8.219157714448957e-06, | |
| "loss": 0.0561, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 1.2352255572375976, | |
| "learning_rate": 8.171966420818227e-06, | |
| "loss": 0.0698, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.8042638647746813, | |
| "learning_rate": 8.124297440711933e-06, | |
| "loss": 0.0587, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.807017543859649, | |
| "grad_norm": 0.7056507871808961, | |
| "learning_rate": 8.076157952903134e-06, | |
| "loss": 0.0433, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 0.8069684693324715, | |
| "learning_rate": 8.02755520702163e-06, | |
| "loss": 0.0577, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.8411539344688561, | |
| "learning_rate": 7.978496522462167e-06, | |
| "loss": 0.0554, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 0.6246099301130649, | |
| "learning_rate": 7.928989287282195e-06, | |
| "loss": 0.0526, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 0.8134173462976111, | |
| "learning_rate": 7.879040957089229e-06, | |
| "loss": 0.0489, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.8233101495435547, | |
| "learning_rate": 7.828659053918067e-06, | |
| "loss": 0.0554, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 0.8139594364760937, | |
| "learning_rate": 7.777851165098012e-06, | |
| "loss": 0.0515, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 0.8420135284595219, | |
| "learning_rate": 7.726624942110233e-06, | |
| "loss": 0.0613, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.6914453691259363, | |
| "learning_rate": 7.674988099435487e-06, | |
| "loss": 0.0578, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 0.7494812262812492, | |
| "learning_rate": 7.6229484133923445e-06, | |
| "loss": 0.0528, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.9824561403508771, | |
| "grad_norm": 0.8971283004688011, | |
| "learning_rate": 7.570513720966108e-06, | |
| "loss": 0.0678, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6804071807397585, | |
| "learning_rate": 7.517691918628589e-06, | |
| "loss": 0.0628, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.017543859649123, | |
| "grad_norm": 0.5969942923496542, | |
| "learning_rate": 7.464490961148921e-06, | |
| "loss": 0.0303, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.0350877192982457, | |
| "grad_norm": 0.36972630052276856, | |
| "learning_rate": 7.410918860395615e-06, | |
| "loss": 0.0197, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 0.5245688642809315, | |
| "learning_rate": 7.3569836841299905e-06, | |
| "loss": 0.0324, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.586886410043569, | |
| "learning_rate": 7.3026935547912004e-06, | |
| "loss": 0.0322, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.087719298245614, | |
| "grad_norm": 0.52688198929108, | |
| "learning_rate": 7.248056648273034e-06, | |
| "loss": 0.0317, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.4845274411404232, | |
| "learning_rate": 7.193081192692639e-06, | |
| "loss": 0.0232, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.1228070175438596, | |
| "grad_norm": 0.37896879688644275, | |
| "learning_rate": 7.137775467151411e-06, | |
| "loss": 0.023, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.1403508771929824, | |
| "grad_norm": 0.48333085052863545, | |
| "learning_rate": 7.0821478004881875e-06, | |
| "loss": 0.0249, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 0.5087235050027019, | |
| "learning_rate": 7.026206570024949e-06, | |
| "loss": 0.026, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.175438596491228, | |
| "grad_norm": 0.5900796410435952, | |
| "learning_rate": 6.969960200305242e-06, | |
| "loss": 0.03, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.192982456140351, | |
| "grad_norm": 0.5278206883353274, | |
| "learning_rate": 6.913417161825449e-06, | |
| "loss": 0.0221, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.47692494321697626, | |
| "learning_rate": 6.856585969759189e-06, | |
| "loss": 0.0264, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.2280701754385963, | |
| "grad_norm": 0.46053887456028303, | |
| "learning_rate": 6.799475182674942e-06, | |
| "loss": 0.0205, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 0.621184886789942, | |
| "learning_rate": 6.742093401247173e-06, | |
| "loss": 0.0338, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 0.591049215409491, | |
| "learning_rate": 6.684449266961101e-06, | |
| "loss": 0.0279, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 0.7230540299030428, | |
| "learning_rate": 6.626551460811316e-06, | |
| "loss": 0.0453, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2982456140350878, | |
| "grad_norm": 0.49492484163558687, | |
| "learning_rate": 6.568408701994459e-06, | |
| "loss": 0.0253, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.7317513887580467, | |
| "learning_rate": 6.510029746596141e-06, | |
| "loss": 0.0266, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.6300761851984211, | |
| "learning_rate": 6.451423386272312e-06, | |
| "loss": 0.0317, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.6972395573858822, | |
| "learning_rate": 6.392598446925266e-06, | |
| "loss": 0.0284, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.4472525715518054, | |
| "learning_rate": 6.333563787374493e-06, | |
| "loss": 0.0134, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.3859649122807016, | |
| "grad_norm": 0.5547552937334421, | |
| "learning_rate": 6.274328298022574e-06, | |
| "loss": 0.0241, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.4035087719298245, | |
| "grad_norm": 0.6287088060697468, | |
| "learning_rate": 6.21490089951632e-06, | |
| "loss": 0.0235, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.7592719638455317, | |
| "learning_rate": 6.155290541403357e-06, | |
| "loss": 0.024, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.43859649122807, | |
| "grad_norm": 0.7061314974517516, | |
| "learning_rate": 6.095506200784349e-06, | |
| "loss": 0.0216, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 0.4146934339346621, | |
| "learning_rate": 6.035556880961093e-06, | |
| "loss": 0.0203, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 0.5808662642747494, | |
| "learning_rate": 5.975451610080643e-06, | |
| "loss": 0.0237, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.9156090409068862, | |
| "learning_rate": 5.915199439775706e-06, | |
| "loss": 0.0342, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.5087719298245617, | |
| "grad_norm": 0.8273130210726023, | |
| "learning_rate": 5.8548094438015065e-06, | |
| "loss": 0.0374, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.5208985562668808, | |
| "learning_rate": 5.794290716669307e-06, | |
| "loss": 0.0274, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.543859649122807, | |
| "grad_norm": 0.5817399089445716, | |
| "learning_rate": 5.733652372276809e-06, | |
| "loss": 0.0238, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.56140350877193, | |
| "grad_norm": 0.41873508080850685, | |
| "learning_rate": 5.672903542535631e-06, | |
| "loss": 0.0198, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 0.5139902783555422, | |
| "learning_rate": 5.612053375996082e-06, | |
| "loss": 0.029, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.5964912280701755, | |
| "grad_norm": 0.5210356553605748, | |
| "learning_rate": 5.551111036469416e-06, | |
| "loss": 0.0181, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.6140350877192984, | |
| "grad_norm": 0.46862390876194876, | |
| "learning_rate": 5.490085701647805e-06, | |
| "loss": 0.026, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.405287753073754, | |
| "learning_rate": 5.4289865617222005e-06, | |
| "loss": 0.0152, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.6491228070175437, | |
| "grad_norm": 0.5417504620103765, | |
| "learning_rate": 5.367822817998338e-06, | |
| "loss": 0.0328, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.5889155423711966, | |
| "learning_rate": 5.306603681511043e-06, | |
| "loss": 0.0223, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 0.6623980663043445, | |
| "learning_rate": 5.245338371637091e-06, | |
| "loss": 0.0396, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.7017543859649122, | |
| "grad_norm": 0.394044500779114, | |
| "learning_rate": 5.184036114706795e-06, | |
| "loss": 0.0185, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.719298245614035, | |
| "grad_norm": 0.6238209593070779, | |
| "learning_rate": 5.122706142614562e-06, | |
| "loss": 0.0282, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.4543063288143525, | |
| "learning_rate": 5.0613576914286e-06, | |
| "loss": 0.0208, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.754385964912281, | |
| "grad_norm": 0.8733220433036752, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0258, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.5556283328421064, | |
| "learning_rate": 4.938642308571401e-06, | |
| "loss": 0.0286, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 0.5917707359836524, | |
| "learning_rate": 4.87729385738544e-06, | |
| "loss": 0.0351, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 0.5766947111963191, | |
| "learning_rate": 4.815963885293206e-06, | |
| "loss": 0.0423, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.824561403508772, | |
| "grad_norm": 0.48545690964635335, | |
| "learning_rate": 4.75466162836291e-06, | |
| "loss": 0.0207, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.6123984743262666, | |
| "learning_rate": 4.693396318488958e-06, | |
| "loss": 0.03, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.8596491228070176, | |
| "grad_norm": 0.8017397383836624, | |
| "learning_rate": 4.6321771820016635e-06, | |
| "loss": 0.0292, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.8771929824561404, | |
| "grad_norm": 0.3923449231624014, | |
| "learning_rate": 4.571013438277801e-06, | |
| "loss": 0.0169, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.3530622647170764, | |
| "learning_rate": 4.509914298352197e-06, | |
| "loss": 0.0135, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.4669626843941875, | |
| "learning_rate": 4.448888963530585e-06, | |
| "loss": 0.0161, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.9298245614035086, | |
| "grad_norm": 0.48346557930060835, | |
| "learning_rate": 4.38794662400392e-06, | |
| "loss": 0.017, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.5346644318960719, | |
| "learning_rate": 4.3270964574643695e-06, | |
| "loss": 0.0182, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.9649122807017543, | |
| "grad_norm": 0.46108525432610675, | |
| "learning_rate": 4.266347627723192e-06, | |
| "loss": 0.0205, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 0.5100637866702252, | |
| "learning_rate": 4.205709283330694e-06, | |
| "loss": 0.0276, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.6346018527302484, | |
| "learning_rate": 4.145190556198494e-06, | |
| "loss": 0.0305, | |
| "step": 171 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 285, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 20236215975936.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |