| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 452, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004424778761061947, | |
| "grad_norm": 1624.0, | |
| "learning_rate": 9.523809523809523e-06, | |
| "loss": 18.8236, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008849557522123894, | |
| "grad_norm": 1632.0, | |
| "learning_rate": 1.9047619047619046e-05, | |
| "loss": 18.845, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01327433628318584, | |
| "grad_norm": 1104.0, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": 13.897, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.017699115044247787, | |
| "grad_norm": 556.0, | |
| "learning_rate": 3.809523809523809e-05, | |
| "loss": 12.7695, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.022123893805309734, | |
| "grad_norm": 352.0, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 12.0053, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02654867256637168, | |
| "grad_norm": 121.5, | |
| "learning_rate": 5.714285714285714e-05, | |
| "loss": 9.8363, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.030973451327433628, | |
| "grad_norm": 93.0, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 8.3813, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.035398230088495575, | |
| "grad_norm": 32.75, | |
| "learning_rate": 7.619047619047618e-05, | |
| "loss": 7.5272, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03982300884955752, | |
| "grad_norm": 54.75, | |
| "learning_rate": 8.571428571428571e-05, | |
| "loss": 7.1281, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.04424778761061947, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.523809523809524e-05, | |
| "loss": 6.8148, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.048672566371681415, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.00010476190476190477, | |
| "loss": 6.8122, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.05309734513274336, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.00011428571428571428, | |
| "loss": 6.4345, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05752212389380531, | |
| "grad_norm": 17.75, | |
| "learning_rate": 0.0001238095238095238, | |
| "loss": 6.4061, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.061946902654867256, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 6.4773, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06637168141592921, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 0.00014285714285714287, | |
| "loss": 6.2721, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07079646017699115, | |
| "grad_norm": 13.25, | |
| "learning_rate": 0.00015238095238095237, | |
| "loss": 6.1657, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0752212389380531, | |
| "grad_norm": 10.125, | |
| "learning_rate": 0.00016190476190476192, | |
| "loss": 5.9112, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.07964601769911504, | |
| "grad_norm": 13.875, | |
| "learning_rate": 0.00017142857142857143, | |
| "loss": 5.3251, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.084070796460177, | |
| "grad_norm": 179.0, | |
| "learning_rate": 0.00018095238095238095, | |
| "loss": 7.1923, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08849557522123894, | |
| "grad_norm": 26.875, | |
| "learning_rate": 0.00019047619047619048, | |
| "loss": 4.1209, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09292035398230089, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0002, | |
| "loss": 3.2117, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09734513274336283, | |
| "grad_norm": 11.25, | |
| "learning_rate": 0.00019999885675796823, | |
| "loss": 2.2957, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.10176991150442478, | |
| "grad_norm": 9.25, | |
| "learning_rate": 0.00019999542705801296, | |
| "loss": 1.8052, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10619469026548672, | |
| "grad_norm": 5.75, | |
| "learning_rate": 0.0001999897109785537, | |
| "loss": 1.5344, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.11061946902654868, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 0.00019998170865028774, | |
| "loss": 1.3085, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11504424778761062, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.00019997142025618701, | |
| "loss": 1.2024, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.11946902654867257, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 0.00019995884603149402, | |
| "loss": 1.1342, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.12389380530973451, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.00019994398626371643, | |
| "loss": 1.073, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.12831858407079647, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 0.00019992684129262038, | |
| "loss": 0.9815, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.13274336283185842, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.00019990741151022301, | |
| "loss": 0.9502, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13716814159292035, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 0.00019988569736078312, | |
| "loss": 0.8713, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1415929203539823, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 0.00019986169934079135, | |
| "loss": 0.8431, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.14601769911504425, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.0001998354179989585, | |
| "loss": 0.7957, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1504424778761062, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.00019980685393620337, | |
| "loss": 0.8088, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.15486725663716813, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 0.00019977600780563863, | |
| "loss": 0.7827, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1592920353982301, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.00019974288031255618, | |
| "loss": 0.7153, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.16371681415929204, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.00019970747221441083, | |
| "loss": 0.7449, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.168141592920354, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.00019966978432080316, | |
| "loss": 0.734, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.17256637168141592, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.00019962981749346078, | |
| "loss": 0.7157, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.17699115044247787, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0001995875726462189, | |
| "loss": 0.733, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18141592920353983, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00019954305074499916, | |
| "loss": 0.6717, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.18584070796460178, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.00019949625280778777, | |
| "loss": 0.7242, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1902654867256637, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.00019944717990461207, | |
| "loss": 0.6803, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.19469026548672566, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.00019939583315751623, | |
| "loss": 0.6824, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.19911504424778761, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.0001993422137405354, | |
| "loss": 0.669, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20353982300884957, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.000199286322879669, | |
| "loss": 0.6621, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.2079646017699115, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00019922816185285265, | |
| "loss": 0.6433, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.21238938053097345, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.000199167731989929, | |
| "loss": 0.647, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2168141592920354, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.00019910503467261722, | |
| "loss": 0.642, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.22123893805309736, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00019904007133448147, | |
| "loss": 0.6283, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22566371681415928, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.0001989728434608981, | |
| "loss": 0.6374, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.23008849557522124, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00019890335258902178, | |
| "loss": 0.6671, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2345132743362832, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00019883160030775016, | |
| "loss": 0.6303, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.23893805309734514, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.0001987575882576878, | |
| "loss": 0.6423, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.24336283185840707, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.00019868131813110832, | |
| "loss": 0.6428, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24778761061946902, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0001986027916719161, | |
| "loss": 0.6105, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.252212389380531, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00019852201067560606, | |
| "loss": 0.6109, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.25663716814159293, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00019843897698922284, | |
| "loss": 0.5753, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2610619469026549, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00019835369251131846, | |
| "loss": 0.607, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.26548672566371684, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00019826615919190887, | |
| "loss": 0.6029, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26991150442477874, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0001981763790324295, | |
| "loss": 0.5668, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2743362831858407, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00019808435408568938, | |
| "loss": 0.6282, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.27876106194690264, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0001979900864558242, | |
| "loss": 0.5816, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2831858407079646, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0001978935782982484, | |
| "loss": 0.6071, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.28761061946902655, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.00019779483181960557, | |
| "loss": 0.56, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2920353982300885, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.0001976938492777182, | |
| "loss": 0.6138, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.29646017699115046, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.00019759063298153598, | |
| "loss": 0.5735, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3008849557522124, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00019748518529108316, | |
| "loss": 0.5871, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3053097345132743, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00019737750861740431, | |
| "loss": 0.6027, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.30973451327433627, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.00019726760542250946, | |
| "loss": 0.6089, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3141592920353982, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0001971554782193176, | |
| "loss": 0.5682, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3185840707964602, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0001970411295715994, | |
| "loss": 0.5707, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3230088495575221, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00019692456209391846, | |
| "loss": 0.5949, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3274336283185841, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00019680577845157155, | |
| "loss": 0.577, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.33185840707964603, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.00019668478136052774, | |
| "loss": 0.5871, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.336283185840708, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00019656157358736626, | |
| "loss": 0.5763, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.3407079646017699, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.0001964361579492132, | |
| "loss": 0.5875, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.34513274336283184, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00019630853731367713, | |
| "loss": 0.5811, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3495575221238938, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.0001961787145987835, | |
| "loss": 0.6122, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.35398230088495575, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.00019604669277290806, | |
| "loss": 0.5584, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3584070796460177, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.0001959124748547088, | |
| "loss": 0.5629, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.36283185840707965, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.00019577606391305702, | |
| "loss": 0.5758, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3672566371681416, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.0001956374630669672, | |
| "loss": 0.5383, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.37168141592920356, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.00019549667548552556, | |
| "loss": 0.5869, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.37610619469026546, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.00019535370438781766, | |
| "loss": 0.5642, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3805309734513274, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.000195208553042855, | |
| "loss": 0.5895, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.38495575221238937, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.00019506122476949981, | |
| "loss": 0.5744, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3893805309734513, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00019491172293638968, | |
| "loss": 0.5537, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3938053097345133, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.00019476005096186015, | |
| "loss": 0.5748, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.39823008849557523, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.00019460621231386676, | |
| "loss": 0.5511, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4026548672566372, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00019445021050990571, | |
| "loss": 0.5636, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.40707964601769914, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 0.00019429204911693333, | |
| "loss": 0.5649, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.41150442477876104, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.00019413173175128473, | |
| "loss": 0.5468, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.415929203539823, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00019396926207859084, | |
| "loss": 0.5347, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.42035398230088494, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00019380464381369493, | |
| "loss": 0.5667, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.4247787610619469, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 0.0001936378807205673, | |
| "loss": 0.5644, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.42920353982300885, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.00019346897661221956, | |
| "loss": 0.5384, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4336283185840708, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.00019329793535061723, | |
| "loss": 0.5569, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.43805309734513276, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0001931247608465915, | |
| "loss": 0.5494, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.4424778761061947, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00019294945705974973, | |
| "loss": 0.536, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4469026548672566, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 0.00019277202799838518, | |
| "loss": 0.5319, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.45132743362831856, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.000192592477719385, | |
| "loss": 0.5649, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.4557522123893805, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00019241081032813772, | |
| "loss": 0.5652, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.46017699115044247, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.00019222702997843927, | |
| "loss": 0.5222, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.4646017699115044, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.00019204114087239806, | |
| "loss": 0.551, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4690265486725664, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00019185314726033893, | |
| "loss": 0.555, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.47345132743362833, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00019166305344070578, | |
| "loss": 0.5404, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.4778761061946903, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.0001914708637599636, | |
| "loss": 0.5462, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.4823008849557522, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0001912765826124987, | |
| "loss": 0.5532, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.48672566371681414, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.0001910802144405186, | |
| "loss": 0.5417, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4911504424778761, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.0001908817637339503, | |
| "loss": 0.5418, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.49557522123893805, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00019068123503033753, | |
| "loss": 0.5362, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.00019047863291473717, | |
| "loss": 0.5512, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.504424778761062, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0001902739620196143, | |
| "loss": 0.5541, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5088495575221239, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00019006722702473629, | |
| "loss": 0.5138, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5132743362831859, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 0.00018985843265706588, | |
| "loss": 0.5457, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5176991150442478, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.000189647583690653, | |
| "loss": 0.5161, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5221238938053098, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0001894346849465257, | |
| "loss": 0.5442, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5265486725663717, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0001892197412925798, | |
| "loss": 0.5079, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5309734513274337, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.00018900275764346768, | |
| "loss": 0.4962, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5353982300884956, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00018878373896048594, | |
| "loss": 0.5248, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5398230088495575, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0001885626902514618, | |
| "loss": 0.5247, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5442477876106194, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.00018833961657063885, | |
| "loss": 0.5397, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5486725663716814, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0001881145230185612, | |
| "loss": 0.5146, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5530973451327433, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.00018788741474195706, | |
| "loss": 0.5654, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5575221238938053, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00018765829693362095, | |
| "loss": 0.52, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5619469026548672, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.0001874271748322951, | |
| "loss": 0.5543, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5663716814159292, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.00018719405372254948, | |
| "loss": 0.5199, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5707964601769911, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.0001869589389346611, | |
| "loss": 0.5064, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5752212389380531, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00018672183584449216, | |
| "loss": 0.5137, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5796460176991151, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00018648274987336704, | |
| "loss": 0.5058, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.584070796460177, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.00018624168648794832, | |
| "loss": 0.5264, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.588495575221239, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00018599865120011192, | |
| "loss": 0.5229, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5929203539823009, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00018575364956682094, | |
| "loss": 0.546, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5973451327433629, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.00018550668718999872, | |
| "loss": 0.5311, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6017699115044248, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.00018525776971640062, | |
| "loss": 0.5043, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.6061946902654868, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.00018500690283748504, | |
| "loss": 0.4815, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.6106194690265486, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00018475409228928312, | |
| "loss": 0.517, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6150442477876106, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0001844993438522678, | |
| "loss": 0.513, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6194690265486725, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.00018424266335122152, | |
| "loss": 0.5296, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6238938053097345, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.000183984056655103, | |
| "loss": 0.5186, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6283185840707964, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.0001837235296769131, | |
| "loss": 0.5239, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6327433628318584, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.00018346108837355972, | |
| "loss": 0.5024, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6371681415929203, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.0001831967387457214, | |
| "loss": 0.5144, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6415929203539823, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 0.00018293048683771024, | |
| "loss": 0.5342, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6460176991150443, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.00018266233873733375, | |
| "loss": 0.5143, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6504424778761062, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00018239230057575542, | |
| "loss": 0.5271, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6548672566371682, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.00018212037852735486, | |
| "loss": 0.5185, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6592920353982301, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00018184657880958635, | |
| "loss": 0.5237, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6637168141592921, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.00018157090768283678, | |
| "loss": 0.5137, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.668141592920354, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 0.00018129337145028255, | |
| "loss": 0.5284, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.672566371681416, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00018101397645774539, | |
| "loss": 0.5252, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6769911504424779, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00018073272909354727, | |
| "loss": 0.5105, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6814159292035398, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.00018044963578836435, | |
| "loss": 0.5384, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6858407079646017, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00018016470301507995, | |
| "loss": 0.5086, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6902654867256637, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00017987793728863651, | |
| "loss": 0.5227, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6946902654867256, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 0.00017958934516588667, | |
| "loss": 0.4982, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6991150442477876, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 0.00017929893324544332, | |
| "loss": 0.5109, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.7035398230088495, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00017900670816752874, | |
| "loss": 0.5242, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.7079646017699115, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.00017871267661382278, | |
| "loss": 0.5019, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7123893805309734, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00017841684530731005, | |
| "loss": 0.5048, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.7168141592920354, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0001781192210121262, | |
| "loss": 0.4957, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.7212389380530974, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00017781981053340337, | |
| "loss": 0.5035, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.7256637168141593, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.00017751862071711442, | |
| "loss": 0.5161, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.7300884955752213, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00017721565844991643, | |
| "loss": 0.4838, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7345132743362832, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.00017691093065899346, | |
| "loss": 0.5135, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7389380530973452, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.0001766044443118978, | |
| "loss": 0.5174, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7433628318584071, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.00017629620641639103, | |
| "loss": 0.4982, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7477876106194691, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.0001759862240202834, | |
| "loss": 0.4902, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7522123893805309, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.00017567450421127306, | |
| "loss": 0.4858, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7566371681415929, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.0001753610541167838, | |
| "loss": 0.5178, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.7610619469026548, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.00017504588090380197, | |
| "loss": 0.5209, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7654867256637168, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00017472899177871297, | |
| "loss": 0.5042, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7699115044247787, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.00017441039398713608, | |
| "loss": 0.539, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7743362831858407, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00017409009481375904, | |
| "loss": 0.5053, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7787610619469026, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.0001737681015821714, | |
| "loss": 0.5357, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7831858407079646, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 0.00017344442165469714, | |
| "loss": 0.5108, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7876106194690266, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00017311906243222614, | |
| "loss": 0.4909, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7920353982300885, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.0001727920313540452, | |
| "loss": 0.506, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.7964601769911505, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00017246333589766787, | |
| "loss": 0.5186, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8008849557522124, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00017213298357866326, | |
| "loss": 0.4941, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.8053097345132744, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00017180098195048458, | |
| "loss": 0.4811, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.8097345132743363, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00017146733860429612, | |
| "loss": 0.5137, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.8141592920353983, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00017113206116879982, | |
| "loss": 0.5152, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.8185840707964602, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00017079515731006085, | |
| "loss": 0.5052, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8230088495575221, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00017045663473133215, | |
| "loss": 0.5018, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.827433628318584, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.00017011650117287866, | |
| "loss": 0.5055, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.831858407079646, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00016977476441179992, | |
| "loss": 0.5109, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8362831858407079, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00016943143226185253, | |
| "loss": 0.4591, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8407079646017699, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.00016908651257327138, | |
| "loss": 0.5182, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8451327433628318, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00016874001323259011, | |
| "loss": 0.4982, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8495575221238938, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00016839194216246108, | |
| "loss": 0.4869, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.8539823008849557, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.0001680423073214737, | |
| "loss": 0.4933, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.8584070796460177, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00016769111670397296, | |
| "loss": 0.5087, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8628318584070797, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00016733837833987633, | |
| "loss": 0.5372, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8672566371681416, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.0001669841002944903, | |
| "loss": 0.5232, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8716814159292036, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00016662829066832596, | |
| "loss": 0.5069, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8761061946902655, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00016627095759691362, | |
| "loss": 0.5029, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8805309734513275, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.0001659121092506171, | |
| "loss": 0.5079, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8849557522123894, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00016555175383444655, | |
| "loss": 0.4895, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8893805309734514, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.00016518989958787126, | |
| "loss": 0.5173, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.8938053097345132, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00016482655478463083, | |
| "loss": 0.4811, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8982300884955752, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00016446172773254629, | |
| "loss": 0.5187, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.9026548672566371, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00016409542677333006, | |
| "loss": 0.5053, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.9070796460176991, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.0001637276602823952, | |
| "loss": 0.5353, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.911504424778761, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.00016335843666866388, | |
| "loss": 0.4827, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.915929203539823, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00016298776437437523, | |
| "loss": 0.4936, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.9203539823008849, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.0001626156518748922, | |
| "loss": 0.5207, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9247787610619469, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.0001622421076785077, | |
| "loss": 0.4725, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.9292035398230089, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00016186714032625035, | |
| "loss": 0.4816, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9336283185840708, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00016149075839168886, | |
| "loss": 0.5515, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.9380530973451328, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.0001611129704807362, | |
| "loss": 0.5059, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.9424778761061947, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.0001607337852314527, | |
| "loss": 0.496, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9469026548672567, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.0001603532113138487, | |
| "loss": 0.484, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.9513274336283186, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.00015997125742968617, | |
| "loss": 0.5204, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9557522123893806, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.0001595879323122798, | |
| "loss": 0.4906, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9601769911504425, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00015920324472629732, | |
| "loss": 0.492, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9646017699115044, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00015881720346755905, | |
| "loss": 0.4574, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.9690265486725663, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00015842981736283686, | |
| "loss": 0.5018, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9734513274336283, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00015804109526965232, | |
| "loss": 0.4766, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9778761061946902, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.0001576510460760741, | |
| "loss": 0.4928, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9823008849557522, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.0001572596787005149, | |
| "loss": 0.4919, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.9867256637168141, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.00015686700209152738, | |
| "loss": 0.4737, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9911504424778761, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00015647302522759962, | |
| "loss": 0.5073, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.995575221238938, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.00015607775711694977, | |
| "loss": 0.4684, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 0.0001556812067973203, | |
| "loss": 0.4789, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.49126777052879333, | |
| "eval_runtime": 5.6505, | |
| "eval_samples_per_second": 148.306, | |
| "eval_steps_per_second": 18.583, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.0044247787610618, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.00015528338333577101, | |
| "loss": 0.4303, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.008849557522124, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00015488429582847192, | |
| "loss": 0.4457, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.0132743362831858, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00015448395340049537, | |
| "loss": 0.4697, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.0176991150442478, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0001540823652056071, | |
| "loss": 0.4294, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0221238938053097, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.0001536795404260572, | |
| "loss": 0.4192, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.0265486725663717, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00015327548827237007, | |
| "loss": 0.4449, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.0309734513274336, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.0001528702179831338, | |
| "loss": 0.4394, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.0353982300884956, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.00015246373882478898, | |
| "loss": 0.435, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.0398230088495575, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.0001520560600914168, | |
| "loss": 0.4531, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0442477876106195, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00015164719110452652, | |
| "loss": 0.4443, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0486725663716814, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.0001512371412128424, | |
| "loss": 0.4459, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.0530973451327434, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00015082591979208976, | |
| "loss": 0.4355, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.0575221238938053, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00015041353624478093, | |
| "loss": 0.443, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.0619469026548674, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.4295, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0663716814159292, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.0001495853205131873, | |
| "loss": 0.4249, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.0707964601769913, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00014916950726592322, | |
| "loss": 0.4464, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.075221238938053, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.00014875256976571135, | |
| "loss": 0.4404, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.079646017699115, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00014833451754576123, | |
| "loss": 0.4294, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.084070796460177, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00014791536016477022, | |
| "loss": 0.4172, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0884955752212389, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00014749510720670506, | |
| "loss": 0.4333, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.092920353982301, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00014707376828058263, | |
| "loss": 0.4522, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0973451327433628, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00014665135302025035, | |
| "loss": 0.46, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.1017699115044248, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00014622787108416584, | |
| "loss": 0.42, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.1061946902654867, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00014580333215517607, | |
| "loss": 0.4367, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1106194690265487, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.000145377745940296, | |
| "loss": 0.4305, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.1150442477876106, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.00014495112217048658, | |
| "loss": 0.4518, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.1194690265486726, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.00014452347060043237, | |
| "loss": 0.4227, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.1238938053097345, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00014409480100831834, | |
| "loss": 0.4336, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.1283185840707965, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.0001436651231956064, | |
| "loss": 0.4077, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1327433628318584, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00014323444698681126, | |
| "loss": 0.426, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.1371681415929205, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 0.0001428027822292758, | |
| "loss": 0.4403, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.1415929203539823, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.0001423701387929459, | |
| "loss": 0.443, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.1460176991150441, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.0001419365265701448, | |
| "loss": 0.438, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.1504424778761062, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00014150195547534686, | |
| "loss": 0.4318, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.154867256637168, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.0001410664354449509, | |
| "loss": 0.4413, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.1592920353982301, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00014062997643705306, | |
| "loss": 0.4325, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.163716814159292, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00014019258843121893, | |
| "loss": 0.4164, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.168141592920354, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.0001397542814282556, | |
| "loss": 0.4393, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.1725663716814159, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00013931506544998283, | |
| "loss": 0.4346, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.176991150442478, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00013887495053900397, | |
| "loss": 0.4294, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.1814159292035398, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00013843394675847634, | |
| "loss": 0.4404, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.1858407079646018, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.00013799206419188103, | |
| "loss": 0.4198, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.1902654867256637, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00013754931294279263, | |
| "loss": 0.4158, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.1946902654867257, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.00013710570313464778, | |
| "loss": 0.4234, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1991150442477876, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.0001366612449105141, | |
| "loss": 0.4233, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.2035398230088497, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00013621594843285802, | |
| "loss": 0.4518, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.2079646017699115, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.0001357698238833126, | |
| "loss": 0.4465, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.2123893805309733, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00013532288146244446, | |
| "loss": 0.4446, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.2168141592920354, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00013487513138952094, | |
| "loss": 0.4384, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2212389380530975, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.00013442658390227602, | |
| "loss": 0.4286, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.2256637168141593, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00013397724925667657, | |
| "loss": 0.43, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.2300884955752212, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.00013352713772668765, | |
| "loss": 0.4293, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.2345132743362832, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00013307625960403763, | |
| "loss": 0.4326, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.238938053097345, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 0.00013262462519798293, | |
| "loss": 0.4241, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2433628318584071, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.00013217224483507228, | |
| "loss": 0.4336, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.247787610619469, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00013171912885891063, | |
| "loss": 0.4418, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.252212389380531, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00013126528762992247, | |
| "loss": 0.4258, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.2566371681415929, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.00013081073152511525, | |
| "loss": 0.438, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.261061946902655, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00013035547093784186, | |
| "loss": 0.4364, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.2654867256637168, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00012989951627756304, | |
| "loss": 0.409, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2699115044247788, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.0001294428779696095, | |
| "loss": 0.4448, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.2743362831858407, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.00012898556645494325, | |
| "loss": 0.4497, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2787610619469025, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.00012852759218991933, | |
| "loss": 0.4188, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.2831858407079646, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.00012806896564604626, | |
| "loss": 0.4226, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2876106194690267, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00012760969730974694, | |
| "loss": 0.4281, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.2920353982300885, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.00012714979768211853, | |
| "loss": 0.4626, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.2964601769911503, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.0001266892772786929, | |
| "loss": 0.4292, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.3008849557522124, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00012622814662919561, | |
| "loss": 0.4309, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.3053097345132743, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.0001257664162773055, | |
| "loss": 0.4283, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3097345132743363, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00012530409678041343, | |
| "loss": 0.4431, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.3141592920353982, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.00012484119870938103, | |
| "loss": 0.4419, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.3185840707964602, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.00012437773264829897, | |
| "loss": 0.4513, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.323008849557522, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00012391370919424485, | |
| "loss": 0.4389, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.3274336283185841, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00012344913895704097, | |
| "loss": 0.4495, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.331858407079646, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00012298403255901186, | |
| "loss": 0.4169, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.336283185840708, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.00012251840063474108, | |
| "loss": 0.4368, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.3407079646017699, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.00012205225383082843, | |
| "loss": 0.4198, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.3451327433628317, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00012158560280564626, | |
| "loss": 0.4386, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.3495575221238938, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00012111845822909596, | |
| "loss": 0.4164, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3539823008849559, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00012065083078236374, | |
| "loss": 0.4568, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.3584070796460177, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.00012018273115767673, | |
| "loss": 0.4283, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.3628318584070795, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00011971417005805818, | |
| "loss": 0.4254, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3672566371681416, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.000119245158197083, | |
| "loss": 0.434, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.3716814159292037, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00011877570629863266, | |
| "loss": 0.4203, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3761061946902655, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00011830582509664995, | |
| "loss": 0.4373, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.3805309734513274, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.00011783552533489372, | |
| "loss": 0.4303, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.3849557522123894, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 0.429, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.3893805309734513, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.0001168937131547015, | |
| "loss": 0.4122, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.3938053097345133, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00011642222227065089, | |
| "loss": 0.4519, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.3982300884955752, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00011595035589510522, | |
| "loss": 0.4303, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.4026548672566372, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.00011547812481721388, | |
| "loss": 0.4215, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.407079646017699, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00011500553983446527, | |
| "loss": 0.4276, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.411504424778761, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00011453261175243973, | |
| "loss": 0.4249, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.415929203539823, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00011405935138456241, | |
| "loss": 0.4168, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.420353982300885, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.0001135857695518563, | |
| "loss": 0.4194, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.424778761061947, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00011311187708269442, | |
| "loss": 0.4154, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.4292035398230087, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00011263768481255264, | |
| "loss": 0.426, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.4336283185840708, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00011216320358376157, | |
| "loss": 0.4359, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.4380530973451329, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.00011168844424525902, | |
| "loss": 0.412, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4424778761061947, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.00011121341765234146, | |
| "loss": 0.4418, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.4469026548672566, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00011073813466641632, | |
| "loss": 0.4193, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.4513274336283186, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00011026260615475333, | |
| "loss": 0.4161, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.4557522123893805, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.00010978684299023607, | |
| "loss": 0.4367, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.4601769911504425, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.00010931085605111354, | |
| "loss": 0.4334, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4646017699115044, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.0001088346562207512, | |
| "loss": 0.4069, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.4690265486725664, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00010835825438738232, | |
| "loss": 0.4183, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4734513274336283, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00010788166144385888, | |
| "loss": 0.4275, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.4778761061946903, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00010740488828740258, | |
| "loss": 0.4342, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.4823008849557522, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.00010692794581935566, | |
| "loss": 0.4199, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.4867256637168142, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.00010645084494493165, | |
| "loss": 0.4134, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.491150442477876, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.00010597359657296602, | |
| "loss": 0.4057, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.495575221238938, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.0001054962116156667, | |
| "loss": 0.4129, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00010501870098836473, | |
| "loss": 0.4337, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.504424778761062, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00010454107560926443, | |
| "loss": 0.4292, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.508849557522124, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 0.00010406334639919403, | |
| "loss": 0.4074, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.5132743362831858, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00010358552428135575, | |
| "loss": 0.4187, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.5176991150442478, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.0001031076201810762, | |
| "loss": 0.4427, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.5221238938053099, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.00010262964502555643, | |
| "loss": 0.4249, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.5265486725663717, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00010215160974362223, | |
| "loss": 0.4132, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5309734513274336, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00010167352526547416, | |
| "loss": 0.4181, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.5353982300884956, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00010119540252243755, | |
| "loss": 0.4282, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.5398230088495575, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.00010071725244671282, | |
| "loss": 0.4314, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.5442477876106193, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.00010023908597112514, | |
| "loss": 0.4404, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.5486725663716814, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 9.976091402887487e-05, | |
| "loss": 0.4394, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5530973451327434, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 9.928274755328723e-05, | |
| "loss": 0.4282, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.5575221238938053, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 9.880459747756247e-05, | |
| "loss": 0.4234, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.5619469026548671, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 9.83264747345259e-05, | |
| "loss": 0.412, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.5663716814159292, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 9.784839025637778e-05, | |
| "loss": 0.4258, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.5707964601769913, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 9.737035497444361e-05, | |
| "loss": 0.4149, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.575221238938053, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 9.689237981892382e-05, | |
| "loss": 0.4095, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.579646017699115, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 9.641447571864429e-05, | |
| "loss": 0.3994, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.584070796460177, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 9.593665360080599e-05, | |
| "loss": 0.417, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.588495575221239, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 9.545892439073562e-05, | |
| "loss": 0.4328, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.592920353982301, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 9.49812990116353e-05, | |
| "loss": 0.4364, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.5973451327433628, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.450378838433331e-05, | |
| "loss": 0.4098, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.6017699115044248, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 9.4026403427034e-05, | |
| "loss": 0.4155, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.606194690265487, | |
| "grad_norm": 0.375, | |
| "learning_rate": 9.354915505506839e-05, | |
| "loss": 0.4338, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.6106194690265485, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 9.307205418064437e-05, | |
| "loss": 0.4326, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.6150442477876106, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 9.259511171259746e-05, | |
| "loss": 0.4167, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.6194690265486726, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 9.211833855614114e-05, | |
| "loss": 0.4413, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.6238938053097345, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 9.164174561261771e-05, | |
| "loss": 0.4223, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.6283185840707963, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 9.116534377924883e-05, | |
| "loss": 0.4171, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.6327433628318584, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 9.06891439488865e-05, | |
| "loss": 0.4042, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.6371681415929205, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.021315700976395e-05, | |
| "loss": 0.4168, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6415929203539823, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 8.973739384524674e-05, | |
| "loss": 0.4121, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.6460176991150441, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 8.92618653335837e-05, | |
| "loss": 0.4084, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.6504424778761062, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 8.878658234765858e-05, | |
| "loss": 0.41, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.6548672566371683, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 8.831155575474102e-05, | |
| "loss": 0.4131, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.6592920353982301, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.783679641623845e-05, | |
| "loss": 0.4277, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.663716814159292, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 8.73623151874474e-05, | |
| "loss": 0.4201, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.668141592920354, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 8.688812291730563e-05, | |
| "loss": 0.417, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.672566371681416, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.641423044814374e-05, | |
| "loss": 0.4111, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.676991150442478, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.59406486154376e-05, | |
| "loss": 0.404, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.6814159292035398, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 8.54673882475603e-05, | |
| "loss": 0.428, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6858407079646018, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 8.499446016553474e-05, | |
| "loss": 0.4051, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.6902654867256637, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 8.452187518278613e-05, | |
| "loss": 0.4164, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.6946902654867255, | |
| "grad_norm": 0.375, | |
| "learning_rate": 8.404964410489485e-05, | |
| "loss": 0.3898, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.6991150442477876, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 8.357777772934913e-05, | |
| "loss": 0.4264, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.7035398230088497, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 8.310628684529856e-05, | |
| "loss": 0.405, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7079646017699115, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 8.263518223330697e-05, | |
| "loss": 0.4261, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.7123893805309733, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 8.216447466510631e-05, | |
| "loss": 0.4001, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.7168141592920354, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 8.169417490335007e-05, | |
| "loss": 0.4212, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.7212389380530975, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.122429370136739e-05, | |
| "loss": 0.4097, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.7256637168141593, | |
| "grad_norm": 0.375, | |
| "learning_rate": 8.075484180291701e-05, | |
| "loss": 0.4133, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7300884955752212, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 8.028582994194185e-05, | |
| "loss": 0.4104, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.7345132743362832, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 7.981726884232328e-05, | |
| "loss": 0.4147, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.7389380530973453, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 7.934916921763628e-05, | |
| "loss": 0.4097, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.7433628318584071, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 7.888154177090405e-05, | |
| "loss": 0.4126, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.747787610619469, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 7.841439719435377e-05, | |
| "loss": 0.414, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.752212389380531, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 7.79477461691716e-05, | |
| "loss": 0.4348, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.7566371681415929, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 7.748159936525896e-05, | |
| "loss": 0.4122, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.7610619469026547, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 7.701596744098818e-05, | |
| "loss": 0.4134, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.7654867256637168, | |
| "grad_norm": 0.375, | |
| "learning_rate": 7.655086104295904e-05, | |
| "loss": 0.406, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.7699115044247788, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 7.608629080575518e-05, | |
| "loss": 0.4179, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7743362831858407, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 7.562226735170106e-05, | |
| "loss": 0.427, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.7787610619469025, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 7.5158801290619e-05, | |
| "loss": 0.4255, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.7831858407079646, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 7.469590321958662e-05, | |
| "loss": 0.407, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.7876106194690267, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 7.423358372269455e-05, | |
| "loss": 0.4208, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.7920353982300885, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 7.377185337080442e-05, | |
| "loss": 0.4167, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.7964601769911503, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 7.331072272130712e-05, | |
| "loss": 0.4207, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.8008849557522124, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 7.285020231788149e-05, | |
| "loss": 0.4195, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.8053097345132745, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 7.239030269025311e-05, | |
| "loss": 0.4033, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.8097345132743363, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 7.193103435395378e-05, | |
| "loss": 0.4012, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.8141592920353982, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 7.147240781008068e-05, | |
| "loss": 0.4151, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8185840707964602, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 7.101443354505678e-05, | |
| "loss": 0.4091, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.823008849557522, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 7.055712203039055e-05, | |
| "loss": 0.453, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.827433628318584, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 7.010048372243698e-05, | |
| "loss": 0.4153, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.831858407079646, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 6.964452906215815e-05, | |
| "loss": 0.3963, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.836283185840708, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 6.918926847488476e-05, | |
| "loss": 0.4217, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.8407079646017699, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 6.873471237007754e-05, | |
| "loss": 0.408, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.8451327433628317, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 6.82808711410894e-05, | |
| "loss": 0.4029, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.8495575221238938, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 6.782775516492771e-05, | |
| "loss": 0.4386, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.8539823008849559, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 6.73753748020171e-05, | |
| "loss": 0.4093, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.8584070796460177, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 6.69237403959624e-05, | |
| "loss": 0.4261, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8628318584070795, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 6.64728622733124e-05, | |
| "loss": 0.3903, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.8672566371681416, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 6.602275074332345e-05, | |
| "loss": 0.4108, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.8716814159292037, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 6.5573416097724e-05, | |
| "loss": 0.4265, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.8761061946902655, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 6.512486861047911e-05, | |
| "loss": 0.4117, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.8805309734513274, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 6.467711853755558e-05, | |
| "loss": 0.4202, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.8849557522123894, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 6.423017611668745e-05, | |
| "loss": 0.4054, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.8893805309734515, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 6.378405156714202e-05, | |
| "loss": 0.4023, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.893805309734513, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 6.333875508948593e-05, | |
| "loss": 0.4138, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.8982300884955752, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 6.289429686535226e-05, | |
| "loss": 0.4308, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.9026548672566372, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 6.245068705720739e-05, | |
| "loss": 0.4343, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.907079646017699, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 6.200793580811896e-05, | |
| "loss": 0.4085, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.911504424778761, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 6.15660532415237e-05, | |
| "loss": 0.4365, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.915929203539823, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 6.112504946099604e-05, | |
| "loss": 0.4068, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.920353982300885, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 6.0684934550017184e-05, | |
| "loss": 0.4157, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.924778761061947, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 6.024571857174443e-05, | |
| "loss": 0.438, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.9292035398230087, | |
| "grad_norm": 0.375, | |
| "learning_rate": 5.980741156878109e-05, | |
| "loss": 0.41, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.9336283185840708, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 5.9370023562946986e-05, | |
| "loss": 0.4104, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.9380530973451329, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 5.8933564555049105e-05, | |
| "loss": 0.4245, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.9424778761061947, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 5.849804452465317e-05, | |
| "loss": 0.4005, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.9469026548672566, | |
| "grad_norm": 0.375, | |
| "learning_rate": 5.806347342985521e-05, | |
| "loss": 0.3941, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9513274336283186, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 5.7629861207054136e-05, | |
| "loss": 0.4167, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.9557522123893807, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 5.7197217770724245e-05, | |
| "loss": 0.4032, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.9601769911504425, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 5.6765553013188766e-05, | |
| "loss": 0.408, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.9646017699115044, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 5.633487680439361e-05, | |
| "loss": 0.4142, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.9690265486725664, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 5.5905198991681695e-05, | |
| "loss": 0.4044, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.9734513274336283, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 5.547652939956764e-05, | |
| "loss": 0.3855, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.9778761061946901, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 5.5048877829513424e-05, | |
| "loss": 0.3921, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.9823008849557522, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 5.462225405970401e-05, | |
| "loss": 0.4108, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.9867256637168142, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 5.419666784482398e-05, | |
| "loss": 0.427, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.991150442477876, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 5.3772128915834184e-05, | |
| "loss": 0.404, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.995575221238938, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 5.3348646979749687e-05, | |
| "loss": 0.4006, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 5.29262317194174e-05, | |
| "loss": 0.4206, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.4563676416873932, | |
| "eval_runtime": 5.7045, | |
| "eval_samples_per_second": 146.901, | |
| "eval_steps_per_second": 18.406, | |
| "step": 452 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 678, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.948804377680282e+16, | |
| "train_batch_size": 60, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |