diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6840 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 29.957001102535834, + "eval_steps": 500, + "global_step": 20400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.030871003307607496, + "grad_norm": 0.7940054535865784, + "learning_rate": 9.990686274509804e-06, + "loss": 3.2723, + "step": 21 + }, + { + "epoch": 0.06174200661521499, + "grad_norm": 0.320931613445282, + "learning_rate": 9.980392156862746e-06, + "loss": 2.3933, + "step": 42 + }, + { + "epoch": 0.0926130099228225, + "grad_norm": 0.36235514283180237, + "learning_rate": 9.970098039215688e-06, + "loss": 2.2741, + "step": 63 + }, + { + "epoch": 0.12348401323042998, + "grad_norm": 0.33555838465690613, + "learning_rate": 9.959803921568629e-06, + "loss": 2.3189, + "step": 84 + }, + { + "epoch": 0.1543550165380375, + "grad_norm": 0.4002174139022827, + "learning_rate": 9.94950980392157e-06, + "loss": 2.3926, + "step": 105 + }, + { + "epoch": 0.185226019845645, + "grad_norm": 0.4205632507801056, + "learning_rate": 9.93921568627451e-06, + "loss": 2.4199, + "step": 126 + }, + { + "epoch": 0.2160970231532525, + "grad_norm": 0.4258420467376709, + "learning_rate": 9.928921568627453e-06, + "loss": 2.2021, + "step": 147 + }, + { + "epoch": 0.24696802646085997, + "grad_norm": 0.45753806829452515, + "learning_rate": 9.918627450980393e-06, + "loss": 2.4466, + "step": 168 + }, + { + "epoch": 0.27783902976846747, + "grad_norm": 0.46691378951072693, + "learning_rate": 9.908333333333335e-06, + "loss": 2.3134, + "step": 189 + }, + { + "epoch": 0.308710033076075, + "grad_norm": 0.32314497232437134, + "learning_rate": 9.898039215686275e-06, + "loss": 2.3165, + "step": 210 + }, + { + "epoch": 0.3395810363836825, + "grad_norm": 0.6973955035209656, + "learning_rate": 9.887745098039217e-06, + "loss": 2.3962, + "step": 231 + }, + { + "epoch": 0.37045203969129, + "grad_norm": 0.3546930253505707, + "learning_rate": 9.877450980392159e-06, + "loss": 2.413, + "step": 252 + }, + { + "epoch": 0.4013230429988975, + "grad_norm": 0.5109633207321167, + "learning_rate": 9.867156862745099e-06, + "loss": 2.292, + "step": 273 + }, + { + "epoch": 0.432194046306505, + "grad_norm": 0.5256220698356628, + "learning_rate": 9.856862745098041e-06, + "loss": 2.285, + "step": 294 + }, + { + "epoch": 0.46306504961411243, + "grad_norm": 0.6690633296966553, + "learning_rate": 9.846568627450981e-06, + "loss": 2.3489, + "step": 315 + }, + { + "epoch": 0.49393605292171994, + "grad_norm": 0.6162332892417908, + "learning_rate": 9.836274509803923e-06, + "loss": 2.3883, + "step": 336 + }, + { + "epoch": 0.5248070562293274, + "grad_norm": 1.9315634965896606, + "learning_rate": 9.825980392156863e-06, + "loss": 2.4159, + "step": 357 + }, + { + "epoch": 0.5556780595369349, + "grad_norm": 0.5455235838890076, + "learning_rate": 9.815686274509805e-06, + "loss": 2.2694, + "step": 378 + }, + { + "epoch": 0.5865490628445424, + "grad_norm": 0.6161318421363831, + "learning_rate": 9.805392156862747e-06, + "loss": 2.4137, + "step": 399 + }, + { + "epoch": 0.61742006615215, + "grad_norm": 0.5634300112724304, + "learning_rate": 9.795098039215687e-06, + "loss": 2.376, + "step": 420 + }, + { + "epoch": 0.6482910694597575, + "grad_norm": 0.9691163897514343, + "learning_rate": 9.78480392156863e-06, + "loss": 2.2671, + "step": 441 + }, + { + "epoch": 0.679162072767365, + "grad_norm": 0.7895241379737854, + "learning_rate": 9.77450980392157e-06, + "loss": 2.3867, + "step": 462 + }, + { + "epoch": 0.7100330760749725, + "grad_norm": 0.6919421553611755, + "learning_rate": 9.764215686274512e-06, + "loss": 2.3087, + "step": 483 + }, + { + "epoch": 0.74090407938258, + "grad_norm": 0.6956549882888794, + "learning_rate": 9.753921568627452e-06, + "loss": 2.343, + "step": 504 + }, + { + "epoch": 0.7717750826901875, + "grad_norm": 0.7738545536994934, + "learning_rate": 9.743627450980394e-06, + "loss": 2.3509, + "step": 525 + }, + { + "epoch": 0.802646085997795, + "grad_norm": 1.1013460159301758, + "learning_rate": 9.733333333333334e-06, + "loss": 2.1828, + "step": 546 + }, + { + "epoch": 0.8335170893054025, + "grad_norm": 1.4819823503494263, + "learning_rate": 9.723039215686276e-06, + "loss": 2.4565, + "step": 567 + }, + { + "epoch": 0.86438809261301, + "grad_norm": 0.46090105175971985, + "learning_rate": 9.712745098039216e-06, + "loss": 2.2541, + "step": 588 + }, + { + "epoch": 0.8952590959206174, + "grad_norm": 0.7013310790061951, + "learning_rate": 9.702450980392158e-06, + "loss": 2.2848, + "step": 609 + }, + { + "epoch": 0.9261300992282249, + "grad_norm": 1.0015063285827637, + "learning_rate": 9.69264705882353e-06, + "loss": 2.2413, + "step": 630 + }, + { + "epoch": 0.9570011025358324, + "grad_norm": 0.8772223591804504, + "learning_rate": 9.682352941176471e-06, + "loss": 2.4909, + "step": 651 + }, + { + "epoch": 0.9878721058434399, + "grad_norm": 1.0654124021530151, + "learning_rate": 9.672058823529413e-06, + "loss": 2.375, + "step": 672 + }, + { + "epoch": 1.017640573318633, + "grad_norm": 1.1609431505203247, + "learning_rate": 9.661764705882353e-06, + "loss": 2.2078, + "step": 693 + }, + { + "epoch": 1.0485115766262403, + "grad_norm": 1.7942206859588623, + "learning_rate": 9.651470588235295e-06, + "loss": 2.3145, + "step": 714 + }, + { + "epoch": 1.079382579933848, + "grad_norm": 1.071953296661377, + "learning_rate": 9.641176470588235e-06, + "loss": 2.3541, + "step": 735 + }, + { + "epoch": 1.1102535832414553, + "grad_norm": 1.6100263595581055, + "learning_rate": 9.630882352941177e-06, + "loss": 2.2648, + "step": 756 + }, + { + "epoch": 1.141124586549063, + "grad_norm": 0.9563078880310059, + "learning_rate": 9.620588235294117e-06, + "loss": 2.3255, + "step": 777 + }, + { + "epoch": 1.1719955898566703, + "grad_norm": 0.9889941215515137, + "learning_rate": 9.61029411764706e-06, + "loss": 2.2734, + "step": 798 + }, + { + "epoch": 1.202866593164278, + "grad_norm": 2.077514886856079, + "learning_rate": 9.600000000000001e-06, + "loss": 2.3709, + "step": 819 + }, + { + "epoch": 1.2337375964718853, + "grad_norm": 0.9159446358680725, + "learning_rate": 9.589705882352941e-06, + "loss": 2.2387, + "step": 840 + }, + { + "epoch": 1.264608599779493, + "grad_norm": 1.461916208267212, + "learning_rate": 9.579411764705883e-06, + "loss": 2.2484, + "step": 861 + }, + { + "epoch": 1.2954796030871003, + "grad_norm": 1.568862795829773, + "learning_rate": 9.569117647058824e-06, + "loss": 2.1579, + "step": 882 + }, + { + "epoch": 1.326350606394708, + "grad_norm": 1.566645622253418, + "learning_rate": 9.558823529411766e-06, + "loss": 2.2645, + "step": 903 + }, + { + "epoch": 1.3572216097023153, + "grad_norm": 1.0653067827224731, + "learning_rate": 9.548529411764706e-06, + "loss": 2.3066, + "step": 924 + }, + { + "epoch": 1.3880926130099227, + "grad_norm": 1.7910544872283936, + "learning_rate": 9.538235294117648e-06, + "loss": 2.4084, + "step": 945 + }, + { + "epoch": 1.4189636163175303, + "grad_norm": 0.7429267764091492, + "learning_rate": 9.527941176470588e-06, + "loss": 2.2003, + "step": 966 + }, + { + "epoch": 1.449834619625138, + "grad_norm": 0.8549733757972717, + "learning_rate": 9.51764705882353e-06, + "loss": 2.3083, + "step": 987 + }, + { + "epoch": 1.4807056229327453, + "grad_norm": 1.5903714895248413, + "learning_rate": 9.50735294117647e-06, + "loss": 2.3847, + "step": 1008 + }, + { + "epoch": 1.5115766262403527, + "grad_norm": 1.0798513889312744, + "learning_rate": 9.497058823529412e-06, + "loss": 2.3322, + "step": 1029 + }, + { + "epoch": 1.5424476295479603, + "grad_norm": 1.6621721982955933, + "learning_rate": 9.486764705882354e-06, + "loss": 2.2987, + "step": 1050 + }, + { + "epoch": 1.573318632855568, + "grad_norm": 1.218017339706421, + "learning_rate": 9.476470588235294e-06, + "loss": 2.3213, + "step": 1071 + }, + { + "epoch": 1.6041896361631753, + "grad_norm": 0.7075687050819397, + "learning_rate": 9.466176470588236e-06, + "loss": 2.2316, + "step": 1092 + }, + { + "epoch": 1.6350606394707827, + "grad_norm": 1.4596519470214844, + "learning_rate": 9.455882352941176e-06, + "loss": 2.2417, + "step": 1113 + }, + { + "epoch": 1.6659316427783903, + "grad_norm": 1.3288954496383667, + "learning_rate": 9.445588235294118e-06, + "loss": 2.4049, + "step": 1134 + }, + { + "epoch": 1.696802646085998, + "grad_norm": 1.7839045524597168, + "learning_rate": 9.435294117647058e-06, + "loss": 2.3342, + "step": 1155 + }, + { + "epoch": 1.7276736493936053, + "grad_norm": 1.3778377771377563, + "learning_rate": 9.425e-06, + "loss": 2.2813, + "step": 1176 + }, + { + "epoch": 1.7585446527012127, + "grad_norm": 1.093371033668518, + "learning_rate": 9.414705882352942e-06, + "loss": 2.3155, + "step": 1197 + }, + { + "epoch": 1.7894156560088201, + "grad_norm": 0.8924500346183777, + "learning_rate": 9.404411764705883e-06, + "loss": 2.2673, + "step": 1218 + }, + { + "epoch": 1.8202866593164277, + "grad_norm": 1.5816951990127563, + "learning_rate": 9.394117647058824e-06, + "loss": 2.3399, + "step": 1239 + }, + { + "epoch": 1.8511576626240354, + "grad_norm": 0.9977998733520508, + "learning_rate": 9.383823529411765e-06, + "loss": 2.3644, + "step": 1260 + }, + { + "epoch": 1.8820286659316428, + "grad_norm": 1.0349079370498657, + "learning_rate": 9.373529411764707e-06, + "loss": 2.2882, + "step": 1281 + }, + { + "epoch": 1.9128996692392501, + "grad_norm": 1.2130367755889893, + "learning_rate": 9.363235294117647e-06, + "loss": 2.3467, + "step": 1302 + }, + { + "epoch": 1.9437706725468578, + "grad_norm": 1.605616807937622, + "learning_rate": 9.352941176470589e-06, + "loss": 2.3506, + "step": 1323 + }, + { + "epoch": 1.9746416758544654, + "grad_norm": 1.339097499847412, + "learning_rate": 9.342647058823529e-06, + "loss": 2.2617, + "step": 1344 + }, + { + "epoch": 2.0044101433296584, + "grad_norm": 1.1007579565048218, + "learning_rate": 9.332352941176471e-06, + "loss": 2.1978, + "step": 1365 + }, + { + "epoch": 2.035281146637266, + "grad_norm": 1.3697082996368408, + "learning_rate": 9.322058823529413e-06, + "loss": 2.2258, + "step": 1386 + }, + { + "epoch": 2.066152149944873, + "grad_norm": 1.9182090759277344, + "learning_rate": 9.311764705882353e-06, + "loss": 2.3566, + "step": 1407 + }, + { + "epoch": 2.0970231532524806, + "grad_norm": 1.0698152780532837, + "learning_rate": 9.301470588235295e-06, + "loss": 2.1923, + "step": 1428 + }, + { + "epoch": 2.1278941565600884, + "grad_norm": 1.8497835397720337, + "learning_rate": 9.291176470588235e-06, + "loss": 2.2372, + "step": 1449 + }, + { + "epoch": 2.158765159867696, + "grad_norm": 1.3342701196670532, + "learning_rate": 9.280882352941177e-06, + "loss": 2.1423, + "step": 1470 + }, + { + "epoch": 2.189636163175303, + "grad_norm": 1.6012579202651978, + "learning_rate": 9.270588235294117e-06, + "loss": 2.2783, + "step": 1491 + }, + { + "epoch": 2.2205071664829106, + "grad_norm": 1.488791584968567, + "learning_rate": 9.26029411764706e-06, + "loss": 2.209, + "step": 1512 + }, + { + "epoch": 2.2513781697905184, + "grad_norm": 2.096177101135254, + "learning_rate": 9.250000000000001e-06, + "loss": 2.2021, + "step": 1533 + }, + { + "epoch": 2.282249173098126, + "grad_norm": 1.4596940279006958, + "learning_rate": 9.239705882352941e-06, + "loss": 2.3285, + "step": 1554 + }, + { + "epoch": 2.313120176405733, + "grad_norm": 1.4805257320404053, + "learning_rate": 9.229411764705883e-06, + "loss": 2.2665, + "step": 1575 + }, + { + "epoch": 2.3439911797133406, + "grad_norm": 1.9005323648452759, + "learning_rate": 9.219117647058824e-06, + "loss": 2.336, + "step": 1596 + }, + { + "epoch": 2.374862183020948, + "grad_norm": 1.7078076601028442, + "learning_rate": 9.208823529411766e-06, + "loss": 2.3066, + "step": 1617 + }, + { + "epoch": 2.405733186328556, + "grad_norm": 1.7854089736938477, + "learning_rate": 9.198529411764706e-06, + "loss": 2.2063, + "step": 1638 + }, + { + "epoch": 2.436604189636163, + "grad_norm": 1.8742430210113525, + "learning_rate": 9.188235294117648e-06, + "loss": 2.2503, + "step": 1659 + }, + { + "epoch": 2.4674751929437706, + "grad_norm": 2.9032325744628906, + "learning_rate": 9.17794117647059e-06, + "loss": 2.1626, + "step": 1680 + }, + { + "epoch": 2.498346196251378, + "grad_norm": 1.505924105644226, + "learning_rate": 9.16764705882353e-06, + "loss": 2.3102, + "step": 1701 + }, + { + "epoch": 2.529217199558986, + "grad_norm": 1.779142141342163, + "learning_rate": 9.157352941176472e-06, + "loss": 2.2732, + "step": 1722 + }, + { + "epoch": 2.560088202866593, + "grad_norm": 1.6594492197036743, + "learning_rate": 9.147058823529412e-06, + "loss": 2.1577, + "step": 1743 + }, + { + "epoch": 2.5909592061742006, + "grad_norm": 1.8729362487792969, + "learning_rate": 9.136764705882354e-06, + "loss": 2.2017, + "step": 1764 + }, + { + "epoch": 2.621830209481808, + "grad_norm": 2.1173110008239746, + "learning_rate": 9.126470588235294e-06, + "loss": 2.3207, + "step": 1785 + }, + { + "epoch": 2.652701212789416, + "grad_norm": 3.0417325496673584, + "learning_rate": 9.116176470588236e-06, + "loss": 2.2706, + "step": 1806 + }, + { + "epoch": 2.6835722160970232, + "grad_norm": 1.361302375793457, + "learning_rate": 9.105882352941178e-06, + "loss": 2.2701, + "step": 1827 + }, + { + "epoch": 2.7144432194046306, + "grad_norm": 1.9176990985870361, + "learning_rate": 9.095588235294118e-06, + "loss": 2.2957, + "step": 1848 + }, + { + "epoch": 2.745314222712238, + "grad_norm": 1.5876405239105225, + "learning_rate": 9.08529411764706e-06, + "loss": 2.389, + "step": 1869 + }, + { + "epoch": 2.7761852260198454, + "grad_norm": 1.3612051010131836, + "learning_rate": 9.075e-06, + "loss": 2.3024, + "step": 1890 + }, + { + "epoch": 2.8070562293274532, + "grad_norm": 2.0122861862182617, + "learning_rate": 9.064705882352942e-06, + "loss": 2.3609, + "step": 1911 + }, + { + "epoch": 2.8379272326350606, + "grad_norm": 1.4486873149871826, + "learning_rate": 9.054411764705883e-06, + "loss": 2.2706, + "step": 1932 + }, + { + "epoch": 2.868798235942668, + "grad_norm": 1.5788092613220215, + "learning_rate": 9.044117647058824e-06, + "loss": 2.1837, + "step": 1953 + }, + { + "epoch": 2.899669239250276, + "grad_norm": 1.0288219451904297, + "learning_rate": 9.033823529411766e-06, + "loss": 2.2825, + "step": 1974 + }, + { + "epoch": 2.9305402425578833, + "grad_norm": 1.6598355770111084, + "learning_rate": 9.023529411764707e-06, + "loss": 2.2774, + "step": 1995 + }, + { + "epoch": 2.9614112458654906, + "grad_norm": 1.5685467720031738, + "learning_rate": 9.013235294117649e-06, + "loss": 2.1978, + "step": 2016 + }, + { + "epoch": 2.992282249173098, + "grad_norm": 2.00368070602417, + "learning_rate": 9.002941176470589e-06, + "loss": 2.2677, + "step": 2037 + }, + { + "epoch": 3.022050716648291, + "grad_norm": 1.7239810228347778, + "learning_rate": 8.99264705882353e-06, + "loss": 2.1242, + "step": 2058 + }, + { + "epoch": 3.0529217199558984, + "grad_norm": 2.0060484409332275, + "learning_rate": 8.982352941176471e-06, + "loss": 2.2234, + "step": 2079 + }, + { + "epoch": 3.083792723263506, + "grad_norm": 2.2679460048675537, + "learning_rate": 8.972058823529413e-06, + "loss": 2.1935, + "step": 2100 + }, + { + "epoch": 3.1146637265711137, + "grad_norm": 2.3541860580444336, + "learning_rate": 8.961764705882353e-06, + "loss": 2.3283, + "step": 2121 + }, + { + "epoch": 3.145534729878721, + "grad_norm": 1.637447714805603, + "learning_rate": 8.951470588235295e-06, + "loss": 2.2094, + "step": 2142 + }, + { + "epoch": 3.1764057331863285, + "grad_norm": 2.7666993141174316, + "learning_rate": 8.941176470588237e-06, + "loss": 2.2272, + "step": 2163 + }, + { + "epoch": 3.207276736493936, + "grad_norm": 2.9229938983917236, + "learning_rate": 8.930882352941177e-06, + "loss": 2.2553, + "step": 2184 + }, + { + "epoch": 3.2381477398015437, + "grad_norm": 2.722456216812134, + "learning_rate": 8.920588235294119e-06, + "loss": 2.3223, + "step": 2205 + }, + { + "epoch": 3.269018743109151, + "grad_norm": 1.965293526649475, + "learning_rate": 8.91029411764706e-06, + "loss": 2.2935, + "step": 2226 + }, + { + "epoch": 3.2998897464167585, + "grad_norm": 1.7254505157470703, + "learning_rate": 8.900000000000001e-06, + "loss": 2.2514, + "step": 2247 + }, + { + "epoch": 3.330760749724366, + "grad_norm": 2.369128942489624, + "learning_rate": 8.889705882352941e-06, + "loss": 2.2861, + "step": 2268 + }, + { + "epoch": 3.3616317530319737, + "grad_norm": 2.42130184173584, + "learning_rate": 8.879411764705883e-06, + "loss": 2.343, + "step": 2289 + }, + { + "epoch": 3.392502756339581, + "grad_norm": 2.222886085510254, + "learning_rate": 8.869117647058825e-06, + "loss": 2.2132, + "step": 2310 + }, + { + "epoch": 3.4233737596471885, + "grad_norm": 2.613894462585449, + "learning_rate": 8.858823529411765e-06, + "loss": 2.2409, + "step": 2331 + }, + { + "epoch": 3.454244762954796, + "grad_norm": 2.4033560752868652, + "learning_rate": 8.848529411764707e-06, + "loss": 2.1564, + "step": 2352 + }, + { + "epoch": 3.4851157662624037, + "grad_norm": 1.3745297193527222, + "learning_rate": 8.838235294117648e-06, + "loss": 2.2523, + "step": 2373 + }, + { + "epoch": 3.515986769570011, + "grad_norm": 1.4414029121398926, + "learning_rate": 8.82794117647059e-06, + "loss": 2.1393, + "step": 2394 + }, + { + "epoch": 3.5468577728776185, + "grad_norm": 2.2887678146362305, + "learning_rate": 8.81764705882353e-06, + "loss": 2.2748, + "step": 2415 + }, + { + "epoch": 3.577728776185226, + "grad_norm": 1.8498324155807495, + "learning_rate": 8.807352941176472e-06, + "loss": 2.1742, + "step": 2436 + }, + { + "epoch": 3.6085997794928337, + "grad_norm": 3.1975903511047363, + "learning_rate": 8.797058823529414e-06, + "loss": 2.3019, + "step": 2457 + }, + { + "epoch": 3.639470782800441, + "grad_norm": 2.0401833057403564, + "learning_rate": 8.786764705882354e-06, + "loss": 2.2753, + "step": 2478 + }, + { + "epoch": 3.6703417861080485, + "grad_norm": 2.023527145385742, + "learning_rate": 8.776470588235296e-06, + "loss": 2.2265, + "step": 2499 + }, + { + "epoch": 3.701212789415656, + "grad_norm": 2.5720412731170654, + "learning_rate": 8.766176470588236e-06, + "loss": 2.2168, + "step": 2520 + }, + { + "epoch": 3.7320837927232633, + "grad_norm": 2.1886861324310303, + "learning_rate": 8.755882352941178e-06, + "loss": 2.2842, + "step": 2541 + }, + { + "epoch": 3.762954796030871, + "grad_norm": 1.7179489135742188, + "learning_rate": 8.745588235294118e-06, + "loss": 2.3208, + "step": 2562 + }, + { + "epoch": 3.7938257993384785, + "grad_norm": 2.382472038269043, + "learning_rate": 8.73529411764706e-06, + "loss": 2.2513, + "step": 2583 + }, + { + "epoch": 3.824696802646086, + "grad_norm": 2.7874560356140137, + "learning_rate": 8.725000000000002e-06, + "loss": 2.177, + "step": 2604 + }, + { + "epoch": 3.8555678059536937, + "grad_norm": 2.724936008453369, + "learning_rate": 8.714705882352942e-06, + "loss": 2.2365, + "step": 2625 + }, + { + "epoch": 3.886438809261301, + "grad_norm": 2.303502082824707, + "learning_rate": 8.704411764705884e-06, + "loss": 2.2006, + "step": 2646 + }, + { + "epoch": 3.9173098125689085, + "grad_norm": 2.214660406112671, + "learning_rate": 8.694117647058824e-06, + "loss": 2.2227, + "step": 2667 + }, + { + "epoch": 3.948180815876516, + "grad_norm": 3.070887804031372, + "learning_rate": 8.683823529411766e-06, + "loss": 2.1777, + "step": 2688 + }, + { + "epoch": 3.9790518191841233, + "grad_norm": 2.8172695636749268, + "learning_rate": 8.674019607843137e-06, + "loss": 2.3141, + "step": 2709 + }, + { + "epoch": 4.008820286659317, + "grad_norm": 2.524413585662842, + "learning_rate": 8.66372549019608e-06, + "loss": 2.1247, + "step": 2730 + }, + { + "epoch": 4.039691289966924, + "grad_norm": 2.7337212562561035, + "learning_rate": 8.653431372549021e-06, + "loss": 2.1418, + "step": 2751 + }, + { + "epoch": 4.070562293274532, + "grad_norm": 3.4945359230041504, + "learning_rate": 8.643137254901961e-06, + "loss": 2.1551, + "step": 2772 + }, + { + "epoch": 4.1014332965821385, + "grad_norm": 1.8705073595046997, + "learning_rate": 8.632843137254903e-06, + "loss": 2.1996, + "step": 2793 + }, + { + "epoch": 4.132304299889746, + "grad_norm": 2.1908445358276367, + "learning_rate": 8.622549019607844e-06, + "loss": 2.214, + "step": 2814 + }, + { + "epoch": 4.163175303197354, + "grad_norm": 2.8475587368011475, + "learning_rate": 8.612254901960786e-06, + "loss": 2.2809, + "step": 2835 + }, + { + "epoch": 4.194046306504961, + "grad_norm": 2.8551456928253174, + "learning_rate": 8.601960784313726e-06, + "loss": 2.2141, + "step": 2856 + }, + { + "epoch": 4.224917309812569, + "grad_norm": 2.450998544692993, + "learning_rate": 8.591666666666668e-06, + "loss": 2.3511, + "step": 2877 + }, + { + "epoch": 4.255788313120177, + "grad_norm": 2.4303412437438965, + "learning_rate": 8.581372549019608e-06, + "loss": 2.171, + "step": 2898 + }, + { + "epoch": 4.286659316427784, + "grad_norm": 1.9172242879867554, + "learning_rate": 8.57107843137255e-06, + "loss": 2.1682, + "step": 2919 + }, + { + "epoch": 4.317530319735392, + "grad_norm": 2.9315176010131836, + "learning_rate": 8.56078431372549e-06, + "loss": 2.3503, + "step": 2940 + }, + { + "epoch": 4.3484013230429985, + "grad_norm": 2.2080533504486084, + "learning_rate": 8.550490196078432e-06, + "loss": 2.1846, + "step": 2961 + }, + { + "epoch": 4.379272326350606, + "grad_norm": 2.6244797706604004, + "learning_rate": 8.540196078431374e-06, + "loss": 2.236, + "step": 2982 + }, + { + "epoch": 4.410143329658214, + "grad_norm": 2.147533655166626, + "learning_rate": 8.529901960784314e-06, + "loss": 2.2606, + "step": 3003 + }, + { + "epoch": 4.441014332965821, + "grad_norm": 1.7855578660964966, + "learning_rate": 8.519607843137256e-06, + "loss": 2.2133, + "step": 3024 + }, + { + "epoch": 4.471885336273429, + "grad_norm": 1.8556910753250122, + "learning_rate": 8.509313725490196e-06, + "loss": 2.0927, + "step": 3045 + }, + { + "epoch": 4.502756339581037, + "grad_norm": 2.6865439414978027, + "learning_rate": 8.499019607843138e-06, + "loss": 2.1611, + "step": 3066 + }, + { + "epoch": 4.533627342888644, + "grad_norm": 2.3935399055480957, + "learning_rate": 8.488725490196078e-06, + "loss": 2.2558, + "step": 3087 + }, + { + "epoch": 4.564498346196252, + "grad_norm": 2.4272029399871826, + "learning_rate": 8.47843137254902e-06, + "loss": 2.1133, + "step": 3108 + }, + { + "epoch": 4.595369349503859, + "grad_norm": 2.0114336013793945, + "learning_rate": 8.46813725490196e-06, + "loss": 2.2369, + "step": 3129 + }, + { + "epoch": 4.626240352811466, + "grad_norm": 2.586351156234741, + "learning_rate": 8.457843137254903e-06, + "loss": 2.4161, + "step": 3150 + }, + { + "epoch": 4.657111356119074, + "grad_norm": 2.777594804763794, + "learning_rate": 8.447549019607843e-06, + "loss": 2.2647, + "step": 3171 + }, + { + "epoch": 4.687982359426681, + "grad_norm": 2.0153422355651855, + "learning_rate": 8.437254901960785e-06, + "loss": 2.2976, + "step": 3192 + }, + { + "epoch": 4.718853362734289, + "grad_norm": 2.486306667327881, + "learning_rate": 8.426960784313727e-06, + "loss": 2.1799, + "step": 3213 + }, + { + "epoch": 4.749724366041896, + "grad_norm": 3.186861276626587, + "learning_rate": 8.416666666666667e-06, + "loss": 2.2084, + "step": 3234 + }, + { + "epoch": 4.780595369349504, + "grad_norm": 2.2380778789520264, + "learning_rate": 8.406372549019609e-06, + "loss": 2.0764, + "step": 3255 + }, + { + "epoch": 4.811466372657112, + "grad_norm": 3.3927717208862305, + "learning_rate": 8.396078431372549e-06, + "loss": 2.1141, + "step": 3276 + }, + { + "epoch": 4.842337375964719, + "grad_norm": 2.017042875289917, + "learning_rate": 8.385784313725491e-06, + "loss": 2.1955, + "step": 3297 + }, + { + "epoch": 4.873208379272326, + "grad_norm": 2.798187494277954, + "learning_rate": 8.375490196078431e-06, + "loss": 2.245, + "step": 3318 + }, + { + "epoch": 4.904079382579933, + "grad_norm": 2.209864377975464, + "learning_rate": 8.365196078431373e-06, + "loss": 2.1835, + "step": 3339 + }, + { + "epoch": 4.934950385887541, + "grad_norm": 2.4622409343719482, + "learning_rate": 8.354901960784313e-06, + "loss": 2.2274, + "step": 3360 + }, + { + "epoch": 4.965821389195149, + "grad_norm": 1.7298108339309692, + "learning_rate": 8.344607843137255e-06, + "loss": 2.1089, + "step": 3381 + }, + { + "epoch": 4.996692392502756, + "grad_norm": 3.1159820556640625, + "learning_rate": 8.334313725490195e-06, + "loss": 2.1574, + "step": 3402 + }, + { + "epoch": 5.0264608599779494, + "grad_norm": 1.9754016399383545, + "learning_rate": 8.324019607843137e-06, + "loss": 2.0488, + "step": 3423 + }, + { + "epoch": 5.057331863285556, + "grad_norm": 2.3963258266448975, + "learning_rate": 8.31372549019608e-06, + "loss": 2.2752, + "step": 3444 + }, + { + "epoch": 5.088202866593164, + "grad_norm": 3.8797078132629395, + "learning_rate": 8.30343137254902e-06, + "loss": 2.1056, + "step": 3465 + }, + { + "epoch": 5.119073869900772, + "grad_norm": 2.89670467376709, + "learning_rate": 8.293137254901961e-06, + "loss": 2.1916, + "step": 3486 + }, + { + "epoch": 5.149944873208379, + "grad_norm": 3.1157844066619873, + "learning_rate": 8.282843137254902e-06, + "loss": 2.1939, + "step": 3507 + }, + { + "epoch": 5.180815876515987, + "grad_norm": 3.1985909938812256, + "learning_rate": 8.272549019607844e-06, + "loss": 2.1605, + "step": 3528 + }, + { + "epoch": 5.211686879823595, + "grad_norm": 2.626016855239868, + "learning_rate": 8.262254901960784e-06, + "loss": 2.185, + "step": 3549 + }, + { + "epoch": 5.242557883131202, + "grad_norm": 2.8655667304992676, + "learning_rate": 8.251960784313726e-06, + "loss": 2.1658, + "step": 3570 + }, + { + "epoch": 5.2734288864388095, + "grad_norm": 2.5003795623779297, + "learning_rate": 8.241666666666668e-06, + "loss": 2.2202, + "step": 3591 + }, + { + "epoch": 5.304299889746416, + "grad_norm": 3.0103225708007812, + "learning_rate": 8.231372549019608e-06, + "loss": 2.251, + "step": 3612 + }, + { + "epoch": 5.335170893054024, + "grad_norm": 2.7656571865081787, + "learning_rate": 8.22107843137255e-06, + "loss": 2.1386, + "step": 3633 + }, + { + "epoch": 5.366041896361632, + "grad_norm": 2.1533167362213135, + "learning_rate": 8.21078431372549e-06, + "loss": 2.0554, + "step": 3654 + }, + { + "epoch": 5.396912899669239, + "grad_norm": 3.6254355907440186, + "learning_rate": 8.200490196078432e-06, + "loss": 2.1868, + "step": 3675 + }, + { + "epoch": 5.427783902976847, + "grad_norm": 3.0589587688446045, + "learning_rate": 8.190196078431372e-06, + "loss": 2.2171, + "step": 3696 + }, + { + "epoch": 5.458654906284455, + "grad_norm": 3.7360947132110596, + "learning_rate": 8.179901960784314e-06, + "loss": 2.191, + "step": 3717 + }, + { + "epoch": 5.489525909592062, + "grad_norm": 2.9356765747070312, + "learning_rate": 8.169607843137256e-06, + "loss": 2.0895, + "step": 3738 + }, + { + "epoch": 5.5203969128996695, + "grad_norm": 2.677701950073242, + "learning_rate": 8.159313725490196e-06, + "loss": 2.1751, + "step": 3759 + }, + { + "epoch": 5.551267916207276, + "grad_norm": 2.194253444671631, + "learning_rate": 8.149019607843138e-06, + "loss": 2.1054, + "step": 3780 + }, + { + "epoch": 5.582138919514884, + "grad_norm": 2.6590781211853027, + "learning_rate": 8.138725490196078e-06, + "loss": 2.1846, + "step": 3801 + }, + { + "epoch": 5.613009922822492, + "grad_norm": 2.1141388416290283, + "learning_rate": 8.12843137254902e-06, + "loss": 2.1489, + "step": 3822 + }, + { + "epoch": 5.643880926130099, + "grad_norm": 3.1729912757873535, + "learning_rate": 8.11813725490196e-06, + "loss": 2.098, + "step": 3843 + }, + { + "epoch": 5.674751929437707, + "grad_norm": 2.9490084648132324, + "learning_rate": 8.107843137254902e-06, + "loss": 2.1878, + "step": 3864 + }, + { + "epoch": 5.705622932745314, + "grad_norm": 1.9357086420059204, + "learning_rate": 8.097549019607844e-06, + "loss": 2.219, + "step": 3885 + }, + { + "epoch": 5.736493936052922, + "grad_norm": 2.848104476928711, + "learning_rate": 8.087254901960785e-06, + "loss": 2.2125, + "step": 3906 + }, + { + "epoch": 5.7673649393605295, + "grad_norm": 2.5720698833465576, + "learning_rate": 8.076960784313727e-06, + "loss": 2.1322, + "step": 3927 + }, + { + "epoch": 5.7982359426681365, + "grad_norm": 2.406402349472046, + "learning_rate": 8.066666666666667e-06, + "loss": 2.2639, + "step": 3948 + }, + { + "epoch": 5.829106945975744, + "grad_norm": 2.501682758331299, + "learning_rate": 8.056372549019609e-06, + "loss": 2.1916, + "step": 3969 + }, + { + "epoch": 5.859977949283351, + "grad_norm": 2.7590603828430176, + "learning_rate": 8.046078431372549e-06, + "loss": 2.2604, + "step": 3990 + }, + { + "epoch": 5.890848952590959, + "grad_norm": 3.130591630935669, + "learning_rate": 8.035784313725491e-06, + "loss": 2.1121, + "step": 4011 + }, + { + "epoch": 5.921719955898567, + "grad_norm": 2.693161725997925, + "learning_rate": 8.025490196078431e-06, + "loss": 2.2319, + "step": 4032 + }, + { + "epoch": 5.952590959206174, + "grad_norm": 3.0124995708465576, + "learning_rate": 8.015196078431373e-06, + "loss": 2.1797, + "step": 4053 + }, + { + "epoch": 5.983461962513782, + "grad_norm": 3.3224260807037354, + "learning_rate": 8.004901960784315e-06, + "loss": 2.2071, + "step": 4074 + }, + { + "epoch": 6.013230429988974, + "grad_norm": 2.305988073348999, + "learning_rate": 7.994607843137255e-06, + "loss": 2.0385, + "step": 4095 + }, + { + "epoch": 6.044101433296582, + "grad_norm": 2.905613660812378, + "learning_rate": 7.984313725490197e-06, + "loss": 2.194, + "step": 4116 + }, + { + "epoch": 6.07497243660419, + "grad_norm": 2.968784809112549, + "learning_rate": 7.974019607843137e-06, + "loss": 2.2794, + "step": 4137 + }, + { + "epoch": 6.105843439911797, + "grad_norm": 2.8147284984588623, + "learning_rate": 7.96372549019608e-06, + "loss": 2.1266, + "step": 4158 + }, + { + "epoch": 6.136714443219405, + "grad_norm": 2.993699073791504, + "learning_rate": 7.95343137254902e-06, + "loss": 2.1956, + "step": 4179 + }, + { + "epoch": 6.167585446527012, + "grad_norm": 1.8021845817565918, + "learning_rate": 7.943137254901961e-06, + "loss": 2.1361, + "step": 4200 + }, + { + "epoch": 6.1984564498346195, + "grad_norm": 2.0478949546813965, + "learning_rate": 7.932843137254903e-06, + "loss": 2.1633, + "step": 4221 + }, + { + "epoch": 6.229327453142227, + "grad_norm": 2.401198387145996, + "learning_rate": 7.922549019607844e-06, + "loss": 2.2218, + "step": 4242 + }, + { + "epoch": 6.260198456449834, + "grad_norm": 3.1930980682373047, + "learning_rate": 7.912254901960785e-06, + "loss": 2.2991, + "step": 4263 + }, + { + "epoch": 6.291069459757442, + "grad_norm": 2.734412908554077, + "learning_rate": 7.901960784313726e-06, + "loss": 2.0321, + "step": 4284 + }, + { + "epoch": 6.32194046306505, + "grad_norm": 1.8950814008712769, + "learning_rate": 7.891666666666668e-06, + "loss": 2.209, + "step": 4305 + }, + { + "epoch": 6.352811466372657, + "grad_norm": 2.232196807861328, + "learning_rate": 7.881372549019608e-06, + "loss": 2.1485, + "step": 4326 + }, + { + "epoch": 6.383682469680265, + "grad_norm": 2.378868579864502, + "learning_rate": 7.87107843137255e-06, + "loss": 2.1191, + "step": 4347 + }, + { + "epoch": 6.414553472987872, + "grad_norm": 2.716021776199341, + "learning_rate": 7.860784313725492e-06, + "loss": 2.1856, + "step": 4368 + }, + { + "epoch": 6.4454244762954795, + "grad_norm": 2.920015335083008, + "learning_rate": 7.850490196078432e-06, + "loss": 2.1965, + "step": 4389 + }, + { + "epoch": 6.476295479603087, + "grad_norm": 2.696876287460327, + "learning_rate": 7.840196078431374e-06, + "loss": 2.1233, + "step": 4410 + }, + { + "epoch": 6.507166482910694, + "grad_norm": 2.693169593811035, + "learning_rate": 7.829901960784314e-06, + "loss": 2.1467, + "step": 4431 + }, + { + "epoch": 6.538037486218302, + "grad_norm": 2.373929023742676, + "learning_rate": 7.819607843137256e-06, + "loss": 2.2374, + "step": 4452 + }, + { + "epoch": 6.56890848952591, + "grad_norm": 2.026304244995117, + "learning_rate": 7.809313725490196e-06, + "loss": 2.2376, + "step": 4473 + }, + { + "epoch": 6.599779492833517, + "grad_norm": 3.8536412715911865, + "learning_rate": 7.799019607843138e-06, + "loss": 2.2277, + "step": 4494 + }, + { + "epoch": 6.630650496141125, + "grad_norm": 3.2263355255126953, + "learning_rate": 7.78872549019608e-06, + "loss": 2.2015, + "step": 4515 + }, + { + "epoch": 6.661521499448732, + "grad_norm": 2.5766403675079346, + "learning_rate": 7.77843137254902e-06, + "loss": 2.1721, + "step": 4536 + }, + { + "epoch": 6.69239250275634, + "grad_norm": 2.6397998332977295, + "learning_rate": 7.768137254901962e-06, + "loss": 2.1238, + "step": 4557 + }, + { + "epoch": 6.723263506063947, + "grad_norm": 2.5694289207458496, + "learning_rate": 7.757843137254902e-06, + "loss": 2.2221, + "step": 4578 + }, + { + "epoch": 6.754134509371554, + "grad_norm": 2.568444013595581, + "learning_rate": 7.747549019607844e-06, + "loss": 2.1968, + "step": 4599 + }, + { + "epoch": 6.785005512679162, + "grad_norm": 3.3770861625671387, + "learning_rate": 7.737254901960785e-06, + "loss": 2.2389, + "step": 4620 + }, + { + "epoch": 6.815876515986769, + "grad_norm": 2.6761832237243652, + "learning_rate": 7.726960784313727e-06, + "loss": 2.1979, + "step": 4641 + }, + { + "epoch": 6.846747519294377, + "grad_norm": 3.2881479263305664, + "learning_rate": 7.716666666666667e-06, + "loss": 2.1772, + "step": 4662 + }, + { + "epoch": 6.877618522601985, + "grad_norm": 2.812613010406494, + "learning_rate": 7.706372549019609e-06, + "loss": 2.2137, + "step": 4683 + }, + { + "epoch": 6.908489525909592, + "grad_norm": 2.3768069744110107, + "learning_rate": 7.69607843137255e-06, + "loss": 2.165, + "step": 4704 + }, + { + "epoch": 6.9393605292172, + "grad_norm": 2.822144031524658, + "learning_rate": 7.68578431372549e-06, + "loss": 2.173, + "step": 4725 + }, + { + "epoch": 6.970231532524807, + "grad_norm": 2.994647979736328, + "learning_rate": 7.675490196078433e-06, + "loss": 2.2263, + "step": 4746 + }, + { + "epoch": 7.0, + "grad_norm": 0.8841678500175476, + "learning_rate": 7.665196078431373e-06, + "loss": 1.9977, + "step": 4767 + }, + { + "epoch": 7.030871003307608, + "grad_norm": 2.560551166534424, + "learning_rate": 7.654901960784315e-06, + "loss": 2.2829, + "step": 4788 + }, + { + "epoch": 7.061742006615215, + "grad_norm": 2.5852670669555664, + "learning_rate": 7.644607843137255e-06, + "loss": 2.1318, + "step": 4809 + }, + { + "epoch": 7.092613009922823, + "grad_norm": 3.1351985931396484, + "learning_rate": 7.634313725490197e-06, + "loss": 2.0308, + "step": 4830 + }, + { + "epoch": 7.12348401323043, + "grad_norm": 2.3059654235839844, + "learning_rate": 7.624019607843138e-06, + "loss": 2.0593, + "step": 4851 + }, + { + "epoch": 7.154355016538037, + "grad_norm": 2.390005111694336, + "learning_rate": 7.613725490196079e-06, + "loss": 2.2439, + "step": 4872 + }, + { + "epoch": 7.185226019845645, + "grad_norm": 2.5935399532318115, + "learning_rate": 7.603921568627451e-06, + "loss": 2.2061, + "step": 4893 + }, + { + "epoch": 7.216097023153252, + "grad_norm": 2.6763274669647217, + "learning_rate": 7.593627450980393e-06, + "loss": 2.2092, + "step": 4914 + }, + { + "epoch": 7.24696802646086, + "grad_norm": 2.63897967338562, + "learning_rate": 7.583333333333333e-06, + "loss": 2.1148, + "step": 4935 + }, + { + "epoch": 7.277839029768468, + "grad_norm": 2.247389078140259, + "learning_rate": 7.573039215686275e-06, + "loss": 2.1387, + "step": 4956 + }, + { + "epoch": 7.308710033076075, + "grad_norm": 3.124016046524048, + "learning_rate": 7.5627450980392154e-06, + "loss": 2.1628, + "step": 4977 + }, + { + "epoch": 7.339581036383683, + "grad_norm": 3.1294453144073486, + "learning_rate": 7.552450980392157e-06, + "loss": 2.0332, + "step": 4998 + }, + { + "epoch": 7.37045203969129, + "grad_norm": 3.3883707523345947, + "learning_rate": 7.542156862745099e-06, + "loss": 2.1588, + "step": 5019 + }, + { + "epoch": 7.401323042998897, + "grad_norm": 2.930655002593994, + "learning_rate": 7.5318627450980395e-06, + "loss": 2.1299, + "step": 5040 + }, + { + "epoch": 7.432194046306505, + "grad_norm": 2.963660478591919, + "learning_rate": 7.5215686274509814e-06, + "loss": 2.0596, + "step": 5061 + }, + { + "epoch": 7.463065049614112, + "grad_norm": 2.7922561168670654, + "learning_rate": 7.511274509803922e-06, + "loss": 2.0655, + "step": 5082 + }, + { + "epoch": 7.49393605292172, + "grad_norm": 2.352179765701294, + "learning_rate": 7.500980392156864e-06, + "loss": 2.1341, + "step": 5103 + }, + { + "epoch": 7.524807056229328, + "grad_norm": 3.8016703128814697, + "learning_rate": 7.490686274509804e-06, + "loss": 2.1156, + "step": 5124 + }, + { + "epoch": 7.555678059536935, + "grad_norm": 3.2462334632873535, + "learning_rate": 7.480392156862746e-06, + "loss": 2.25, + "step": 5145 + }, + { + "epoch": 7.586549062844543, + "grad_norm": 2.0880022048950195, + "learning_rate": 7.470098039215688e-06, + "loss": 2.2028, + "step": 5166 + }, + { + "epoch": 7.61742006615215, + "grad_norm": 2.5556790828704834, + "learning_rate": 7.459803921568628e-06, + "loss": 2.0374, + "step": 5187 + }, + { + "epoch": 7.6482910694597575, + "grad_norm": 3.2562601566314697, + "learning_rate": 7.44950980392157e-06, + "loss": 2.131, + "step": 5208 + }, + { + "epoch": 7.679162072767365, + "grad_norm": 2.942624807357788, + "learning_rate": 7.43921568627451e-06, + "loss": 2.1297, + "step": 5229 + }, + { + "epoch": 7.710033076074972, + "grad_norm": 2.845376968383789, + "learning_rate": 7.428921568627452e-06, + "loss": 2.1408, + "step": 5250 + }, + { + "epoch": 7.74090407938258, + "grad_norm": 2.4782392978668213, + "learning_rate": 7.418627450980392e-06, + "loss": 2.1182, + "step": 5271 + }, + { + "epoch": 7.771775082690187, + "grad_norm": 4.134006023406982, + "learning_rate": 7.408333333333334e-06, + "loss": 2.1222, + "step": 5292 + }, + { + "epoch": 7.802646085997795, + "grad_norm": 2.537743330001831, + "learning_rate": 7.398039215686274e-06, + "loss": 2.0851, + "step": 5313 + }, + { + "epoch": 7.833517089305403, + "grad_norm": 3.0612194538116455, + "learning_rate": 7.387745098039216e-06, + "loss": 2.2203, + "step": 5334 + }, + { + "epoch": 7.86438809261301, + "grad_norm": 2.5010664463043213, + "learning_rate": 7.377450980392158e-06, + "loss": 2.0808, + "step": 5355 + }, + { + "epoch": 7.8952590959206175, + "grad_norm": 2.4548556804656982, + "learning_rate": 7.367156862745098e-06, + "loss": 2.1164, + "step": 5376 + }, + { + "epoch": 7.926130099228224, + "grad_norm": 3.158270835876465, + "learning_rate": 7.35686274509804e-06, + "loss": 2.2378, + "step": 5397 + }, + { + "epoch": 7.957001102535832, + "grad_norm": 3.6559364795684814, + "learning_rate": 7.3465686274509806e-06, + "loss": 2.1605, + "step": 5418 + }, + { + "epoch": 7.98787210584344, + "grad_norm": 3.131807804107666, + "learning_rate": 7.3362745098039225e-06, + "loss": 2.1603, + "step": 5439 + }, + { + "epoch": 8.017640573318634, + "grad_norm": 2.334476947784424, + "learning_rate": 7.325980392156863e-06, + "loss": 2.0486, + "step": 5460 + }, + { + "epoch": 8.04851157662624, + "grad_norm": 3.2329330444335938, + "learning_rate": 7.315686274509805e-06, + "loss": 2.1824, + "step": 5481 + }, + { + "epoch": 8.079382579933847, + "grad_norm": 3.517030715942383, + "learning_rate": 7.3053921568627466e-06, + "loss": 2.1195, + "step": 5502 + }, + { + "epoch": 8.110253583241455, + "grad_norm": 2.7786850929260254, + "learning_rate": 7.295098039215687e-06, + "loss": 2.1134, + "step": 5523 + }, + { + "epoch": 8.141124586549063, + "grad_norm": 2.952907085418701, + "learning_rate": 7.284803921568629e-06, + "loss": 2.1647, + "step": 5544 + }, + { + "epoch": 8.171995589856671, + "grad_norm": 3.045341968536377, + "learning_rate": 7.274509803921569e-06, + "loss": 2.0899, + "step": 5565 + }, + { + "epoch": 8.202866593164277, + "grad_norm": 2.412834882736206, + "learning_rate": 7.264215686274511e-06, + "loss": 2.1993, + "step": 5586 + }, + { + "epoch": 8.233737596471885, + "grad_norm": 2.3728556632995605, + "learning_rate": 7.253921568627451e-06, + "loss": 2.1946, + "step": 5607 + }, + { + "epoch": 8.264608599779493, + "grad_norm": 3.4004909992218018, + "learning_rate": 7.243627450980393e-06, + "loss": 2.1066, + "step": 5628 + }, + { + "epoch": 8.2954796030871, + "grad_norm": 3.052344560623169, + "learning_rate": 7.233333333333334e-06, + "loss": 2.1316, + "step": 5649 + }, + { + "epoch": 8.326350606394708, + "grad_norm": 3.6270687580108643, + "learning_rate": 7.223039215686275e-06, + "loss": 2.1767, + "step": 5670 + }, + { + "epoch": 8.357221609702314, + "grad_norm": 2.74826717376709, + "learning_rate": 7.212745098039216e-06, + "loss": 2.0877, + "step": 5691 + }, + { + "epoch": 8.388092613009922, + "grad_norm": 2.5722084045410156, + "learning_rate": 7.202450980392157e-06, + "loss": 1.9884, + "step": 5712 + }, + { + "epoch": 8.41896361631753, + "grad_norm": 3.1915769577026367, + "learning_rate": 7.192156862745099e-06, + "loss": 2.0973, + "step": 5733 + }, + { + "epoch": 8.449834619625138, + "grad_norm": 3.181138277053833, + "learning_rate": 7.1818627450980395e-06, + "loss": 2.1478, + "step": 5754 + }, + { + "epoch": 8.480705622932746, + "grad_norm": 2.320770502090454, + "learning_rate": 7.171568627450981e-06, + "loss": 2.2276, + "step": 5775 + }, + { + "epoch": 8.511576626240354, + "grad_norm": 3.598928928375244, + "learning_rate": 7.1612745098039225e-06, + "loss": 2.2547, + "step": 5796 + }, + { + "epoch": 8.54244762954796, + "grad_norm": 2.7235798835754395, + "learning_rate": 7.1509803921568636e-06, + "loss": 2.2114, + "step": 5817 + }, + { + "epoch": 8.573318632855568, + "grad_norm": 3.4046123027801514, + "learning_rate": 7.140686274509805e-06, + "loss": 2.0938, + "step": 5838 + }, + { + "epoch": 8.604189636163175, + "grad_norm": 3.1639325618743896, + "learning_rate": 7.130392156862746e-06, + "loss": 2.1363, + "step": 5859 + }, + { + "epoch": 8.635060639470783, + "grad_norm": 3.2247660160064697, + "learning_rate": 7.120098039215687e-06, + "loss": 2.2053, + "step": 5880 + }, + { + "epoch": 8.665931642778391, + "grad_norm": 3.5176591873168945, + "learning_rate": 7.109803921568628e-06, + "loss": 2.1747, + "step": 5901 + }, + { + "epoch": 8.696802646085997, + "grad_norm": 2.0369083881378174, + "learning_rate": 7.099509803921569e-06, + "loss": 2.0933, + "step": 5922 + }, + { + "epoch": 8.727673649393605, + "grad_norm": 4.293984413146973, + "learning_rate": 7.08921568627451e-06, + "loss": 2.2752, + "step": 5943 + }, + { + "epoch": 8.758544652701213, + "grad_norm": 3.280562162399292, + "learning_rate": 7.078921568627451e-06, + "loss": 2.2512, + "step": 5964 + }, + { + "epoch": 8.78941565600882, + "grad_norm": 3.1418211460113525, + "learning_rate": 7.068627450980393e-06, + "loss": 2.0804, + "step": 5985 + }, + { + "epoch": 8.820286659316428, + "grad_norm": 3.2922585010528564, + "learning_rate": 7.058333333333334e-06, + "loss": 2.1726, + "step": 6006 + }, + { + "epoch": 8.851157662624034, + "grad_norm": 3.8439033031463623, + "learning_rate": 7.048039215686275e-06, + "loss": 2.0561, + "step": 6027 + }, + { + "epoch": 8.882028665931642, + "grad_norm": 2.2356507778167725, + "learning_rate": 7.037745098039216e-06, + "loss": 2.0462, + "step": 6048 + }, + { + "epoch": 8.91289966923925, + "grad_norm": 3.2431116104125977, + "learning_rate": 7.027450980392157e-06, + "loss": 2.1178, + "step": 6069 + }, + { + "epoch": 8.943770672546858, + "grad_norm": 3.199249267578125, + "learning_rate": 7.017156862745098e-06, + "loss": 2.0562, + "step": 6090 + }, + { + "epoch": 8.974641675854466, + "grad_norm": 2.0143613815307617, + "learning_rate": 7.0068627450980395e-06, + "loss": 2.2118, + "step": 6111 + }, + { + "epoch": 9.004410143329658, + "grad_norm": 2.6825101375579834, + "learning_rate": 6.996568627450981e-06, + "loss": 2.0181, + "step": 6132 + }, + { + "epoch": 9.035281146637265, + "grad_norm": 3.3078560829162598, + "learning_rate": 6.986274509803922e-06, + "loss": 2.1425, + "step": 6153 + }, + { + "epoch": 9.066152149944873, + "grad_norm": 3.204214334487915, + "learning_rate": 6.9759803921568635e-06, + "loss": 2.0554, + "step": 6174 + }, + { + "epoch": 9.097023153252481, + "grad_norm": 2.2293057441711426, + "learning_rate": 6.965686274509804e-06, + "loss": 2.0393, + "step": 6195 + }, + { + "epoch": 9.127894156560089, + "grad_norm": 2.959235429763794, + "learning_rate": 6.955392156862746e-06, + "loss": 2.0141, + "step": 6216 + }, + { + "epoch": 9.158765159867695, + "grad_norm": 3.337249755859375, + "learning_rate": 6.945098039215687e-06, + "loss": 2.1386, + "step": 6237 + }, + { + "epoch": 9.189636163175303, + "grad_norm": 3.5160326957702637, + "learning_rate": 6.934803921568628e-06, + "loss": 2.0975, + "step": 6258 + }, + { + "epoch": 9.22050716648291, + "grad_norm": 3.6730356216430664, + "learning_rate": 6.92450980392157e-06, + "loss": 2.0231, + "step": 6279 + }, + { + "epoch": 9.251378169790518, + "grad_norm": 2.872706890106201, + "learning_rate": 6.91421568627451e-06, + "loss": 2.2048, + "step": 6300 + }, + { + "epoch": 9.282249173098126, + "grad_norm": 3.1090550422668457, + "learning_rate": 6.903921568627452e-06, + "loss": 2.1596, + "step": 6321 + }, + { + "epoch": 9.313120176405732, + "grad_norm": 2.391374349594116, + "learning_rate": 6.893627450980392e-06, + "loss": 2.0706, + "step": 6342 + }, + { + "epoch": 9.34399117971334, + "grad_norm": 3.1433513164520264, + "learning_rate": 6.883333333333334e-06, + "loss": 2.2176, + "step": 6363 + }, + { + "epoch": 9.374862183020948, + "grad_norm": 2.974668264389038, + "learning_rate": 6.873039215686274e-06, + "loss": 2.1334, + "step": 6384 + }, + { + "epoch": 9.405733186328556, + "grad_norm": 2.1751785278320312, + "learning_rate": 6.862745098039216e-06, + "loss": 2.1712, + "step": 6405 + }, + { + "epoch": 9.436604189636164, + "grad_norm": 3.2119059562683105, + "learning_rate": 6.852450980392158e-06, + "loss": 2.0204, + "step": 6426 + }, + { + "epoch": 9.467475192943771, + "grad_norm": 2.9304966926574707, + "learning_rate": 6.842156862745098e-06, + "loss": 2.0646, + "step": 6447 + }, + { + "epoch": 9.498346196251378, + "grad_norm": 3.7001969814300537, + "learning_rate": 6.83186274509804e-06, + "loss": 2.2128, + "step": 6468 + }, + { + "epoch": 9.529217199558985, + "grad_norm": 2.783215284347534, + "learning_rate": 6.8215686274509805e-06, + "loss": 2.1408, + "step": 6489 + }, + { + "epoch": 9.560088202866593, + "grad_norm": 2.5412495136260986, + "learning_rate": 6.8112745098039224e-06, + "loss": 2.109, + "step": 6510 + }, + { + "epoch": 9.590959206174201, + "grad_norm": 2.7052578926086426, + "learning_rate": 6.800980392156863e-06, + "loss": 2.0865, + "step": 6531 + }, + { + "epoch": 9.621830209481809, + "grad_norm": 3.288067579269409, + "learning_rate": 6.790686274509805e-06, + "loss": 2.0056, + "step": 6552 + }, + { + "epoch": 9.652701212789415, + "grad_norm": 3.3594720363616943, + "learning_rate": 6.780392156862745e-06, + "loss": 2.1256, + "step": 6573 + }, + { + "epoch": 9.683572216097023, + "grad_norm": 4.163586616516113, + "learning_rate": 6.770098039215687e-06, + "loss": 2.1335, + "step": 6594 + }, + { + "epoch": 9.71444321940463, + "grad_norm": 2.5263030529022217, + "learning_rate": 6.759803921568629e-06, + "loss": 2.0532, + "step": 6615 + }, + { + "epoch": 9.745314222712238, + "grad_norm": 2.9361979961395264, + "learning_rate": 6.749509803921569e-06, + "loss": 2.114, + "step": 6636 + }, + { + "epoch": 9.776185226019846, + "grad_norm": 2.5235047340393066, + "learning_rate": 6.739215686274511e-06, + "loss": 2.1004, + "step": 6657 + }, + { + "epoch": 9.807056229327452, + "grad_norm": 3.3822638988494873, + "learning_rate": 6.728921568627451e-06, + "loss": 2.1218, + "step": 6678 + }, + { + "epoch": 9.83792723263506, + "grad_norm": 2.4693849086761475, + "learning_rate": 6.718627450980393e-06, + "loss": 2.0892, + "step": 6699 + }, + { + "epoch": 9.868798235942668, + "grad_norm": 2.9233691692352295, + "learning_rate": 6.708333333333333e-06, + "loss": 2.1292, + "step": 6720 + }, + { + "epoch": 9.899669239250276, + "grad_norm": 3.697094678878784, + "learning_rate": 6.698039215686275e-06, + "loss": 2.21, + "step": 6741 + }, + { + "epoch": 9.930540242557884, + "grad_norm": 3.1898422241210938, + "learning_rate": 6.687745098039217e-06, + "loss": 2.1406, + "step": 6762 + }, + { + "epoch": 9.961411245865492, + "grad_norm": 2.3322906494140625, + "learning_rate": 6.677450980392157e-06, + "loss": 2.0774, + "step": 6783 + }, + { + "epoch": 9.992282249173098, + "grad_norm": 3.2816247940063477, + "learning_rate": 6.667156862745099e-06, + "loss": 2.2821, + "step": 6804 + }, + { + "epoch": 10.022050716648291, + "grad_norm": 3.6437644958496094, + "learning_rate": 6.656862745098039e-06, + "loss": 1.9058, + "step": 6825 + }, + { + "epoch": 10.052921719955899, + "grad_norm": 4.289977550506592, + "learning_rate": 6.646568627450981e-06, + "loss": 2.155, + "step": 6846 + }, + { + "epoch": 10.083792723263507, + "grad_norm": 1.590816855430603, + "learning_rate": 6.6362745098039216e-06, + "loss": 1.956, + "step": 6867 + }, + { + "epoch": 10.114663726571113, + "grad_norm": 3.7391345500946045, + "learning_rate": 6.6259803921568635e-06, + "loss": 2.0601, + "step": 6888 + }, + { + "epoch": 10.14553472987872, + "grad_norm": 3.7508692741394043, + "learning_rate": 6.6156862745098046e-06, + "loss": 2.1054, + "step": 6909 + }, + { + "epoch": 10.176405733186328, + "grad_norm": 2.5421996116638184, + "learning_rate": 6.605392156862746e-06, + "loss": 2.2101, + "step": 6930 + }, + { + "epoch": 10.207276736493936, + "grad_norm": 2.4397692680358887, + "learning_rate": 6.595098039215687e-06, + "loss": 2.0759, + "step": 6951 + }, + { + "epoch": 10.238147739801544, + "grad_norm": 4.160530090332031, + "learning_rate": 6.584803921568628e-06, + "loss": 2.0361, + "step": 6972 + }, + { + "epoch": 10.26901874310915, + "grad_norm": 3.957812547683716, + "learning_rate": 6.57450980392157e-06, + "loss": 2.0717, + "step": 6993 + }, + { + "epoch": 10.299889746416758, + "grad_norm": 3.6107029914855957, + "learning_rate": 6.56421568627451e-06, + "loss": 2.143, + "step": 7014 + }, + { + "epoch": 10.330760749724366, + "grad_norm": 3.4517884254455566, + "learning_rate": 6.553921568627452e-06, + "loss": 2.1439, + "step": 7035 + }, + { + "epoch": 10.361631753031974, + "grad_norm": 2.700695753097534, + "learning_rate": 6.543627450980393e-06, + "loss": 2.0712, + "step": 7056 + }, + { + "epoch": 10.392502756339582, + "grad_norm": 2.3187098503112793, + "learning_rate": 6.533333333333334e-06, + "loss": 2.1131, + "step": 7077 + }, + { + "epoch": 10.42337375964719, + "grad_norm": 3.6724188327789307, + "learning_rate": 6.523529411764706e-06, + "loss": 2.0648, + "step": 7098 + }, + { + "epoch": 10.454244762954795, + "grad_norm": 3.3507304191589355, + "learning_rate": 6.513235294117648e-06, + "loss": 2.064, + "step": 7119 + }, + { + "epoch": 10.485115766262403, + "grad_norm": 2.6285359859466553, + "learning_rate": 6.502941176470589e-06, + "loss": 2.1908, + "step": 7140 + }, + { + "epoch": 10.515986769570011, + "grad_norm": 2.9801242351531982, + "learning_rate": 6.49264705882353e-06, + "loss": 2.1629, + "step": 7161 + }, + { + "epoch": 10.546857772877619, + "grad_norm": 3.342548370361328, + "learning_rate": 6.482352941176471e-06, + "loss": 2.1269, + "step": 7182 + }, + { + "epoch": 10.577728776185227, + "grad_norm": 3.217926025390625, + "learning_rate": 6.472058823529412e-06, + "loss": 2.1063, + "step": 7203 + }, + { + "epoch": 10.608599779492833, + "grad_norm": 4.631300449371338, + "learning_rate": 6.461764705882353e-06, + "loss": 2.09, + "step": 7224 + }, + { + "epoch": 10.63947078280044, + "grad_norm": 3.2653615474700928, + "learning_rate": 6.451470588235294e-06, + "loss": 2.1174, + "step": 7245 + }, + { + "epoch": 10.670341786108049, + "grad_norm": 3.6463816165924072, + "learning_rate": 6.441176470588236e-06, + "loss": 2.0675, + "step": 7266 + }, + { + "epoch": 10.701212789415656, + "grad_norm": 3.0588767528533936, + "learning_rate": 6.4308823529411765e-06, + "loss": 2.1324, + "step": 7287 + }, + { + "epoch": 10.732083792723264, + "grad_norm": 2.32501220703125, + "learning_rate": 6.420588235294118e-06, + "loss": 2.0737, + "step": 7308 + }, + { + "epoch": 10.76295479603087, + "grad_norm": 3.140713691711426, + "learning_rate": 6.410294117647059e-06, + "loss": 2.2039, + "step": 7329 + }, + { + "epoch": 10.793825799338478, + "grad_norm": 2.824019193649292, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.0973, + "step": 7350 + }, + { + "epoch": 10.824696802646086, + "grad_norm": 3.9583792686462402, + "learning_rate": 6.389705882352941e-06, + "loss": 2.0894, + "step": 7371 + }, + { + "epoch": 10.855567805953694, + "grad_norm": 3.186405897140503, + "learning_rate": 6.379411764705883e-06, + "loss": 2.139, + "step": 7392 + }, + { + "epoch": 10.886438809261302, + "grad_norm": 2.8080101013183594, + "learning_rate": 6.369117647058825e-06, + "loss": 2.0255, + "step": 7413 + }, + { + "epoch": 10.91730981256891, + "grad_norm": 2.799795389175415, + "learning_rate": 6.358823529411765e-06, + "loss": 2.0883, + "step": 7434 + }, + { + "epoch": 10.948180815876515, + "grad_norm": 3.2571089267730713, + "learning_rate": 6.348529411764707e-06, + "loss": 2.1766, + "step": 7455 + }, + { + "epoch": 10.979051819184123, + "grad_norm": 3.5721755027770996, + "learning_rate": 6.338235294117647e-06, + "loss": 2.1196, + "step": 7476 + }, + { + "epoch": 11.008820286659317, + "grad_norm": 3.4969334602355957, + "learning_rate": 6.327941176470589e-06, + "loss": 1.967, + "step": 7497 + }, + { + "epoch": 11.039691289966925, + "grad_norm": 2.619086980819702, + "learning_rate": 6.317647058823529e-06, + "loss": 1.9869, + "step": 7518 + }, + { + "epoch": 11.07056229327453, + "grad_norm": 2.7135913372039795, + "learning_rate": 6.307352941176471e-06, + "loss": 2.0601, + "step": 7539 + }, + { + "epoch": 11.101433296582139, + "grad_norm": 2.352649450302124, + "learning_rate": 6.297058823529413e-06, + "loss": 2.1449, + "step": 7560 + }, + { + "epoch": 11.132304299889746, + "grad_norm": 3.9685213565826416, + "learning_rate": 6.286764705882353e-06, + "loss": 1.949, + "step": 7581 + }, + { + "epoch": 11.163175303197354, + "grad_norm": 2.865730047225952, + "learning_rate": 6.276470588235295e-06, + "loss": 2.0938, + "step": 7602 + }, + { + "epoch": 11.194046306504962, + "grad_norm": 2.6979782581329346, + "learning_rate": 6.266176470588235e-06, + "loss": 2.0482, + "step": 7623 + }, + { + "epoch": 11.224917309812568, + "grad_norm": 2.2605934143066406, + "learning_rate": 6.255882352941177e-06, + "loss": 2.1628, + "step": 7644 + }, + { + "epoch": 11.255788313120176, + "grad_norm": 3.395565986633301, + "learning_rate": 6.2455882352941175e-06, + "loss": 2.0532, + "step": 7665 + }, + { + "epoch": 11.286659316427784, + "grad_norm": 2.142057180404663, + "learning_rate": 6.2352941176470595e-06, + "loss": 2.0898, + "step": 7686 + }, + { + "epoch": 11.317530319735392, + "grad_norm": 3.811500310897827, + "learning_rate": 6.225000000000001e-06, + "loss": 2.0959, + "step": 7707 + }, + { + "epoch": 11.348401323043, + "grad_norm": 4.222660541534424, + "learning_rate": 6.214705882352942e-06, + "loss": 2.0377, + "step": 7728 + }, + { + "epoch": 11.379272326350605, + "grad_norm": 2.5871293544769287, + "learning_rate": 6.2044117647058835e-06, + "loss": 2.1134, + "step": 7749 + }, + { + "epoch": 11.410143329658213, + "grad_norm": 3.4915037155151367, + "learning_rate": 6.194117647058824e-06, + "loss": 2.166, + "step": 7770 + }, + { + "epoch": 11.441014332965821, + "grad_norm": 4.213758945465088, + "learning_rate": 6.183823529411766e-06, + "loss": 2.3109, + "step": 7791 + }, + { + "epoch": 11.471885336273429, + "grad_norm": 5.042109489440918, + "learning_rate": 6.173529411764706e-06, + "loss": 2.0819, + "step": 7812 + }, + { + "epoch": 11.502756339581037, + "grad_norm": 3.9552133083343506, + "learning_rate": 6.163235294117648e-06, + "loss": 1.9804, + "step": 7833 + }, + { + "epoch": 11.533627342888645, + "grad_norm": 4.496159553527832, + "learning_rate": 6.152941176470588e-06, + "loss": 2.1712, + "step": 7854 + }, + { + "epoch": 11.56449834619625, + "grad_norm": 2.321765422821045, + "learning_rate": 6.14264705882353e-06, + "loss": 2.0532, + "step": 7875 + }, + { + "epoch": 11.595369349503859, + "grad_norm": 3.6235439777374268, + "learning_rate": 6.132352941176472e-06, + "loss": 2.0967, + "step": 7896 + }, + { + "epoch": 11.626240352811466, + "grad_norm": 2.5655317306518555, + "learning_rate": 6.122058823529412e-06, + "loss": 2.0876, + "step": 7917 + }, + { + "epoch": 11.657111356119074, + "grad_norm": 5.025815486907959, + "learning_rate": 6.111764705882354e-06, + "loss": 2.0861, + "step": 7938 + }, + { + "epoch": 11.687982359426682, + "grad_norm": 3.1800806522369385, + "learning_rate": 6.101470588235294e-06, + "loss": 2.141, + "step": 7959 + }, + { + "epoch": 11.718853362734288, + "grad_norm": 4.988901138305664, + "learning_rate": 6.091176470588236e-06, + "loss": 2.0665, + "step": 7980 + }, + { + "epoch": 11.749724366041896, + "grad_norm": 4.063960552215576, + "learning_rate": 6.0808823529411764e-06, + "loss": 2.0464, + "step": 8001 + }, + { + "epoch": 11.780595369349504, + "grad_norm": 3.9658567905426025, + "learning_rate": 6.070588235294118e-06, + "loss": 2.2195, + "step": 8022 + }, + { + "epoch": 11.811466372657112, + "grad_norm": 2.566220760345459, + "learning_rate": 6.0602941176470594e-06, + "loss": 2.1139, + "step": 8043 + }, + { + "epoch": 11.84233737596472, + "grad_norm": 3.7702178955078125, + "learning_rate": 6.0500000000000005e-06, + "loss": 2.2112, + "step": 8064 + }, + { + "epoch": 11.873208379272326, + "grad_norm": 2.5686612129211426, + "learning_rate": 6.039705882352942e-06, + "loss": 2.1437, + "step": 8085 + }, + { + "epoch": 11.904079382579933, + "grad_norm": 2.8515195846557617, + "learning_rate": 6.029411764705883e-06, + "loss": 2.1262, + "step": 8106 + }, + { + "epoch": 11.934950385887541, + "grad_norm": 3.2659997940063477, + "learning_rate": 6.019117647058825e-06, + "loss": 2.0747, + "step": 8127 + }, + { + "epoch": 11.965821389195149, + "grad_norm": 2.8110339641571045, + "learning_rate": 6.008823529411765e-06, + "loss": 2.0746, + "step": 8148 + }, + { + "epoch": 11.996692392502757, + "grad_norm": 3.7431135177612305, + "learning_rate": 5.998529411764707e-06, + "loss": 2.0912, + "step": 8169 + }, + { + "epoch": 12.026460859977949, + "grad_norm": 4.557000637054443, + "learning_rate": 5.988235294117648e-06, + "loss": 2.1271, + "step": 8190 + }, + { + "epoch": 12.057331863285556, + "grad_norm": 2.926790475845337, + "learning_rate": 5.977941176470589e-06, + "loss": 2.0433, + "step": 8211 + }, + { + "epoch": 12.088202866593164, + "grad_norm": 2.975024938583374, + "learning_rate": 5.96764705882353e-06, + "loss": 2.0768, + "step": 8232 + }, + { + "epoch": 12.119073869900772, + "grad_norm": 3.581433057785034, + "learning_rate": 5.957352941176471e-06, + "loss": 2.0376, + "step": 8253 + }, + { + "epoch": 12.14994487320838, + "grad_norm": 5.864632606506348, + "learning_rate": 5.947058823529412e-06, + "loss": 2.146, + "step": 8274 + }, + { + "epoch": 12.180815876515986, + "grad_norm": 2.731471300125122, + "learning_rate": 5.936764705882353e-06, + "loss": 2.0287, + "step": 8295 + }, + { + "epoch": 12.211686879823594, + "grad_norm": 3.1022236347198486, + "learning_rate": 5.926470588235294e-06, + "loss": 2.131, + "step": 8316 + }, + { + "epoch": 12.242557883131202, + "grad_norm": 2.6037495136260986, + "learning_rate": 5.916176470588236e-06, + "loss": 2.0926, + "step": 8337 + }, + { + "epoch": 12.27342888643881, + "grad_norm": 2.68418550491333, + "learning_rate": 5.9058823529411764e-06, + "loss": 2.1563, + "step": 8358 + }, + { + "epoch": 12.304299889746417, + "grad_norm": 3.6252169609069824, + "learning_rate": 5.895588235294118e-06, + "loss": 2.1229, + "step": 8379 + }, + { + "epoch": 12.335170893054023, + "grad_norm": 2.7223503589630127, + "learning_rate": 5.8852941176470594e-06, + "loss": 2.1574, + "step": 8400 + }, + { + "epoch": 12.366041896361631, + "grad_norm": 3.2684574127197266, + "learning_rate": 5.8750000000000005e-06, + "loss": 2.0381, + "step": 8421 + }, + { + "epoch": 12.396912899669239, + "grad_norm": 2.6794166564941406, + "learning_rate": 5.864705882352942e-06, + "loss": 2.0885, + "step": 8442 + }, + { + "epoch": 12.427783902976847, + "grad_norm": 4.735394477844238, + "learning_rate": 5.854411764705883e-06, + "loss": 2.0897, + "step": 8463 + }, + { + "epoch": 12.458654906284455, + "grad_norm": 2.711524724960327, + "learning_rate": 5.8441176470588246e-06, + "loss": 2.1041, + "step": 8484 + }, + { + "epoch": 12.489525909592063, + "grad_norm": 3.3792128562927246, + "learning_rate": 5.833823529411765e-06, + "loss": 2.0796, + "step": 8505 + }, + { + "epoch": 12.520396912899669, + "grad_norm": 3.296825647354126, + "learning_rate": 5.823529411764707e-06, + "loss": 2.0124, + "step": 8526 + }, + { + "epoch": 12.551267916207276, + "grad_norm": 2.5053694248199463, + "learning_rate": 5.813235294117647e-06, + "loss": 1.9827, + "step": 8547 + }, + { + "epoch": 12.582138919514884, + "grad_norm": 2.5156023502349854, + "learning_rate": 5.802941176470589e-06, + "loss": 2.1209, + "step": 8568 + }, + { + "epoch": 12.613009922822492, + "grad_norm": 3.3136746883392334, + "learning_rate": 5.792647058823529e-06, + "loss": 2.0295, + "step": 8589 + }, + { + "epoch": 12.6438809261301, + "grad_norm": 4.127097129821777, + "learning_rate": 5.782352941176471e-06, + "loss": 2.0824, + "step": 8610 + }, + { + "epoch": 12.674751929437706, + "grad_norm": 2.4340720176696777, + "learning_rate": 5.772058823529412e-06, + "loss": 2.0821, + "step": 8631 + }, + { + "epoch": 12.705622932745314, + "grad_norm": 2.7903730869293213, + "learning_rate": 5.761764705882353e-06, + "loss": 1.9539, + "step": 8652 + }, + { + "epoch": 12.736493936052922, + "grad_norm": 3.9927330017089844, + "learning_rate": 5.751470588235295e-06, + "loss": 2.1137, + "step": 8673 + }, + { + "epoch": 12.76736493936053, + "grad_norm": 4.364806175231934, + "learning_rate": 5.741176470588235e-06, + "loss": 2.0973, + "step": 8694 + }, + { + "epoch": 12.798235942668137, + "grad_norm": 2.8431894779205322, + "learning_rate": 5.730882352941177e-06, + "loss": 2.0947, + "step": 8715 + }, + { + "epoch": 12.829106945975743, + "grad_norm": 4.133395671844482, + "learning_rate": 5.7205882352941175e-06, + "loss": 2.0374, + "step": 8736 + }, + { + "epoch": 12.859977949283351, + "grad_norm": 2.9242682456970215, + "learning_rate": 5.710294117647059e-06, + "loss": 2.1083, + "step": 8757 + }, + { + "epoch": 12.890848952590959, + "grad_norm": 4.007552146911621, + "learning_rate": 5.7e-06, + "loss": 2.1067, + "step": 8778 + }, + { + "epoch": 12.921719955898567, + "grad_norm": 3.1161675453186035, + "learning_rate": 5.6897058823529416e-06, + "loss": 2.0318, + "step": 8799 + }, + { + "epoch": 12.952590959206175, + "grad_norm": 3.6268036365509033, + "learning_rate": 5.6794117647058835e-06, + "loss": 2.0485, + "step": 8820 + }, + { + "epoch": 12.983461962513783, + "grad_norm": 4.202246189117432, + "learning_rate": 5.669117647058824e-06, + "loss": 2.0769, + "step": 8841 + }, + { + "epoch": 13.013230429988974, + "grad_norm": 2.679133653640747, + "learning_rate": 5.658823529411766e-06, + "loss": 2.1386, + "step": 8862 + }, + { + "epoch": 13.044101433296582, + "grad_norm": 2.7983431816101074, + "learning_rate": 5.648529411764706e-06, + "loss": 1.9623, + "step": 8883 + }, + { + "epoch": 13.07497243660419, + "grad_norm": 3.559689998626709, + "learning_rate": 5.638235294117648e-06, + "loss": 2.0616, + "step": 8904 + }, + { + "epoch": 13.105843439911798, + "grad_norm": 3.0902254581451416, + "learning_rate": 5.627941176470588e-06, + "loss": 2.0896, + "step": 8925 + }, + { + "epoch": 13.136714443219404, + "grad_norm": 2.4725775718688965, + "learning_rate": 5.61764705882353e-06, + "loss": 2.0846, + "step": 8946 + }, + { + "epoch": 13.167585446527012, + "grad_norm": 3.484569787979126, + "learning_rate": 5.607352941176472e-06, + "loss": 2.0743, + "step": 8967 + }, + { + "epoch": 13.19845644983462, + "grad_norm": 3.184817314147949, + "learning_rate": 5.597058823529412e-06, + "loss": 2.0133, + "step": 8988 + }, + { + "epoch": 13.229327453142227, + "grad_norm": 4.28677225112915, + "learning_rate": 5.586764705882354e-06, + "loss": 2.1234, + "step": 9009 + }, + { + "epoch": 13.260198456449835, + "grad_norm": 3.0605459213256836, + "learning_rate": 5.576470588235294e-06, + "loss": 2.083, + "step": 9030 + }, + { + "epoch": 13.291069459757441, + "grad_norm": 3.3841278553009033, + "learning_rate": 5.566176470588236e-06, + "loss": 2.1344, + "step": 9051 + }, + { + "epoch": 13.321940463065049, + "grad_norm": 2.9158005714416504, + "learning_rate": 5.555882352941176e-06, + "loss": 2.0873, + "step": 9072 + }, + { + "epoch": 13.352811466372657, + "grad_norm": 2.881852626800537, + "learning_rate": 5.545588235294118e-06, + "loss": 1.9675, + "step": 9093 + }, + { + "epoch": 13.383682469680265, + "grad_norm": 3.3890089988708496, + "learning_rate": 5.53529411764706e-06, + "loss": 2.1642, + "step": 9114 + }, + { + "epoch": 13.414553472987873, + "grad_norm": 3.3301126956939697, + "learning_rate": 5.5250000000000005e-06, + "loss": 2.0901, + "step": 9135 + }, + { + "epoch": 13.44542447629548, + "grad_norm": 3.285712957382202, + "learning_rate": 5.515196078431373e-06, + "loss": 1.9407, + "step": 9156 + }, + { + "epoch": 13.476295479603086, + "grad_norm": 2.797427177429199, + "learning_rate": 5.504901960784314e-06, + "loss": 2.1603, + "step": 9177 + }, + { + "epoch": 13.507166482910694, + "grad_norm": 2.386242151260376, + "learning_rate": 5.494607843137255e-06, + "loss": 2.0334, + "step": 9198 + }, + { + "epoch": 13.538037486218302, + "grad_norm": 2.7402749061584473, + "learning_rate": 5.4843137254901965e-06, + "loss": 2.1055, + "step": 9219 + }, + { + "epoch": 13.56890848952591, + "grad_norm": 3.5467629432678223, + "learning_rate": 5.4740196078431375e-06, + "loss": 2.0922, + "step": 9240 + }, + { + "epoch": 13.599779492833518, + "grad_norm": 4.005125999450684, + "learning_rate": 5.463725490196079e-06, + "loss": 2.0181, + "step": 9261 + }, + { + "epoch": 13.630650496141124, + "grad_norm": 2.6311771869659424, + "learning_rate": 5.45343137254902e-06, + "loss": 2.0758, + "step": 9282 + }, + { + "epoch": 13.661521499448732, + "grad_norm": 3.188577651977539, + "learning_rate": 5.443137254901962e-06, + "loss": 2.0835, + "step": 9303 + }, + { + "epoch": 13.69239250275634, + "grad_norm": 3.115765333175659, + "learning_rate": 5.432843137254903e-06, + "loss": 2.1527, + "step": 9324 + }, + { + "epoch": 13.723263506063947, + "grad_norm": 2.4878807067871094, + "learning_rate": 5.422549019607844e-06, + "loss": 2.0135, + "step": 9345 + }, + { + "epoch": 13.754134509371555, + "grad_norm": 1.9076848030090332, + "learning_rate": 5.412254901960785e-06, + "loss": 2.0666, + "step": 9366 + }, + { + "epoch": 13.785005512679161, + "grad_norm": 3.4870707988739014, + "learning_rate": 5.401960784313726e-06, + "loss": 2.1315, + "step": 9387 + }, + { + "epoch": 13.81587651598677, + "grad_norm": 2.885655403137207, + "learning_rate": 5.391666666666667e-06, + "loss": 1.9702, + "step": 9408 + }, + { + "epoch": 13.846747519294377, + "grad_norm": 3.8822875022888184, + "learning_rate": 5.381372549019608e-06, + "loss": 2.0141, + "step": 9429 + }, + { + "epoch": 13.877618522601985, + "grad_norm": 2.470426559448242, + "learning_rate": 5.371078431372549e-06, + "loss": 2.1559, + "step": 9450 + }, + { + "epoch": 13.908489525909593, + "grad_norm": 3.060715436935425, + "learning_rate": 5.360784313725491e-06, + "loss": 2.1225, + "step": 9471 + }, + { + "epoch": 13.9393605292172, + "grad_norm": 2.9717185497283936, + "learning_rate": 5.350490196078431e-06, + "loss": 2.1496, + "step": 9492 + }, + { + "epoch": 13.970231532524807, + "grad_norm": 3.471766471862793, + "learning_rate": 5.340196078431373e-06, + "loss": 2.1459, + "step": 9513 + }, + { + "epoch": 14.0, + "grad_norm": 1.8087966442108154, + "learning_rate": 5.329901960784314e-06, + "loss": 1.9732, + "step": 9534 + }, + { + "epoch": 14.030871003307608, + "grad_norm": 2.219148635864258, + "learning_rate": 5.319607843137255e-06, + "loss": 2.066, + "step": 9555 + }, + { + "epoch": 14.061742006615216, + "grad_norm": 2.4805564880371094, + "learning_rate": 5.3093137254901964e-06, + "loss": 2.055, + "step": 9576 + }, + { + "epoch": 14.092613009922822, + "grad_norm": 3.4554660320281982, + "learning_rate": 5.2990196078431375e-06, + "loss": 2.0085, + "step": 9597 + }, + { + "epoch": 14.12348401323043, + "grad_norm": 3.6150786876678467, + "learning_rate": 5.2887254901960794e-06, + "loss": 2.1289, + "step": 9618 + }, + { + "epoch": 14.154355016538037, + "grad_norm": 3.061882495880127, + "learning_rate": 5.27843137254902e-06, + "loss": 2.0114, + "step": 9639 + }, + { + "epoch": 14.185226019845645, + "grad_norm": 3.2985293865203857, + "learning_rate": 5.268137254901962e-06, + "loss": 2.1027, + "step": 9660 + }, + { + "epoch": 14.216097023153253, + "grad_norm": 3.460374116897583, + "learning_rate": 5.257843137254902e-06, + "loss": 2.1815, + "step": 9681 + }, + { + "epoch": 14.24696802646086, + "grad_norm": 2.6231353282928467, + "learning_rate": 5.247549019607844e-06, + "loss": 2.1611, + "step": 9702 + }, + { + "epoch": 14.277839029768467, + "grad_norm": 2.7419650554656982, + "learning_rate": 5.237254901960784e-06, + "loss": 2.069, + "step": 9723 + }, + { + "epoch": 14.308710033076075, + "grad_norm": 2.1007187366485596, + "learning_rate": 5.226960784313726e-06, + "loss": 1.9542, + "step": 9744 + }, + { + "epoch": 14.339581036383683, + "grad_norm": 2.824641227722168, + "learning_rate": 5.216666666666666e-06, + "loss": 2.1204, + "step": 9765 + }, + { + "epoch": 14.37045203969129, + "grad_norm": 3.708585739135742, + "learning_rate": 5.206372549019608e-06, + "loss": 2.0792, + "step": 9786 + }, + { + "epoch": 14.401323042998898, + "grad_norm": 3.4671897888183594, + "learning_rate": 5.19607843137255e-06, + "loss": 2.0354, + "step": 9807 + }, + { + "epoch": 14.432194046306504, + "grad_norm": 3.824810743331909, + "learning_rate": 5.18578431372549e-06, + "loss": 1.9401, + "step": 9828 + }, + { + "epoch": 14.463065049614112, + "grad_norm": 2.3937902450561523, + "learning_rate": 5.175490196078432e-06, + "loss": 2.0049, + "step": 9849 + }, + { + "epoch": 14.49393605292172, + "grad_norm": 3.4717845916748047, + "learning_rate": 5.165196078431372e-06, + "loss": 2.111, + "step": 9870 + }, + { + "epoch": 14.524807056229328, + "grad_norm": 3.0910346508026123, + "learning_rate": 5.154901960784314e-06, + "loss": 2.1328, + "step": 9891 + }, + { + "epoch": 14.555678059536936, + "grad_norm": 2.9736809730529785, + "learning_rate": 5.1446078431372545e-06, + "loss": 1.9529, + "step": 9912 + }, + { + "epoch": 14.586549062844542, + "grad_norm": 2.9420766830444336, + "learning_rate": 5.1343137254901964e-06, + "loss": 2.0487, + "step": 9933 + }, + { + "epoch": 14.61742006615215, + "grad_norm": 3.0031661987304688, + "learning_rate": 5.124019607843138e-06, + "loss": 2.0438, + "step": 9954 + }, + { + "epoch": 14.648291069459757, + "grad_norm": 2.0756826400756836, + "learning_rate": 5.113725490196079e-06, + "loss": 2.0878, + "step": 9975 + }, + { + "epoch": 14.679162072767365, + "grad_norm": 3.5476233959198, + "learning_rate": 5.1034313725490205e-06, + "loss": 2.0462, + "step": 9996 + }, + { + "epoch": 14.710033076074973, + "grad_norm": 3.7489500045776367, + "learning_rate": 5.093137254901961e-06, + "loss": 1.9793, + "step": 10017 + }, + { + "epoch": 14.74090407938258, + "grad_norm": 2.612318992614746, + "learning_rate": 5.082843137254903e-06, + "loss": 1.9353, + "step": 10038 + }, + { + "epoch": 14.771775082690187, + "grad_norm": 3.1954147815704346, + "learning_rate": 5.072549019607843e-06, + "loss": 2.1059, + "step": 10059 + }, + { + "epoch": 14.802646085997795, + "grad_norm": 2.8227596282958984, + "learning_rate": 5.062254901960785e-06, + "loss": 2.0501, + "step": 10080 + }, + { + "epoch": 14.833517089305403, + "grad_norm": 3.4035403728485107, + "learning_rate": 5.051960784313727e-06, + "loss": 2.0993, + "step": 10101 + }, + { + "epoch": 14.86438809261301, + "grad_norm": 2.661450147628784, + "learning_rate": 5.041666666666667e-06, + "loss": 1.9717, + "step": 10122 + }, + { + "epoch": 14.895259095920617, + "grad_norm": 3.4387428760528564, + "learning_rate": 5.031372549019609e-06, + "loss": 2.082, + "step": 10143 + }, + { + "epoch": 14.926130099228224, + "grad_norm": 3.7169981002807617, + "learning_rate": 5.021078431372549e-06, + "loss": 2.1306, + "step": 10164 + }, + { + "epoch": 14.957001102535832, + "grad_norm": 4.3067946434021, + "learning_rate": 5.010784313725491e-06, + "loss": 2.1733, + "step": 10185 + }, + { + "epoch": 14.98787210584344, + "grad_norm": 3.562462329864502, + "learning_rate": 5.000490196078431e-06, + "loss": 2.1481, + "step": 10206 + }, + { + "epoch": 15.017640573318634, + "grad_norm": 4.963381290435791, + "learning_rate": 4.990196078431373e-06, + "loss": 2.0355, + "step": 10227 + }, + { + "epoch": 15.04851157662624, + "grad_norm": 3.029024600982666, + "learning_rate": 4.979901960784314e-06, + "loss": 2.1029, + "step": 10248 + }, + { + "epoch": 15.079382579933847, + "grad_norm": 3.5499942302703857, + "learning_rate": 4.969607843137255e-06, + "loss": 2.1633, + "step": 10269 + }, + { + "epoch": 15.110253583241455, + "grad_norm": 3.7054245471954346, + "learning_rate": 4.959313725490196e-06, + "loss": 2.1461, + "step": 10290 + }, + { + "epoch": 15.141124586549063, + "grad_norm": 3.2250783443450928, + "learning_rate": 4.9490196078431375e-06, + "loss": 2.0355, + "step": 10311 + }, + { + "epoch": 15.171995589856671, + "grad_norm": 2.0336270332336426, + "learning_rate": 4.938725490196079e-06, + "loss": 2.0813, + "step": 10332 + }, + { + "epoch": 15.202866593164277, + "grad_norm": 3.070176839828491, + "learning_rate": 4.9284313725490205e-06, + "loss": 2.1598, + "step": 10353 + }, + { + "epoch": 15.233737596471885, + "grad_norm": 4.083742141723633, + "learning_rate": 4.9181372549019616e-06, + "loss": 2.1571, + "step": 10374 + }, + { + "epoch": 15.264608599779493, + "grad_norm": 2.932722568511963, + "learning_rate": 4.907843137254903e-06, + "loss": 2.0541, + "step": 10395 + }, + { + "epoch": 15.2954796030871, + "grad_norm": 4.044142723083496, + "learning_rate": 4.897549019607844e-06, + "loss": 2.0635, + "step": 10416 + }, + { + "epoch": 15.326350606394708, + "grad_norm": 2.9298267364501953, + "learning_rate": 4.887254901960785e-06, + "loss": 1.9621, + "step": 10437 + }, + { + "epoch": 15.357221609702314, + "grad_norm": 3.0545711517333984, + "learning_rate": 4.876960784313726e-06, + "loss": 2.117, + "step": 10458 + }, + { + "epoch": 15.388092613009922, + "grad_norm": 4.692001819610596, + "learning_rate": 4.866666666666667e-06, + "loss": 2.0084, + "step": 10479 + }, + { + "epoch": 15.41896361631753, + "grad_norm": 2.91191029548645, + "learning_rate": 4.856372549019608e-06, + "loss": 2.062, + "step": 10500 + }, + { + "epoch": 15.449834619625138, + "grad_norm": 2.914384603500366, + "learning_rate": 4.84607843137255e-06, + "loss": 2.0613, + "step": 10521 + }, + { + "epoch": 15.480705622932746, + "grad_norm": 2.4917314052581787, + "learning_rate": 4.835784313725491e-06, + "loss": 1.9963, + "step": 10542 + }, + { + "epoch": 15.511576626240354, + "grad_norm": 4.361894130706787, + "learning_rate": 4.825490196078432e-06, + "loss": 2.0153, + "step": 10563 + }, + { + "epoch": 15.54244762954796, + "grad_norm": 2.501007556915283, + "learning_rate": 4.815196078431373e-06, + "loss": 2.1068, + "step": 10584 + }, + { + "epoch": 15.573318632855568, + "grad_norm": 3.864947557449341, + "learning_rate": 4.804901960784314e-06, + "loss": 1.9808, + "step": 10605 + }, + { + "epoch": 15.604189636163175, + "grad_norm": 2.879890203475952, + "learning_rate": 4.794607843137255e-06, + "loss": 2.0443, + "step": 10626 + }, + { + "epoch": 15.635060639470783, + "grad_norm": 2.9568746089935303, + "learning_rate": 4.784313725490196e-06, + "loss": 2.1534, + "step": 10647 + }, + { + "epoch": 15.665931642778391, + "grad_norm": 3.456660032272339, + "learning_rate": 4.7740196078431375e-06, + "loss": 2.1061, + "step": 10668 + }, + { + "epoch": 15.696802646085997, + "grad_norm": 2.0130109786987305, + "learning_rate": 4.7637254901960785e-06, + "loss": 2.0547, + "step": 10689 + }, + { + "epoch": 15.727673649393605, + "grad_norm": 2.771479845046997, + "learning_rate": 4.75343137254902e-06, + "loss": 2.0917, + "step": 10710 + }, + { + "epoch": 15.758544652701213, + "grad_norm": 3.6473817825317383, + "learning_rate": 4.743137254901961e-06, + "loss": 2.0157, + "step": 10731 + }, + { + "epoch": 15.78941565600882, + "grad_norm": 3.1383354663848877, + "learning_rate": 4.732843137254902e-06, + "loss": 1.9199, + "step": 10752 + }, + { + "epoch": 15.820286659316428, + "grad_norm": 3.2463924884796143, + "learning_rate": 4.722549019607844e-06, + "loss": 2.053, + "step": 10773 + }, + { + "epoch": 15.851157662624034, + "grad_norm": 2.4401538372039795, + "learning_rate": 4.712254901960785e-06, + "loss": 2.0286, + "step": 10794 + }, + { + "epoch": 15.882028665931642, + "grad_norm": 3.72370982170105, + "learning_rate": 4.701960784313726e-06, + "loss": 2.0477, + "step": 10815 + }, + { + "epoch": 15.91289966923925, + "grad_norm": 2.85298228263855, + "learning_rate": 4.691666666666667e-06, + "loss": 1.9962, + "step": 10836 + }, + { + "epoch": 15.943770672546858, + "grad_norm": 3.810678482055664, + "learning_rate": 4.681372549019608e-06, + "loss": 2.1352, + "step": 10857 + }, + { + "epoch": 15.974641675854466, + "grad_norm": 3.461115598678589, + "learning_rate": 4.671078431372549e-06, + "loss": 2.1521, + "step": 10878 + }, + { + "epoch": 16.004410143329658, + "grad_norm": 3.260249376296997, + "learning_rate": 4.66078431372549e-06, + "loss": 1.9691, + "step": 10899 + }, + { + "epoch": 16.035281146637267, + "grad_norm": 3.3586559295654297, + "learning_rate": 4.650490196078431e-06, + "loss": 2.1661, + "step": 10920 + }, + { + "epoch": 16.066152149944873, + "grad_norm": 3.206244468688965, + "learning_rate": 4.640196078431372e-06, + "loss": 2.1357, + "step": 10941 + }, + { + "epoch": 16.09702315325248, + "grad_norm": 3.45607590675354, + "learning_rate": 4.629901960784314e-06, + "loss": 2.0576, + "step": 10962 + }, + { + "epoch": 16.12789415656009, + "grad_norm": 2.4897308349609375, + "learning_rate": 4.619607843137255e-06, + "loss": 2.152, + "step": 10983 + }, + { + "epoch": 16.158765159867695, + "grad_norm": 3.5065560340881348, + "learning_rate": 4.609313725490196e-06, + "loss": 1.964, + "step": 11004 + }, + { + "epoch": 16.189636163175305, + "grad_norm": 3.8698577880859375, + "learning_rate": 4.5990196078431375e-06, + "loss": 1.9255, + "step": 11025 + }, + { + "epoch": 16.22050716648291, + "grad_norm": 2.7581987380981445, + "learning_rate": 4.5887254901960785e-06, + "loss": 2.0786, + "step": 11046 + }, + { + "epoch": 16.251378169790517, + "grad_norm": 4.7639007568359375, + "learning_rate": 4.57843137254902e-06, + "loss": 2.1155, + "step": 11067 + }, + { + "epoch": 16.282249173098126, + "grad_norm": 2.4301273822784424, + "learning_rate": 4.568137254901961e-06, + "loss": 2.1352, + "step": 11088 + }, + { + "epoch": 16.313120176405732, + "grad_norm": 4.11771297454834, + "learning_rate": 4.557843137254903e-06, + "loss": 2.0469, + "step": 11109 + }, + { + "epoch": 16.343991179713342, + "grad_norm": 3.496967077255249, + "learning_rate": 4.547549019607844e-06, + "loss": 2.1648, + "step": 11130 + }, + { + "epoch": 16.374862183020948, + "grad_norm": 2.431269407272339, + "learning_rate": 4.537254901960785e-06, + "loss": 2.0452, + "step": 11151 + }, + { + "epoch": 16.405733186328554, + "grad_norm": 2.833159923553467, + "learning_rate": 4.526960784313726e-06, + "loss": 2.0595, + "step": 11172 + }, + { + "epoch": 16.436604189636164, + "grad_norm": 2.9650235176086426, + "learning_rate": 4.516666666666667e-06, + "loss": 2.0827, + "step": 11193 + }, + { + "epoch": 16.46747519294377, + "grad_norm": 3.284029483795166, + "learning_rate": 4.506372549019608e-06, + "loss": 2.1151, + "step": 11214 + }, + { + "epoch": 16.49834619625138, + "grad_norm": 2.8441848754882812, + "learning_rate": 4.496078431372549e-06, + "loss": 2.0386, + "step": 11235 + }, + { + "epoch": 16.529217199558985, + "grad_norm": 2.7832906246185303, + "learning_rate": 4.485784313725491e-06, + "loss": 2.0704, + "step": 11256 + }, + { + "epoch": 16.56008820286659, + "grad_norm": 2.812133550643921, + "learning_rate": 4.475490196078432e-06, + "loss": 2.0627, + "step": 11277 + }, + { + "epoch": 16.5909592061742, + "grad_norm": 4.428758144378662, + "learning_rate": 4.465196078431373e-06, + "loss": 2.0506, + "step": 11298 + }, + { + "epoch": 16.621830209481807, + "grad_norm": 3.15263032913208, + "learning_rate": 4.454901960784314e-06, + "loss": 2.0758, + "step": 11319 + }, + { + "epoch": 16.652701212789417, + "grad_norm": 3.6009435653686523, + "learning_rate": 4.445098039215687e-06, + "loss": 2.0106, + "step": 11340 + }, + { + "epoch": 16.683572216097023, + "grad_norm": 3.4212281703948975, + "learning_rate": 4.434803921568628e-06, + "loss": 1.9713, + "step": 11361 + }, + { + "epoch": 16.71444321940463, + "grad_norm": 5.794731140136719, + "learning_rate": 4.424509803921569e-06, + "loss": 2.0213, + "step": 11382 + }, + { + "epoch": 16.74531422271224, + "grad_norm": 3.2197704315185547, + "learning_rate": 4.41421568627451e-06, + "loss": 2.0999, + "step": 11403 + }, + { + "epoch": 16.776185226019845, + "grad_norm": 1.9434154033660889, + "learning_rate": 4.403921568627451e-06, + "loss": 2.1013, + "step": 11424 + }, + { + "epoch": 16.807056229327454, + "grad_norm": 2.352503538131714, + "learning_rate": 4.393627450980393e-06, + "loss": 1.9736, + "step": 11445 + }, + { + "epoch": 16.83792723263506, + "grad_norm": 3.7558541297912598, + "learning_rate": 4.383333333333334e-06, + "loss": 2.0952, + "step": 11466 + }, + { + "epoch": 16.86879823594267, + "grad_norm": 3.505101442337036, + "learning_rate": 4.373039215686275e-06, + "loss": 2.0759, + "step": 11487 + }, + { + "epoch": 16.899669239250276, + "grad_norm": 2.8822970390319824, + "learning_rate": 4.3627450980392164e-06, + "loss": 2.1724, + "step": 11508 + }, + { + "epoch": 16.930540242557882, + "grad_norm": 3.491542100906372, + "learning_rate": 4.3524509803921575e-06, + "loss": 2.0708, + "step": 11529 + }, + { + "epoch": 16.96141124586549, + "grad_norm": 3.233745574951172, + "learning_rate": 4.342156862745099e-06, + "loss": 2.0463, + "step": 11550 + }, + { + "epoch": 16.992282249173098, + "grad_norm": 3.1945841312408447, + "learning_rate": 4.33186274509804e-06, + "loss": 1.998, + "step": 11571 + }, + { + "epoch": 17.02205071664829, + "grad_norm": 2.081578254699707, + "learning_rate": 4.321568627450981e-06, + "loss": 2.0078, + "step": 11592 + }, + { + "epoch": 17.052921719955897, + "grad_norm": 3.096571683883667, + "learning_rate": 4.311274509803922e-06, + "loss": 1.958, + "step": 11613 + }, + { + "epoch": 17.083792723263507, + "grad_norm": 1.7668076753616333, + "learning_rate": 4.300980392156863e-06, + "loss": 2.0671, + "step": 11634 + }, + { + "epoch": 17.114663726571113, + "grad_norm": 3.3065104484558105, + "learning_rate": 4.290686274509804e-06, + "loss": 2.0577, + "step": 11655 + }, + { + "epoch": 17.145534729878722, + "grad_norm": 3.453615427017212, + "learning_rate": 4.280392156862745e-06, + "loss": 2.0605, + "step": 11676 + }, + { + "epoch": 17.17640573318633, + "grad_norm": 2.54748272895813, + "learning_rate": 4.270098039215687e-06, + "loss": 2.2096, + "step": 11697 + }, + { + "epoch": 17.207276736493935, + "grad_norm": 3.4268970489501953, + "learning_rate": 4.259803921568628e-06, + "loss": 2.0538, + "step": 11718 + }, + { + "epoch": 17.238147739801544, + "grad_norm": 3.91983699798584, + "learning_rate": 4.249509803921569e-06, + "loss": 2.0354, + "step": 11739 + }, + { + "epoch": 17.26901874310915, + "grad_norm": 3.7987680435180664, + "learning_rate": 4.23921568627451e-06, + "loss": 2.0894, + "step": 11760 + }, + { + "epoch": 17.29988974641676, + "grad_norm": 4.2465291023254395, + "learning_rate": 4.228921568627451e-06, + "loss": 2.1255, + "step": 11781 + }, + { + "epoch": 17.330760749724366, + "grad_norm": 5.107961654663086, + "learning_rate": 4.218627450980392e-06, + "loss": 2.0218, + "step": 11802 + }, + { + "epoch": 17.361631753031972, + "grad_norm": 3.0302984714508057, + "learning_rate": 4.208333333333333e-06, + "loss": 2.0429, + "step": 11823 + }, + { + "epoch": 17.39250275633958, + "grad_norm": 3.6439778804779053, + "learning_rate": 4.1980392156862745e-06, + "loss": 2.0699, + "step": 11844 + }, + { + "epoch": 17.423373759647188, + "grad_norm": 2.6481032371520996, + "learning_rate": 4.1877450980392156e-06, + "loss": 2.0194, + "step": 11865 + }, + { + "epoch": 17.454244762954797, + "grad_norm": 2.3916313648223877, + "learning_rate": 4.177450980392157e-06, + "loss": 1.9906, + "step": 11886 + }, + { + "epoch": 17.485115766262403, + "grad_norm": 3.7970051765441895, + "learning_rate": 4.167156862745098e-06, + "loss": 2.0747, + "step": 11907 + }, + { + "epoch": 17.51598676957001, + "grad_norm": 3.466770648956299, + "learning_rate": 4.15686274509804e-06, + "loss": 2.0312, + "step": 11928 + }, + { + "epoch": 17.54685777287762, + "grad_norm": 3.2934865951538086, + "learning_rate": 4.146568627450981e-06, + "loss": 2.1119, + "step": 11949 + }, + { + "epoch": 17.577728776185225, + "grad_norm": 3.815096855163574, + "learning_rate": 4.136274509803922e-06, + "loss": 2.0647, + "step": 11970 + }, + { + "epoch": 17.608599779492835, + "grad_norm": 2.554340362548828, + "learning_rate": 4.125980392156863e-06, + "loss": 2.1036, + "step": 11991 + }, + { + "epoch": 17.63947078280044, + "grad_norm": 3.861665964126587, + "learning_rate": 4.115686274509804e-06, + "loss": 1.9365, + "step": 12012 + }, + { + "epoch": 17.670341786108047, + "grad_norm": 4.2214226722717285, + "learning_rate": 4.105392156862745e-06, + "loss": 2.1702, + "step": 12033 + }, + { + "epoch": 17.701212789415656, + "grad_norm": 2.405787706375122, + "learning_rate": 4.095098039215686e-06, + "loss": 2.0042, + "step": 12054 + }, + { + "epoch": 17.732083792723262, + "grad_norm": 3.1533408164978027, + "learning_rate": 4.084803921568628e-06, + "loss": 1.851, + "step": 12075 + }, + { + "epoch": 17.762954796030872, + "grad_norm": 3.3998310565948486, + "learning_rate": 4.074509803921569e-06, + "loss": 2.1838, + "step": 12096 + }, + { + "epoch": 17.793825799338478, + "grad_norm": 3.0578980445861816, + "learning_rate": 4.06421568627451e-06, + "loss": 2.0094, + "step": 12117 + }, + { + "epoch": 17.824696802646088, + "grad_norm": 2.25312876701355, + "learning_rate": 4.053921568627451e-06, + "loss": 2.0077, + "step": 12138 + }, + { + "epoch": 17.855567805953694, + "grad_norm": 2.8547134399414062, + "learning_rate": 4.043627450980392e-06, + "loss": 2.096, + "step": 12159 + }, + { + "epoch": 17.8864388092613, + "grad_norm": 2.6908249855041504, + "learning_rate": 4.033333333333333e-06, + "loss": 2.0115, + "step": 12180 + }, + { + "epoch": 17.91730981256891, + "grad_norm": 4.099616527557373, + "learning_rate": 4.0230392156862745e-06, + "loss": 2.015, + "step": 12201 + }, + { + "epoch": 17.948180815876515, + "grad_norm": 3.3205764293670654, + "learning_rate": 4.0127450980392155e-06, + "loss": 1.9277, + "step": 12222 + }, + { + "epoch": 17.979051819184125, + "grad_norm": 3.4432685375213623, + "learning_rate": 4.0024509803921575e-06, + "loss": 2.0977, + "step": 12243 + }, + { + "epoch": 18.008820286659315, + "grad_norm": 3.4045350551605225, + "learning_rate": 3.9921568627450985e-06, + "loss": 1.8597, + "step": 12264 + }, + { + "epoch": 18.039691289966925, + "grad_norm": 2.844184637069702, + "learning_rate": 3.98186274509804e-06, + "loss": 2.0898, + "step": 12285 + }, + { + "epoch": 18.07056229327453, + "grad_norm": 3.8207850456237793, + "learning_rate": 3.971568627450981e-06, + "loss": 2.0866, + "step": 12306 + }, + { + "epoch": 18.10143329658214, + "grad_norm": 2.78249192237854, + "learning_rate": 3.961274509803922e-06, + "loss": 2.1116, + "step": 12327 + }, + { + "epoch": 18.132304299889746, + "grad_norm": 3.2231979370117188, + "learning_rate": 3.950980392156863e-06, + "loss": 2.0347, + "step": 12348 + }, + { + "epoch": 18.163175303197352, + "grad_norm": 3.118004560470581, + "learning_rate": 3.940686274509804e-06, + "loss": 2.1377, + "step": 12369 + }, + { + "epoch": 18.194046306504962, + "grad_norm": 2.7039101123809814, + "learning_rate": 3.930392156862746e-06, + "loss": 2.0915, + "step": 12390 + }, + { + "epoch": 18.224917309812568, + "grad_norm": 5.230814456939697, + "learning_rate": 3.920098039215687e-06, + "loss": 2.0506, + "step": 12411 + }, + { + "epoch": 18.255788313120178, + "grad_norm": 4.035606861114502, + "learning_rate": 3.909803921568628e-06, + "loss": 2.0546, + "step": 12432 + }, + { + "epoch": 18.286659316427784, + "grad_norm": 2.651754379272461, + "learning_rate": 3.899509803921569e-06, + "loss": 2.1167, + "step": 12453 + }, + { + "epoch": 18.31753031973539, + "grad_norm": 2.6406948566436768, + "learning_rate": 3.88921568627451e-06, + "loss": 2.0703, + "step": 12474 + }, + { + "epoch": 18.348401323043, + "grad_norm": 2.9610483646392822, + "learning_rate": 3.878921568627451e-06, + "loss": 2.0869, + "step": 12495 + }, + { + "epoch": 18.379272326350605, + "grad_norm": 3.5515575408935547, + "learning_rate": 3.868627450980392e-06, + "loss": 2.0525, + "step": 12516 + }, + { + "epoch": 18.410143329658215, + "grad_norm": 4.212573528289795, + "learning_rate": 3.858333333333333e-06, + "loss": 2.0139, + "step": 12537 + }, + { + "epoch": 18.44101433296582, + "grad_norm": 5.508068561553955, + "learning_rate": 3.848039215686275e-06, + "loss": 2.0703, + "step": 12558 + }, + { + "epoch": 18.471885336273427, + "grad_norm": 2.8144259452819824, + "learning_rate": 3.837745098039216e-06, + "loss": 2.0188, + "step": 12579 + }, + { + "epoch": 18.502756339581037, + "grad_norm": 2.812675714492798, + "learning_rate": 3.8274509803921575e-06, + "loss": 2.061, + "step": 12600 + }, + { + "epoch": 18.533627342888643, + "grad_norm": 3.4826412200927734, + "learning_rate": 3.8171568627450985e-06, + "loss": 2.1514, + "step": 12621 + }, + { + "epoch": 18.564498346196252, + "grad_norm": 3.2510266304016113, + "learning_rate": 3.8068627450980396e-06, + "loss": 2.0922, + "step": 12642 + }, + { + "epoch": 18.59536934950386, + "grad_norm": 2.7289392948150635, + "learning_rate": 3.7965686274509807e-06, + "loss": 1.9753, + "step": 12663 + }, + { + "epoch": 18.626240352811465, + "grad_norm": 3.323258876800537, + "learning_rate": 3.7862745098039218e-06, + "loss": 2.0766, + "step": 12684 + }, + { + "epoch": 18.657111356119074, + "grad_norm": 2.709409713745117, + "learning_rate": 3.7759803921568633e-06, + "loss": 2.0631, + "step": 12705 + }, + { + "epoch": 18.68798235942668, + "grad_norm": 3.7730557918548584, + "learning_rate": 3.7656862745098043e-06, + "loss": 1.9911, + "step": 12726 + }, + { + "epoch": 18.71885336273429, + "grad_norm": 3.651437997817993, + "learning_rate": 3.7553921568627454e-06, + "loss": 2.1125, + "step": 12747 + }, + { + "epoch": 18.749724366041896, + "grad_norm": 3.381253242492676, + "learning_rate": 3.7450980392156865e-06, + "loss": 1.9971, + "step": 12768 + }, + { + "epoch": 18.780595369349506, + "grad_norm": 3.1010146141052246, + "learning_rate": 3.7348039215686276e-06, + "loss": 2.0648, + "step": 12789 + }, + { + "epoch": 18.81146637265711, + "grad_norm": 3.745161533355713, + "learning_rate": 3.7245098039215686e-06, + "loss": 2.0145, + "step": 12810 + }, + { + "epoch": 18.842337375964718, + "grad_norm": 4.621708393096924, + "learning_rate": 3.7142156862745097e-06, + "loss": 1.9998, + "step": 12831 + }, + { + "epoch": 18.873208379272327, + "grad_norm": 3.5155012607574463, + "learning_rate": 3.7039215686274516e-06, + "loss": 2.0464, + "step": 12852 + }, + { + "epoch": 18.904079382579933, + "grad_norm": 2.5564968585968018, + "learning_rate": 3.6936274509803927e-06, + "loss": 2.0925, + "step": 12873 + }, + { + "epoch": 18.934950385887543, + "grad_norm": 3.048408031463623, + "learning_rate": 3.6833333333333338e-06, + "loss": 2.0354, + "step": 12894 + }, + { + "epoch": 18.96582138919515, + "grad_norm": 3.5387840270996094, + "learning_rate": 3.673039215686275e-06, + "loss": 1.9691, + "step": 12915 + }, + { + "epoch": 18.996692392502755, + "grad_norm": 3.683196544647217, + "learning_rate": 3.662745098039216e-06, + "loss": 2.0746, + "step": 12936 + }, + { + "epoch": 19.02646085997795, + "grad_norm": 5.585927486419678, + "learning_rate": 3.652450980392157e-06, + "loss": 1.9729, + "step": 12957 + }, + { + "epoch": 19.057331863285558, + "grad_norm": 3.5750479698181152, + "learning_rate": 3.642156862745098e-06, + "loss": 2.0793, + "step": 12978 + }, + { + "epoch": 19.088202866593164, + "grad_norm": 3.356248617172241, + "learning_rate": 3.631862745098039e-06, + "loss": 2.0644, + "step": 12999 + }, + { + "epoch": 19.11907386990077, + "grad_norm": 3.57338809967041, + "learning_rate": 3.621568627450981e-06, + "loss": 2.1432, + "step": 13020 + }, + { + "epoch": 19.14994487320838, + "grad_norm": 4.2975287437438965, + "learning_rate": 3.611274509803922e-06, + "loss": 2.0979, + "step": 13041 + }, + { + "epoch": 19.180815876515986, + "grad_norm": 3.7634999752044678, + "learning_rate": 3.6009803921568632e-06, + "loss": 2.1543, + "step": 13062 + }, + { + "epoch": 19.211686879823596, + "grad_norm": 3.75382924079895, + "learning_rate": 3.5906862745098043e-06, + "loss": 2.0601, + "step": 13083 + }, + { + "epoch": 19.2425578831312, + "grad_norm": 3.5403940677642822, + "learning_rate": 3.5803921568627454e-06, + "loss": 2.0395, + "step": 13104 + }, + { + "epoch": 19.273428886438808, + "grad_norm": 2.899658203125, + "learning_rate": 3.5700980392156865e-06, + "loss": 2.0467, + "step": 13125 + }, + { + "epoch": 19.304299889746417, + "grad_norm": 3.702956199645996, + "learning_rate": 3.5598039215686275e-06, + "loss": 2.0043, + "step": 13146 + }, + { + "epoch": 19.335170893054023, + "grad_norm": 3.56923508644104, + "learning_rate": 3.549509803921569e-06, + "loss": 1.9066, + "step": 13167 + }, + { + "epoch": 19.366041896361633, + "grad_norm": 2.911870241165161, + "learning_rate": 3.53921568627451e-06, + "loss": 2.0302, + "step": 13188 + }, + { + "epoch": 19.39691289966924, + "grad_norm": 2.913762331008911, + "learning_rate": 3.528921568627451e-06, + "loss": 1.9829, + "step": 13209 + }, + { + "epoch": 19.427783902976845, + "grad_norm": 3.171879768371582, + "learning_rate": 3.5186274509803927e-06, + "loss": 1.9945, + "step": 13230 + }, + { + "epoch": 19.458654906284455, + "grad_norm": 2.787626266479492, + "learning_rate": 3.5083333333333338e-06, + "loss": 2.0112, + "step": 13251 + }, + { + "epoch": 19.48952590959206, + "grad_norm": 3.769746780395508, + "learning_rate": 3.498039215686275e-06, + "loss": 1.9996, + "step": 13272 + }, + { + "epoch": 19.52039691289967, + "grad_norm": 3.2027320861816406, + "learning_rate": 3.487745098039216e-06, + "loss": 1.9983, + "step": 13293 + }, + { + "epoch": 19.551267916207276, + "grad_norm": 3.7969839572906494, + "learning_rate": 3.477450980392157e-06, + "loss": 2.1563, + "step": 13314 + }, + { + "epoch": 19.582138919514883, + "grad_norm": 2.9992904663085938, + "learning_rate": 3.4671568627450985e-06, + "loss": 2.0276, + "step": 13335 + }, + { + "epoch": 19.613009922822492, + "grad_norm": 3.2763264179229736, + "learning_rate": 3.4568627450980396e-06, + "loss": 2.0982, + "step": 13356 + }, + { + "epoch": 19.643880926130098, + "grad_norm": 3.3335490226745605, + "learning_rate": 3.4465686274509806e-06, + "loss": 2.0123, + "step": 13377 + }, + { + "epoch": 19.674751929437708, + "grad_norm": 3.0416243076324463, + "learning_rate": 3.4362745098039217e-06, + "loss": 2.0464, + "step": 13398 + }, + { + "epoch": 19.705622932745314, + "grad_norm": 3.625033140182495, + "learning_rate": 3.425980392156863e-06, + "loss": 2.0404, + "step": 13419 + }, + { + "epoch": 19.736493936052923, + "grad_norm": 3.487109422683716, + "learning_rate": 3.415686274509804e-06, + "loss": 2.1132, + "step": 13440 + }, + { + "epoch": 19.76736493936053, + "grad_norm": 3.335360050201416, + "learning_rate": 3.405392156862745e-06, + "loss": 2.0071, + "step": 13461 + }, + { + "epoch": 19.798235942668136, + "grad_norm": 3.624753952026367, + "learning_rate": 3.395098039215687e-06, + "loss": 2.05, + "step": 13482 + }, + { + "epoch": 19.829106945975745, + "grad_norm": 3.325979709625244, + "learning_rate": 3.384803921568628e-06, + "loss": 1.8625, + "step": 13503 + }, + { + "epoch": 19.85997794928335, + "grad_norm": 3.8328804969787598, + "learning_rate": 3.374509803921569e-06, + "loss": 1.9652, + "step": 13524 + }, + { + "epoch": 19.89084895259096, + "grad_norm": 4.318526744842529, + "learning_rate": 3.364705882352942e-06, + "loss": 1.955, + "step": 13545 + }, + { + "epoch": 19.921719955898567, + "grad_norm": 3.1736397743225098, + "learning_rate": 3.354411764705883e-06, + "loss": 2.0086, + "step": 13566 + }, + { + "epoch": 19.952590959206173, + "grad_norm": 2.356720209121704, + "learning_rate": 3.344117647058824e-06, + "loss": 2.0913, + "step": 13587 + }, + { + "epoch": 19.983461962513783, + "grad_norm": 3.434952735900879, + "learning_rate": 3.333823529411765e-06, + "loss": 2.0391, + "step": 13608 + }, + { + "epoch": 20.013230429988976, + "grad_norm": 3.4107272624969482, + "learning_rate": 3.323529411764706e-06, + "loss": 2.0666, + "step": 13629 + }, + { + "epoch": 20.044101433296582, + "grad_norm": 4.133201599121094, + "learning_rate": 3.313235294117647e-06, + "loss": 2.0181, + "step": 13650 + }, + { + "epoch": 20.074972436604188, + "grad_norm": 3.949664831161499, + "learning_rate": 3.3029411764705887e-06, + "loss": 2.033, + "step": 13671 + }, + { + "epoch": 20.105843439911798, + "grad_norm": 3.270611047744751, + "learning_rate": 3.2926470588235298e-06, + "loss": 2.0423, + "step": 13692 + }, + { + "epoch": 20.136714443219404, + "grad_norm": 3.6626546382904053, + "learning_rate": 3.282352941176471e-06, + "loss": 2.085, + "step": 13713 + }, + { + "epoch": 20.167585446527013, + "grad_norm": 2.613144874572754, + "learning_rate": 3.272058823529412e-06, + "loss": 1.9857, + "step": 13734 + }, + { + "epoch": 20.19845644983462, + "grad_norm": 5.440834999084473, + "learning_rate": 3.261764705882353e-06, + "loss": 2.0182, + "step": 13755 + }, + { + "epoch": 20.229327453142226, + "grad_norm": 3.383056402206421, + "learning_rate": 3.2514705882352945e-06, + "loss": 2.067, + "step": 13776 + }, + { + "epoch": 20.260198456449835, + "grad_norm": 3.9195475578308105, + "learning_rate": 3.2411764705882356e-06, + "loss": 2.0912, + "step": 13797 + }, + { + "epoch": 20.29106945975744, + "grad_norm": 3.1253345012664795, + "learning_rate": 3.2308823529411766e-06, + "loss": 1.9867, + "step": 13818 + }, + { + "epoch": 20.32194046306505, + "grad_norm": 1.728356957435608, + "learning_rate": 3.220588235294118e-06, + "loss": 1.9589, + "step": 13839 + }, + { + "epoch": 20.352811466372657, + "grad_norm": 5.2193827629089355, + "learning_rate": 3.210294117647059e-06, + "loss": 2.0627, + "step": 13860 + }, + { + "epoch": 20.383682469680263, + "grad_norm": 4.1548309326171875, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.0675, + "step": 13881 + }, + { + "epoch": 20.414553472987873, + "grad_norm": 3.0767784118652344, + "learning_rate": 3.1897058823529414e-06, + "loss": 2.1164, + "step": 13902 + }, + { + "epoch": 20.44542447629548, + "grad_norm": 3.6371419429779053, + "learning_rate": 3.1794117647058824e-06, + "loss": 1.9204, + "step": 13923 + }, + { + "epoch": 20.47629547960309, + "grad_norm": 2.9141464233398438, + "learning_rate": 3.1691176470588235e-06, + "loss": 2.0619, + "step": 13944 + }, + { + "epoch": 20.507166482910694, + "grad_norm": 3.0113942623138428, + "learning_rate": 3.1588235294117646e-06, + "loss": 2.0679, + "step": 13965 + }, + { + "epoch": 20.5380374862183, + "grad_norm": 3.8592772483825684, + "learning_rate": 3.1485294117647065e-06, + "loss": 2.1119, + "step": 13986 + }, + { + "epoch": 20.56890848952591, + "grad_norm": 4.055545330047607, + "learning_rate": 3.1382352941176476e-06, + "loss": 2.0723, + "step": 14007 + }, + { + "epoch": 20.599779492833516, + "grad_norm": 4.263570785522461, + "learning_rate": 3.1279411764705887e-06, + "loss": 1.9356, + "step": 14028 + }, + { + "epoch": 20.630650496141126, + "grad_norm": 3.5049495697021484, + "learning_rate": 3.1176470588235297e-06, + "loss": 2.064, + "step": 14049 + }, + { + "epoch": 20.66152149944873, + "grad_norm": 3.676957130432129, + "learning_rate": 3.107352941176471e-06, + "loss": 1.9775, + "step": 14070 + }, + { + "epoch": 20.69239250275634, + "grad_norm": 2.424762725830078, + "learning_rate": 3.097058823529412e-06, + "loss": 2.0656, + "step": 14091 + }, + { + "epoch": 20.723263506063947, + "grad_norm": 2.600409984588623, + "learning_rate": 3.086764705882353e-06, + "loss": 1.9777, + "step": 14112 + }, + { + "epoch": 20.754134509371553, + "grad_norm": 3.8351240158081055, + "learning_rate": 3.076470588235294e-06, + "loss": 2.0204, + "step": 14133 + }, + { + "epoch": 20.785005512679163, + "grad_norm": 2.652747631072998, + "learning_rate": 3.066176470588236e-06, + "loss": 1.9423, + "step": 14154 + }, + { + "epoch": 20.81587651598677, + "grad_norm": 3.5386784076690674, + "learning_rate": 3.055882352941177e-06, + "loss": 2.1863, + "step": 14175 + }, + { + "epoch": 20.84674751929438, + "grad_norm": 3.687246799468994, + "learning_rate": 3.045588235294118e-06, + "loss": 1.9755, + "step": 14196 + }, + { + "epoch": 20.877618522601985, + "grad_norm": 2.620722532272339, + "learning_rate": 3.035294117647059e-06, + "loss": 2.0561, + "step": 14217 + }, + { + "epoch": 20.90848952590959, + "grad_norm": 2.740910053253174, + "learning_rate": 3.0250000000000003e-06, + "loss": 2.0948, + "step": 14238 + }, + { + "epoch": 20.9393605292172, + "grad_norm": 2.521815538406372, + "learning_rate": 3.0147058823529413e-06, + "loss": 1.9918, + "step": 14259 + }, + { + "epoch": 20.970231532524807, + "grad_norm": 3.51527738571167, + "learning_rate": 3.0044117647058824e-06, + "loss": 2.1119, + "step": 14280 + }, + { + "epoch": 21.0, + "grad_norm": 1.8354216814041138, + "learning_rate": 2.994117647058824e-06, + "loss": 1.9956, + "step": 14301 + }, + { + "epoch": 21.030871003307606, + "grad_norm": 3.5836827754974365, + "learning_rate": 2.983823529411765e-06, + "loss": 2.0313, + "step": 14322 + }, + { + "epoch": 21.061742006615216, + "grad_norm": 4.276784420013428, + "learning_rate": 2.973529411764706e-06, + "loss": 2.009, + "step": 14343 + }, + { + "epoch": 21.09261300992282, + "grad_norm": 4.545126914978027, + "learning_rate": 2.963235294117647e-06, + "loss": 1.9624, + "step": 14364 + }, + { + "epoch": 21.12348401323043, + "grad_norm": 3.138828754425049, + "learning_rate": 2.9529411764705882e-06, + "loss": 2.0172, + "step": 14385 + }, + { + "epoch": 21.154355016538037, + "grad_norm": 3.7198119163513184, + "learning_rate": 2.9426470588235297e-06, + "loss": 1.9557, + "step": 14406 + }, + { + "epoch": 21.185226019845643, + "grad_norm": 3.2042577266693115, + "learning_rate": 2.932352941176471e-06, + "loss": 2.076, + "step": 14427 + }, + { + "epoch": 21.216097023153253, + "grad_norm": 3.6730806827545166, + "learning_rate": 2.9220588235294123e-06, + "loss": 1.9988, + "step": 14448 + }, + { + "epoch": 21.24696802646086, + "grad_norm": 3.7986257076263428, + "learning_rate": 2.9117647058823534e-06, + "loss": 2.02, + "step": 14469 + }, + { + "epoch": 21.27783902976847, + "grad_norm": 2.8649749755859375, + "learning_rate": 2.9014705882352944e-06, + "loss": 2.0309, + "step": 14490 + }, + { + "epoch": 21.308710033076075, + "grad_norm": 3.3312325477600098, + "learning_rate": 2.8911764705882355e-06, + "loss": 2.1171, + "step": 14511 + }, + { + "epoch": 21.33958103638368, + "grad_norm": 5.849648475646973, + "learning_rate": 2.8808823529411766e-06, + "loss": 1.9797, + "step": 14532 + }, + { + "epoch": 21.37045203969129, + "grad_norm": 2.860837697982788, + "learning_rate": 2.8705882352941177e-06, + "loss": 1.9537, + "step": 14553 + }, + { + "epoch": 21.401323042998897, + "grad_norm": 3.9823107719421387, + "learning_rate": 2.8602941176470587e-06, + "loss": 1.9863, + "step": 14574 + }, + { + "epoch": 21.432194046306506, + "grad_norm": 3.9077060222625732, + "learning_rate": 2.85e-06, + "loss": 1.9336, + "step": 14595 + }, + { + "epoch": 21.463065049614112, + "grad_norm": 3.0670223236083984, + "learning_rate": 2.8397058823529417e-06, + "loss": 2.1698, + "step": 14616 + }, + { + "epoch": 21.49393605292172, + "grad_norm": 3.615760326385498, + "learning_rate": 2.829411764705883e-06, + "loss": 2.1763, + "step": 14637 + }, + { + "epoch": 21.524807056229328, + "grad_norm": 2.5886383056640625, + "learning_rate": 2.819117647058824e-06, + "loss": 2.0616, + "step": 14658 + }, + { + "epoch": 21.555678059536934, + "grad_norm": 3.460663080215454, + "learning_rate": 2.808823529411765e-06, + "loss": 1.9536, + "step": 14679 + }, + { + "epoch": 21.586549062844544, + "grad_norm": 2.1539037227630615, + "learning_rate": 2.798529411764706e-06, + "loss": 2.0335, + "step": 14700 + }, + { + "epoch": 21.61742006615215, + "grad_norm": 3.484496831893921, + "learning_rate": 2.788235294117647e-06, + "loss": 2.0183, + "step": 14721 + }, + { + "epoch": 21.64829106945976, + "grad_norm": 2.755068302154541, + "learning_rate": 2.777941176470588e-06, + "loss": 1.8785, + "step": 14742 + }, + { + "epoch": 21.679162072767365, + "grad_norm": 3.6181225776672363, + "learning_rate": 2.76764705882353e-06, + "loss": 2.1339, + "step": 14763 + }, + { + "epoch": 21.71003307607497, + "grad_norm": 2.7359960079193115, + "learning_rate": 2.757352941176471e-06, + "loss": 1.9992, + "step": 14784 + }, + { + "epoch": 21.74090407938258, + "grad_norm": 3.1301567554473877, + "learning_rate": 2.7470588235294123e-06, + "loss": 1.9978, + "step": 14805 + }, + { + "epoch": 21.771775082690187, + "grad_norm": 3.758486032485962, + "learning_rate": 2.7367647058823533e-06, + "loss": 2.0067, + "step": 14826 + }, + { + "epoch": 21.802646085997797, + "grad_norm": 3.277071475982666, + "learning_rate": 2.7264705882352944e-06, + "loss": 2.1184, + "step": 14847 + }, + { + "epoch": 21.833517089305403, + "grad_norm": 4.998867511749268, + "learning_rate": 2.7161764705882355e-06, + "loss": 2.0494, + "step": 14868 + }, + { + "epoch": 21.86438809261301, + "grad_norm": 3.6457066535949707, + "learning_rate": 2.7058823529411766e-06, + "loss": 2.1115, + "step": 14889 + }, + { + "epoch": 21.89525909592062, + "grad_norm": 2.19140625, + "learning_rate": 2.6955882352941176e-06, + "loss": 1.9649, + "step": 14910 + }, + { + "epoch": 21.926130099228224, + "grad_norm": 3.393760919570923, + "learning_rate": 2.685294117647059e-06, + "loss": 2.0337, + "step": 14931 + }, + { + "epoch": 21.957001102535834, + "grad_norm": 2.6467642784118652, + "learning_rate": 2.6750000000000002e-06, + "loss": 2.0755, + "step": 14952 + }, + { + "epoch": 21.98787210584344, + "grad_norm": 2.9383254051208496, + "learning_rate": 2.6647058823529413e-06, + "loss": 2.0456, + "step": 14973 + }, + { + "epoch": 22.017640573318634, + "grad_norm": 2.71541690826416, + "learning_rate": 2.6544117647058824e-06, + "loss": 1.8908, + "step": 14994 + }, + { + "epoch": 22.04851157662624, + "grad_norm": 3.295368194580078, + "learning_rate": 2.644117647058824e-06, + "loss": 2.1089, + "step": 15015 + }, + { + "epoch": 22.07938257993385, + "grad_norm": 2.6102893352508545, + "learning_rate": 2.633823529411765e-06, + "loss": 2.0108, + "step": 15036 + }, + { + "epoch": 22.110253583241455, + "grad_norm": 3.5170047283172607, + "learning_rate": 2.623529411764706e-06, + "loss": 2.0225, + "step": 15057 + }, + { + "epoch": 22.14112458654906, + "grad_norm": 3.3258235454559326, + "learning_rate": 2.6132352941176475e-06, + "loss": 2.1019, + "step": 15078 + }, + { + "epoch": 22.17199558985667, + "grad_norm": 4.083648204803467, + "learning_rate": 2.6029411764705886e-06, + "loss": 1.9216, + "step": 15099 + }, + { + "epoch": 22.202866593164277, + "grad_norm": 2.2198421955108643, + "learning_rate": 2.5926470588235297e-06, + "loss": 2.057, + "step": 15120 + }, + { + "epoch": 22.233737596471887, + "grad_norm": 3.2718071937561035, + "learning_rate": 2.5823529411764708e-06, + "loss": 2.0215, + "step": 15141 + }, + { + "epoch": 22.264608599779493, + "grad_norm": 4.054749965667725, + "learning_rate": 2.572058823529412e-06, + "loss": 2.0821, + "step": 15162 + }, + { + "epoch": 22.2954796030871, + "grad_norm": 3.719663143157959, + "learning_rate": 2.561764705882353e-06, + "loss": 2.0058, + "step": 15183 + }, + { + "epoch": 22.32635060639471, + "grad_norm": 4.2490363121032715, + "learning_rate": 2.551470588235294e-06, + "loss": 1.9647, + "step": 15204 + }, + { + "epoch": 22.357221609702314, + "grad_norm": 2.9652185440063477, + "learning_rate": 2.541176470588235e-06, + "loss": 2.1335, + "step": 15225 + }, + { + "epoch": 22.388092613009924, + "grad_norm": 2.423184394836426, + "learning_rate": 2.530882352941177e-06, + "loss": 2.06, + "step": 15246 + }, + { + "epoch": 22.41896361631753, + "grad_norm": 4.045749187469482, + "learning_rate": 2.520588235294118e-06, + "loss": 2.1137, + "step": 15267 + }, + { + "epoch": 22.449834619625136, + "grad_norm": 2.758460283279419, + "learning_rate": 2.510294117647059e-06, + "loss": 1.9363, + "step": 15288 + }, + { + "epoch": 22.480705622932746, + "grad_norm": 3.1217832565307617, + "learning_rate": 2.5e-06, + "loss": 2.0578, + "step": 15309 + }, + { + "epoch": 22.511576626240352, + "grad_norm": 3.535093069076538, + "learning_rate": 2.4897058823529413e-06, + "loss": 2.0919, + "step": 15330 + }, + { + "epoch": 22.54244762954796, + "grad_norm": 2.6681010723114014, + "learning_rate": 2.4794117647058828e-06, + "loss": 1.9656, + "step": 15351 + }, + { + "epoch": 22.573318632855568, + "grad_norm": 3.941215753555298, + "learning_rate": 2.469117647058824e-06, + "loss": 2.0865, + "step": 15372 + }, + { + "epoch": 22.604189636163177, + "grad_norm": 3.9376718997955322, + "learning_rate": 2.458823529411765e-06, + "loss": 2.0817, + "step": 15393 + }, + { + "epoch": 22.635060639470783, + "grad_norm": 4.634395122528076, + "learning_rate": 2.448529411764706e-06, + "loss": 1.9958, + "step": 15414 + }, + { + "epoch": 22.66593164277839, + "grad_norm": 4.030196189880371, + "learning_rate": 2.4382352941176475e-06, + "loss": 1.9103, + "step": 15435 + }, + { + "epoch": 22.696802646086, + "grad_norm": 3.294787883758545, + "learning_rate": 2.4279411764705886e-06, + "loss": 1.979, + "step": 15456 + }, + { + "epoch": 22.727673649393605, + "grad_norm": 4.601010322570801, + "learning_rate": 2.4176470588235297e-06, + "loss": 2.037, + "step": 15477 + }, + { + "epoch": 22.75854465270121, + "grad_norm": 2.8019866943359375, + "learning_rate": 2.4073529411764707e-06, + "loss": 1.9568, + "step": 15498 + }, + { + "epoch": 22.78941565600882, + "grad_norm": 4.673406600952148, + "learning_rate": 2.3970588235294122e-06, + "loss": 2.1818, + "step": 15519 + }, + { + "epoch": 22.820286659316427, + "grad_norm": 3.075392723083496, + "learning_rate": 2.3867647058823533e-06, + "loss": 2.0665, + "step": 15540 + }, + { + "epoch": 22.851157662624036, + "grad_norm": 4.0568437576293945, + "learning_rate": 2.3764705882352944e-06, + "loss": 1.9772, + "step": 15561 + }, + { + "epoch": 22.882028665931642, + "grad_norm": 22.847898483276367, + "learning_rate": 2.3661764705882355e-06, + "loss": 2.1377, + "step": 15582 + }, + { + "epoch": 22.912899669239252, + "grad_norm": 5.353538990020752, + "learning_rate": 2.3558823529411765e-06, + "loss": 2.0693, + "step": 15603 + }, + { + "epoch": 22.943770672546858, + "grad_norm": 3.0493805408477783, + "learning_rate": 2.345588235294118e-06, + "loss": 2.0442, + "step": 15624 + }, + { + "epoch": 22.974641675854464, + "grad_norm": 2.932695150375366, + "learning_rate": 2.335294117647059e-06, + "loss": 2.1378, + "step": 15645 + }, + { + "epoch": 23.004410143329658, + "grad_norm": 2.9815673828125, + "learning_rate": 2.325e-06, + "loss": 1.9206, + "step": 15666 + }, + { + "epoch": 23.035281146637267, + "grad_norm": 2.5419440269470215, + "learning_rate": 2.3147058823529413e-06, + "loss": 2.017, + "step": 15687 + }, + { + "epoch": 23.066152149944873, + "grad_norm": 3.180964708328247, + "learning_rate": 2.3044117647058823e-06, + "loss": 2.0926, + "step": 15708 + }, + { + "epoch": 23.09702315325248, + "grad_norm": 3.7221603393554688, + "learning_rate": 2.2941176470588234e-06, + "loss": 2.0389, + "step": 15729 + }, + { + "epoch": 23.12789415656009, + "grad_norm": 2.8961706161499023, + "learning_rate": 2.283823529411765e-06, + "loss": 1.9742, + "step": 15750 + }, + { + "epoch": 23.158765159867695, + "grad_norm": 4.145342826843262, + "learning_rate": 2.273529411764706e-06, + "loss": 1.9546, + "step": 15771 + }, + { + "epoch": 23.189636163175305, + "grad_norm": 3.063854217529297, + "learning_rate": 2.263235294117647e-06, + "loss": 1.9493, + "step": 15792 + }, + { + "epoch": 23.22050716648291, + "grad_norm": 3.4941153526306152, + "learning_rate": 2.252941176470588e-06, + "loss": 2.1055, + "step": 15813 + }, + { + "epoch": 23.251378169790517, + "grad_norm": 5.007169723510742, + "learning_rate": 2.243137254901961e-06, + "loss": 1.9953, + "step": 15834 + }, + { + "epoch": 23.282249173098126, + "grad_norm": 2.7144691944122314, + "learning_rate": 2.2328431372549024e-06, + "loss": 2.0099, + "step": 15855 + }, + { + "epoch": 23.313120176405732, + "grad_norm": 3.789856433868408, + "learning_rate": 2.2225490196078435e-06, + "loss": 2.0094, + "step": 15876 + }, + { + "epoch": 23.343991179713342, + "grad_norm": 3.635158061981201, + "learning_rate": 2.2122549019607846e-06, + "loss": 2.0724, + "step": 15897 + }, + { + "epoch": 23.374862183020948, + "grad_norm": 3.294782876968384, + "learning_rate": 2.2019607843137256e-06, + "loss": 2.1435, + "step": 15918 + }, + { + "epoch": 23.405733186328554, + "grad_norm": 3.1448404788970947, + "learning_rate": 2.191666666666667e-06, + "loss": 1.9845, + "step": 15939 + }, + { + "epoch": 23.436604189636164, + "grad_norm": 3.8233425617218018, + "learning_rate": 2.1813725490196082e-06, + "loss": 2.0245, + "step": 15960 + }, + { + "epoch": 23.46747519294377, + "grad_norm": 3.341221570968628, + "learning_rate": 2.1710784313725493e-06, + "loss": 1.9936, + "step": 15981 + }, + { + "epoch": 23.49834619625138, + "grad_norm": 3.8249101638793945, + "learning_rate": 2.1607843137254904e-06, + "loss": 1.9964, + "step": 16002 + }, + { + "epoch": 23.529217199558985, + "grad_norm": 3.033024549484253, + "learning_rate": 2.1504901960784314e-06, + "loss": 1.9725, + "step": 16023 + }, + { + "epoch": 23.56008820286659, + "grad_norm": 2.373525619506836, + "learning_rate": 2.1401960784313725e-06, + "loss": 1.9035, + "step": 16044 + }, + { + "epoch": 23.5909592061742, + "grad_norm": 3.101956844329834, + "learning_rate": 2.129901960784314e-06, + "loss": 1.9845, + "step": 16065 + }, + { + "epoch": 23.621830209481807, + "grad_norm": 3.2224278450012207, + "learning_rate": 2.119607843137255e-06, + "loss": 2.0678, + "step": 16086 + }, + { + "epoch": 23.652701212789417, + "grad_norm": 3.791844129562378, + "learning_rate": 2.109313725490196e-06, + "loss": 1.9764, + "step": 16107 + }, + { + "epoch": 23.683572216097023, + "grad_norm": 2.1430459022521973, + "learning_rate": 2.0990196078431372e-06, + "loss": 2.1508, + "step": 16128 + }, + { + "epoch": 23.71444321940463, + "grad_norm": 3.8360910415649414, + "learning_rate": 2.0887254901960783e-06, + "loss": 2.073, + "step": 16149 + }, + { + "epoch": 23.74531422271224, + "grad_norm": 4.548948287963867, + "learning_rate": 2.07843137254902e-06, + "loss": 2.0117, + "step": 16170 + }, + { + "epoch": 23.776185226019845, + "grad_norm": 2.1734726428985596, + "learning_rate": 2.068137254901961e-06, + "loss": 2.0523, + "step": 16191 + }, + { + "epoch": 23.807056229327454, + "grad_norm": 2.869358539581299, + "learning_rate": 2.057843137254902e-06, + "loss": 2.0996, + "step": 16212 + }, + { + "epoch": 23.83792723263506, + "grad_norm": 2.855677843093872, + "learning_rate": 2.047549019607843e-06, + "loss": 2.0153, + "step": 16233 + }, + { + "epoch": 23.86879823594267, + "grad_norm": 2.8188350200653076, + "learning_rate": 2.0372549019607845e-06, + "loss": 2.0767, + "step": 16254 + }, + { + "epoch": 23.899669239250276, + "grad_norm": 2.9440088272094727, + "learning_rate": 2.0269607843137256e-06, + "loss": 1.911, + "step": 16275 + }, + { + "epoch": 23.930540242557882, + "grad_norm": 3.6652421951293945, + "learning_rate": 2.0166666666666667e-06, + "loss": 1.9595, + "step": 16296 + }, + { + "epoch": 23.96141124586549, + "grad_norm": 2.7821288108825684, + "learning_rate": 2.0063725490196078e-06, + "loss": 2.0475, + "step": 16317 + }, + { + "epoch": 23.992282249173098, + "grad_norm": 4.175220966339111, + "learning_rate": 1.9960784313725493e-06, + "loss": 1.97, + "step": 16338 + }, + { + "epoch": 24.02205071664829, + "grad_norm": 3.457138776779175, + "learning_rate": 1.9857843137254903e-06, + "loss": 1.9588, + "step": 16359 + }, + { + "epoch": 24.052921719955897, + "grad_norm": 3.436939001083374, + "learning_rate": 1.9754901960784314e-06, + "loss": 2.0322, + "step": 16380 + }, + { + "epoch": 24.083792723263507, + "grad_norm": 5.357500076293945, + "learning_rate": 1.965196078431373e-06, + "loss": 1.9858, + "step": 16401 + }, + { + "epoch": 24.114663726571113, + "grad_norm": 3.293443202972412, + "learning_rate": 1.954901960784314e-06, + "loss": 1.9989, + "step": 16422 + }, + { + "epoch": 24.145534729878722, + "grad_norm": 4.073799133300781, + "learning_rate": 1.944607843137255e-06, + "loss": 2.0617, + "step": 16443 + }, + { + "epoch": 24.17640573318633, + "grad_norm": 2.9053542613983154, + "learning_rate": 1.934313725490196e-06, + "loss": 2.031, + "step": 16464 + }, + { + "epoch": 24.207276736493935, + "grad_norm": 4.001552581787109, + "learning_rate": 1.9240196078431377e-06, + "loss": 1.905, + "step": 16485 + }, + { + "epoch": 24.238147739801544, + "grad_norm": 3.245598316192627, + "learning_rate": 1.9137254901960787e-06, + "loss": 2.0186, + "step": 16506 + }, + { + "epoch": 24.26901874310915, + "grad_norm": 2.63043212890625, + "learning_rate": 1.9034313725490198e-06, + "loss": 2.0694, + "step": 16527 + }, + { + "epoch": 24.29988974641676, + "grad_norm": 3.8443353176116943, + "learning_rate": 1.8931372549019609e-06, + "loss": 1.8988, + "step": 16548 + }, + { + "epoch": 24.330760749724366, + "grad_norm": 2.937856674194336, + "learning_rate": 1.8828431372549022e-06, + "loss": 2.0204, + "step": 16569 + }, + { + "epoch": 24.361631753031972, + "grad_norm": 3.2340915203094482, + "learning_rate": 1.8725490196078432e-06, + "loss": 2.0266, + "step": 16590 + }, + { + "epoch": 24.39250275633958, + "grad_norm": 4.433572769165039, + "learning_rate": 1.8622549019607843e-06, + "loss": 2.0059, + "step": 16611 + }, + { + "epoch": 24.423373759647188, + "grad_norm": 3.6761868000030518, + "learning_rate": 1.8519607843137258e-06, + "loss": 2.0048, + "step": 16632 + }, + { + "epoch": 24.454244762954797, + "grad_norm": 3.846853017807007, + "learning_rate": 1.8416666666666669e-06, + "loss": 1.9698, + "step": 16653 + }, + { + "epoch": 24.485115766262403, + "grad_norm": 4.937806129455566, + "learning_rate": 1.831372549019608e-06, + "loss": 1.9939, + "step": 16674 + }, + { + "epoch": 24.51598676957001, + "grad_norm": 2.8848650455474854, + "learning_rate": 1.821078431372549e-06, + "loss": 2.0912, + "step": 16695 + }, + { + "epoch": 24.54685777287762, + "grad_norm": 2.890634536743164, + "learning_rate": 1.8107843137254905e-06, + "loss": 1.9191, + "step": 16716 + }, + { + "epoch": 24.577728776185225, + "grad_norm": 2.678192615509033, + "learning_rate": 1.8004901960784316e-06, + "loss": 2.0085, + "step": 16737 + }, + { + "epoch": 24.608599779492835, + "grad_norm": 2.798147201538086, + "learning_rate": 1.7901960784313727e-06, + "loss": 2.1472, + "step": 16758 + }, + { + "epoch": 24.63947078280044, + "grad_norm": 3.4055185317993164, + "learning_rate": 1.7799019607843138e-06, + "loss": 1.9615, + "step": 16779 + }, + { + "epoch": 24.670341786108047, + "grad_norm": 2.357825517654419, + "learning_rate": 1.769607843137255e-06, + "loss": 1.9813, + "step": 16800 + }, + { + "epoch": 24.701212789415656, + "grad_norm": 3.224651575088501, + "learning_rate": 1.7593137254901963e-06, + "loss": 2.037, + "step": 16821 + }, + { + "epoch": 24.732083792723262, + "grad_norm": 3.342982053756714, + "learning_rate": 1.7490196078431374e-06, + "loss": 2.0401, + "step": 16842 + }, + { + "epoch": 24.762954796030872, + "grad_norm": 4.797550201416016, + "learning_rate": 1.7387254901960785e-06, + "loss": 1.9856, + "step": 16863 + }, + { + "epoch": 24.793825799338478, + "grad_norm": 3.657365322113037, + "learning_rate": 1.7284313725490198e-06, + "loss": 1.9802, + "step": 16884 + }, + { + "epoch": 24.824696802646088, + "grad_norm": 3.4711039066314697, + "learning_rate": 1.7181372549019609e-06, + "loss": 1.9883, + "step": 16905 + }, + { + "epoch": 24.855567805953694, + "grad_norm": 3.5633745193481445, + "learning_rate": 1.707843137254902e-06, + "loss": 2.0446, + "step": 16926 + }, + { + "epoch": 24.8864388092613, + "grad_norm": 3.0469162464141846, + "learning_rate": 1.6975490196078434e-06, + "loss": 2.0676, + "step": 16947 + }, + { + "epoch": 24.91730981256891, + "grad_norm": 3.1432437896728516, + "learning_rate": 1.6872549019607845e-06, + "loss": 2.007, + "step": 16968 + }, + { + "epoch": 24.948180815876515, + "grad_norm": 3.7878284454345703, + "learning_rate": 1.6769607843137256e-06, + "loss": 2.0496, + "step": 16989 + }, + { + "epoch": 24.979051819184125, + "grad_norm": 3.5506863594055176, + "learning_rate": 1.6666666666666667e-06, + "loss": 2.0364, + "step": 17010 + }, + { + "epoch": 25.008820286659315, + "grad_norm": 3.147472620010376, + "learning_rate": 1.6563725490196082e-06, + "loss": 1.9508, + "step": 17031 + }, + { + "epoch": 25.039691289966925, + "grad_norm": 2.918980598449707, + "learning_rate": 1.6460784313725492e-06, + "loss": 2.0846, + "step": 17052 + }, + { + "epoch": 25.07056229327453, + "grad_norm": 3.624821186065674, + "learning_rate": 1.6357843137254903e-06, + "loss": 1.968, + "step": 17073 + }, + { + "epoch": 25.10143329658214, + "grad_norm": 3.0075933933258057, + "learning_rate": 1.6254901960784314e-06, + "loss": 2.2606, + "step": 17094 + }, + { + "epoch": 25.132304299889746, + "grad_norm": 3.014009475708008, + "learning_rate": 1.6151960784313727e-06, + "loss": 2.0683, + "step": 17115 + }, + { + "epoch": 25.163175303197352, + "grad_norm": 2.4412238597869873, + "learning_rate": 1.604901960784314e-06, + "loss": 1.8993, + "step": 17136 + }, + { + "epoch": 25.194046306504962, + "grad_norm": 3.6228857040405273, + "learning_rate": 1.594607843137255e-06, + "loss": 2.014, + "step": 17157 + }, + { + "epoch": 25.224917309812568, + "grad_norm": 3.0460023880004883, + "learning_rate": 1.5843137254901961e-06, + "loss": 1.9859, + "step": 17178 + }, + { + "epoch": 25.255788313120178, + "grad_norm": 3.527524948120117, + "learning_rate": 1.5740196078431374e-06, + "loss": 2.0807, + "step": 17199 + }, + { + "epoch": 25.286659316427784, + "grad_norm": 3.9341087341308594, + "learning_rate": 1.5637254901960785e-06, + "loss": 2.0596, + "step": 17220 + }, + { + "epoch": 25.31753031973539, + "grad_norm": 2.719883441925049, + "learning_rate": 1.5534313725490196e-06, + "loss": 2.0104, + "step": 17241 + }, + { + "epoch": 25.348401323043, + "grad_norm": 2.8454911708831787, + "learning_rate": 1.543137254901961e-06, + "loss": 1.8807, + "step": 17262 + }, + { + "epoch": 25.379272326350605, + "grad_norm": 3.0369486808776855, + "learning_rate": 1.5328431372549021e-06, + "loss": 2.0453, + "step": 17283 + }, + { + "epoch": 25.410143329658215, + "grad_norm": 2.745230197906494, + "learning_rate": 1.5225490196078432e-06, + "loss": 1.9671, + "step": 17304 + }, + { + "epoch": 25.44101433296582, + "grad_norm": 2.962475299835205, + "learning_rate": 1.5122549019607843e-06, + "loss": 1.9908, + "step": 17325 + }, + { + "epoch": 25.471885336273427, + "grad_norm": 2.3031718730926514, + "learning_rate": 1.5019607843137258e-06, + "loss": 2.0485, + "step": 17346 + }, + { + "epoch": 25.502756339581037, + "grad_norm": 3.512769937515259, + "learning_rate": 1.4916666666666669e-06, + "loss": 1.9571, + "step": 17367 + }, + { + "epoch": 25.533627342888643, + "grad_norm": 4.890634536743164, + "learning_rate": 1.481372549019608e-06, + "loss": 2.0696, + "step": 17388 + }, + { + "epoch": 25.564498346196252, + "grad_norm": 4.985838890075684, + "learning_rate": 1.471078431372549e-06, + "loss": 2.0484, + "step": 17409 + }, + { + "epoch": 25.59536934950386, + "grad_norm": 4.576741695404053, + "learning_rate": 1.4607843137254903e-06, + "loss": 2.0638, + "step": 17430 + }, + { + "epoch": 25.626240352811465, + "grad_norm": 3.1261324882507324, + "learning_rate": 1.4504901960784316e-06, + "loss": 2.0689, + "step": 17451 + }, + { + "epoch": 25.657111356119074, + "grad_norm": 2.712101936340332, + "learning_rate": 1.4401960784313727e-06, + "loss": 1.9987, + "step": 17472 + }, + { + "epoch": 25.68798235942668, + "grad_norm": 4.180928707122803, + "learning_rate": 1.429901960784314e-06, + "loss": 2.0525, + "step": 17493 + }, + { + "epoch": 25.71885336273429, + "grad_norm": 3.562640428543091, + "learning_rate": 1.419607843137255e-06, + "loss": 2.0431, + "step": 17514 + }, + { + "epoch": 25.749724366041896, + "grad_norm": 3.860323429107666, + "learning_rate": 1.409313725490196e-06, + "loss": 1.9785, + "step": 17535 + }, + { + "epoch": 25.780595369349506, + "grad_norm": 3.463897705078125, + "learning_rate": 1.3990196078431374e-06, + "loss": 1.9708, + "step": 17556 + }, + { + "epoch": 25.81146637265711, + "grad_norm": 3.1898350715637207, + "learning_rate": 1.3887254901960787e-06, + "loss": 1.981, + "step": 17577 + }, + { + "epoch": 25.842337375964718, + "grad_norm": 2.384361982345581, + "learning_rate": 1.3784313725490197e-06, + "loss": 2.0021, + "step": 17598 + }, + { + "epoch": 25.873208379272327, + "grad_norm": 3.8873538970947266, + "learning_rate": 1.3681372549019608e-06, + "loss": 2.1231, + "step": 17619 + }, + { + "epoch": 25.904079382579933, + "grad_norm": 4.486485958099365, + "learning_rate": 1.357843137254902e-06, + "loss": 1.9089, + "step": 17640 + }, + { + "epoch": 25.934950385887543, + "grad_norm": 2.862537145614624, + "learning_rate": 1.3475490196078434e-06, + "loss": 2.0553, + "step": 17661 + }, + { + "epoch": 25.96582138919515, + "grad_norm": 3.2039759159088135, + "learning_rate": 1.3372549019607845e-06, + "loss": 1.914, + "step": 17682 + }, + { + "epoch": 25.996692392502755, + "grad_norm": 2.9676759243011475, + "learning_rate": 1.3269607843137255e-06, + "loss": 1.9942, + "step": 17703 + }, + { + "epoch": 26.02646085997795, + "grad_norm": 4.259377479553223, + "learning_rate": 1.3166666666666666e-06, + "loss": 1.9375, + "step": 17724 + }, + { + "epoch": 26.057331863285558, + "grad_norm": 2.7348697185516357, + "learning_rate": 1.3063725490196081e-06, + "loss": 2.018, + "step": 17745 + }, + { + "epoch": 26.088202866593164, + "grad_norm": 3.2641003131866455, + "learning_rate": 1.2960784313725492e-06, + "loss": 2.021, + "step": 17766 + }, + { + "epoch": 26.11907386990077, + "grad_norm": 3.0980889797210693, + "learning_rate": 1.2857843137254903e-06, + "loss": 2.0851, + "step": 17787 + }, + { + "epoch": 26.14994487320838, + "grad_norm": 2.84696888923645, + "learning_rate": 1.2754901960784316e-06, + "loss": 2.0336, + "step": 17808 + }, + { + "epoch": 26.180815876515986, + "grad_norm": 2.650468111038208, + "learning_rate": 1.2651960784313726e-06, + "loss": 2.035, + "step": 17829 + }, + { + "epoch": 26.211686879823596, + "grad_norm": 2.750750780105591, + "learning_rate": 1.2549019607843137e-06, + "loss": 1.9548, + "step": 17850 + }, + { + "epoch": 26.2425578831312, + "grad_norm": 2.1722657680511475, + "learning_rate": 1.244607843137255e-06, + "loss": 1.9388, + "step": 17871 + }, + { + "epoch": 26.273428886438808, + "grad_norm": 3.936103105545044, + "learning_rate": 1.234313725490196e-06, + "loss": 2.0783, + "step": 17892 + }, + { + "epoch": 26.304299889746417, + "grad_norm": 3.3649473190307617, + "learning_rate": 1.2240196078431374e-06, + "loss": 1.9849, + "step": 17913 + }, + { + "epoch": 26.335170893054023, + "grad_norm": 3.6519925594329834, + "learning_rate": 1.2137254901960784e-06, + "loss": 2.0773, + "step": 17934 + }, + { + "epoch": 26.366041896361633, + "grad_norm": 4.6296067237854, + "learning_rate": 1.2034313725490197e-06, + "loss": 2.0144, + "step": 17955 + }, + { + "epoch": 26.39691289966924, + "grad_norm": 3.302056312561035, + "learning_rate": 1.1931372549019608e-06, + "loss": 2.0479, + "step": 17976 + }, + { + "epoch": 26.427783902976845, + "grad_norm": 3.523118734359741, + "learning_rate": 1.182843137254902e-06, + "loss": 2.0372, + "step": 17997 + }, + { + "epoch": 26.458654906284455, + "grad_norm": 4.209780216217041, + "learning_rate": 1.1725490196078432e-06, + "loss": 2.1403, + "step": 18018 + }, + { + "epoch": 26.48952590959206, + "grad_norm": 3.7631661891937256, + "learning_rate": 1.1622549019607845e-06, + "loss": 2.1286, + "step": 18039 + }, + { + "epoch": 26.52039691289967, + "grad_norm": 4.490149021148682, + "learning_rate": 1.1519607843137255e-06, + "loss": 2.0241, + "step": 18060 + }, + { + "epoch": 26.551267916207276, + "grad_norm": 2.7082109451293945, + "learning_rate": 1.1416666666666668e-06, + "loss": 2.025, + "step": 18081 + }, + { + "epoch": 26.582138919514883, + "grad_norm": 3.7196762561798096, + "learning_rate": 1.1313725490196079e-06, + "loss": 2.0884, + "step": 18102 + }, + { + "epoch": 26.613009922822492, + "grad_norm": 4.222052574157715, + "learning_rate": 1.1210784313725492e-06, + "loss": 2.0388, + "step": 18123 + }, + { + "epoch": 26.643880926130098, + "grad_norm": 4.171756267547607, + "learning_rate": 1.1107843137254903e-06, + "loss": 2.028, + "step": 18144 + }, + { + "epoch": 26.674751929437708, + "grad_norm": 4.496833324432373, + "learning_rate": 1.1004901960784315e-06, + "loss": 2.093, + "step": 18165 + }, + { + "epoch": 26.705622932745314, + "grad_norm": 2.4819180965423584, + "learning_rate": 1.0901960784313726e-06, + "loss": 1.8364, + "step": 18186 + }, + { + "epoch": 26.736493936052923, + "grad_norm": 3.3765549659729004, + "learning_rate": 1.0799019607843137e-06, + "loss": 2.0641, + "step": 18207 + }, + { + "epoch": 26.76736493936053, + "grad_norm": 3.363340139389038, + "learning_rate": 1.069607843137255e-06, + "loss": 2.0086, + "step": 18228 + }, + { + "epoch": 26.798235942668136, + "grad_norm": 2.692979335784912, + "learning_rate": 1.059313725490196e-06, + "loss": 2.0157, + "step": 18249 + }, + { + "epoch": 26.829106945975745, + "grad_norm": 3.666555881500244, + "learning_rate": 1.0490196078431373e-06, + "loss": 2.0233, + "step": 18270 + }, + { + "epoch": 26.85997794928335, + "grad_norm": 3.171398639678955, + "learning_rate": 1.0387254901960784e-06, + "loss": 1.976, + "step": 18291 + }, + { + "epoch": 26.89084895259096, + "grad_norm": 4.0071868896484375, + "learning_rate": 1.0284313725490197e-06, + "loss": 1.9523, + "step": 18312 + }, + { + "epoch": 26.921719955898567, + "grad_norm": 4.549193859100342, + "learning_rate": 1.0181372549019608e-06, + "loss": 1.9475, + "step": 18333 + }, + { + "epoch": 26.952590959206173, + "grad_norm": 4.22450590133667, + "learning_rate": 1.007843137254902e-06, + "loss": 1.9244, + "step": 18354 + }, + { + "epoch": 26.983461962513783, + "grad_norm": 3.5466229915618896, + "learning_rate": 9.975490196078434e-07, + "loss": 2.0134, + "step": 18375 + }, + { + "epoch": 27.013230429988976, + "grad_norm": 3.650482416152954, + "learning_rate": 9.872549019607844e-07, + "loss": 1.9523, + "step": 18396 + }, + { + "epoch": 27.044101433296582, + "grad_norm": 3.628394842147827, + "learning_rate": 9.769607843137257e-07, + "loss": 2.126, + "step": 18417 + }, + { + "epoch": 27.074972436604188, + "grad_norm": 2.50268816947937, + "learning_rate": 9.666666666666668e-07, + "loss": 2.1265, + "step": 18438 + }, + { + "epoch": 27.105843439911798, + "grad_norm": 2.1516237258911133, + "learning_rate": 9.563725490196079e-07, + "loss": 2.0307, + "step": 18459 + }, + { + "epoch": 27.136714443219404, + "grad_norm": 3.48169207572937, + "learning_rate": 9.460784313725491e-07, + "loss": 2.0434, + "step": 18480 + }, + { + "epoch": 27.167585446527013, + "grad_norm": 5.042967796325684, + "learning_rate": 9.357843137254903e-07, + "loss": 2.0246, + "step": 18501 + }, + { + "epoch": 27.19845644983462, + "grad_norm": 2.58532977104187, + "learning_rate": 9.254901960784314e-07, + "loss": 2.0705, + "step": 18522 + }, + { + "epoch": 27.229327453142226, + "grad_norm": 2.8869588375091553, + "learning_rate": 9.151960784313726e-07, + "loss": 2.0355, + "step": 18543 + }, + { + "epoch": 27.260198456449835, + "grad_norm": 3.409188985824585, + "learning_rate": 9.049019607843138e-07, + "loss": 1.8979, + "step": 18564 + }, + { + "epoch": 27.29106945975744, + "grad_norm": 2.7081124782562256, + "learning_rate": 8.94607843137255e-07, + "loss": 1.9425, + "step": 18585 + }, + { + "epoch": 27.32194046306505, + "grad_norm": 2.522266149520874, + "learning_rate": 8.84313725490196e-07, + "loss": 1.9749, + "step": 18606 + }, + { + "epoch": 27.352811466372657, + "grad_norm": 2.937058210372925, + "learning_rate": 8.740196078431373e-07, + "loss": 1.981, + "step": 18627 + }, + { + "epoch": 27.383682469680263, + "grad_norm": 3.0787739753723145, + "learning_rate": 8.637254901960786e-07, + "loss": 2.0818, + "step": 18648 + }, + { + "epoch": 27.414553472987873, + "grad_norm": 4.102266311645508, + "learning_rate": 8.534313725490197e-07, + "loss": 1.8458, + "step": 18669 + }, + { + "epoch": 27.44542447629548, + "grad_norm": 5.012358665466309, + "learning_rate": 8.431372549019609e-07, + "loss": 2.0026, + "step": 18690 + }, + { + "epoch": 27.47629547960309, + "grad_norm": 3.763712167739868, + "learning_rate": 8.328431372549021e-07, + "loss": 2.091, + "step": 18711 + }, + { + "epoch": 27.507166482910694, + "grad_norm": 3.3943490982055664, + "learning_rate": 8.230392156862746e-07, + "loss": 2.0389, + "step": 18732 + }, + { + "epoch": 27.5380374862183, + "grad_norm": 3.6211483478546143, + "learning_rate": 8.127450980392157e-07, + "loss": 2.0371, + "step": 18753 + }, + { + "epoch": 27.56890848952591, + "grad_norm": 3.4411706924438477, + "learning_rate": 8.02450980392157e-07, + "loss": 1.9773, + "step": 18774 + }, + { + "epoch": 27.599779492833516, + "grad_norm": 2.2726285457611084, + "learning_rate": 7.921568627450981e-07, + "loss": 1.9814, + "step": 18795 + }, + { + "epoch": 27.630650496141126, + "grad_norm": 6.275850296020508, + "learning_rate": 7.818627450980392e-07, + "loss": 2.009, + "step": 18816 + }, + { + "epoch": 27.66152149944873, + "grad_norm": 3.4813263416290283, + "learning_rate": 7.715686274509805e-07, + "loss": 1.9447, + "step": 18837 + }, + { + "epoch": 27.69239250275634, + "grad_norm": 3.8926095962524414, + "learning_rate": 7.612745098039216e-07, + "loss": 2.1061, + "step": 18858 + }, + { + "epoch": 27.723263506063947, + "grad_norm": 3.2279534339904785, + "learning_rate": 7.509803921568629e-07, + "loss": 2.1357, + "step": 18879 + }, + { + "epoch": 27.754134509371553, + "grad_norm": 4.1901631355285645, + "learning_rate": 7.40686274509804e-07, + "loss": 2.0081, + "step": 18900 + }, + { + "epoch": 27.785005512679163, + "grad_norm": 3.5056240558624268, + "learning_rate": 7.303921568627451e-07, + "loss": 2.0354, + "step": 18921 + }, + { + "epoch": 27.81587651598677, + "grad_norm": 3.4158339500427246, + "learning_rate": 7.200980392156863e-07, + "loss": 2.0163, + "step": 18942 + }, + { + "epoch": 27.84674751929438, + "grad_norm": 2.8889598846435547, + "learning_rate": 7.098039215686275e-07, + "loss": 2.0046, + "step": 18963 + }, + { + "epoch": 27.877618522601985, + "grad_norm": 3.9091246128082275, + "learning_rate": 6.995098039215687e-07, + "loss": 1.9854, + "step": 18984 + }, + { + "epoch": 27.90848952590959, + "grad_norm": 4.670572280883789, + "learning_rate": 6.892156862745099e-07, + "loss": 2.0157, + "step": 19005 + }, + { + "epoch": 27.9393605292172, + "grad_norm": 2.7897861003875732, + "learning_rate": 6.78921568627451e-07, + "loss": 2.03, + "step": 19026 + }, + { + "epoch": 27.970231532524807, + "grad_norm": 3.8964619636535645, + "learning_rate": 6.686274509803922e-07, + "loss": 2.028, + "step": 19047 + }, + { + "epoch": 28.0, + "grad_norm": 1.9693920612335205, + "learning_rate": 6.583333333333333e-07, + "loss": 1.8564, + "step": 19068 + }, + { + "epoch": 28.030871003307606, + "grad_norm": 4.751320838928223, + "learning_rate": 6.480392156862746e-07, + "loss": 1.9493, + "step": 19089 + }, + { + "epoch": 28.061742006615216, + "grad_norm": 3.4831652641296387, + "learning_rate": 6.377450980392158e-07, + "loss": 1.9572, + "step": 19110 + }, + { + "epoch": 28.09261300992282, + "grad_norm": 3.4712512493133545, + "learning_rate": 6.274509803921569e-07, + "loss": 2.0309, + "step": 19131 + }, + { + "epoch": 28.12348401323043, + "grad_norm": 3.9922566413879395, + "learning_rate": 6.17156862745098e-07, + "loss": 1.9811, + "step": 19152 + }, + { + "epoch": 28.154355016538037, + "grad_norm": 4.792636871337891, + "learning_rate": 6.068627450980392e-07, + "loss": 1.9534, + "step": 19173 + }, + { + "epoch": 28.185226019845643, + "grad_norm": 3.729520559310913, + "learning_rate": 5.965686274509804e-07, + "loss": 1.9456, + "step": 19194 + }, + { + "epoch": 28.216097023153253, + "grad_norm": 2.783691167831421, + "learning_rate": 5.862745098039216e-07, + "loss": 2.0291, + "step": 19215 + }, + { + "epoch": 28.24696802646086, + "grad_norm": 3.3077173233032227, + "learning_rate": 5.759803921568628e-07, + "loss": 2.0922, + "step": 19236 + }, + { + "epoch": 28.27783902976847, + "grad_norm": 3.1509790420532227, + "learning_rate": 5.656862745098039e-07, + "loss": 2.1581, + "step": 19257 + }, + { + "epoch": 28.308710033076075, + "grad_norm": 4.7033538818359375, + "learning_rate": 5.553921568627451e-07, + "loss": 1.9384, + "step": 19278 + }, + { + "epoch": 28.33958103638368, + "grad_norm": 4.201630115509033, + "learning_rate": 5.450980392156863e-07, + "loss": 1.9448, + "step": 19299 + }, + { + "epoch": 28.37045203969129, + "grad_norm": 3.8276824951171875, + "learning_rate": 5.348039215686275e-07, + "loss": 1.9769, + "step": 19320 + }, + { + "epoch": 28.401323042998897, + "grad_norm": 4.129844665527344, + "learning_rate": 5.245098039215687e-07, + "loss": 2.1262, + "step": 19341 + }, + { + "epoch": 28.432194046306506, + "grad_norm": 3.277721643447876, + "learning_rate": 5.142156862745099e-07, + "loss": 1.9226, + "step": 19362 + }, + { + "epoch": 28.463065049614112, + "grad_norm": 3.1267642974853516, + "learning_rate": 5.03921568627451e-07, + "loss": 2.1073, + "step": 19383 + }, + { + "epoch": 28.49393605292172, + "grad_norm": 3.5480620861053467, + "learning_rate": 4.936274509803922e-07, + "loss": 2.0114, + "step": 19404 + }, + { + "epoch": 28.524807056229328, + "grad_norm": 3.172790765762329, + "learning_rate": 4.833333333333334e-07, + "loss": 2.0302, + "step": 19425 + }, + { + "epoch": 28.555678059536934, + "grad_norm": 3.0664799213409424, + "learning_rate": 4.7303921568627453e-07, + "loss": 2.0294, + "step": 19446 + }, + { + "epoch": 28.586549062844544, + "grad_norm": 2.906982421875, + "learning_rate": 4.627450980392157e-07, + "loss": 2.0385, + "step": 19467 + }, + { + "epoch": 28.61742006615215, + "grad_norm": 4.4187541007995605, + "learning_rate": 4.524509803921569e-07, + "loss": 2.0023, + "step": 19488 + }, + { + "epoch": 28.64829106945976, + "grad_norm": 3.3276491165161133, + "learning_rate": 4.42156862745098e-07, + "loss": 2.0314, + "step": 19509 + }, + { + "epoch": 28.679162072767365, + "grad_norm": 3.633134365081787, + "learning_rate": 4.318627450980393e-07, + "loss": 2.0017, + "step": 19530 + }, + { + "epoch": 28.71003307607497, + "grad_norm": 3.469681978225708, + "learning_rate": 4.2156862745098044e-07, + "loss": 1.9007, + "step": 19551 + }, + { + "epoch": 28.74090407938258, + "grad_norm": 3.641199827194214, + "learning_rate": 4.112745098039216e-07, + "loss": 2.0142, + "step": 19572 + }, + { + "epoch": 28.771775082690187, + "grad_norm": 3.490839719772339, + "learning_rate": 4.009803921568628e-07, + "loss": 2.106, + "step": 19593 + }, + { + "epoch": 28.802646085997797, + "grad_norm": 4.220351219177246, + "learning_rate": 3.90686274509804e-07, + "loss": 1.9925, + "step": 19614 + }, + { + "epoch": 28.833517089305403, + "grad_norm": 3.093439817428589, + "learning_rate": 3.8039215686274516e-07, + "loss": 1.9348, + "step": 19635 + }, + { + "epoch": 28.86438809261301, + "grad_norm": 3.168133497238159, + "learning_rate": 3.700980392156863e-07, + "loss": 2.1087, + "step": 19656 + }, + { + "epoch": 28.89525909592062, + "grad_norm": 3.433194875717163, + "learning_rate": 3.5980392156862747e-07, + "loss": 1.9117, + "step": 19677 + }, + { + "epoch": 28.926130099228224, + "grad_norm": 3.6745409965515137, + "learning_rate": 3.4950980392156865e-07, + "loss": 2.0456, + "step": 19698 + }, + { + "epoch": 28.957001102535834, + "grad_norm": 4.280655384063721, + "learning_rate": 3.3921568627450984e-07, + "loss": 2.0921, + "step": 19719 + }, + { + "epoch": 28.98787210584344, + "grad_norm": 2.682467460632324, + "learning_rate": 3.2892156862745096e-07, + "loss": 2.1163, + "step": 19740 + }, + { + "epoch": 29.017640573318634, + "grad_norm": 4.8197712898254395, + "learning_rate": 3.1862745098039215e-07, + "loss": 1.9863, + "step": 19761 + }, + { + "epoch": 29.04851157662624, + "grad_norm": 3.6110095977783203, + "learning_rate": 3.083333333333334e-07, + "loss": 2.014, + "step": 19782 + }, + { + "epoch": 29.07938257993385, + "grad_norm": 4.205295085906982, + "learning_rate": 2.9803921568627456e-07, + "loss": 1.9606, + "step": 19803 + }, + { + "epoch": 29.110253583241455, + "grad_norm": 4.5187907218933105, + "learning_rate": 2.877450980392157e-07, + "loss": 2.0288, + "step": 19824 + }, + { + "epoch": 29.14112458654906, + "grad_norm": 3.4736719131469727, + "learning_rate": 2.7745098039215687e-07, + "loss": 1.9578, + "step": 19845 + }, + { + "epoch": 29.17199558985667, + "grad_norm": 3.5511319637298584, + "learning_rate": 2.6715686274509805e-07, + "loss": 1.9772, + "step": 19866 + }, + { + "epoch": 29.202866593164277, + "grad_norm": 2.8322770595550537, + "learning_rate": 2.5686274509803924e-07, + "loss": 1.9669, + "step": 19887 + }, + { + "epoch": 29.233737596471887, + "grad_norm": 5.3508477210998535, + "learning_rate": 2.465686274509804e-07, + "loss": 2.0524, + "step": 19908 + }, + { + "epoch": 29.264608599779493, + "grad_norm": 4.745840549468994, + "learning_rate": 2.362745098039216e-07, + "loss": 2.0862, + "step": 19929 + }, + { + "epoch": 29.2954796030871, + "grad_norm": 4.096988677978516, + "learning_rate": 2.2598039215686275e-07, + "loss": 1.954, + "step": 19950 + }, + { + "epoch": 29.32635060639471, + "grad_norm": 2.3860230445861816, + "learning_rate": 2.1568627450980394e-07, + "loss": 2.0573, + "step": 19971 + }, + { + "epoch": 29.357221609702314, + "grad_norm": 3.304518461227417, + "learning_rate": 2.0539215686274512e-07, + "loss": 2.0871, + "step": 19992 + }, + { + "epoch": 29.388092613009924, + "grad_norm": 3.948188304901123, + "learning_rate": 1.9509803921568627e-07, + "loss": 2.0126, + "step": 20013 + }, + { + "epoch": 29.41896361631753, + "grad_norm": 2.696289300918579, + "learning_rate": 1.8480392156862748e-07, + "loss": 2.0085, + "step": 20034 + }, + { + "epoch": 29.449834619625136, + "grad_norm": 3.4525928497314453, + "learning_rate": 1.7450980392156866e-07, + "loss": 2.0798, + "step": 20055 + }, + { + "epoch": 29.480705622932746, + "grad_norm": 3.622753620147705, + "learning_rate": 1.6421568627450982e-07, + "loss": 2.0391, + "step": 20076 + }, + { + "epoch": 29.511576626240352, + "grad_norm": 3.2289085388183594, + "learning_rate": 1.53921568627451e-07, + "loss": 1.992, + "step": 20097 + }, + { + "epoch": 29.54244762954796, + "grad_norm": 3.7036259174346924, + "learning_rate": 1.4362745098039215e-07, + "loss": 2.0901, + "step": 20118 + }, + { + "epoch": 29.573318632855568, + "grad_norm": 3.6887247562408447, + "learning_rate": 1.3333333333333336e-07, + "loss": 2.0561, + "step": 20139 + }, + { + "epoch": 29.604189636163177, + "grad_norm": 2.8597869873046875, + "learning_rate": 1.2303921568627452e-07, + "loss": 2.072, + "step": 20160 + }, + { + "epoch": 29.635060639470783, + "grad_norm": 3.587333917617798, + "learning_rate": 1.127450980392157e-07, + "loss": 2.0183, + "step": 20181 + }, + { + "epoch": 29.66593164277839, + "grad_norm": 2.980628490447998, + "learning_rate": 1.0245098039215687e-07, + "loss": 2.0967, + "step": 20202 + }, + { + "epoch": 29.696802646086, + "grad_norm": 3.0355567932128906, + "learning_rate": 9.215686274509805e-08, + "loss": 2.0283, + "step": 20223 + }, + { + "epoch": 29.727673649393605, + "grad_norm": 3.7858340740203857, + "learning_rate": 8.186274509803922e-08, + "loss": 1.9461, + "step": 20244 + }, + { + "epoch": 29.75854465270121, + "grad_norm": 3.9610328674316406, + "learning_rate": 7.15686274509804e-08, + "loss": 2.0268, + "step": 20265 + }, + { + "epoch": 29.78941565600882, + "grad_norm": 3.412227153778076, + "learning_rate": 6.127450980392157e-08, + "loss": 1.903, + "step": 20286 + }, + { + "epoch": 29.820286659316427, + "grad_norm": 3.366365671157837, + "learning_rate": 5.098039215686275e-08, + "loss": 2.0263, + "step": 20307 + }, + { + "epoch": 29.851157662624036, + "grad_norm": 3.1891021728515625, + "learning_rate": 4.0686274509803924e-08, + "loss": 1.9993, + "step": 20328 + }, + { + "epoch": 29.882028665931642, + "grad_norm": 2.5397629737854004, + "learning_rate": 3.03921568627451e-08, + "loss": 1.8939, + "step": 20349 + }, + { + "epoch": 29.912899669239252, + "grad_norm": 3.0380682945251465, + "learning_rate": 2.0098039215686278e-08, + "loss": 2.0146, + "step": 20370 + }, + { + "epoch": 29.943770672546858, + "grad_norm": 4.6516828536987305, + "learning_rate": 9.803921568627451e-09, + "loss": 2.0806, + "step": 20391 + }, + { + "epoch": 29.957001102535834, + "step": 20400, + "total_flos": 1.0698124218303099e+18, + "train_loss": 2.100168639164345, + "train_runtime": 22354.634, + "train_samples_per_second": 182.572, + "train_steps_per_second": 0.913 + } + ], + "logging_steps": 21, + "max_steps": 20400, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 6800, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0698124218303099e+18, + "train_batch_size": 50, + "trial_name": null, + "trial_params": null +}