diff --git "a/checkpoint-10188/trainer_state.json" "b/checkpoint-10188/trainer_state.json" --- "a/checkpoint-10188/trainer_state.json" +++ "b/checkpoint-10188/trainer_state.json" @@ -1,7 +1,7 @@ { "best_global_step": 7641, - "best_metric": 0.19104745984077454, - "best_model_checkpoint": "/tmp/tmpvs_embab/adapter-multilabel/checkpoint-7641", + "best_metric": 0.2000712752342224, + "best_model_checkpoint": "/tmp/tmpfnnwubec/adapter-multilabel/checkpoint-7641", "epoch": 4.0, "eval_steps": 500, "global_step": 10188, @@ -11,7160 +11,7160 @@ "log_history": [ { "epoch": 0.0039261876717707105, - "grad_norm": 3.9793148040771484, + "grad_norm": 3.481051445007324, "learning_rate": 8.829300196206672e-08, - "loss": 0.965, + "loss": 1.0581, "step": 10 }, { "epoch": 0.007852375343541421, - "grad_norm": 5.651288986206055, + "grad_norm": 2.726672410964966, "learning_rate": 1.8639633747547416e-07, - "loss": 1.0887, + "loss": 1.0359, "step": 20 }, { "epoch": 0.011778563015312132, - "grad_norm": 3.057249069213867, + "grad_norm": 3.473125457763672, "learning_rate": 2.844996729888816e-07, - "loss": 0.8455, + "loss": 0.8935, "step": 30 }, { "epoch": 0.015704750687082842, - "grad_norm": 4.313498497009277, + "grad_norm": 3.2374024391174316, "learning_rate": 3.8260300850228906e-07, - "loss": 0.9154, + "loss": 0.9647, "step": 40 }, { "epoch": 0.01963093835885355, - "grad_norm": 5.464332580566406, + "grad_norm": 6.737728595733643, "learning_rate": 4.807063440156965e-07, - "loss": 0.9871, + "loss": 1.1304, "step": 50 }, { "epoch": 0.023557126030624265, - "grad_norm": 3.1885132789611816, + "grad_norm": 3.1872334480285645, "learning_rate": 5.788096795291039e-07, - "loss": 1.0026, + "loss": 0.9217, "step": 60 }, { "epoch": 0.027483313702394974, - "grad_norm": 5.261380672454834, + "grad_norm": 5.807702541351318, "learning_rate": 6.769130150425114e-07, - "loss": 0.9838, + "loss": 1.0496, "step": 70 }, { "epoch": 0.031409501374165684, - "grad_norm": 7.684381484985352, + "grad_norm": 7.070641994476318, "learning_rate": 7.750163505559188e-07, - "loss": 1.0479, + "loss": 1.0097, "step": 80 }, { "epoch": 0.0353356890459364, - "grad_norm": 2.8870339393615723, + "grad_norm": 3.3689141273498535, "learning_rate": 8.731196860693265e-07, - "loss": 0.9857, + "loss": 1.0643, "step": 90 }, { "epoch": 0.0392618767177071, - "grad_norm": 2.8800153732299805, + "grad_norm": 3.1216671466827393, "learning_rate": 9.712230215827338e-07, - "loss": 0.9496, + "loss": 0.9438, "step": 100 }, { "epoch": 0.043188064389477816, - "grad_norm": 4.835630893707275, + "grad_norm": 5.534323692321777, "learning_rate": 1.0693263570961413e-06, - "loss": 0.9488, + "loss": 0.9201, "step": 110 }, { "epoch": 0.04711425206124853, - "grad_norm": 6.1992645263671875, + "grad_norm": 5.127967357635498, "learning_rate": 1.1674296926095486e-06, - "loss": 1.0918, + "loss": 1.077, "step": 120 }, { "epoch": 0.051040439733019236, - "grad_norm": 3.0620675086975098, + "grad_norm": 2.993151903152466, "learning_rate": 1.2655330281229563e-06, - "loss": 0.9755, + "loss": 0.9443, "step": 130 }, { "epoch": 0.05496662740478995, - "grad_norm": 2.86425518989563, + "grad_norm": 3.4643309116363525, "learning_rate": 1.3636363636363636e-06, - "loss": 0.9964, + "loss": 0.9822, "step": 140 }, { "epoch": 0.05889281507656066, - "grad_norm": 5.927287578582764, + "grad_norm": 2.5459845066070557, "learning_rate": 1.4617396991497711e-06, - "loss": 0.9554, + "loss": 0.8806, "step": 150 }, { "epoch": 0.06281900274833137, - "grad_norm": 3.132922649383545, + "grad_norm": 3.301366090774536, "learning_rate": 1.5598430346631786e-06, - "loss": 0.9391, + "loss": 0.9089, "step": 160 }, { "epoch": 0.06674519042010207, - "grad_norm": 7.593597412109375, + "grad_norm": 9.29371166229248, "learning_rate": 1.657946370176586e-06, - "loss": 0.9444, + "loss": 0.9116, "step": 170 }, { "epoch": 0.0706713780918728, - "grad_norm": 2.500877857208252, + "grad_norm": 2.902604341506958, "learning_rate": 1.7560497056899937e-06, - "loss": 0.8082, + "loss": 0.8661, "step": 180 }, { "epoch": 0.0745975657636435, - "grad_norm": 3.361327648162842, + "grad_norm": 3.1522815227508545, "learning_rate": 1.854153041203401e-06, - "loss": 0.7621, + "loss": 0.8099, "step": 190 }, { "epoch": 0.0785237534354142, - "grad_norm": 6.126520156860352, + "grad_norm": 3.7005128860473633, "learning_rate": 1.9522563767168085e-06, - "loss": 1.1926, + "loss": 1.1836, "step": 200 }, { "epoch": 0.08244994110718493, - "grad_norm": 2.6630825996398926, + "grad_norm": 2.894810914993286, "learning_rate": 2.050359712230216e-06, - "loss": 0.7418, + "loss": 0.7575, "step": 210 }, { "epoch": 0.08637612877895563, - "grad_norm": 4.095630168914795, + "grad_norm": 4.037342071533203, "learning_rate": 2.1484630477436235e-06, - "loss": 0.9533, + "loss": 0.9232, "step": 220 }, { "epoch": 0.09030231645072634, - "grad_norm": 3.654273271560669, + "grad_norm": 2.4140400886535645, "learning_rate": 2.246566383257031e-06, - "loss": 0.9681, + "loss": 0.9533, "step": 230 }, { "epoch": 0.09422850412249706, - "grad_norm": 3.933027505874634, + "grad_norm": 3.2682995796203613, "learning_rate": 2.344669718770438e-06, - "loss": 1.0246, + "loss": 1.007, "step": 240 }, { "epoch": 0.09815469179426776, - "grad_norm": 3.5743088722229004, + "grad_norm": 2.970691442489624, "learning_rate": 2.442773054283846e-06, - "loss": 1.0907, + "loss": 1.0529, "step": 250 }, { "epoch": 0.10208087946603847, - "grad_norm": 5.295337200164795, + "grad_norm": 7.472085475921631, "learning_rate": 2.540876389797253e-06, - "loss": 0.9328, + "loss": 0.9884, "step": 260 }, { "epoch": 0.10600706713780919, - "grad_norm": 2.843456983566284, + "grad_norm": 2.9394443035125732, "learning_rate": 2.6389797253106604e-06, - "loss": 0.9408, + "loss": 0.9467, "step": 270 }, { "epoch": 0.1099332548095799, - "grad_norm": 3.5099680423736572, + "grad_norm": 3.025452136993408, "learning_rate": 2.7370830608240677e-06, - "loss": 0.6865, + "loss": 0.6476, "step": 280 }, { "epoch": 0.1138594424813506, - "grad_norm": 3.213709831237793, + "grad_norm": 2.8236961364746094, "learning_rate": 2.8351863963374755e-06, - "loss": 0.9818, + "loss": 0.9491, "step": 290 }, { "epoch": 0.11778563015312132, - "grad_norm": 3.519076347351074, + "grad_norm": 3.2733681201934814, "learning_rate": 2.933289731850883e-06, - "loss": 0.896, + "loss": 0.9067, "step": 300 }, { "epoch": 0.12171181782489203, - "grad_norm": 4.259673118591309, + "grad_norm": 4.042492866516113, "learning_rate": 3.0313930673642905e-06, - "loss": 1.0808, + "loss": 0.9983, "step": 310 }, { "epoch": 0.12563800549666274, - "grad_norm": 5.529066562652588, + "grad_norm": 7.6011552810668945, "learning_rate": 3.1294964028776982e-06, - "loss": 0.8814, + "loss": 0.8858, "step": 320 }, { "epoch": 0.12956419316843346, - "grad_norm": 2.591524124145508, + "grad_norm": 2.6120543479919434, "learning_rate": 3.2275997383911055e-06, - "loss": 1.0676, + "loss": 1.1023, "step": 330 }, { "epoch": 0.13349038084020415, - "grad_norm": 3.512587308883667, + "grad_norm": 4.009816646575928, "learning_rate": 3.325703073904513e-06, - "loss": 0.7467, + "loss": 0.7161, "step": 340 }, { "epoch": 0.13741656851197487, - "grad_norm": 7.953798294067383, + "grad_norm": 5.020063877105713, "learning_rate": 3.42380640941792e-06, - "loss": 0.8037, + "loss": 0.7537, "step": 350 }, { "epoch": 0.1413427561837456, - "grad_norm": 4.76967191696167, + "grad_norm": 5.510372638702393, "learning_rate": 3.521909744931328e-06, - "loss": 0.7553, + "loss": 0.6776, "step": 360 }, { "epoch": 0.14526894385551628, - "grad_norm": 3.284162998199463, + "grad_norm": 3.419509172439575, "learning_rate": 3.620013080444735e-06, - "loss": 0.91, + "loss": 0.8777, "step": 370 }, { "epoch": 0.149195131527287, - "grad_norm": 5.602280139923096, + "grad_norm": 5.865817546844482, "learning_rate": 3.7181164159581425e-06, - "loss": 0.7732, + "loss": 0.7006, "step": 380 }, { "epoch": 0.15312131919905772, - "grad_norm": 15.13402271270752, + "grad_norm": 19.455717086791992, "learning_rate": 3.81621975147155e-06, - "loss": 0.9715, + "loss": 1.0258, "step": 390 }, { "epoch": 0.1570475068708284, - "grad_norm": 8.506321907043457, + "grad_norm": 8.902175903320312, "learning_rate": 3.914323086984958e-06, - "loss": 0.8935, + "loss": 0.8563, "step": 400 }, { "epoch": 0.16097369454259913, - "grad_norm": 17.040428161621094, + "grad_norm": 20.07317352294922, "learning_rate": 4.012426422498365e-06, - "loss": 0.9555, + "loss": 0.9596, "step": 410 }, { "epoch": 0.16489988221436985, - "grad_norm": 13.54706859588623, + "grad_norm": 13.66897201538086, "learning_rate": 4.1105297580117725e-06, - "loss": 0.7936, + "loss": 0.84, "step": 420 }, { "epoch": 0.16882606988614055, - "grad_norm": 3.683502674102783, + "grad_norm": 4.654616832733154, "learning_rate": 4.208633093525179e-06, - "loss": 0.8609, + "loss": 0.8919, "step": 430 }, { "epoch": 0.17275225755791127, - "grad_norm": 23.729930877685547, + "grad_norm": 28.85814666748047, "learning_rate": 4.306736429038587e-06, - "loss": 1.0903, + "loss": 1.1332, "step": 440 }, { "epoch": 0.17667844522968199, - "grad_norm": 3.782012701034546, + "grad_norm": 4.021274089813232, "learning_rate": 4.404839764551996e-06, - "loss": 0.9143, + "loss": 0.9797, "step": 450 }, { "epoch": 0.18060463290145268, - "grad_norm": 2.5681381225585938, + "grad_norm": 2.763209104537964, "learning_rate": 4.5029431000654026e-06, - "loss": 0.7391, + "loss": 0.7481, "step": 460 }, { "epoch": 0.1845308205732234, - "grad_norm": 4.402597904205322, + "grad_norm": 4.355885028839111, "learning_rate": 4.60104643557881e-06, - "loss": 0.9954, + "loss": 1.0002, "step": 470 }, { "epoch": 0.18845700824499412, - "grad_norm": 8.97122573852539, + "grad_norm": 6.94189977645874, "learning_rate": 4.699149771092217e-06, - "loss": 0.882, + "loss": 0.9269, "step": 480 }, { "epoch": 0.1923831959167648, - "grad_norm": 7.4766621589660645, + "grad_norm": 9.310148239135742, "learning_rate": 4.797253106605625e-06, - "loss": 0.7004, + "loss": 0.694, "step": 490 }, { "epoch": 0.19630938358853553, - "grad_norm": 10.86671257019043, + "grad_norm": 10.130315780639648, "learning_rate": 4.895356442119032e-06, - "loss": 0.767, + "loss": 0.7089, "step": 500 }, { "epoch": 0.20023557126030625, - "grad_norm": 21.112340927124023, + "grad_norm": 22.3338680267334, "learning_rate": 4.9934597776324395e-06, - "loss": 0.939, + "loss": 0.9072, "step": 510 }, { "epoch": 0.20416175893207694, - "grad_norm": 10.99935531616211, + "grad_norm": 9.217657089233398, "learning_rate": 5.091563113145846e-06, - "loss": 0.9089, + "loss": 0.9927, "step": 520 }, { "epoch": 0.20808794660384766, - "grad_norm": 4.02927827835083, + "grad_norm": 4.5457258224487305, "learning_rate": 5.189666448659255e-06, - "loss": 0.7263, + "loss": 0.6635, "step": 530 }, { "epoch": 0.21201413427561838, - "grad_norm": 15.078070640563965, + "grad_norm": 19.128801345825195, "learning_rate": 5.287769784172662e-06, - "loss": 0.8193, + "loss": 0.7747, "step": 540 }, { "epoch": 0.21594032194738907, - "grad_norm": 6.6269330978393555, + "grad_norm": 5.427403926849365, "learning_rate": 5.3858731196860696e-06, - "loss": 0.8408, + "loss": 0.78, "step": 550 }, { "epoch": 0.2198665096191598, - "grad_norm": 4.6614298820495605, + "grad_norm": 5.932145118713379, "learning_rate": 5.483976455199477e-06, - "loss": 0.6491, + "loss": 0.6514, "step": 560 }, { "epoch": 0.22379269729093051, - "grad_norm": 8.660489082336426, + "grad_norm": 8.63182544708252, "learning_rate": 5.582079790712884e-06, - "loss": 0.8506, + "loss": 0.9256, "step": 570 }, { "epoch": 0.2277188849627012, - "grad_norm": 6.674793243408203, + "grad_norm": 5.8081183433532715, "learning_rate": 5.680183126226292e-06, - "loss": 0.5018, + "loss": 0.5146, "step": 580 }, { "epoch": 0.23164507263447193, - "grad_norm": 10.671791076660156, + "grad_norm": 10.12514591217041, "learning_rate": 5.778286461739699e-06, - "loss": 0.7786, + "loss": 0.8669, "step": 590 }, { "epoch": 0.23557126030624265, - "grad_norm": 5.927751541137695, + "grad_norm": 6.432973384857178, "learning_rate": 5.876389797253107e-06, - "loss": 0.6311, + "loss": 0.5922, "step": 600 }, { "epoch": 0.23949744797801334, - "grad_norm": 11.92956256866455, + "grad_norm": 8.82177734375, "learning_rate": 5.974493132766514e-06, - "loss": 0.8829, + "loss": 0.7083, "step": 610 }, { "epoch": 0.24342363564978406, - "grad_norm": 7.990926265716553, + "grad_norm": 7.296743869781494, "learning_rate": 6.072596468279922e-06, - "loss": 0.557, + "loss": 0.5272, "step": 620 }, { "epoch": 0.24734982332155478, - "grad_norm": 3.7177248001098633, + "grad_norm": 3.679032802581787, "learning_rate": 6.170699803793329e-06, - "loss": 0.9214, + "loss": 0.8344, "step": 630 }, { "epoch": 0.25127601099332547, - "grad_norm": 9.946877479553223, + "grad_norm": 14.788595199584961, "learning_rate": 6.2688031393067365e-06, - "loss": 0.6521, + "loss": 0.7171, "step": 640 }, { "epoch": 0.2552021986650962, - "grad_norm": 5.775181770324707, + "grad_norm": 4.707211017608643, "learning_rate": 6.366906474820144e-06, - "loss": 0.5572, + "loss": 0.5191, "step": 650 }, { "epoch": 0.2591283863368669, - "grad_norm": 29.76291847229004, + "grad_norm": 19.173988342285156, "learning_rate": 6.465009810333551e-06, - "loss": 0.7875, + "loss": 0.7087, "step": 660 }, { "epoch": 0.26305457400863763, - "grad_norm": 9.139225006103516, + "grad_norm": 20.141216278076172, "learning_rate": 6.563113145846959e-06, - "loss": 0.9778, + "loss": 0.9065, "step": 670 }, { "epoch": 0.2669807616804083, - "grad_norm": 6.76605749130249, + "grad_norm": 5.210958480834961, "learning_rate": 6.661216481360367e-06, - "loss": 0.5986, + "loss": 0.6185, "step": 680 }, { "epoch": 0.270906949352179, - "grad_norm": 10.687578201293945, + "grad_norm": 10.236114501953125, "learning_rate": 6.759319816873774e-06, - "loss": 0.655, + "loss": 0.6902, "step": 690 }, { "epoch": 0.27483313702394974, - "grad_norm": 4.13920259475708, + "grad_norm": 4.203823089599609, "learning_rate": 6.857423152387181e-06, - "loss": 0.4445, + "loss": 0.5212, "step": 700 }, { "epoch": 0.27875932469572046, - "grad_norm": 9.511812210083008, + "grad_norm": 10.880992889404297, "learning_rate": 6.955526487900589e-06, - "loss": 0.6284, + "loss": 0.6867, "step": 710 }, { "epoch": 0.2826855123674912, - "grad_norm": 23.41804313659668, + "grad_norm": 15.194366455078125, "learning_rate": 7.053629823413996e-06, - "loss": 0.5792, + "loss": 0.5287, "step": 720 }, { "epoch": 0.2866117000392619, - "grad_norm": 2.432250738143921, + "grad_norm": 2.420100212097168, "learning_rate": 7.1517331589274035e-06, - "loss": 0.7348, + "loss": 0.6973, "step": 730 }, { "epoch": 0.29053788771103256, - "grad_norm": 11.461575508117676, + "grad_norm": 3.989758253097534, "learning_rate": 7.24983649444081e-06, - "loss": 0.407, + "loss": 0.3747, "step": 740 }, { "epoch": 0.2944640753828033, - "grad_norm": 24.64181900024414, + "grad_norm": 7.890944480895996, "learning_rate": 7.347939829954219e-06, - "loss": 0.7198, + "loss": 0.7078, "step": 750 }, { "epoch": 0.298390263054574, - "grad_norm": 3.43178653717041, + "grad_norm": 3.2260732650756836, "learning_rate": 7.446043165467627e-06, - "loss": 0.7367, + "loss": 0.7524, "step": 760 }, { "epoch": 0.3023164507263447, - "grad_norm": 11.215662956237793, + "grad_norm": 8.550039291381836, "learning_rate": 7.544146500981034e-06, - "loss": 0.6012, + "loss": 0.656, "step": 770 }, { "epoch": 0.30624263839811544, - "grad_norm": 2.662524700164795, + "grad_norm": 7.197936534881592, "learning_rate": 7.64224983649444e-06, - "loss": 0.624, + "loss": 0.6351, "step": 780 }, { "epoch": 0.31016882606988616, - "grad_norm": 4.465322017669678, + "grad_norm": 7.162957191467285, "learning_rate": 7.740353172007849e-06, - "loss": 0.4292, + "loss": 0.4419, "step": 790 }, { "epoch": 0.3140950137416568, - "grad_norm": 14.8330659866333, + "grad_norm": 15.707671165466309, "learning_rate": 7.838456507521257e-06, - "loss": 0.4668, + "loss": 0.5304, "step": 800 }, { "epoch": 0.31802120141342755, - "grad_norm": 3.697566032409668, + "grad_norm": 3.9049336910247803, "learning_rate": 7.936559843034663e-06, - "loss": 0.6978, + "loss": 0.6481, "step": 810 }, { "epoch": 0.32194738908519827, - "grad_norm": 3.810598611831665, + "grad_norm": 2.9919142723083496, "learning_rate": 8.03466317854807e-06, - "loss": 0.6638, + "loss": 0.6765, "step": 820 }, { "epoch": 0.325873576756969, - "grad_norm": 6.719821929931641, + "grad_norm": 6.209725856781006, "learning_rate": 8.132766514061478e-06, - "loss": 0.391, + "loss": 0.361, "step": 830 }, { "epoch": 0.3297997644287397, - "grad_norm": 2.087430953979492, + "grad_norm": 1.5867514610290527, "learning_rate": 8.230869849574886e-06, - "loss": 0.4951, + "loss": 0.4874, "step": 840 }, { "epoch": 0.3337259521005104, - "grad_norm": 18.407089233398438, + "grad_norm": 25.201175689697266, "learning_rate": 8.328973185088292e-06, - "loss": 0.5426, + "loss": 0.5692, "step": 850 }, { "epoch": 0.3376521397722811, - "grad_norm": 19.534862518310547, + "grad_norm": 17.566923141479492, "learning_rate": 8.427076520601701e-06, - "loss": 0.6793, + "loss": 0.6265, "step": 860 }, { "epoch": 0.3415783274440518, - "grad_norm": 17.81478500366211, + "grad_norm": 16.987472534179688, "learning_rate": 8.525179856115109e-06, - "loss": 0.4712, + "loss": 0.4832, "step": 870 }, { "epoch": 0.34550451511582253, - "grad_norm": 5.050530910491943, + "grad_norm": 6.545525074005127, "learning_rate": 8.623283191628515e-06, - "loss": 0.556, + "loss": 0.5877, "step": 880 }, { "epoch": 0.34943070278759325, - "grad_norm": 15.126139640808105, + "grad_norm": 12.33557415008545, "learning_rate": 8.721386527141923e-06, - "loss": 0.6758, + "loss": 0.6599, "step": 890 }, { "epoch": 0.35335689045936397, - "grad_norm": 4.586108684539795, + "grad_norm": 5.927364826202393, "learning_rate": 8.81948986265533e-06, - "loss": 0.3972, + "loss": 0.419, "step": 900 }, { "epoch": 0.3572830781311347, - "grad_norm": 4.0998969078063965, + "grad_norm": 3.0960209369659424, "learning_rate": 8.917593198168738e-06, - "loss": 0.4785, + "loss": 0.4855, "step": 910 }, { "epoch": 0.36120926580290535, - "grad_norm": 2.8743770122528076, + "grad_norm": 4.590017318725586, "learning_rate": 9.015696533682144e-06, - "loss": 0.5175, + "loss": 0.4808, "step": 920 }, { "epoch": 0.3651354534746761, - "grad_norm": 11.98127269744873, + "grad_norm": 9.836238861083984, "learning_rate": 9.113799869195552e-06, - "loss": 0.4073, + "loss": 0.3716, "step": 930 }, { "epoch": 0.3690616411464468, - "grad_norm": 28.792461395263672, + "grad_norm": 25.86206817626953, "learning_rate": 9.211903204708962e-06, - "loss": 0.4693, + "loss": 0.4711, "step": 940 }, { "epoch": 0.3729878288182175, - "grad_norm": 24.169824600219727, + "grad_norm": 31.329652786254883, "learning_rate": 9.310006540222368e-06, - "loss": 0.8149, + "loss": 0.7231, "step": 950 }, { "epoch": 0.37691401648998824, - "grad_norm": 7.978569507598877, + "grad_norm": 14.603381156921387, "learning_rate": 9.408109875735775e-06, - "loss": 0.4088, + "loss": 0.5075, "step": 960 }, { "epoch": 0.38084020416175896, - "grad_norm": 2.8425447940826416, + "grad_norm": 3.2160160541534424, "learning_rate": 9.506213211249183e-06, - "loss": 0.321, + "loss": 0.2756, "step": 970 }, { "epoch": 0.3847663918335296, - "grad_norm": 8.290285110473633, + "grad_norm": 10.547919273376465, "learning_rate": 9.60431654676259e-06, - "loss": 0.3932, + "loss": 0.4526, "step": 980 }, { "epoch": 0.38869257950530034, - "grad_norm": 4.763232231140137, + "grad_norm": 3.9408512115478516, "learning_rate": 9.702419882275997e-06, - "loss": 0.7244, + "loss": 0.6724, "step": 990 }, { "epoch": 0.39261876717707106, - "grad_norm": 4.444037914276123, + "grad_norm": 3.9256722927093506, "learning_rate": 9.800523217789404e-06, - "loss": 0.5837, + "loss": 0.6349, "step": 1000 }, { "epoch": 0.3965449548488418, - "grad_norm": 24.227767944335938, + "grad_norm": 15.100317001342773, "learning_rate": 9.898626553302814e-06, - "loss": 0.4705, + "loss": 0.4085, "step": 1010 }, { "epoch": 0.4004711425206125, - "grad_norm": 6.601070404052734, + "grad_norm": 6.286873817443848, "learning_rate": 9.99672988881622e-06, - "loss": 0.6687, + "loss": 0.4736, "step": 1020 }, { "epoch": 0.4043973301923832, - "grad_norm": 13.607088088989258, + "grad_norm": 14.044744491577148, "learning_rate": 1.0094833224329628e-05, - "loss": 0.3664, + "loss": 0.3989, "step": 1030 }, { "epoch": 0.4083235178641539, - "grad_norm": 4.271781921386719, + "grad_norm": 2.022460460662842, "learning_rate": 1.0192936559843035e-05, - "loss": 0.3109, + "loss": 0.3296, "step": 1040 }, { "epoch": 0.4122497055359246, - "grad_norm": 4.805093765258789, + "grad_norm": 6.2514519691467285, "learning_rate": 1.0291039895356443e-05, - "loss": 0.3202, + "loss": 0.283, "step": 1050 }, { "epoch": 0.4161758932076953, - "grad_norm": 3.546109676361084, + "grad_norm": 2.508761167526245, "learning_rate": 1.0389143230869849e-05, - "loss": 0.3466, + "loss": 0.3208, "step": 1060 }, { "epoch": 0.42010208087946604, - "grad_norm": 8.463558197021484, + "grad_norm": 8.403465270996094, "learning_rate": 1.0487246566383257e-05, - "loss": 0.3949, + "loss": 0.4725, "step": 1070 }, { "epoch": 0.42402826855123676, - "grad_norm": 1.6530474424362183, + "grad_norm": 2.1996490955352783, "learning_rate": 1.0585349901896665e-05, - "loss": 0.4115, + "loss": 0.2682, "step": 1080 }, { "epoch": 0.4279544562230075, - "grad_norm": 4.2548675537109375, + "grad_norm": 3.522963285446167, "learning_rate": 1.0683453237410072e-05, - "loss": 0.3653, + "loss": 0.4021, "step": 1090 }, { "epoch": 0.43188064389477815, - "grad_norm": 1.8331494331359863, + "grad_norm": 5.853711128234863, "learning_rate": 1.078155657292348e-05, - "loss": 0.3914, + "loss": 0.3718, "step": 1100 }, { "epoch": 0.43580683156654887, - "grad_norm": 17.74492835998535, + "grad_norm": 15.8383150100708, "learning_rate": 1.0879659908436886e-05, - "loss": 0.4573, + "loss": 0.3823, "step": 1110 }, { "epoch": 0.4397330192383196, - "grad_norm": 6.226461410522461, + "grad_norm": 7.292287826538086, "learning_rate": 1.0977763243950295e-05, - "loss": 0.2844, + "loss": 0.278, "step": 1120 }, { "epoch": 0.4436592069100903, - "grad_norm": 5.841897964477539, + "grad_norm": 2.243644952774048, "learning_rate": 1.1075866579463702e-05, - "loss": 0.2996, + "loss": 0.3343, "step": 1130 }, { "epoch": 0.44758539458186103, - "grad_norm": 7.31948184967041, + "grad_norm": 7.754064559936523, "learning_rate": 1.117396991497711e-05, - "loss": 0.3439, + "loss": 0.3685, "step": 1140 }, { "epoch": 0.45151158225363175, - "grad_norm": 11.942208290100098, + "grad_norm": 14.604247093200684, "learning_rate": 1.1272073250490517e-05, - "loss": 0.568, + "loss": 0.4292, "step": 1150 }, { "epoch": 0.4554377699254024, - "grad_norm": 5.836273670196533, + "grad_norm": 6.38851261138916, "learning_rate": 1.1370176586003925e-05, - "loss": 0.4736, + "loss": 0.3697, "step": 1160 }, { "epoch": 0.45936395759717313, - "grad_norm": 2.1570844650268555, + "grad_norm": 2.8847551345825195, "learning_rate": 1.1468279921517332e-05, - "loss": 0.414, + "loss": 0.4678, "step": 1170 }, { "epoch": 0.46329014526894385, - "grad_norm": 3.391273021697998, + "grad_norm": 2.991736888885498, "learning_rate": 1.1566383257030738e-05, - "loss": 0.3917, + "loss": 0.3648, "step": 1180 }, { "epoch": 0.4672163329407146, - "grad_norm": 20.387121200561523, + "grad_norm": 12.23653507232666, "learning_rate": 1.1664486592544148e-05, - "loss": 0.307, + "loss": 0.2225, "step": 1190 }, { "epoch": 0.4711425206124853, - "grad_norm": 21.665388107299805, + "grad_norm": 15.220344543457031, "learning_rate": 1.1762589928057554e-05, - "loss": 0.4551, + "loss": 0.4655, "step": 1200 }, { "epoch": 0.475068708284256, - "grad_norm": 7.036664962768555, + "grad_norm": 4.383800506591797, "learning_rate": 1.1860693263570962e-05, - "loss": 0.2739, + "loss": 0.2441, "step": 1210 }, { "epoch": 0.4789948959560267, - "grad_norm": 24.953664779663086, + "grad_norm": 33.76412582397461, "learning_rate": 1.1958796599084368e-05, - "loss": 0.3859, + "loss": 0.3759, "step": 1220 }, { "epoch": 0.4829210836277974, - "grad_norm": 43.237674713134766, + "grad_norm": 27.416797637939453, "learning_rate": 1.2056899934597777e-05, - "loss": 0.5316, + "loss": 0.4576, "step": 1230 }, { "epoch": 0.4868472712995681, - "grad_norm": 9.316898345947266, + "grad_norm": 8.021793365478516, "learning_rate": 1.2155003270111185e-05, - "loss": 0.3097, + "loss": 0.3437, "step": 1240 }, { "epoch": 0.49077345897133884, - "grad_norm": 4.275661945343018, + "grad_norm": 3.793288469314575, "learning_rate": 1.225310660562459e-05, - "loss": 0.2938, + "loss": 0.2485, "step": 1250 }, { "epoch": 0.49469964664310956, - "grad_norm": 36.5017204284668, + "grad_norm": 31.524032592773438, "learning_rate": 1.2351209941138e-05, - "loss": 0.6633, + "loss": 0.4996, "step": 1260 }, { "epoch": 0.4986258343148803, - "grad_norm": 4.175166130065918, + "grad_norm": 4.508017539978027, "learning_rate": 1.2449313276651406e-05, - "loss": 0.2933, + "loss": 0.2595, "step": 1270 }, { "epoch": 0.5025520219866509, - "grad_norm": 10.313667297363281, + "grad_norm": 5.323174476623535, "learning_rate": 1.2547416612164814e-05, - "loss": 0.357, + "loss": 0.3821, "step": 1280 }, { "epoch": 0.5064782096584217, - "grad_norm": 2.864466667175293, + "grad_norm": 1.561851143836975, "learning_rate": 1.264551994767822e-05, - "loss": 0.4922, + "loss": 0.3681, "step": 1290 }, { "epoch": 0.5104043973301924, - "grad_norm": 9.19117546081543, + "grad_norm": 4.669619083404541, "learning_rate": 1.274362328319163e-05, - "loss": 0.2615, + "loss": 0.3328, "step": 1300 }, { "epoch": 0.514330585001963, - "grad_norm": 51.292728424072266, + "grad_norm": 59.23523712158203, "learning_rate": 1.2841726618705037e-05, - "loss": 0.5405, + "loss": 0.58, "step": 1310 }, { "epoch": 0.5182567726737338, - "grad_norm": 6.370532512664795, + "grad_norm": 5.40714693069458, "learning_rate": 1.2939829954218443e-05, - "loss": 0.4008, + "loss": 0.4346, "step": 1320 }, { "epoch": 0.5221829603455045, - "grad_norm": 2.372314929962158, + "grad_norm": 8.357551574707031, "learning_rate": 1.3037933289731851e-05, - "loss": 0.5068, + "loss": 0.5634, "step": 1330 }, { "epoch": 0.5261091480172753, - "grad_norm": 0.9157946109771729, + "grad_norm": 1.5698715448379517, "learning_rate": 1.3136036625245259e-05, - "loss": 0.1776, + "loss": 0.1326, "step": 1340 }, { "epoch": 0.5300353356890459, - "grad_norm": 3.708653688430786, + "grad_norm": 5.228177070617676, "learning_rate": 1.3234139960758666e-05, - "loss": 0.2262, + "loss": 0.2445, "step": 1350 }, { "epoch": 0.5339615233608166, - "grad_norm": 3.8156628608703613, + "grad_norm": 6.554355144500732, "learning_rate": 1.3332243296272072e-05, - "loss": 0.2908, + "loss": 0.3139, "step": 1360 }, { "epoch": 0.5378877110325874, - "grad_norm": 46.9285774230957, + "grad_norm": 34.03114700317383, "learning_rate": 1.3430346631785482e-05, - "loss": 0.2828, + "loss": 0.3401, "step": 1370 }, { "epoch": 0.541813898704358, - "grad_norm": 45.85068130493164, + "grad_norm": 20.547780990600586, "learning_rate": 1.3528449967298888e-05, - "loss": 0.4336, + "loss": 0.4378, "step": 1380 }, { "epoch": 0.5457400863761288, - "grad_norm": 4.289827346801758, + "grad_norm": 4.3763017654418945, "learning_rate": 1.3626553302812296e-05, - "loss": 0.1254, + "loss": 0.1077, "step": 1390 }, { "epoch": 0.5496662740478995, - "grad_norm": 20.87666130065918, + "grad_norm": 50.1994743347168, "learning_rate": 1.3724656638325703e-05, - "loss": 0.2531, + "loss": 0.2442, "step": 1400 }, { "epoch": 0.5535924617196702, - "grad_norm": 7.489846229553223, + "grad_norm": 1.3502081632614136, "learning_rate": 1.3822759973839111e-05, - "loss": 0.3416, + "loss": 0.4841, "step": 1410 }, { "epoch": 0.5575186493914409, - "grad_norm": 20.740631103515625, + "grad_norm": 19.93968963623047, "learning_rate": 1.3920863309352519e-05, - "loss": 0.3422, + "loss": 0.3302, "step": 1420 }, { "epoch": 0.5614448370632116, - "grad_norm": 4.524540424346924, + "grad_norm": 3.2099475860595703, "learning_rate": 1.4018966644865925e-05, - "loss": 0.2961, + "loss": 0.3642, "step": 1430 }, { "epoch": 0.5653710247349824, - "grad_norm": 15.30322265625, + "grad_norm": 10.53823471069336, "learning_rate": 1.4117069980379334e-05, - "loss": 0.3909, + "loss": 0.3544, "step": 1440 }, { "epoch": 0.569297212406753, - "grad_norm": 2.1286799907684326, + "grad_norm": 0.9103022813796997, "learning_rate": 1.421517331589274e-05, - "loss": 0.3773, + "loss": 0.3088, "step": 1450 }, { "epoch": 0.5732234000785238, - "grad_norm": 14.987198829650879, + "grad_norm": 6.787320613861084, "learning_rate": 1.4313276651406148e-05, - "loss": 0.3777, + "loss": 0.3217, "step": 1460 }, { "epoch": 0.5771495877502945, - "grad_norm": 81.90992736816406, + "grad_norm": 60.254310607910156, "learning_rate": 1.4411379986919556e-05, - "loss": 0.5291, + "loss": 0.5068, "step": 1470 }, { "epoch": 0.5810757754220651, - "grad_norm": 44.91868209838867, + "grad_norm": 15.669028282165527, "learning_rate": 1.4509483322432963e-05, - "loss": 0.4396, + "loss": 0.3124, "step": 1480 }, { "epoch": 0.5850019630938359, - "grad_norm": 2.5824217796325684, + "grad_norm": 10.544931411743164, "learning_rate": 1.4607586657946371e-05, - "loss": 0.4743, + "loss": 0.5415, "step": 1490 }, { "epoch": 0.5889281507656066, - "grad_norm": 15.979462623596191, + "grad_norm": 13.226673126220703, "learning_rate": 1.4705689993459777e-05, - "loss": 0.256, + "loss": 0.22, "step": 1500 }, { "epoch": 0.5928543384373773, - "grad_norm": 3.5519607067108154, + "grad_norm": 5.586292266845703, "learning_rate": 1.4803793328973185e-05, - "loss": 0.304, + "loss": 0.3003, "step": 1510 }, { "epoch": 0.596780526109148, - "grad_norm": 2.2138147354125977, + "grad_norm": 2.5064399242401123, "learning_rate": 1.4901896664486593e-05, - "loss": 0.1901, + "loss": 0.1977, "step": 1520 }, { "epoch": 0.6007067137809188, - "grad_norm": 29.590654373168945, + "grad_norm": 38.85649108886719, "learning_rate": 1.5e-05, - "loss": 0.6663, + "loss": 0.6154, "step": 1530 }, { "epoch": 0.6046329014526894, - "grad_norm": 18.716819763183594, + "grad_norm": 22.79119110107422, "learning_rate": 1.4989093288737004e-05, - "loss": 0.2989, + "loss": 0.2872, "step": 1540 }, { "epoch": 0.6085590891244601, - "grad_norm": 107.53730010986328, + "grad_norm": 12.569866180419922, "learning_rate": 1.4978186577474007e-05, - "loss": 0.2168, + "loss": 0.2073, "step": 1550 }, { "epoch": 0.6124852767962309, - "grad_norm": 24.21065330505371, + "grad_norm": 68.21198272705078, "learning_rate": 1.4967279866211009e-05, - "loss": 0.3952, + "loss": 0.5122, "step": 1560 }, { "epoch": 0.6164114644680015, - "grad_norm": 8.248003959655762, + "grad_norm": 3.8244588375091553, "learning_rate": 1.4956373154948012e-05, - "loss": 0.1231, + "loss": 0.1663, "step": 1570 }, { "epoch": 0.6203376521397723, - "grad_norm": 36.4151725769043, + "grad_norm": 38.16170883178711, "learning_rate": 1.4945466443685014e-05, - "loss": 0.3092, + "loss": 0.2609, "step": 1580 }, { "epoch": 0.624263839811543, - "grad_norm": 34.05025863647461, + "grad_norm": 34.50796890258789, "learning_rate": 1.4934559732422017e-05, - "loss": 0.2369, + "loss": 0.273, "step": 1590 }, { "epoch": 0.6281900274833137, - "grad_norm": 39.28768539428711, + "grad_norm": 51.55370330810547, "learning_rate": 1.4923653021159019e-05, - "loss": 0.3376, + "loss": 0.5104, "step": 1600 }, { "epoch": 0.6321162151550844, - "grad_norm": 1.5521210432052612, + "grad_norm": 1.395621418952942, "learning_rate": 1.4912746309896022e-05, - "loss": 0.351, + "loss": 0.5353, "step": 1610 }, { "epoch": 0.6360424028268551, - "grad_norm": 8.46679973602295, + "grad_norm": 7.185873985290527, "learning_rate": 1.4901839598633026e-05, - "loss": 0.3411, + "loss": 0.3141, "step": 1620 }, { "epoch": 0.6399685904986259, - "grad_norm": 4.40542459487915, + "grad_norm": 3.089205026626587, "learning_rate": 1.489093288737003e-05, - "loss": 0.2118, + "loss": 0.2287, "step": 1630 }, { "epoch": 0.6438947781703965, - "grad_norm": 12.468846321105957, + "grad_norm": 6.155888080596924, "learning_rate": 1.4880026176107033e-05, - "loss": 0.5091, + "loss": 0.4378, "step": 1640 }, { "epoch": 0.6478209658421673, - "grad_norm": 2.155667543411255, + "grad_norm": 3.5708343982696533, "learning_rate": 1.4869119464844034e-05, - "loss": 0.4592, + "loss": 0.6447, "step": 1650 }, { "epoch": 0.651747153513938, - "grad_norm": 3.5184009075164795, + "grad_norm": 3.6848175525665283, "learning_rate": 1.4858212753581036e-05, - "loss": 0.1962, + "loss": 0.2191, "step": 1660 }, { "epoch": 0.6556733411857086, - "grad_norm": 7.525317192077637, + "grad_norm": 28.661956787109375, "learning_rate": 1.484730604231804e-05, - "loss": 0.1126, + "loss": 0.1384, "step": 1670 }, { "epoch": 0.6595995288574794, - "grad_norm": 14.463057518005371, + "grad_norm": 10.65005111694336, "learning_rate": 1.4836399331055043e-05, - "loss": 0.3144, + "loss": 0.5317, "step": 1680 }, { "epoch": 0.6635257165292501, - "grad_norm": 3.4677093029022217, + "grad_norm": 6.081112861633301, "learning_rate": 1.4825492619792046e-05, - "loss": 0.2725, + "loss": 0.2438, "step": 1690 }, { "epoch": 0.6674519042010209, - "grad_norm": 4.494072437286377, + "grad_norm": 6.957218170166016, "learning_rate": 1.4814585908529048e-05, - "loss": 0.1777, + "loss": 0.2193, "step": 1700 }, { "epoch": 0.6713780918727915, - "grad_norm": 19.921783447265625, + "grad_norm": 20.77627182006836, "learning_rate": 1.4803679197266051e-05, - "loss": 0.7467, + "loss": 0.7343, "step": 1710 }, { "epoch": 0.6753042795445622, - "grad_norm": 2.0078423023223877, + "grad_norm": 3.5985021591186523, "learning_rate": 1.4792772486003055e-05, - "loss": 0.2109, + "loss": 0.2465, "step": 1720 }, { "epoch": 0.679230467216333, - "grad_norm": 0.6054943203926086, + "grad_norm": 0.5627488493919373, "learning_rate": 1.4781865774740058e-05, - "loss": 0.1331, + "loss": 0.198, "step": 1730 }, { "epoch": 0.6831566548881036, - "grad_norm": 1.2508567571640015, + "grad_norm": 5.338821887969971, "learning_rate": 1.477095906347706e-05, - "loss": 0.0905, + "loss": 0.1211, "step": 1740 }, { "epoch": 0.6870828425598744, - "grad_norm": 6.846147060394287, + "grad_norm": 1.4694796800613403, "learning_rate": 1.4760052352214062e-05, - "loss": 0.2772, + "loss": 0.1806, "step": 1750 }, { "epoch": 0.6910090302316451, - "grad_norm": 5.680413722991943, + "grad_norm": 5.876432418823242, "learning_rate": 1.4749145640951065e-05, - "loss": 0.3237, + "loss": 0.2966, "step": 1760 }, { "epoch": 0.6949352179034158, - "grad_norm": 4.05276346206665, + "grad_norm": 3.7161383628845215, "learning_rate": 1.4738238929688068e-05, - "loss": 0.4071, + "loss": 0.4394, "step": 1770 }, { "epoch": 0.6988614055751865, - "grad_norm": 40.03810119628906, + "grad_norm": 50.05641555786133, "learning_rate": 1.4727332218425072e-05, - "loss": 0.4735, + "loss": 0.648, "step": 1780 }, { "epoch": 0.7027875932469572, - "grad_norm": 2.5294904708862305, + "grad_norm": 2.0114455223083496, "learning_rate": 1.4716425507162074e-05, - "loss": 0.1842, + "loss": 0.2052, "step": 1790 }, { "epoch": 0.7067137809187279, - "grad_norm": 53.05503845214844, + "grad_norm": 39.20153045654297, "learning_rate": 1.4705518795899077e-05, - "loss": 0.3101, + "loss": 0.2375, "step": 1800 }, { "epoch": 0.7106399685904986, - "grad_norm": 2.4650866985321045, + "grad_norm": 2.8059744834899902, "learning_rate": 1.469461208463608e-05, - "loss": 0.2056, + "loss": 0.2261, "step": 1810 }, { "epoch": 0.7145661562622694, - "grad_norm": 33.811710357666016, + "grad_norm": 17.432003021240234, "learning_rate": 1.4683705373373084e-05, - "loss": 0.5613, + "loss": 0.5645, "step": 1820 }, { "epoch": 0.71849234393404, - "grad_norm": 2.3604888916015625, + "grad_norm": 2.1211421489715576, "learning_rate": 1.4672798662110085e-05, - "loss": 0.1128, + "loss": 0.1432, "step": 1830 }, { "epoch": 0.7224185316058107, - "grad_norm": 9.413778305053711, + "grad_norm": 29.23116111755371, "learning_rate": 1.4661891950847087e-05, - "loss": 0.1632, + "loss": 0.2002, "step": 1840 }, { "epoch": 0.7263447192775815, - "grad_norm": 10.638350486755371, + "grad_norm": 2.884796380996704, "learning_rate": 1.465098523958409e-05, - "loss": 0.4124, + "loss": 0.3871, "step": 1850 }, { "epoch": 0.7302709069493521, - "grad_norm": 1.8169736862182617, + "grad_norm": 3.6958603858947754, "learning_rate": 1.4640078528321094e-05, - "loss": 0.2498, + "loss": 0.2806, "step": 1860 }, { "epoch": 0.7341970946211229, - "grad_norm": 2.663595676422119, + "grad_norm": 24.932758331298828, "learning_rate": 1.4629171817058097e-05, - "loss": 0.5092, + "loss": 0.3542, "step": 1870 }, { "epoch": 0.7381232822928936, - "grad_norm": 7.008359432220459, + "grad_norm": 6.772890090942383, "learning_rate": 1.4618265105795099e-05, - "loss": 0.2777, + "loss": 0.4096, "step": 1880 }, { "epoch": 0.7420494699646644, - "grad_norm": 0.9332793951034546, + "grad_norm": 0.8791089653968811, "learning_rate": 1.4607358394532102e-05, - "loss": 0.2573, + "loss": 0.2725, "step": 1890 }, { "epoch": 0.745975657636435, - "grad_norm": 1.6770204305648804, + "grad_norm": 2.153632164001465, "learning_rate": 1.4596451683269106e-05, - "loss": 0.307, + "loss": 0.2802, "step": 1900 }, { "epoch": 0.7499018453082057, - "grad_norm": 1.2598676681518555, + "grad_norm": 2.6855568885803223, "learning_rate": 1.458554497200611e-05, - "loss": 0.3337, + "loss": 0.3497, "step": 1910 }, { "epoch": 0.7538280329799765, - "grad_norm": 4.97174072265625, + "grad_norm": 4.875283718109131, "learning_rate": 1.4574638260743111e-05, - "loss": 0.1169, + "loss": 0.1366, "step": 1920 }, { "epoch": 0.7577542206517471, - "grad_norm": 5.666492938995361, + "grad_norm": 1.6532658338546753, "learning_rate": 1.4563731549480113e-05, - "loss": 0.4084, + "loss": 0.3219, "step": 1930 }, { "epoch": 0.7616804083235179, - "grad_norm": 10.250621795654297, + "grad_norm": 7.924410343170166, "learning_rate": 1.4552824838217116e-05, - "loss": 0.4612, + "loss": 0.5275, "step": 1940 }, { "epoch": 0.7656065959952886, - "grad_norm": 1.7088961601257324, + "grad_norm": 1.0707097053527832, "learning_rate": 1.454191812695412e-05, - "loss": 0.3609, + "loss": 0.2263, "step": 1950 }, { "epoch": 0.7695327836670592, - "grad_norm": 2.604320764541626, + "grad_norm": 2.635694980621338, "learning_rate": 1.4531011415691123e-05, - "loss": 0.1592, + "loss": 0.1502, "step": 1960 }, { "epoch": 0.77345897133883, - "grad_norm": 0.36692944169044495, + "grad_norm": 1.0742791891098022, "learning_rate": 1.4520104704428125e-05, - "loss": 0.7023, + "loss": 0.5652, "step": 1970 }, { "epoch": 0.7773851590106007, - "grad_norm": 1.9514503479003906, + "grad_norm": 1.5840063095092773, "learning_rate": 1.4509197993165128e-05, - "loss": 0.4717, + "loss": 0.3439, "step": 1980 }, { "epoch": 0.7813113466823715, - "grad_norm": 27.791797637939453, + "grad_norm": 34.938560485839844, "learning_rate": 1.4498291281902131e-05, - "loss": 0.489, + "loss": 0.6397, "step": 1990 }, { "epoch": 0.7852375343541421, - "grad_norm": 29.59618377685547, + "grad_norm": 63.408504486083984, "learning_rate": 1.4487384570639133e-05, - "loss": 0.1403, + "loss": 0.2482, "step": 2000 }, { "epoch": 0.7891637220259128, - "grad_norm": 2.8945841789245605, + "grad_norm": 2.8896102905273438, "learning_rate": 1.4476477859376136e-05, - "loss": 0.1756, + "loss": 0.2754, "step": 2010 }, { "epoch": 0.7930899096976836, - "grad_norm": 0.3147704601287842, + "grad_norm": 0.10139458626508713, "learning_rate": 1.4465571148113138e-05, - "loss": 0.5243, + "loss": 0.4273, "step": 2020 }, { "epoch": 0.7970160973694542, - "grad_norm": 13.201154708862305, + "grad_norm": 11.02223014831543, "learning_rate": 1.4454664436850142e-05, - "loss": 0.4569, + "loss": 0.3829, "step": 2030 }, { "epoch": 0.800942285041225, - "grad_norm": 5.532883167266846, + "grad_norm": 1.813936471939087, "learning_rate": 1.4443757725587145e-05, - "loss": 0.4276, + "loss": 0.3874, "step": 2040 }, { "epoch": 0.8048684727129957, - "grad_norm": 3.3129794597625732, + "grad_norm": 17.101606369018555, "learning_rate": 1.4432851014324148e-05, - "loss": 0.6454, + "loss": 0.4317, "step": 2050 }, { "epoch": 0.8087946603847664, - "grad_norm": 4.767651557922363, + "grad_norm": 3.2440969944000244, "learning_rate": 1.4421944303061152e-05, - "loss": 0.341, + "loss": 0.3372, "step": 2060 }, { "epoch": 0.8127208480565371, - "grad_norm": 3.7348949909210205, + "grad_norm": 3.52420973777771, "learning_rate": 1.4411037591798153e-05, - "loss": 0.2142, + "loss": 0.1441, "step": 2070 }, { "epoch": 0.8166470357283078, - "grad_norm": 1.919364333152771, + "grad_norm": 1.4761594533920288, "learning_rate": 1.4400130880535157e-05, - "loss": 0.32, + "loss": 0.3104, "step": 2080 }, { "epoch": 0.8205732234000785, - "grad_norm": 10.20119857788086, + "grad_norm": 7.558804512023926, "learning_rate": 1.4389224169272159e-05, - "loss": 0.2062, + "loss": 0.1633, "step": 2090 }, { "epoch": 0.8244994110718492, - "grad_norm": 33.3404426574707, + "grad_norm": 51.0029182434082, "learning_rate": 1.4378317458009162e-05, - "loss": 0.4958, + "loss": 0.7646, "step": 2100 }, { "epoch": 0.82842559874362, - "grad_norm": 0.6379029750823975, + "grad_norm": 2.8390491008758545, "learning_rate": 1.4367410746746164e-05, - "loss": 0.648, + "loss": 0.6187, "step": 2110 }, { "epoch": 0.8323517864153906, - "grad_norm": 16.49692153930664, + "grad_norm": 8.160420417785645, "learning_rate": 1.4356504035483167e-05, - "loss": 0.1945, + "loss": 0.1643, "step": 2120 }, { "epoch": 0.8362779740871613, - "grad_norm": 11.915961265563965, + "grad_norm": 16.129671096801758, "learning_rate": 1.434559732422017e-05, - "loss": 0.4641, + "loss": 0.341, "step": 2130 }, { "epoch": 0.8402041617589321, - "grad_norm": 2.487652063369751, + "grad_norm": 1.4582387208938599, "learning_rate": 1.4334690612957174e-05, - "loss": 0.301, + "loss": 0.2998, "step": 2140 }, { "epoch": 0.8441303494307028, - "grad_norm": 1.9147950410842896, + "grad_norm": 2.485788583755493, "learning_rate": 1.4323783901694177e-05, - "loss": 0.1974, + "loss": 0.1933, "step": 2150 }, { "epoch": 0.8480565371024735, - "grad_norm": 13.185128211975098, + "grad_norm": 12.96564769744873, "learning_rate": 1.4312877190431179e-05, - "loss": 0.3197, + "loss": 0.3559, "step": 2160 }, { "epoch": 0.8519827247742442, - "grad_norm": 3.5867531299591064, + "grad_norm": 3.1576311588287354, "learning_rate": 1.4301970479168182e-05, - "loss": 0.1984, + "loss": 0.2083, "step": 2170 }, { "epoch": 0.855908912446015, - "grad_norm": 12.109413146972656, + "grad_norm": 6.451188564300537, "learning_rate": 1.4291063767905184e-05, - "loss": 0.1036, + "loss": 0.1177, "step": 2180 }, { "epoch": 0.8598351001177856, - "grad_norm": 7.396296977996826, + "grad_norm": 7.6136956214904785, "learning_rate": 1.4280157056642188e-05, - "loss": 0.45, + "loss": 0.4799, "step": 2190 }, { "epoch": 0.8637612877895563, - "grad_norm": 1.3671844005584717, + "grad_norm": 5.497119903564453, "learning_rate": 1.4269250345379191e-05, - "loss": 0.1636, + "loss": 0.1579, "step": 2200 }, { "epoch": 0.8676874754613271, - "grad_norm": 7.776515007019043, + "grad_norm": 7.1131415367126465, "learning_rate": 1.4258343634116193e-05, - "loss": 0.4378, + "loss": 0.3308, "step": 2210 }, { "epoch": 0.8716136631330977, - "grad_norm": 6.528609275817871, + "grad_norm": 21.18025016784668, "learning_rate": 1.4247436922853196e-05, - "loss": 0.2979, + "loss": 0.321, "step": 2220 }, { "epoch": 0.8755398508048685, - "grad_norm": 11.839767456054688, + "grad_norm": 12.188064575195312, "learning_rate": 1.42365302115902e-05, - "loss": 0.3736, + "loss": 0.3795, "step": 2230 }, { "epoch": 0.8794660384766392, - "grad_norm": 35.86703109741211, + "grad_norm": 26.001676559448242, "learning_rate": 1.4225623500327203e-05, - "loss": 0.2929, + "loss": 0.2857, "step": 2240 }, { "epoch": 0.8833922261484098, - "grad_norm": 21.705886840820312, + "grad_norm": 34.258766174316406, "learning_rate": 1.4214716789064205e-05, - "loss": 0.2329, + "loss": 0.2769, "step": 2250 }, { "epoch": 0.8873184138201806, - "grad_norm": 6.472341537475586, + "grad_norm": 9.625612258911133, "learning_rate": 1.4203810077801206e-05, - "loss": 0.2986, + "loss": 0.2652, "step": 2260 }, { "epoch": 0.8912446014919513, - "grad_norm": 4.334987640380859, + "grad_norm": 7.745572566986084, "learning_rate": 1.419290336653821e-05, - "loss": 0.1771, + "loss": 0.1705, "step": 2270 }, { "epoch": 0.8951707891637221, - "grad_norm": 28.425695419311523, + "grad_norm": 15.009160041809082, "learning_rate": 1.4181996655275213e-05, - "loss": 0.3309, + "loss": 0.3216, "step": 2280 }, { "epoch": 0.8990969768354927, - "grad_norm": 3.070922374725342, + "grad_norm": 8.771262168884277, "learning_rate": 1.4171089944012216e-05, - "loss": 0.2103, + "loss": 0.1815, "step": 2290 }, { "epoch": 0.9030231645072635, - "grad_norm": 6.562528610229492, + "grad_norm": 33.691436767578125, "learning_rate": 1.4160183232749218e-05, - "loss": 0.289, + "loss": 0.2609, "step": 2300 }, { "epoch": 0.9069493521790342, - "grad_norm": 5.212559223175049, + "grad_norm": 2.230038642883301, "learning_rate": 1.4149276521486222e-05, - "loss": 0.4259, + "loss": 0.4041, "step": 2310 }, { "epoch": 0.9108755398508048, - "grad_norm": 0.4170234203338623, + "grad_norm": 2.0432682037353516, "learning_rate": 1.4138369810223225e-05, - "loss": 0.2592, + "loss": 0.311, "step": 2320 }, { "epoch": 0.9148017275225756, - "grad_norm": 2.584571123123169, + "grad_norm": 5.312524795532227, "learning_rate": 1.4127463098960228e-05, - "loss": 0.3831, + "loss": 0.2871, "step": 2330 }, { "epoch": 0.9187279151943463, - "grad_norm": 24.72283935546875, + "grad_norm": 9.85692310333252, "learning_rate": 1.411655638769723e-05, - "loss": 0.8012, + "loss": 0.6296, "step": 2340 }, { "epoch": 0.922654102866117, - "grad_norm": 15.69322681427002, + "grad_norm": 22.06463623046875, "learning_rate": 1.4105649676434232e-05, - "loss": 0.1919, + "loss": 0.1887, "step": 2350 }, { "epoch": 0.9265802905378877, - "grad_norm": 8.938844680786133, + "grad_norm": 2.899306058883667, "learning_rate": 1.4094742965171235e-05, - "loss": 0.238, + "loss": 0.2502, "step": 2360 }, { "epoch": 0.9305064782096584, - "grad_norm": 5.273670196533203, + "grad_norm": 4.303149700164795, "learning_rate": 1.4083836253908239e-05, - "loss": 0.3718, + "loss": 0.3998, "step": 2370 }, { "epoch": 0.9344326658814291, - "grad_norm": 2.937269449234009, + "grad_norm": 2.372411012649536, "learning_rate": 1.4072929542645242e-05, - "loss": 0.1491, + "loss": 0.2195, "step": 2380 }, { "epoch": 0.9383588535531998, - "grad_norm": 23.44415855407715, + "grad_norm": 34.01797866821289, "learning_rate": 1.4062022831382244e-05, - "loss": 0.4142, + "loss": 0.4571, "step": 2390 }, { "epoch": 0.9422850412249706, - "grad_norm": 1.3673181533813477, + "grad_norm": 1.4108043909072876, "learning_rate": 1.4051116120119247e-05, - "loss": 0.1441, + "loss": 0.17, "step": 2400 }, { "epoch": 0.9462112288967413, - "grad_norm": 28.430185317993164, + "grad_norm": 23.5246524810791, "learning_rate": 1.404020940885625e-05, - "loss": 0.2624, + "loss": 0.1801, "step": 2410 }, { "epoch": 0.950137416568512, - "grad_norm": 1.981184959411621, + "grad_norm": 0.6515095233917236, "learning_rate": 1.4029302697593254e-05, - "loss": 0.1699, + "loss": 0.1063, "step": 2420 }, { "epoch": 0.9540636042402827, - "grad_norm": 17.230236053466797, + "grad_norm": 20.522192001342773, "learning_rate": 1.4018395986330256e-05, - "loss": 0.2959, + "loss": 0.2979, "step": 2430 }, { "epoch": 0.9579897919120534, - "grad_norm": 93.46700286865234, + "grad_norm": 25.96772003173828, "learning_rate": 1.4007489275067257e-05, - "loss": 0.3253, + "loss": 0.2838, "step": 2440 }, { "epoch": 0.9619159795838241, - "grad_norm": 12.560220718383789, + "grad_norm": 16.852500915527344, "learning_rate": 1.399658256380426e-05, - "loss": 0.4247, + "loss": 0.3366, "step": 2450 }, { "epoch": 0.9658421672555948, - "grad_norm": 1.3784681558609009, + "grad_norm": 1.536694884300232, "learning_rate": 1.3985675852541264e-05, - "loss": 0.1077, + "loss": 0.1328, "step": 2460 }, { "epoch": 0.9697683549273656, - "grad_norm": 4.360574722290039, + "grad_norm": 10.68250560760498, "learning_rate": 1.3974769141278267e-05, - "loss": 0.1708, + "loss": 0.1843, "step": 2470 }, { "epoch": 0.9736945425991362, - "grad_norm": 4.308298587799072, + "grad_norm": 3.7731075286865234, "learning_rate": 1.396386243001527e-05, - "loss": 0.5731, + "loss": 0.4689, "step": 2480 }, { "epoch": 0.9776207302709069, - "grad_norm": 2.4784011840820312, + "grad_norm": 6.545607089996338, "learning_rate": 1.3952955718752273e-05, - "loss": 0.1688, + "loss": 0.1579, "step": 2490 }, { "epoch": 0.9815469179426777, - "grad_norm": 1.948642373085022, + "grad_norm": 2.8131589889526367, "learning_rate": 1.3942049007489276e-05, - "loss": 0.4797, + "loss": 0.3835, "step": 2500 }, { "epoch": 0.9854731056144483, - "grad_norm": 1.4803906679153442, + "grad_norm": 1.0107388496398926, "learning_rate": 1.393114229622628e-05, - "loss": 0.2496, + "loss": 0.3368, "step": 2510 }, { "epoch": 0.9893992932862191, - "grad_norm": 4.6783366203308105, + "grad_norm": 2.4708988666534424, "learning_rate": 1.3920235584963281e-05, - "loss": 0.8178, + "loss": 0.6378, "step": 2520 }, { "epoch": 0.9933254809579898, - "grad_norm": 16.2108154296875, + "grad_norm": 17.677087783813477, "learning_rate": 1.3909328873700283e-05, - "loss": 0.1354, + "loss": 0.1769, "step": 2530 }, { "epoch": 0.9972516686297606, - "grad_norm": 3.2173478603363037, + "grad_norm": 2.584199905395508, "learning_rate": 1.3898422162437286e-05, - "loss": 0.4002, + "loss": 0.2427, "step": 2540 }, { "epoch": 1.0, - "eval_loss": 0.2975941002368927, - "eval_runtime": 12.7436, - "eval_samples_per_second": 177.658, - "eval_steps_per_second": 22.207, + "eval_loss": 0.33665597438812256, + "eval_runtime": 14.3034, + "eval_samples_per_second": 158.284, + "eval_steps_per_second": 19.785, "step": 2547 }, { "epoch": 1.0011778563015312, - "grad_norm": 7.409631729125977, + "grad_norm": 34.32118225097656, "learning_rate": 1.388751545117429e-05, - "loss": 0.4656, + "loss": 0.4117, "step": 2550 }, { "epoch": 1.0051040439733019, - "grad_norm": 1.942702293395996, + "grad_norm": 7.000924110412598, "learning_rate": 1.3876608739911293e-05, - "loss": 0.1432, + "loss": 0.1964, "step": 2560 }, { "epoch": 1.0090302316450726, - "grad_norm": 3.2248356342315674, + "grad_norm": 2.952646255493164, "learning_rate": 1.3865702028648296e-05, - "loss": 0.0917, + "loss": 0.1702, "step": 2570 }, { "epoch": 1.0129564193168434, - "grad_norm": 3.640166997909546, + "grad_norm": 7.024879455566406, "learning_rate": 1.3854795317385298e-05, - "loss": 0.422, + "loss": 0.4806, "step": 2580 }, { "epoch": 1.016882606988614, - "grad_norm": 3.5013017654418945, + "grad_norm": 21.01302146911621, "learning_rate": 1.3843888606122301e-05, - "loss": 0.2037, + "loss": 0.2703, "step": 2590 }, { "epoch": 1.0208087946603848, - "grad_norm": 2.7399351596832275, + "grad_norm": 1.915413498878479, "learning_rate": 1.3832981894859303e-05, - "loss": 0.1133, + "loss": 0.1459, "step": 2600 }, { "epoch": 1.0247349823321554, - "grad_norm": 30.039278030395508, + "grad_norm": 12.596318244934082, "learning_rate": 1.3822075183596307e-05, - "loss": 0.3891, + "loss": 0.1538, "step": 2610 }, { "epoch": 1.028661170003926, - "grad_norm": 0.2973475456237793, + "grad_norm": 0.6418520212173462, "learning_rate": 1.3811168472333308e-05, - "loss": 0.3665, + "loss": 0.3064, "step": 2620 }, { "epoch": 1.032587357675697, - "grad_norm": 0.652389407157898, + "grad_norm": 0.7288719415664673, "learning_rate": 1.3800261761070312e-05, - "loss": 0.4428, + "loss": 0.3236, "step": 2630 }, { "epoch": 1.0365135453474676, - "grad_norm": 5.137771129608154, + "grad_norm": 8.18033218383789, "learning_rate": 1.3789355049807315e-05, - "loss": 0.2409, + "loss": 0.2083, "step": 2640 }, { "epoch": 1.0404397330192383, - "grad_norm": 2.8443734645843506, + "grad_norm": 6.915499210357666, "learning_rate": 1.3778448338544318e-05, - "loss": 0.2288, + "loss": 0.1449, "step": 2650 }, { "epoch": 1.044365920691009, - "grad_norm": 2.019512176513672, + "grad_norm": 2.7883493900299072, "learning_rate": 1.3767541627281322e-05, - "loss": 0.4846, + "loss": 0.3375, "step": 2660 }, { "epoch": 1.0482921083627796, - "grad_norm": 2.1162519454956055, + "grad_norm": 1.9019161462783813, "learning_rate": 1.3756634916018324e-05, - "loss": 0.21, + "loss": 0.1619, "step": 2670 }, { "epoch": 1.0522182960345505, - "grad_norm": 2.31459641456604, + "grad_norm": 1.9549580812454224, "learning_rate": 1.3745728204755327e-05, - "loss": 0.5574, + "loss": 0.4853, "step": 2680 }, { "epoch": 1.0561444837063212, - "grad_norm": 28.970691680908203, + "grad_norm": 27.3566837310791, "learning_rate": 1.3734821493492329e-05, - "loss": 0.3405, + "loss": 0.3912, "step": 2690 }, { "epoch": 1.0600706713780919, - "grad_norm": 76.02347564697266, + "grad_norm": 80.08515167236328, "learning_rate": 1.3723914782229332e-05, - "loss": 0.403, + "loss": 0.4208, "step": 2700 }, { "epoch": 1.0639968590498625, - "grad_norm": 5.529140949249268, + "grad_norm": 3.5745654106140137, "learning_rate": 1.3713008070966336e-05, - "loss": 0.4342, + "loss": 0.5019, "step": 2710 }, { "epoch": 1.0679230467216332, - "grad_norm": 11.524633407592773, + "grad_norm": 2.5056707859039307, "learning_rate": 1.3702101359703337e-05, - "loss": 0.1142, + "loss": 0.1025, "step": 2720 }, { "epoch": 1.071849234393404, - "grad_norm": 102.33007049560547, + "grad_norm": 99.27290344238281, "learning_rate": 1.369119464844034e-05, - "loss": 0.6877, + "loss": 0.5603, "step": 2730 }, { "epoch": 1.0757754220651747, - "grad_norm": 0.11869838833808899, + "grad_norm": 0.07891564071178436, "learning_rate": 1.3680287937177344e-05, - "loss": 0.73, + "loss": 0.5738, "step": 2740 }, { "epoch": 1.0797016097369454, - "grad_norm": 2.562869071960449, + "grad_norm": 2.426074504852295, "learning_rate": 1.3669381225914347e-05, - "loss": 0.3726, + "loss": 0.4077, "step": 2750 }, { "epoch": 1.083627797408716, - "grad_norm": 8.394923210144043, + "grad_norm": 8.385368347167969, "learning_rate": 1.3658474514651349e-05, - "loss": 0.0962, + "loss": 0.1085, "step": 2760 }, { "epoch": 1.087553985080487, - "grad_norm": 1.0309988260269165, + "grad_norm": 1.1436806917190552, "learning_rate": 1.364756780338835e-05, - "loss": 0.6096, + "loss": 0.5882, "step": 2770 }, { "epoch": 1.0914801727522576, - "grad_norm": 0.15654361248016357, + "grad_norm": 0.08624885231256485, "learning_rate": 1.3636661092125354e-05, - "loss": 0.4344, + "loss": 0.3577, "step": 2780 }, { "epoch": 1.0954063604240283, - "grad_norm": 0.7564122080802917, + "grad_norm": 0.6857588887214661, "learning_rate": 1.3625754380862358e-05, - "loss": 0.2118, + "loss": 0.3343, "step": 2790 }, { "epoch": 1.099332548095799, - "grad_norm": 1.588985562324524, + "grad_norm": 2.0301759243011475, "learning_rate": 1.3614847669599361e-05, - "loss": 0.1315, + "loss": 0.1016, "step": 2800 }, { "epoch": 1.1032587357675696, - "grad_norm": 3.0313401222229004, + "grad_norm": 2.3979074954986572, "learning_rate": 1.3603940958336363e-05, - "loss": 0.2724, + "loss": 0.2548, "step": 2810 }, { "epoch": 1.1071849234393405, - "grad_norm": 1.4999165534973145, + "grad_norm": 0.6892157196998596, "learning_rate": 1.3593034247073366e-05, - "loss": 0.2012, + "loss": 0.1922, "step": 2820 }, { "epoch": 1.1111111111111112, - "grad_norm": 45.549354553222656, + "grad_norm": 33.415714263916016, "learning_rate": 1.358212753581037e-05, - "loss": 0.1023, + "loss": 0.0971, "step": 2830 }, { "epoch": 1.1150372987828818, - "grad_norm": 44.6949348449707, + "grad_norm": 28.732221603393555, "learning_rate": 1.3571220824547373e-05, - "loss": 0.3726, + "loss": 0.2828, "step": 2840 }, { "epoch": 1.1189634864546525, - "grad_norm": 0.7977939248085022, + "grad_norm": 0.9036908745765686, "learning_rate": 1.3560314113284375e-05, - "loss": 0.1861, + "loss": 0.2128, "step": 2850 }, { "epoch": 1.1228896741264232, - "grad_norm": 0.11477933824062347, + "grad_norm": 0.03526393696665764, "learning_rate": 1.3549407402021376e-05, - "loss": 0.1233, + "loss": 0.1307, "step": 2860 }, { "epoch": 1.126815861798194, - "grad_norm": 0.12229687720537186, + "grad_norm": 0.30302172899246216, "learning_rate": 1.353850069075838e-05, - "loss": 0.1204, + "loss": 0.1134, "step": 2870 }, { "epoch": 1.1307420494699647, - "grad_norm": 3.3404996395111084, + "grad_norm": 1.8320659399032593, "learning_rate": 1.3527593979495383e-05, - "loss": 0.1081, + "loss": 0.1662, "step": 2880 }, { "epoch": 1.1346682371417354, - "grad_norm": 3.561617136001587, + "grad_norm": 0.36011096835136414, "learning_rate": 1.3516687268232387e-05, - "loss": 0.2778, + "loss": 0.2805, "step": 2890 }, { "epoch": 1.138594424813506, - "grad_norm": 5.524371147155762, + "grad_norm": 5.343972206115723, "learning_rate": 1.3505780556969388e-05, - "loss": 0.3032, + "loss": 0.3577, "step": 2900 }, { "epoch": 1.142520612485277, - "grad_norm": 1.7958314418792725, + "grad_norm": 1.8759875297546387, "learning_rate": 1.3494873845706392e-05, - "loss": 0.2301, + "loss": 0.1896, "step": 2910 }, { "epoch": 1.1464468001570476, - "grad_norm": 6.156078338623047, + "grad_norm": 4.169508934020996, "learning_rate": 1.3483967134443395e-05, - "loss": 0.0856, + "loss": 0.1213, "step": 2920 }, { "epoch": 1.1503729878288182, - "grad_norm": 2.6006457805633545, + "grad_norm": 6.427751541137695, "learning_rate": 1.3473060423180398e-05, - "loss": 0.1874, + "loss": 0.2339, "step": 2930 }, { "epoch": 1.154299175500589, - "grad_norm": 0.5849444270133972, + "grad_norm": 2.1072142124176025, "learning_rate": 1.34621537119174e-05, - "loss": 0.3247, + "loss": 0.2456, "step": 2940 }, { "epoch": 1.1582253631723596, - "grad_norm": 0.8616421222686768, + "grad_norm": 0.5070629715919495, "learning_rate": 1.3451247000654402e-05, - "loss": 0.1195, + "loss": 0.0872, "step": 2950 }, { "epoch": 1.1621515508441305, - "grad_norm": 18.601743698120117, + "grad_norm": 17.096242904663086, "learning_rate": 1.3440340289391405e-05, - "loss": 0.0933, + "loss": 0.1178, "step": 2960 }, { "epoch": 1.1660777385159011, - "grad_norm": 0.067299023270607, + "grad_norm": 0.03149707615375519, "learning_rate": 1.3429433578128409e-05, - "loss": 0.3427, + "loss": 0.4904, "step": 2970 }, { "epoch": 1.1700039261876718, - "grad_norm": 3.093510150909424, + "grad_norm": 7.501014709472656, "learning_rate": 1.3418526866865412e-05, - "loss": 0.1068, + "loss": 0.1489, "step": 2980 }, { "epoch": 1.1739301138594425, - "grad_norm": 2.36521315574646, + "grad_norm": 2.3471922874450684, "learning_rate": 1.3407620155602414e-05, - "loss": 0.0761, + "loss": 0.0742, "step": 2990 }, { "epoch": 1.1778563015312131, - "grad_norm": 8.149934768676758, + "grad_norm": 8.472454071044922, "learning_rate": 1.3396713444339417e-05, - "loss": 0.432, + "loss": 0.5187, "step": 3000 }, { "epoch": 1.181782489202984, - "grad_norm": 5.193252086639404, + "grad_norm": 3.597513198852539, "learning_rate": 1.338580673307642e-05, - "loss": 0.1718, + "loss": 0.2327, "step": 3010 }, { "epoch": 1.1857086768747547, - "grad_norm": 6.139854907989502, + "grad_norm": 2.3216969966888428, "learning_rate": 1.3374900021813424e-05, - "loss": 0.1816, + "loss": 0.1741, "step": 3020 }, { "epoch": 1.1896348645465253, - "grad_norm": 1.7385913133621216, + "grad_norm": 1.5170233249664307, "learning_rate": 1.3363993310550426e-05, - "loss": 0.1209, + "loss": 0.077, "step": 3030 }, { "epoch": 1.193561052218296, - "grad_norm": 36.645137786865234, + "grad_norm": 30.894716262817383, "learning_rate": 1.3353086599287427e-05, - "loss": 0.4752, + "loss": 0.3785, "step": 3040 }, { "epoch": 1.1974872398900667, - "grad_norm": 21.392906188964844, + "grad_norm": 17.768352508544922, "learning_rate": 1.334217988802443e-05, - "loss": 0.2006, + "loss": 0.3431, "step": 3050 }, { "epoch": 1.2014134275618376, - "grad_norm": 19.5700740814209, + "grad_norm": 20.75244903564453, "learning_rate": 1.3331273176761434e-05, - "loss": 0.1962, + "loss": 0.2009, "step": 3060 }, { "epoch": 1.2053396152336082, - "grad_norm": 2.765977144241333, + "grad_norm": 2.387011766433716, "learning_rate": 1.3320366465498438e-05, - "loss": 0.1451, + "loss": 0.1762, "step": 3070 }, { "epoch": 1.2092658029053789, - "grad_norm": 1.6447166204452515, + "grad_norm": 2.0431301593780518, "learning_rate": 1.3309459754235441e-05, - "loss": 0.3088, + "loss": 0.3011, "step": 3080 }, { "epoch": 1.2131919905771495, - "grad_norm": 24.882009506225586, + "grad_norm": 15.98243236541748, "learning_rate": 1.3298553042972443e-05, - "loss": 0.3091, + "loss": 0.211, "step": 3090 }, { "epoch": 1.2171181782489202, - "grad_norm": 2.055807113647461, + "grad_norm": 2.6845574378967285, "learning_rate": 1.3287646331709446e-05, - "loss": 0.1509, + "loss": 0.1572, "step": 3100 }, { "epoch": 1.221044365920691, - "grad_norm": 36.39563751220703, + "grad_norm": 36.89638137817383, "learning_rate": 1.3276739620446448e-05, - "loss": 0.2606, + "loss": 0.2266, "step": 3110 }, { "epoch": 1.2249705535924618, - "grad_norm": 13.238870620727539, + "grad_norm": 35.31357192993164, "learning_rate": 1.3265832909183451e-05, - "loss": 0.5394, + "loss": 0.6833, "step": 3120 }, { "epoch": 1.2288967412642324, - "grad_norm": 4.6781744956970215, + "grad_norm": 4.742762565612793, "learning_rate": 1.3254926197920453e-05, - "loss": 0.2222, + "loss": 0.2172, "step": 3130 }, { "epoch": 1.232822928936003, - "grad_norm": 0.360545814037323, + "grad_norm": 0.8355633020401001, "learning_rate": 1.3244019486657456e-05, - "loss": 0.1314, + "loss": 0.1387, "step": 3140 }, { "epoch": 1.2367491166077738, - "grad_norm": 2.7406344413757324, + "grad_norm": 5.099259853363037, "learning_rate": 1.323311277539446e-05, - "loss": 0.3277, + "loss": 0.2845, "step": 3150 }, { "epoch": 1.2406753042795446, - "grad_norm": 3.2115094661712646, + "grad_norm": 3.624067544937134, "learning_rate": 1.3222206064131463e-05, - "loss": 0.4304, + "loss": 0.4714, "step": 3160 }, { "epoch": 1.2446014919513153, - "grad_norm": 34.796424865722656, + "grad_norm": 28.65761947631836, "learning_rate": 1.3211299352868466e-05, - "loss": 0.4436, + "loss": 0.2961, "step": 3170 }, { "epoch": 1.248527679623086, - "grad_norm": 1.2256044149398804, + "grad_norm": 3.235868215560913, "learning_rate": 1.3200392641605468e-05, - "loss": 0.1309, + "loss": 0.1539, "step": 3180 }, { "epoch": 1.2524538672948566, - "grad_norm": 43.63812255859375, + "grad_norm": 55.52271270751953, "learning_rate": 1.3189485930342472e-05, - "loss": 0.196, + "loss": 0.2383, "step": 3190 }, { "epoch": 1.2563800549666273, - "grad_norm": 0.013691261410713196, + "grad_norm": 0.00822574831545353, "learning_rate": 1.3178579219079473e-05, - "loss": 0.1449, + "loss": 0.1519, "step": 3200 }, { "epoch": 1.2603062426383982, - "grad_norm": 3.356618642807007, + "grad_norm": 2.4268360137939453, "learning_rate": 1.3167672507816477e-05, - "loss": 0.226, + "loss": 0.1949, "step": 3210 }, { "epoch": 1.2642324303101689, - "grad_norm": 2.6706573963165283, + "grad_norm": 2.268681049346924, "learning_rate": 1.3156765796553478e-05, - "loss": 0.0745, + "loss": 0.0874, "step": 3220 }, { "epoch": 1.2681586179819395, - "grad_norm": 0.24871893227100372, + "grad_norm": 1.7323014736175537, "learning_rate": 1.3145859085290482e-05, - "loss": 0.1303, + "loss": 0.1722, "step": 3230 }, { "epoch": 1.2720848056537102, - "grad_norm": 3.6240484714508057, + "grad_norm": 2.859678268432617, "learning_rate": 1.3134952374027485e-05, - "loss": 0.5143, + "loss": 0.5015, "step": 3240 }, { "epoch": 1.2760109933254808, - "grad_norm": 0.17671142518520355, + "grad_norm": 0.2620829641819, "learning_rate": 1.3124045662764489e-05, - "loss": 0.1413, + "loss": 0.1433, "step": 3250 }, { "epoch": 1.2799371809972517, - "grad_norm": 18.125869750976562, + "grad_norm": 16.924570083618164, "learning_rate": 1.3113138951501492e-05, - "loss": 0.2226, + "loss": 0.2371, "step": 3260 }, { "epoch": 1.2838633686690224, - "grad_norm": 28.14374351501465, + "grad_norm": 18.177810668945312, "learning_rate": 1.3102232240238494e-05, - "loss": 0.2505, + "loss": 0.2207, "step": 3270 }, { "epoch": 1.287789556340793, - "grad_norm": 1.8290412425994873, + "grad_norm": 5.893682479858398, "learning_rate": 1.3091325528975497e-05, - "loss": 0.1061, + "loss": 0.1739, "step": 3280 }, { "epoch": 1.2917157440125637, - "grad_norm": 1.2448176145553589, + "grad_norm": 1.1524956226348877, "learning_rate": 1.3080418817712499e-05, - "loss": 0.1009, + "loss": 0.085, "step": 3290 }, { "epoch": 1.2956419316843344, - "grad_norm": 2.7430367469787598, + "grad_norm": 3.3578295707702637, "learning_rate": 1.3069512106449502e-05, - "loss": 0.2654, + "loss": 0.4043, "step": 3300 }, { "epoch": 1.2995681193561053, - "grad_norm": 14.66954231262207, + "grad_norm": 57.29066467285156, "learning_rate": 1.3058605395186506e-05, - "loss": 0.2037, + "loss": 0.2397, "step": 3310 }, { "epoch": 1.303494307027876, - "grad_norm": 0.715758740901947, + "grad_norm": 0.04882551357150078, "learning_rate": 1.3047698683923507e-05, - "loss": 0.2786, + "loss": 0.2966, "step": 3320 }, { "epoch": 1.3074204946996466, - "grad_norm": 4.3318891525268555, + "grad_norm": 2.631953239440918, "learning_rate": 1.303679197266051e-05, - "loss": 0.1928, + "loss": 0.1364, "step": 3330 }, { "epoch": 1.3113466823714173, - "grad_norm": 2.3164262771606445, + "grad_norm": 2.76619029045105, "learning_rate": 1.3025885261397514e-05, - "loss": 0.1148, + "loss": 0.1506, "step": 3340 }, { "epoch": 1.315272870043188, - "grad_norm": 9.428380966186523, + "grad_norm": 0.1661418080329895, "learning_rate": 1.3014978550134518e-05, - "loss": 0.3608, + "loss": 0.336, "step": 3350 }, { "epoch": 1.3191990577149588, - "grad_norm": 4.136178970336914, + "grad_norm": 2.402981996536255, "learning_rate": 1.300407183887152e-05, - "loss": 0.5773, + "loss": 0.6321, "step": 3360 }, { "epoch": 1.3231252453867295, - "grad_norm": 2.9020094871520996, + "grad_norm": 3.1364150047302246, "learning_rate": 1.2993165127608521e-05, - "loss": 0.1264, + "loss": 0.1517, "step": 3370 }, { "epoch": 1.3270514330585002, - "grad_norm": 0.36572468280792236, + "grad_norm": 1.0921645164489746, "learning_rate": 1.2982258416345524e-05, - "loss": 0.398, + "loss": 0.3848, "step": 3380 }, { "epoch": 1.330977620730271, - "grad_norm": 2.6275601387023926, + "grad_norm": 2.7638635635375977, "learning_rate": 1.2971351705082528e-05, - "loss": 0.0474, + "loss": 0.0466, "step": 3390 }, { "epoch": 1.3349038084020415, - "grad_norm": 2.499476671218872, + "grad_norm": 11.197885513305664, "learning_rate": 1.2960444993819531e-05, - "loss": 0.5042, + "loss": 0.4811, "step": 3400 }, { "epoch": 1.3388299960738124, - "grad_norm": 1.1841174364089966, + "grad_norm": 1.3396183252334595, "learning_rate": 1.2949538282556533e-05, - "loss": 0.2347, + "loss": 0.4053, "step": 3410 }, { "epoch": 1.342756183745583, - "grad_norm": 2.8502907752990723, + "grad_norm": 2.4852423667907715, "learning_rate": 1.2938631571293536e-05, - "loss": 0.1357, + "loss": 0.1044, "step": 3420 }, { "epoch": 1.3466823714173537, - "grad_norm": 2.846534252166748, + "grad_norm": 6.505958557128906, "learning_rate": 1.292772486003054e-05, - "loss": 0.1816, + "loss": 0.1946, "step": 3430 }, { "epoch": 1.3506085590891246, - "grad_norm": 19.991262435913086, + "grad_norm": 35.329280853271484, "learning_rate": 1.2916818148767543e-05, - "loss": 0.6397, + "loss": 0.4008, "step": 3440 }, { "epoch": 1.3545347467608952, - "grad_norm": 6.915536880493164, + "grad_norm": 9.852523803710938, "learning_rate": 1.2905911437504546e-05, - "loss": 0.2666, + "loss": 0.246, "step": 3450 }, { "epoch": 1.358460934432666, - "grad_norm": 0.6370344161987305, + "grad_norm": 0.5630434155464172, "learning_rate": 1.2895004726241546e-05, - "loss": 0.2399, + "loss": 0.2228, "step": 3460 }, { "epoch": 1.3623871221044366, - "grad_norm": 3.8076541423797607, + "grad_norm": 5.466976642608643, "learning_rate": 1.288409801497855e-05, - "loss": 0.3958, + "loss": 0.308, "step": 3470 }, { "epoch": 1.3663133097762072, - "grad_norm": 1.0442428588867188, + "grad_norm": 1.2237000465393066, "learning_rate": 1.2873191303715553e-05, - "loss": 0.23, + "loss": 0.1621, "step": 3480 }, { "epoch": 1.3702394974479781, - "grad_norm": 0.08702751249074936, + "grad_norm": 0.22505871951580048, "learning_rate": 1.2862284592452557e-05, - "loss": 0.3506, + "loss": 0.3134, "step": 3490 }, { "epoch": 1.3741656851197488, - "grad_norm": 2.7526333332061768, + "grad_norm": 2.4502227306365967, "learning_rate": 1.2851377881189558e-05, - "loss": 0.4472, + "loss": 0.4901, "step": 3500 }, { "epoch": 1.3780918727915195, - "grad_norm": 2.654585838317871, + "grad_norm": 2.7651402950286865, "learning_rate": 1.2840471169926562e-05, - "loss": 0.1181, + "loss": 0.133, "step": 3510 }, { "epoch": 1.3820180604632901, - "grad_norm": 94.29911041259766, + "grad_norm": 70.7117919921875, "learning_rate": 1.2829564458663565e-05, - "loss": 0.4487, + "loss": 0.3747, "step": 3520 }, { "epoch": 1.3859442481350608, - "grad_norm": 2.7882180213928223, + "grad_norm": 2.1751351356506348, "learning_rate": 1.2818657747400569e-05, - "loss": 0.242, + "loss": 0.2179, "step": 3530 }, { "epoch": 1.3898704358068317, - "grad_norm": 25.820287704467773, + "grad_norm": 33.76871109008789, "learning_rate": 1.280775103613757e-05, - "loss": 0.3822, + "loss": 0.1683, "step": 3540 }, { "epoch": 1.3937966234786023, - "grad_norm": 1.8992029428482056, + "grad_norm": 1.7627195119857788, "learning_rate": 1.2796844324874572e-05, - "loss": 0.2421, + "loss": 0.2762, "step": 3550 }, { "epoch": 1.397722811150373, - "grad_norm": 8.083194732666016, + "grad_norm": 5.963202953338623, "learning_rate": 1.2785937613611575e-05, - "loss": 0.5646, + "loss": 0.6422, "step": 3560 }, { "epoch": 1.4016489988221437, - "grad_norm": 11.878625869750977, + "grad_norm": 4.193851470947266, "learning_rate": 1.2775030902348579e-05, - "loss": 0.1885, + "loss": 0.2083, "step": 3570 }, { "epoch": 1.4055751864939143, - "grad_norm": 1.0640068054199219, + "grad_norm": 0.357085645198822, "learning_rate": 1.2764124191085582e-05, - "loss": 0.2894, + "loss": 0.2474, "step": 3580 }, { "epoch": 1.4095013741656852, - "grad_norm": 9.506266593933105, + "grad_norm": 8.560419082641602, "learning_rate": 1.2753217479822584e-05, - "loss": 0.592, + "loss": 0.4779, "step": 3590 }, { "epoch": 1.4134275618374559, - "grad_norm": 2.0636656284332275, + "grad_norm": 1.5039420127868652, "learning_rate": 1.2742310768559587e-05, - "loss": 0.285, + "loss": 0.3662, "step": 3600 }, { "epoch": 1.4173537495092265, - "grad_norm": 5.572286128997803, + "grad_norm": 2.1456620693206787, "learning_rate": 1.273140405729659e-05, - "loss": 0.2655, + "loss": 0.1302, "step": 3610 }, { "epoch": 1.4212799371809972, - "grad_norm": 8.564207077026367, + "grad_norm": 3.4741525650024414, "learning_rate": 1.2720497346033594e-05, - "loss": 0.3474, + "loss": 0.214, "step": 3620 }, { "epoch": 1.4252061248527679, - "grad_norm": 39.71969985961914, + "grad_norm": 13.244150161743164, "learning_rate": 1.2709590634770596e-05, - "loss": 0.3601, + "loss": 0.3641, "step": 3630 }, { "epoch": 1.4291323125245388, - "grad_norm": 0.8744980096817017, + "grad_norm": 1.468261957168579, "learning_rate": 1.2698683923507597e-05, - "loss": 0.3159, + "loss": 0.3924, "step": 3640 }, { "epoch": 1.4330585001963094, - "grad_norm": 98.5473403930664, + "grad_norm": 64.67282104492188, "learning_rate": 1.2687777212244601e-05, - "loss": 0.3269, + "loss": 0.1727, "step": 3650 }, { "epoch": 1.43698468786808, - "grad_norm": 0.214829221367836, + "grad_norm": 0.03878183290362358, "learning_rate": 1.2676870500981604e-05, - "loss": 0.5745, + "loss": 0.4648, "step": 3660 }, { "epoch": 1.4409108755398508, - "grad_norm": 5.37337064743042, + "grad_norm": 6.684301853179932, "learning_rate": 1.2665963789718608e-05, - "loss": 0.1973, + "loss": 0.2147, "step": 3670 }, { "epoch": 1.4448370632116214, - "grad_norm": 48.92639923095703, + "grad_norm": 41.179874420166016, "learning_rate": 1.2655057078455611e-05, - "loss": 0.8013, + "loss": 0.5565, "step": 3680 }, { "epoch": 1.4487632508833923, - "grad_norm": 1.9772289991378784, + "grad_norm": 1.7266539335250854, "learning_rate": 1.2644150367192613e-05, - "loss": 0.1212, + "loss": 0.1024, "step": 3690 }, { "epoch": 1.452689438555163, - "grad_norm": 2.272261142730713, + "grad_norm": 2.14715838432312, "learning_rate": 1.2633243655929616e-05, - "loss": 0.1137, + "loss": 0.1039, "step": 3700 }, { "epoch": 1.4566156262269336, - "grad_norm": 18.0383243560791, + "grad_norm": 8.050750732421875, "learning_rate": 1.2622336944666618e-05, - "loss": 0.2685, + "loss": 0.351, "step": 3710 }, { "epoch": 1.4605418138987043, - "grad_norm": 5.253210067749023, + "grad_norm": 27.680932998657227, "learning_rate": 1.2611430233403621e-05, - "loss": 0.5217, + "loss": 0.5489, "step": 3720 }, { "epoch": 1.464468001570475, - "grad_norm": 4.792253494262695, + "grad_norm": 18.96026611328125, "learning_rate": 1.2600523522140623e-05, - "loss": 0.3644, + "loss": 0.4985, "step": 3730 }, { "epoch": 1.4683941892422459, - "grad_norm": 3.2499141693115234, + "grad_norm": 4.689240455627441, "learning_rate": 1.2589616810877626e-05, - "loss": 0.2957, + "loss": 0.3302, "step": 3740 }, { "epoch": 1.4723203769140165, - "grad_norm": 15.382283210754395, + "grad_norm": 7.312971591949463, "learning_rate": 1.257871009961463e-05, - "loss": 0.1865, + "loss": 0.2522, "step": 3750 }, { "epoch": 1.4762465645857872, - "grad_norm": 15.411844253540039, + "grad_norm": 2.906958818435669, "learning_rate": 1.2567803388351633e-05, - "loss": 0.2047, + "loss": 0.1583, "step": 3760 }, { "epoch": 1.4801727522575578, - "grad_norm": 2.8935534954071045, + "grad_norm": 2.277543067932129, "learning_rate": 1.2556896677088637e-05, - "loss": 0.5395, + "loss": 0.4741, "step": 3770 }, { "epoch": 1.4840989399293285, - "grad_norm": 47.827903747558594, + "grad_norm": 32.105567932128906, "learning_rate": 1.2545989965825638e-05, - "loss": 0.343, + "loss": 0.3196, "step": 3780 }, { "epoch": 1.4880251276010994, - "grad_norm": 65.80414581298828, + "grad_norm": 39.72721481323242, "learning_rate": 1.2535083254562642e-05, - "loss": 0.4888, + "loss": 0.3714, "step": 3790 }, { "epoch": 1.49195131527287, - "grad_norm": 6.793389797210693, + "grad_norm": 9.799948692321777, "learning_rate": 1.2524176543299643e-05, - "loss": 0.5024, + "loss": 0.4716, "step": 3800 }, { "epoch": 1.4958775029446407, - "grad_norm": 17.61514663696289, + "grad_norm": 1.5319210290908813, "learning_rate": 1.2513269832036647e-05, - "loss": 0.42, + "loss": 0.3437, "step": 3810 }, { "epoch": 1.4998036906164114, - "grad_norm": 0.013786498457193375, + "grad_norm": 0.008060574531555176, "learning_rate": 1.250236312077365e-05, - "loss": 0.1227, + "loss": 0.2543, "step": 3820 }, { "epoch": 1.503729878288182, - "grad_norm": 39.4918327331543, + "grad_norm": 38.86592102050781, "learning_rate": 1.2491456409510652e-05, - "loss": 0.2433, + "loss": 0.3501, "step": 3830 }, { "epoch": 1.507656065959953, - "grad_norm": 69.49808502197266, + "grad_norm": 21.065832138061523, "learning_rate": 1.2480549698247655e-05, - "loss": 0.1745, + "loss": 0.2924, "step": 3840 }, { "epoch": 1.5115822536317236, - "grad_norm": 2.1593613624572754, + "grad_norm": 2.18723726272583, "learning_rate": 1.2469642986984659e-05, - "loss": 0.5716, + "loss": 0.5621, "step": 3850 }, { "epoch": 1.5155084413034943, - "grad_norm": 6.5500006675720215, + "grad_norm": 8.56038761138916, "learning_rate": 1.2458736275721662e-05, - "loss": 0.2036, + "loss": 0.1278, "step": 3860 }, { "epoch": 1.5194346289752652, - "grad_norm": 101.1977767944336, + "grad_norm": 53.09764862060547, "learning_rate": 1.2447829564458664e-05, - "loss": 0.6963, + "loss": 0.4007, "step": 3870 }, { "epoch": 1.5233608166470356, - "grad_norm": 37.124359130859375, + "grad_norm": 34.460472106933594, "learning_rate": 1.2436922853195666e-05, - "loss": 0.5787, + "loss": 0.3771, "step": 3880 }, { "epoch": 1.5272870043188065, - "grad_norm": 0.41297638416290283, + "grad_norm": 0.2860153913497925, "learning_rate": 1.2426016141932669e-05, - "loss": 0.1384, + "loss": 0.1802, "step": 3890 }, { "epoch": 1.5312131919905771, - "grad_norm": 8.20191478729248, + "grad_norm": 7.442525386810303, "learning_rate": 1.2415109430669672e-05, - "loss": 0.082, + "loss": 0.0922, "step": 3900 }, { "epoch": 1.5351393796623478, - "grad_norm": 54.57821273803711, + "grad_norm": 8.071239471435547, "learning_rate": 1.2404202719406676e-05, - "loss": 0.1489, + "loss": 0.0994, "step": 3910 }, { "epoch": 1.5390655673341187, - "grad_norm": 0.012444762513041496, + "grad_norm": 0.0044219596311450005, "learning_rate": 1.2393296008143677e-05, - "loss": 0.2557, + "loss": 0.1932, "step": 3920 }, { "epoch": 1.5429917550058891, - "grad_norm": 25.229501724243164, + "grad_norm": 1.1259160041809082, "learning_rate": 1.2382389296880681e-05, - "loss": 0.0846, + "loss": 0.0733, "step": 3930 }, { "epoch": 1.54691794267766, - "grad_norm": 0.8175008296966553, + "grad_norm": 0.5113934874534607, "learning_rate": 1.2371482585617684e-05, - "loss": 0.1047, + "loss": 0.0693, "step": 3940 }, { "epoch": 1.5508441303494307, - "grad_norm": 0.6168592572212219, + "grad_norm": 0.6272404193878174, "learning_rate": 1.2360575874354688e-05, - "loss": 0.2308, + "loss": 0.0561, "step": 3950 }, { "epoch": 1.5547703180212014, - "grad_norm": 10.85566520690918, + "grad_norm": 5.58217191696167, "learning_rate": 1.234966916309169e-05, - "loss": 0.39, + "loss": 0.3467, "step": 3960 }, { "epoch": 1.5586965056929722, - "grad_norm": 4.265470504760742, + "grad_norm": 3.3610289096832275, "learning_rate": 1.2338762451828691e-05, - "loss": 0.235, + "loss": 0.2746, "step": 3970 }, { "epoch": 1.5626226933647427, - "grad_norm": 0.7159315347671509, + "grad_norm": 0.4434475600719452, "learning_rate": 1.2327855740565694e-05, - "loss": 0.4779, + "loss": 0.5352, "step": 3980 }, { "epoch": 1.5665488810365136, - "grad_norm": 15.867265701293945, + "grad_norm": 32.69227600097656, "learning_rate": 1.2316949029302698e-05, - "loss": 0.1256, + "loss": 0.1573, "step": 3990 }, { "epoch": 1.5704750687082842, - "grad_norm": 3.7823917865753174, + "grad_norm": 4.054953098297119, "learning_rate": 1.2306042318039701e-05, - "loss": 0.2497, + "loss": 0.1534, "step": 4000 }, { "epoch": 1.574401256380055, - "grad_norm": 0.9219305515289307, + "grad_norm": 1.0914356708526611, "learning_rate": 1.2295135606776703e-05, - "loss": 0.3294, + "loss": 0.384, "step": 4010 }, { "epoch": 1.5783274440518258, - "grad_norm": 7.009582042694092, + "grad_norm": 4.943174362182617, "learning_rate": 1.2284228895513706e-05, - "loss": 0.2944, + "loss": 0.2237, "step": 4020 }, { "epoch": 1.5822536317235962, - "grad_norm": 4.1940507888793945, + "grad_norm": 24.511137008666992, "learning_rate": 1.227332218425071e-05, - "loss": 0.1097, + "loss": 0.1579, "step": 4030 }, { "epoch": 1.5861798193953671, - "grad_norm": 2.5770161151885986, + "grad_norm": 2.4812803268432617, "learning_rate": 1.2262415472987713e-05, - "loss": 0.0695, + "loss": 0.0762, "step": 4040 }, { "epoch": 1.5901060070671378, - "grad_norm": 43.279598236083984, + "grad_norm": 33.02486801147461, "learning_rate": 1.2251508761724715e-05, - "loss": 0.2461, + "loss": 0.26, "step": 4050 }, { "epoch": 1.5940321947389084, - "grad_norm": 21.83753204345703, + "grad_norm": 42.50679397583008, "learning_rate": 1.2240602050461717e-05, - "loss": 0.1192, + "loss": 0.2323, "step": 4060 }, { "epoch": 1.5979583824106793, - "grad_norm": 27.78812599182129, + "grad_norm": 31.61253547668457, "learning_rate": 1.222969533919872e-05, - "loss": 0.0964, + "loss": 0.1465, "step": 4070 }, { "epoch": 1.6018845700824498, - "grad_norm": 30.291053771972656, + "grad_norm": 16.291244506835938, "learning_rate": 1.2218788627935723e-05, - "loss": 0.211, + "loss": 0.2368, "step": 4080 }, { "epoch": 1.6058107577542207, - "grad_norm": 47.00924301147461, + "grad_norm": 36.47014617919922, "learning_rate": 1.2207881916672727e-05, - "loss": 0.5632, + "loss": 0.3985, "step": 4090 }, { "epoch": 1.6097369454259913, - "grad_norm": 0.9587500691413879, + "grad_norm": 1.4661973714828491, "learning_rate": 1.2196975205409728e-05, - "loss": 0.5037, + "loss": 0.4125, "step": 4100 }, { "epoch": 1.613663133097762, - "grad_norm": 30.255420684814453, + "grad_norm": 21.816913604736328, "learning_rate": 1.2186068494146732e-05, - "loss": 0.4127, + "loss": 0.416, "step": 4110 }, { "epoch": 1.6175893207695329, - "grad_norm": 4.81709098815918, + "grad_norm": 3.423997163772583, "learning_rate": 1.2175161782883735e-05, - "loss": 0.1723, + "loss": 0.2471, "step": 4120 }, { "epoch": 1.6215155084413035, - "grad_norm": 2.9998042583465576, + "grad_norm": 1.4516773223876953, "learning_rate": 1.2164255071620739e-05, - "loss": 0.0546, + "loss": 0.0581, "step": 4130 }, { "epoch": 1.6254416961130742, - "grad_norm": 0.4158773720264435, + "grad_norm": 0.8106865286827087, "learning_rate": 1.215334836035774e-05, - "loss": 0.8328, + "loss": 0.7108, "step": 4140 }, { "epoch": 1.6293678837848449, - "grad_norm": 29.774818420410156, + "grad_norm": 28.893798828125, "learning_rate": 1.2142441649094742e-05, - "loss": 0.4989, + "loss": 0.5593, "step": 4150 }, { "epoch": 1.6332940714566155, - "grad_norm": 2.8384060859680176, + "grad_norm": 1.2560535669326782, "learning_rate": 1.2131534937831746e-05, - "loss": 0.224, + "loss": 0.2434, "step": 4160 }, { "epoch": 1.6372202591283864, - "grad_norm": 2.5227696895599365, + "grad_norm": 3.4033455848693848, "learning_rate": 1.2120628226568749e-05, - "loss": 0.3677, + "loss": 0.3323, "step": 4170 }, { "epoch": 1.641146446800157, - "grad_norm": 3.062748908996582, + "grad_norm": 4.629420757293701, "learning_rate": 1.2109721515305752e-05, - "loss": 0.1755, + "loss": 0.262, "step": 4180 }, { "epoch": 1.6450726344719278, - "grad_norm": 13.409513473510742, + "grad_norm": 1.8610162734985352, "learning_rate": 1.2098814804042756e-05, - "loss": 0.1801, + "loss": 0.1929, "step": 4190 }, { "epoch": 1.6489988221436984, - "grad_norm": 1.779102087020874, + "grad_norm": 2.187347650527954, "learning_rate": 1.2087908092779757e-05, - "loss": 0.2987, + "loss": 0.1955, "step": 4200 }, { "epoch": 1.652925009815469, - "grad_norm": 0.5692170262336731, + "grad_norm": 0.6556830406188965, "learning_rate": 1.207700138151676e-05, - "loss": 0.2419, + "loss": 0.194, "step": 4210 }, { "epoch": 1.65685119748724, - "grad_norm": 9.442747116088867, + "grad_norm": 10.433953285217285, "learning_rate": 1.2066094670253763e-05, - "loss": 0.3116, + "loss": 0.2146, "step": 4220 }, { "epoch": 1.6607773851590106, - "grad_norm": 6.343710422515869, + "grad_norm": 4.713764667510986, "learning_rate": 1.2055187958990766e-05, - "loss": 0.141, + "loss": 0.075, "step": 4230 }, { "epoch": 1.6647035728307813, - "grad_norm": 2.3418519496917725, + "grad_norm": 11.79529094696045, "learning_rate": 1.2044281247727768e-05, - "loss": 0.2045, + "loss": 0.113, "step": 4240 }, { "epoch": 1.6686297605025522, - "grad_norm": 0.11656112223863602, + "grad_norm": 0.48563656210899353, "learning_rate": 1.2033374536464771e-05, - "loss": 0.1237, + "loss": 0.1807, "step": 4250 }, { "epoch": 1.6725559481743226, - "grad_norm": 2.9281299114227295, + "grad_norm": 3.780322551727295, "learning_rate": 1.2022467825201774e-05, - "loss": 0.1874, + "loss": 0.2485, "step": 4260 }, { "epoch": 1.6764821358460935, - "grad_norm": 21.003536224365234, + "grad_norm": 13.414273262023926, "learning_rate": 1.2011561113938778e-05, - "loss": 0.3257, + "loss": 0.2225, "step": 4270 }, { "epoch": 1.6804083235178642, - "grad_norm": 2.155855417251587, + "grad_norm": 1.58971107006073, "learning_rate": 1.2000654402675781e-05, - "loss": 0.1054, + "loss": 0.139, "step": 4280 }, { "epoch": 1.6843345111896348, - "grad_norm": 0.8147543668746948, + "grad_norm": 0.8417547941207886, "learning_rate": 1.1989747691412783e-05, - "loss": 0.8279, + "loss": 0.7407, "step": 4290 }, { "epoch": 1.6882606988614057, - "grad_norm": 35.95346450805664, + "grad_norm": 21.29022789001465, "learning_rate": 1.1978840980149786e-05, - "loss": 0.2114, + "loss": 0.1743, "step": 4300 }, { "epoch": 1.6921868865331762, - "grad_norm": 0.14950741827487946, + "grad_norm": 0.06733459234237671, "learning_rate": 1.1967934268886788e-05, - "loss": 0.1203, + "loss": 0.1736, "step": 4310 }, { "epoch": 1.696113074204947, - "grad_norm": 2.692833185195923, + "grad_norm": 18.892793655395508, "learning_rate": 1.1957027557623791e-05, - "loss": 0.1026, + "loss": 0.2538, "step": 4320 }, { "epoch": 1.7000392618767177, - "grad_norm": 2.132970094680786, + "grad_norm": 1.9906818866729736, "learning_rate": 1.1946120846360793e-05, - "loss": 0.4647, + "loss": 0.3897, "step": 4330 }, { "epoch": 1.7039654495484884, - "grad_norm": 3.76112699508667, + "grad_norm": 2.6699600219726562, "learning_rate": 1.1935214135097797e-05, - "loss": 0.1074, + "loss": 0.1141, "step": 4340 }, { "epoch": 1.7078916372202593, - "grad_norm": 2.332601547241211, + "grad_norm": 2.170525550842285, "learning_rate": 1.19243074238348e-05, - "loss": 0.1042, + "loss": 0.095, "step": 4350 }, { "epoch": 1.7118178248920297, - "grad_norm": 31.587160110473633, + "grad_norm": 42.41236114501953, "learning_rate": 1.1913400712571803e-05, - "loss": 0.1447, + "loss": 0.2352, "step": 4360 }, { "epoch": 1.7157440125638006, - "grad_norm": 9.502628326416016, + "grad_norm": 2.2652878761291504, "learning_rate": 1.1902494001308807e-05, - "loss": 0.2851, + "loss": 0.1746, "step": 4370 }, { "epoch": 1.7196702002355713, - "grad_norm": 1.109468936920166, + "grad_norm": 0.7272818088531494, "learning_rate": 1.1891587290045808e-05, - "loss": 0.1257, + "loss": 0.1502, "step": 4380 }, { "epoch": 1.723596387907342, - "grad_norm": 46.283287048339844, + "grad_norm": 52.35148620605469, "learning_rate": 1.1880680578782812e-05, - "loss": 0.4181, + "loss": 0.5154, "step": 4390 }, { "epoch": 1.7275225755791128, - "grad_norm": 1.7998861074447632, + "grad_norm": 1.9060238599777222, "learning_rate": 1.1869773867519814e-05, - "loss": 0.2095, + "loss": 0.3174, "step": 4400 }, { "epoch": 1.7314487632508833, - "grad_norm": 4.281277656555176, + "grad_norm": 7.425998687744141, "learning_rate": 1.1858867156256817e-05, - "loss": 0.2439, + "loss": 0.279, "step": 4410 }, { "epoch": 1.7353749509226541, - "grad_norm": 3.0704283714294434, + "grad_norm": 0.37728676199913025, "learning_rate": 1.184796044499382e-05, - "loss": 0.0505, + "loss": 0.0357, "step": 4420 }, { "epoch": 1.7393011385944248, - "grad_norm": 2.278432607650757, + "grad_norm": 0.2503686249256134, "learning_rate": 1.1837053733730822e-05, - "loss": 0.2598, + "loss": 0.2153, "step": 4430 }, { "epoch": 1.7432273262661955, - "grad_norm": 0.35137739777565, + "grad_norm": 0.5800251364707947, "learning_rate": 1.1826147022467825e-05, - "loss": 0.1042, + "loss": 0.1235, "step": 4440 }, { "epoch": 1.7471535139379664, - "grad_norm": 0.06673158705234528, + "grad_norm": 0.01660076342523098, "learning_rate": 1.1815240311204829e-05, - "loss": 0.335, + "loss": 0.3932, "step": 4450 }, { "epoch": 1.7510797016097368, - "grad_norm": 12.628028869628906, + "grad_norm": 4.8997321128845215, "learning_rate": 1.1804333599941832e-05, - "loss": 0.2652, + "loss": 0.0691, "step": 4460 }, { "epoch": 1.7550058892815077, - "grad_norm": 47.03285598754883, + "grad_norm": 42.73733139038086, "learning_rate": 1.1793426888678834e-05, - "loss": 0.4436, + "loss": 0.4522, "step": 4470 }, { "epoch": 1.7589320769532784, - "grad_norm": 0.9875231385231018, + "grad_norm": 1.2741503715515137, "learning_rate": 1.1782520177415836e-05, - "loss": 0.4397, + "loss": 0.5499, "step": 4480 }, { "epoch": 1.762858264625049, - "grad_norm": 1.1983519792556763, + "grad_norm": 0.10601712763309479, "learning_rate": 1.1771613466152839e-05, - "loss": 0.3532, + "loss": 0.2581, "step": 4490 }, { "epoch": 1.76678445229682, - "grad_norm": 0.02381601557135582, + "grad_norm": 0.017394443973898888, "learning_rate": 1.1760706754889842e-05, - "loss": 0.1159, + "loss": 0.1202, "step": 4500 }, { "epoch": 1.7707106399685903, - "grad_norm": 13.333812713623047, + "grad_norm": 15.199203491210938, "learning_rate": 1.1749800043626846e-05, - "loss": 0.1921, + "loss": 0.2225, "step": 4510 }, { "epoch": 1.7746368276403612, - "grad_norm": 2.6007888317108154, + "grad_norm": 1.049607753753662, "learning_rate": 1.1738893332363848e-05, - "loss": 0.0632, + "loss": 0.1454, "step": 4520 }, { "epoch": 1.778563015312132, - "grad_norm": 34.8441047668457, + "grad_norm": 2.6150224208831787, "learning_rate": 1.1727986621100851e-05, - "loss": 0.1967, + "loss": 0.1444, "step": 4530 }, { "epoch": 1.7824892029839026, - "grad_norm": 0.4005063772201538, + "grad_norm": 0.3684069514274597, "learning_rate": 1.1717079909837854e-05, - "loss": 0.1965, + "loss": 0.3143, "step": 4540 }, { "epoch": 1.7864153906556735, - "grad_norm": 9.887832641601562, + "grad_norm": 17.626483917236328, "learning_rate": 1.1706173198574858e-05, - "loss": 0.2359, + "loss": 0.1998, "step": 4550 }, { "epoch": 1.790341578327444, - "grad_norm": 0.302543967962265, + "grad_norm": 0.33435943722724915, "learning_rate": 1.169526648731186e-05, - "loss": 0.3309, + "loss": 0.2809, "step": 4560 }, { "epoch": 1.7942677659992148, - "grad_norm": 14.347996711730957, + "grad_norm": 0.18420487642288208, "learning_rate": 1.1684359776048861e-05, - "loss": 0.0679, + "loss": 0.0667, "step": 4570 }, { "epoch": 1.7981939536709854, - "grad_norm": 1.054168462753296, + "grad_norm": 1.2188926935195923, "learning_rate": 1.1673453064785865e-05, - "loss": 0.0944, + "loss": 0.0812, "step": 4580 }, { "epoch": 1.802120141342756, - "grad_norm": 3.1250557899475098, + "grad_norm": 8.814946174621582, "learning_rate": 1.1662546353522868e-05, - "loss": 0.1724, + "loss": 0.1069, "step": 4590 }, { "epoch": 1.806046329014527, - "grad_norm": 0.7301310896873474, + "grad_norm": 0.5256406664848328, "learning_rate": 1.1651639642259871e-05, - "loss": 0.1065, + "loss": 0.1838, "step": 4600 }, { "epoch": 1.8099725166862974, - "grad_norm": 10.358341217041016, + "grad_norm": 27.83869743347168, "learning_rate": 1.1640732930996873e-05, - "loss": 0.4261, + "loss": 0.3041, "step": 4610 }, { "epoch": 1.8138987043580683, - "grad_norm": 7.623668670654297, + "grad_norm": 17.37398338317871, "learning_rate": 1.1629826219733876e-05, - "loss": 0.4021, + "loss": 0.4189, "step": 4620 }, { "epoch": 1.817824892029839, - "grad_norm": 4.492222785949707, + "grad_norm": 0.6335673928260803, "learning_rate": 1.161891950847088e-05, - "loss": 0.1024, + "loss": 0.049, "step": 4630 }, { "epoch": 1.8217510797016097, - "grad_norm": 21.047353744506836, + "grad_norm": 26.77448272705078, "learning_rate": 1.1608012797207883e-05, - "loss": 0.4526, + "loss": 0.4465, "step": 4640 }, { "epoch": 1.8256772673733805, - "grad_norm": 35.66292953491211, + "grad_norm": 25.042078018188477, "learning_rate": 1.1597106085944885e-05, - "loss": 0.6816, + "loss": 0.5236, "step": 4650 }, { "epoch": 1.8296034550451512, - "grad_norm": 4.782094478607178, + "grad_norm": 2.642307758331299, "learning_rate": 1.1586199374681887e-05, - "loss": 0.2171, + "loss": 0.211, "step": 4660 }, { "epoch": 1.8335296427169219, - "grad_norm": 0.2317655384540558, + "grad_norm": 0.2744080424308777, "learning_rate": 1.157529266341889e-05, - "loss": 0.028, + "loss": 0.0392, "step": 4670 }, { "epoch": 1.8374558303886925, - "grad_norm": 0.8448551297187805, + "grad_norm": 1.4772825241088867, "learning_rate": 1.1564385952155894e-05, - "loss": 0.0303, + "loss": 0.0385, "step": 4680 }, { "epoch": 1.8413820180604632, - "grad_norm": 5.708097457885742, + "grad_norm": 24.301664352416992, "learning_rate": 1.1553479240892897e-05, - "loss": 0.0676, + "loss": 0.0873, "step": 4690 }, { "epoch": 1.845308205732234, - "grad_norm": 23.819332122802734, + "grad_norm": 36.702293395996094, "learning_rate": 1.1542572529629899e-05, - "loss": 0.4183, + "loss": 0.4341, "step": 4700 }, { "epoch": 1.8492343934040048, - "grad_norm": 1.0077656507492065, + "grad_norm": 1.1580719947814941, "learning_rate": 1.1531665818366902e-05, - "loss": 0.0536, + "loss": 0.0554, "step": 4710 }, { "epoch": 1.8531605810757754, - "grad_norm": 9.621075630187988, + "grad_norm": 25.359813690185547, "learning_rate": 1.1520759107103905e-05, - "loss": 0.3384, + "loss": 0.3721, "step": 4720 }, { "epoch": 1.857086768747546, - "grad_norm": 38.56547546386719, + "grad_norm": 34.623477935791016, "learning_rate": 1.1509852395840909e-05, - "loss": 0.2478, + "loss": 0.2872, "step": 4730 }, { "epoch": 1.8610129564193167, - "grad_norm": 3.536519765853882, + "grad_norm": 0.39129844307899475, "learning_rate": 1.149894568457791e-05, - "loss": 0.1468, + "loss": 0.1829, "step": 4740 }, { "epoch": 1.8649391440910876, - "grad_norm": 3.9039463996887207, + "grad_norm": 2.954010248184204, "learning_rate": 1.1488038973314912e-05, - "loss": 0.0693, + "loss": 0.0688, "step": 4750 }, { "epoch": 1.8688653317628583, - "grad_norm": 42.27299880981445, + "grad_norm": 12.764348030090332, "learning_rate": 1.1477132262051916e-05, - "loss": 0.2891, + "loss": 0.2326, "step": 4760 }, { "epoch": 1.872791519434629, - "grad_norm": 2.438563108444214, + "grad_norm": 1.5827277898788452, "learning_rate": 1.1466225550788919e-05, - "loss": 0.2541, + "loss": 0.2279, "step": 4770 }, { "epoch": 1.8767177071063998, - "grad_norm": 1.3442409038543701, + "grad_norm": 2.375059127807617, "learning_rate": 1.1455318839525922e-05, - "loss": 0.0291, + "loss": 0.0587, "step": 4780 }, { "epoch": 1.8806438947781703, - "grad_norm": 1.46486234664917, + "grad_norm": 0.25809821486473083, "learning_rate": 1.1444412128262926e-05, - "loss": 0.342, + "loss": 0.3066, "step": 4790 }, { "epoch": 1.8845700824499412, - "grad_norm": 26.426105499267578, + "grad_norm": 30.938188552856445, "learning_rate": 1.1433505416999928e-05, - "loss": 0.6372, + "loss": 0.5834, "step": 4800 }, { "epoch": 1.8884962701217118, - "grad_norm": 105.84485626220703, + "grad_norm": 57.43980026245117, "learning_rate": 1.1422598705736931e-05, - "loss": 0.4038, + "loss": 0.4127, "step": 4810 }, { "epoch": 1.8924224577934825, - "grad_norm": 0.9661233425140381, + "grad_norm": 3.9162137508392334, "learning_rate": 1.1411691994473933e-05, - "loss": 0.1516, + "loss": 0.1693, "step": 4820 }, { "epoch": 1.8963486454652534, - "grad_norm": 3.0319085121154785, + "grad_norm": 1.3653476238250732, "learning_rate": 1.1400785283210936e-05, - "loss": 0.2057, + "loss": 0.1286, "step": 4830 }, { "epoch": 1.9002748331370238, - "grad_norm": 1.0012201070785522, + "grad_norm": 4.350405216217041, "learning_rate": 1.1389878571947938e-05, - "loss": 0.4942, + "loss": 0.2301, "step": 4840 }, { "epoch": 1.9042010208087947, - "grad_norm": 3.7785074710845947, + "grad_norm": 2.9020628929138184, "learning_rate": 1.1378971860684941e-05, - "loss": 0.2612, + "loss": 0.2505, "step": 4850 }, { "epoch": 1.9081272084805654, - "grad_norm": 0.15459251403808594, + "grad_norm": 0.07572176307439804, "learning_rate": 1.1368065149421945e-05, - "loss": 0.3275, + "loss": 0.3087, "step": 4860 }, { "epoch": 1.912053396152336, - "grad_norm": 0.06626954674720764, + "grad_norm": 0.1045960858464241, "learning_rate": 1.1357158438158948e-05, - "loss": 0.2472, + "loss": 0.2448, "step": 4870 }, { "epoch": 1.915979583824107, - "grad_norm": 0.43407300114631653, + "grad_norm": 3.5578835010528564, "learning_rate": 1.1346251726895951e-05, - "loss": 0.117, + "loss": 0.1693, "step": 4880 }, { "epoch": 1.9199057714958774, - "grad_norm": 64.02273559570312, + "grad_norm": 88.63399505615234, "learning_rate": 1.1335345015632953e-05, - "loss": 0.5139, + "loss": 0.4683, "step": 4890 }, { "epoch": 1.9238319591676483, - "grad_norm": 0.4571993947029114, + "grad_norm": 0.09000589698553085, "learning_rate": 1.1324438304369956e-05, - "loss": 0.1381, + "loss": 0.369, "step": 4900 }, { "epoch": 1.927758146839419, - "grad_norm": 2.856790542602539, + "grad_norm": 2.1689536571502686, "learning_rate": 1.1313531593106958e-05, - "loss": 0.3868, + "loss": 0.3601, "step": 4910 }, { "epoch": 1.9316843345111896, - "grad_norm": 0.8793458938598633, + "grad_norm": 0.4397715926170349, "learning_rate": 1.1302624881843962e-05, - "loss": 0.051, + "loss": 0.081, "step": 4920 }, { "epoch": 1.9356105221829605, - "grad_norm": 0.5418068170547485, + "grad_norm": 3.5679731369018555, "learning_rate": 1.1291718170580965e-05, - "loss": 0.1776, + "loss": 0.0969, "step": 4930 }, { "epoch": 1.939536709854731, - "grad_norm": 2.6040823459625244, + "grad_norm": 4.980875492095947, "learning_rate": 1.1280811459317967e-05, - "loss": 0.7193, + "loss": 0.7658, "step": 4940 }, { "epoch": 1.9434628975265018, - "grad_norm": 2.320516347885132, + "grad_norm": 3.793043375015259, "learning_rate": 1.126990474805497e-05, - "loss": 0.1382, + "loss": 0.1816, "step": 4950 }, { "epoch": 1.9473890851982725, - "grad_norm": 1.2318557500839233, + "grad_norm": 1.4687343835830688, "learning_rate": 1.1258998036791973e-05, - "loss": 0.4078, + "loss": 0.3441, "step": 4960 }, { "epoch": 1.9513152728700431, - "grad_norm": 0.9136293530464172, + "grad_norm": 0.48666736483573914, "learning_rate": 1.1248091325528977e-05, - "loss": 0.1331, + "loss": 0.0811, "step": 4970 }, { "epoch": 1.955241460541814, - "grad_norm": 1.3649396896362305, + "grad_norm": 0.5860501527786255, "learning_rate": 1.1237184614265979e-05, - "loss": 0.1447, + "loss": 0.0951, "step": 4980 }, { "epoch": 1.9591676482135845, - "grad_norm": 64.03887939453125, + "grad_norm": 51.60679626464844, "learning_rate": 1.122627790300298e-05, - "loss": 0.8119, + "loss": 0.6194, "step": 4990 }, { "epoch": 1.9630938358853554, - "grad_norm": 4.18821907043457, + "grad_norm": 3.2067317962646484, "learning_rate": 1.1215371191739984e-05, - "loss": 0.0396, + "loss": 0.0553, "step": 5000 }, { "epoch": 1.967020023557126, - "grad_norm": 5.9112677574157715, + "grad_norm": 2.8314120769500732, "learning_rate": 1.1204464480476987e-05, - "loss": 0.1586, + "loss": 0.0731, "step": 5010 }, { "epoch": 1.9709462112288967, - "grad_norm": 0.9457396268844604, + "grad_norm": 1.1886752843856812, "learning_rate": 1.119355776921399e-05, - "loss": 0.3625, + "loss": 0.2312, "step": 5020 }, { "epoch": 1.9748723989006676, - "grad_norm": 27.326642990112305, + "grad_norm": 10.633919715881348, "learning_rate": 1.1182651057950992e-05, - "loss": 0.4076, + "loss": 0.3473, "step": 5030 }, { "epoch": 1.978798586572438, - "grad_norm": 16.921142578125, + "grad_norm": 13.639116287231445, "learning_rate": 1.1171744346687996e-05, - "loss": 0.1716, + "loss": 0.1756, "step": 5040 }, { "epoch": 1.982724774244209, - "grad_norm": 5.490754127502441, + "grad_norm": 1.5369274616241455, "learning_rate": 1.1160837635424999e-05, - "loss": 0.2818, + "loss": 0.352, "step": 5050 }, { "epoch": 1.9866509619159796, - "grad_norm": 0.17912153899669647, + "grad_norm": 0.2064032107591629, "learning_rate": 1.1149930924162002e-05, - "loss": 0.3383, + "loss": 0.3489, "step": 5060 }, { "epoch": 1.9905771495877502, - "grad_norm": 7.714686393737793, + "grad_norm": 6.747467041015625, "learning_rate": 1.1139024212899006e-05, - "loss": 0.084, + "loss": 0.0484, "step": 5070 }, { "epoch": 1.9945033372595211, - "grad_norm": 0.24603542685508728, + "grad_norm": 0.48481491208076477, "learning_rate": 1.1128117501636006e-05, - "loss": 0.0684, + "loss": 0.0895, "step": 5080 }, { "epoch": 1.9984295249312916, - "grad_norm": 0.4901498854160309, + "grad_norm": 1.7066333293914795, "learning_rate": 1.111721079037301e-05, - "loss": 0.4822, + "loss": 0.608, "step": 5090 }, { "epoch": 2.0, - "eval_loss": 0.33433255553245544, - "eval_runtime": 11.3479, - "eval_samples_per_second": 199.508, - "eval_steps_per_second": 24.938, + "eval_loss": 0.2542453408241272, + "eval_runtime": 12.0862, + "eval_samples_per_second": 187.32, + "eval_steps_per_second": 23.415, "step": 5094 }, { "epoch": 2.0023557126030624, - "grad_norm": 0.43537086248397827, + "grad_norm": 0.660499095916748, "learning_rate": 1.1106304079110013e-05, - "loss": 0.0601, + "loss": 0.0524, "step": 5100 }, { "epoch": 2.0062819002748333, - "grad_norm": 10.666902542114258, + "grad_norm": 14.827805519104004, "learning_rate": 1.1095397367847016e-05, - "loss": 0.0952, + "loss": 0.1765, "step": 5110 }, { "epoch": 2.0102080879466038, - "grad_norm": 1.4851411581039429, + "grad_norm": 19.803085327148438, "learning_rate": 1.1084490656584018e-05, - "loss": 0.3158, + "loss": 0.3992, "step": 5120 }, { "epoch": 2.0141342756183747, - "grad_norm": 50.06395721435547, + "grad_norm": 91.47151947021484, "learning_rate": 1.1073583945321021e-05, - "loss": 0.0463, + "loss": 0.0607, "step": 5130 }, { "epoch": 2.018060463290145, - "grad_norm": 61.58543395996094, + "grad_norm": 42.167030334472656, "learning_rate": 1.1062677234058024e-05, - "loss": 0.4353, + "loss": 0.3195, "step": 5140 }, { "epoch": 2.021986650961916, - "grad_norm": 0.015496039763092995, + "grad_norm": 0.003961506299674511, "learning_rate": 1.1051770522795028e-05, - "loss": 0.1596, + "loss": 0.2835, "step": 5150 }, { "epoch": 2.025912838633687, - "grad_norm": 0.6960954070091248, + "grad_norm": 2.5522396564483643, "learning_rate": 1.104086381153203e-05, - "loss": 0.2004, + "loss": 0.1716, "step": 5160 }, { "epoch": 2.0298390263054573, - "grad_norm": 2.7587497234344482, + "grad_norm": 7.234408378601074, "learning_rate": 1.1029957100269031e-05, - "loss": 0.0841, + "loss": 0.1154, "step": 5170 }, { "epoch": 2.033765213977228, - "grad_norm": 3.3305282592773438, + "grad_norm": 4.5936598777771, "learning_rate": 1.1019050389006035e-05, - "loss": 0.0487, + "loss": 0.0496, "step": 5180 }, { "epoch": 2.0376914016489986, - "grad_norm": 12.296683311462402, + "grad_norm": 4.262065410614014, "learning_rate": 1.1008143677743038e-05, - "loss": 0.3119, + "loss": 0.2505, "step": 5190 }, { "epoch": 2.0416175893207695, - "grad_norm": 7.0005083084106445, + "grad_norm": 6.03399658203125, "learning_rate": 1.0997236966480042e-05, - "loss": 0.1063, + "loss": 0.0637, "step": 5200 }, { "epoch": 2.0455437769925404, - "grad_norm": 5.020710468292236, + "grad_norm": 1.7100470066070557, "learning_rate": 1.0986330255217043e-05, - "loss": 0.0662, + "loss": 0.0313, "step": 5210 }, { "epoch": 2.049469964664311, - "grad_norm": 0.8911005854606628, + "grad_norm": 2.106959581375122, "learning_rate": 1.0975423543954047e-05, - "loss": 0.2814, + "loss": 0.1689, "step": 5220 }, { "epoch": 2.0533961523360817, - "grad_norm": 1.4800794124603271, + "grad_norm": 2.25296688079834, "learning_rate": 1.096451683269105e-05, - "loss": 0.0567, + "loss": 0.0406, "step": 5230 }, { "epoch": 2.057322340007852, - "grad_norm": 92.8011703491211, + "grad_norm": 31.612632751464844, "learning_rate": 1.0953610121428053e-05, - "loss": 0.5465, + "loss": 0.4152, "step": 5240 }, { "epoch": 2.061248527679623, - "grad_norm": 0.2573162317276001, + "grad_norm": 0.09129858016967773, "learning_rate": 1.0942703410165055e-05, - "loss": 0.1104, + "loss": 0.1341, "step": 5250 }, { "epoch": 2.065174715351394, - "grad_norm": 0.8492686748504639, + "grad_norm": 1.2951576709747314, "learning_rate": 1.0931796698902057e-05, - "loss": 0.337, + "loss": 0.1399, "step": 5260 }, { "epoch": 2.0691009030231644, - "grad_norm": 25.304157257080078, + "grad_norm": 33.41801834106445, "learning_rate": 1.092088998763906e-05, - "loss": 0.2449, + "loss": 0.2409, "step": 5270 }, { "epoch": 2.0730270906949353, - "grad_norm": 2.877927780151367, + "grad_norm": 18.708669662475586, "learning_rate": 1.0909983276376064e-05, - "loss": 0.2925, + "loss": 0.3017, "step": 5280 }, { "epoch": 2.0769532783667057, - "grad_norm": 10.76980972290039, + "grad_norm": 10.98039436340332, "learning_rate": 1.0899076565113067e-05, - "loss": 0.1931, + "loss": 0.1891, "step": 5290 }, { "epoch": 2.0808794660384766, - "grad_norm": 14.985898971557617, + "grad_norm": 11.020075798034668, "learning_rate": 1.088816985385007e-05, - "loss": 0.0395, + "loss": 0.0517, "step": 5300 }, { "epoch": 2.0848056537102475, - "grad_norm": 1.4945588111877441, + "grad_norm": 2.3587729930877686, "learning_rate": 1.0877263142587072e-05, - "loss": 0.1286, + "loss": 0.2053, "step": 5310 }, { "epoch": 2.088731841382018, - "grad_norm": 34.53713607788086, + "grad_norm": 13.901846885681152, "learning_rate": 1.0866356431324076e-05, - "loss": 0.1039, + "loss": 0.08, "step": 5320 }, { "epoch": 2.092658029053789, - "grad_norm": 2.387420654296875, + "grad_norm": 5.135604381561279, "learning_rate": 1.0855449720061077e-05, - "loss": 0.4098, + "loss": 0.3673, "step": 5330 }, { "epoch": 2.0965842167255593, - "grad_norm": 1.549958348274231, + "grad_norm": 1.2400366067886353, "learning_rate": 1.084454300879808e-05, - "loss": 0.3115, + "loss": 0.3504, "step": 5340 }, { "epoch": 2.10051040439733, - "grad_norm": 0.2978038489818573, + "grad_norm": 1.3022143840789795, "learning_rate": 1.0833636297535082e-05, - "loss": 0.1452, + "loss": 0.0868, "step": 5350 }, { "epoch": 2.104436592069101, - "grad_norm": 1.8551357984542847, + "grad_norm": 2.3019893169403076, "learning_rate": 1.0822729586272086e-05, - "loss": 0.0903, + "loss": 0.1497, "step": 5360 }, { "epoch": 2.1083627797408715, - "grad_norm": 0.33167847990989685, + "grad_norm": 14.929694175720215, "learning_rate": 1.0811822875009089e-05, - "loss": 0.2144, + "loss": 0.4463, "step": 5370 }, { "epoch": 2.1122889674126424, - "grad_norm": 0.230964794754982, + "grad_norm": 1.7946845293045044, "learning_rate": 1.0800916163746093e-05, - "loss": 0.1916, + "loss": 0.2282, "step": 5380 }, { "epoch": 2.116215155084413, - "grad_norm": 2.453498363494873, + "grad_norm": 0.5966055393218994, "learning_rate": 1.0790009452483096e-05, - "loss": 0.0453, + "loss": 0.0549, "step": 5390 }, { "epoch": 2.1201413427561837, - "grad_norm": 0.12110085040330887, + "grad_norm": 0.38297879695892334, "learning_rate": 1.0779102741220098e-05, - "loss": 0.5143, + "loss": 0.3811, "step": 5400 }, { "epoch": 2.1240675304279546, - "grad_norm": 52.622493743896484, + "grad_norm": 39.88683319091797, "learning_rate": 1.0768196029957101e-05, - "loss": 0.2527, + "loss": 0.2065, "step": 5410 }, { "epoch": 2.127993718099725, - "grad_norm": 2.6107842922210693, + "grad_norm": 1.3031110763549805, "learning_rate": 1.0757289318694103e-05, - "loss": 0.6835, + "loss": 0.6112, "step": 5420 }, { "epoch": 2.131919905771496, - "grad_norm": 2.568737506866455, + "grad_norm": 2.865288019180298, "learning_rate": 1.0746382607431106e-05, - "loss": 0.0909, + "loss": 0.0672, "step": 5430 }, { "epoch": 2.1358460934432664, - "grad_norm": 2.7886953353881836, + "grad_norm": 7.15928316116333, "learning_rate": 1.073547589616811e-05, - "loss": 0.4157, + "loss": 0.1668, "step": 5440 }, { "epoch": 2.1397722811150373, - "grad_norm": 15.35631275177002, + "grad_norm": 4.131762504577637, "learning_rate": 1.0724569184905111e-05, - "loss": 0.099, + "loss": 0.072, "step": 5450 }, { "epoch": 2.143698468786808, - "grad_norm": 2.943362236022949, + "grad_norm": 2.9077038764953613, "learning_rate": 1.0713662473642115e-05, - "loss": 0.3043, + "loss": 0.2916, "step": 5460 }, { "epoch": 2.1476246564585786, - "grad_norm": 0.37295016646385193, + "grad_norm": 0.5987065434455872, "learning_rate": 1.0702755762379118e-05, - "loss": 0.0575, + "loss": 0.0533, "step": 5470 }, { "epoch": 2.1515508441303495, - "grad_norm": 30.756572723388672, + "grad_norm": 50.157630920410156, "learning_rate": 1.0691849051116121e-05, - "loss": 0.4514, + "loss": 0.4383, "step": 5480 }, { "epoch": 2.1554770318021204, - "grad_norm": 5.921298980712891, + "grad_norm": 7.79362678527832, "learning_rate": 1.0680942339853123e-05, - "loss": 0.0446, + "loss": 0.0465, "step": 5490 }, { "epoch": 2.159403219473891, - "grad_norm": 0.29499348998069763, + "grad_norm": 0.06476526707410812, "learning_rate": 1.0670035628590125e-05, - "loss": 0.12, + "loss": 0.162, "step": 5500 }, { "epoch": 2.1633294071456617, - "grad_norm": 48.97966766357422, + "grad_norm": 86.66081237792969, "learning_rate": 1.0659128917327128e-05, - "loss": 0.3755, + "loss": 0.8574, "step": 5510 }, { "epoch": 2.167255594817432, - "grad_norm": 26.55067253112793, + "grad_norm": 26.79175567626953, "learning_rate": 1.0648222206064132e-05, - "loss": 0.3144, + "loss": 0.3911, "step": 5520 }, { "epoch": 2.171181782489203, - "grad_norm": 41.20008087158203, + "grad_norm": 38.800514221191406, "learning_rate": 1.0637315494801135e-05, - "loss": 0.1857, + "loss": 0.1874, "step": 5530 }, { "epoch": 2.175107970160974, - "grad_norm": 1.8148813247680664, + "grad_norm": 1.2191575765609741, "learning_rate": 1.0626408783538137e-05, - "loss": 0.2164, + "loss": 0.2042, "step": 5540 }, { "epoch": 2.1790341578327443, - "grad_norm": 1.0039079189300537, + "grad_norm": 2.576686143875122, "learning_rate": 1.061550207227514e-05, - "loss": 0.2514, + "loss": 0.2109, "step": 5550 }, { "epoch": 2.1829603455045152, - "grad_norm": 2.235701560974121, + "grad_norm": 1.5496302843093872, "learning_rate": 1.0604595361012144e-05, - "loss": 0.2934, + "loss": 0.2716, "step": 5560 }, { "epoch": 2.1868865331762857, - "grad_norm": 61.45900344848633, + "grad_norm": 11.906233787536621, "learning_rate": 1.0593688649749147e-05, - "loss": 0.131, + "loss": 0.0606, "step": 5570 }, { "epoch": 2.1908127208480566, - "grad_norm": 49.589115142822266, + "grad_norm": 46.65951156616211, "learning_rate": 1.0582781938486149e-05, - "loss": 0.3368, + "loss": 0.2559, "step": 5580 }, { "epoch": 2.1947389085198274, - "grad_norm": 0.4478747248649597, + "grad_norm": 0.7134725451469421, "learning_rate": 1.057187522722315e-05, - "loss": 0.1052, + "loss": 0.0599, "step": 5590 }, { "epoch": 2.198665096191598, - "grad_norm": 1.428010106086731, + "grad_norm": 1.6857045888900757, "learning_rate": 1.0560968515960154e-05, - "loss": 0.1176, + "loss": 0.1693, "step": 5600 }, { "epoch": 2.2025912838633688, - "grad_norm": 12.0214262008667, + "grad_norm": 4.220065593719482, "learning_rate": 1.0550061804697157e-05, - "loss": 0.0927, + "loss": 0.0529, "step": 5610 }, { "epoch": 2.206517471535139, - "grad_norm": 0.9961761236190796, + "grad_norm": 0.08061785995960236, "learning_rate": 1.053915509343416e-05, - "loss": 0.3056, + "loss": 0.2428, "step": 5620 }, { "epoch": 2.21044365920691, - "grad_norm": 30.738323211669922, + "grad_norm": 30.183252334594727, "learning_rate": 1.0528248382171162e-05, - "loss": 0.3152, + "loss": 0.2599, "step": 5630 }, { "epoch": 2.214369846878681, - "grad_norm": 4.3228349685668945, + "grad_norm": 3.4865050315856934, "learning_rate": 1.0517341670908166e-05, - "loss": 0.2429, + "loss": 0.2122, "step": 5640 }, { "epoch": 2.2182960345504514, - "grad_norm": 2.5322418212890625, + "grad_norm": 0.2593090236186981, "learning_rate": 1.0506434959645169e-05, - "loss": 0.0922, + "loss": 0.1209, "step": 5650 }, { "epoch": 2.2222222222222223, - "grad_norm": 0.1956188976764679, + "grad_norm": 0.4189735949039459, "learning_rate": 1.0495528248382172e-05, - "loss": 0.0586, + "loss": 0.0925, "step": 5660 }, { "epoch": 2.2261484098939928, - "grad_norm": 0.16448786854743958, + "grad_norm": 0.04027146100997925, "learning_rate": 1.0484621537119174e-05, - "loss": 0.1751, + "loss": 0.2284, "step": 5670 }, { "epoch": 2.2300745975657636, - "grad_norm": 4.043306827545166, + "grad_norm": 0.05509791523218155, "learning_rate": 1.0473714825856176e-05, - "loss": 0.5134, + "loss": 0.362, "step": 5680 }, { "epoch": 2.2340007852375345, - "grad_norm": 4.274496555328369, + "grad_norm": 4.667411804199219, "learning_rate": 1.046280811459318e-05, - "loss": 0.0639, + "loss": 0.1076, "step": 5690 }, { "epoch": 2.237926972909305, - "grad_norm": 0.6351669430732727, + "grad_norm": 0.09081613272428513, "learning_rate": 1.0451901403330183e-05, - "loss": 0.0868, + "loss": 0.1723, "step": 5700 }, { "epoch": 2.241853160581076, - "grad_norm": 0.8180021643638611, + "grad_norm": 0.12606559693813324, "learning_rate": 1.0440994692067186e-05, - "loss": 0.0415, + "loss": 0.0358, "step": 5710 }, { "epoch": 2.2457793482528463, - "grad_norm": 1.776466727256775, + "grad_norm": 1.1984368562698364, "learning_rate": 1.0430087980804188e-05, - "loss": 0.1675, + "loss": 0.1789, "step": 5720 }, { "epoch": 2.249705535924617, - "grad_norm": 2.777200937271118, + "grad_norm": 2.3475677967071533, "learning_rate": 1.0419181269541191e-05, - "loss": 0.3172, + "loss": 0.3125, "step": 5730 }, { "epoch": 2.253631723596388, - "grad_norm": 22.90363121032715, + "grad_norm": 25.091045379638672, "learning_rate": 1.0408274558278195e-05, - "loss": 0.1138, + "loss": 0.0959, "step": 5740 }, { "epoch": 2.2575579112681585, - "grad_norm": 3.341553211212158, + "grad_norm": 6.505715370178223, "learning_rate": 1.0397367847015198e-05, - "loss": 0.2159, + "loss": 0.261, "step": 5750 }, { "epoch": 2.2614840989399294, - "grad_norm": 9.112372398376465, + "grad_norm": 4.189174175262451, "learning_rate": 1.03864611357522e-05, - "loss": 0.4136, + "loss": 0.3947, "step": 5760 }, { "epoch": 2.2654102866117, - "grad_norm": 0.50634765625, + "grad_norm": 0.14621849358081818, "learning_rate": 1.0375554424489201e-05, - "loss": 0.0461, + "loss": 0.0361, "step": 5770 }, { "epoch": 2.2693364742834707, - "grad_norm": 0.504768431186676, + "grad_norm": 0.25471073389053345, "learning_rate": 1.0364647713226205e-05, - "loss": 0.1365, + "loss": 0.1035, "step": 5780 }, { "epoch": 2.2732626619552416, - "grad_norm": 0.2869071364402771, + "grad_norm": 0.9693595170974731, "learning_rate": 1.0353741001963208e-05, - "loss": 0.0361, + "loss": 0.0287, "step": 5790 }, { "epoch": 2.277188849627012, - "grad_norm": 0.20939621329307556, + "grad_norm": 0.04725302383303642, "learning_rate": 1.0342834290700212e-05, - "loss": 0.0331, + "loss": 0.0156, "step": 5800 }, { "epoch": 2.281115037298783, - "grad_norm": 1.0972102880477905, + "grad_norm": 1.0130659341812134, "learning_rate": 1.0331927579437215e-05, - "loss": 0.4512, + "loss": 0.3945, "step": 5810 }, { "epoch": 2.285041224970554, - "grad_norm": 1.0246741771697998, + "grad_norm": 0.6472530364990234, "learning_rate": 1.0321020868174217e-05, - "loss": 0.1271, + "loss": 0.1146, "step": 5820 }, { "epoch": 2.2889674126423243, - "grad_norm": 1.0128393173217773, + "grad_norm": 0.35015296936035156, "learning_rate": 1.031011415691122e-05, - "loss": 0.2283, + "loss": 0.4945, "step": 5830 }, { "epoch": 2.292893600314095, - "grad_norm": 0.8622622489929199, + "grad_norm": 0.055610235780477524, "learning_rate": 1.0299207445648224e-05, - "loss": 0.4103, + "loss": 0.2469, "step": 5840 }, { "epoch": 2.2968197879858656, - "grad_norm": 38.8965950012207, + "grad_norm": 37.27722930908203, "learning_rate": 1.0288300734385225e-05, - "loss": 0.1741, + "loss": 0.156, "step": 5850 }, { "epoch": 2.3007459756576365, - "grad_norm": 6.6226067543029785, + "grad_norm": 10.902371406555176, "learning_rate": 1.0277394023122227e-05, - "loss": 0.2215, + "loss": 0.1503, "step": 5860 }, { "epoch": 2.304672163329407, - "grad_norm": 14.37143325805664, + "grad_norm": 10.099776268005371, "learning_rate": 1.026648731185923e-05, - "loss": 0.1994, + "loss": 0.1318, "step": 5870 }, { "epoch": 2.308598351001178, - "grad_norm": 1.2804064750671387, + "grad_norm": 0.5221976041793823, "learning_rate": 1.0255580600596234e-05, - "loss": 0.0738, + "loss": 0.0754, "step": 5880 }, { "epoch": 2.3125245386729487, - "grad_norm": 0.07044283300638199, + "grad_norm": 0.04902220889925957, "learning_rate": 1.0244673889333237e-05, - "loss": 0.0573, + "loss": 0.0725, "step": 5890 }, { "epoch": 2.316450726344719, - "grad_norm": 2.4032301902770996, + "grad_norm": 4.8667683601379395, "learning_rate": 1.023376717807024e-05, - "loss": 0.086, + "loss": 0.0616, "step": 5900 }, { "epoch": 2.32037691401649, - "grad_norm": 0.21904931962490082, + "grad_norm": 1.924928903579712, "learning_rate": 1.0222860466807242e-05, - "loss": 0.0883, + "loss": 0.3201, "step": 5910 }, { "epoch": 2.324303101688261, - "grad_norm": 0.22715237736701965, + "grad_norm": 0.07030254602432251, "learning_rate": 1.0211953755544246e-05, - "loss": 0.3552, + "loss": 0.3232, "step": 5920 }, { "epoch": 2.3282292893600314, - "grad_norm": 17.506547927856445, + "grad_norm": 0.7331899404525757, "learning_rate": 1.0201047044281247e-05, - "loss": 0.6008, + "loss": 0.377, "step": 5930 }, { "epoch": 2.3321554770318023, - "grad_norm": 0.23176495730876923, + "grad_norm": 0.5421873331069946, "learning_rate": 1.019014033301825e-05, - "loss": 0.2604, + "loss": 0.2775, "step": 5940 }, { "epoch": 2.3360816647035727, - "grad_norm": 1.0355746746063232, + "grad_norm": 0.08875811100006104, "learning_rate": 1.0179233621755252e-05, - "loss": 0.384, + "loss": 0.228, "step": 5950 }, { "epoch": 2.3400078523753436, - "grad_norm": 1.2482649087905884, + "grad_norm": 0.06304715573787689, "learning_rate": 1.0168326910492256e-05, - "loss": 0.2553, + "loss": 0.1826, "step": 5960 }, { "epoch": 2.343934040047114, - "grad_norm": 4.462619304656982, + "grad_norm": 3.389239549636841, "learning_rate": 1.015742019922926e-05, - "loss": 0.2477, + "loss": 0.279, "step": 5970 }, { "epoch": 2.347860227718885, - "grad_norm": 1.4509198665618896, + "grad_norm": 0.8488080501556396, "learning_rate": 1.0146513487966263e-05, - "loss": 0.1935, + "loss": 0.143, "step": 5980 }, { "epoch": 2.351786415390656, - "grad_norm": 4.005868911743164, + "grad_norm": 9.877676963806152, "learning_rate": 1.0135606776703266e-05, - "loss": 0.1906, + "loss": 0.2996, "step": 5990 }, { "epoch": 2.3557126030624262, - "grad_norm": 1.8947813510894775, + "grad_norm": 1.2593107223510742, "learning_rate": 1.0124700065440268e-05, - "loss": 0.7414, + "loss": 0.5849, "step": 6000 }, { "epoch": 2.359638790734197, - "grad_norm": 4.249821186065674, + "grad_norm": 3.665756940841675, "learning_rate": 1.0113793354177271e-05, - "loss": 0.2717, + "loss": 0.2642, "step": 6010 }, { "epoch": 2.363564978405968, - "grad_norm": 12.452858924865723, + "grad_norm": 4.754635334014893, "learning_rate": 1.0102886642914273e-05, - "loss": 0.1839, + "loss": 0.1281, "step": 6020 }, { "epoch": 2.3674911660777385, - "grad_norm": 32.34346389770508, + "grad_norm": 53.92820739746094, "learning_rate": 1.0091979931651276e-05, - "loss": 0.148, + "loss": 0.0986, "step": 6030 }, { "epoch": 2.3714173537495093, - "grad_norm": 0.9306933283805847, + "grad_norm": 2.3993618488311768, "learning_rate": 1.008107322038828e-05, - "loss": 0.3147, + "loss": 0.41, "step": 6040 }, { "epoch": 2.37534354142128, - "grad_norm": 5.001786708831787, + "grad_norm": 2.18753981590271, "learning_rate": 1.0070166509125281e-05, - "loss": 0.1905, + "loss": 0.204, "step": 6050 }, { "epoch": 2.3792697290930507, - "grad_norm": 1.3732270002365112, + "grad_norm": 0.08793739974498749, "learning_rate": 1.0059259797862285e-05, - "loss": 0.203, + "loss": 0.1858, "step": 6060 }, { "epoch": 2.383195916764821, - "grad_norm": 3.645620822906494, + "grad_norm": 4.743882179260254, "learning_rate": 1.0048353086599288e-05, - "loss": 0.3835, + "loss": 0.1761, "step": 6070 }, { "epoch": 2.387122104436592, - "grad_norm": 1.74021315574646, + "grad_norm": 2.0335168838500977, "learning_rate": 1.0037446375336292e-05, - "loss": 0.3149, + "loss": 0.187, "step": 6080 }, { "epoch": 2.391048292108363, - "grad_norm": 48.43793487548828, + "grad_norm": 59.3013801574707, "learning_rate": 1.0026539664073293e-05, - "loss": 0.1736, + "loss": 0.1815, "step": 6090 }, { "epoch": 2.3949744797801333, - "grad_norm": 0.8240251541137695, + "grad_norm": 0.6419391632080078, "learning_rate": 1.0015632952810295e-05, - "loss": 0.1713, + "loss": 0.1378, "step": 6100 }, { "epoch": 2.3989006674519042, - "grad_norm": 1.2377926111221313, + "grad_norm": 0.4858306348323822, "learning_rate": 1.0004726241547298e-05, - "loss": 0.0916, + "loss": 0.1293, "step": 6110 }, { "epoch": 2.402826855123675, - "grad_norm": 7.041884899139404, + "grad_norm": 0.3772069811820984, "learning_rate": 9.993819530284302e-06, - "loss": 0.7885, + "loss": 0.7755, "step": 6120 }, { "epoch": 2.4067530427954456, - "grad_norm": 12.270745277404785, + "grad_norm": 14.143595695495605, "learning_rate": 9.982912819021305e-06, - "loss": 0.0351, + "loss": 0.0596, "step": 6130 }, { "epoch": 2.4106792304672164, - "grad_norm": 0.023543832823634148, + "grad_norm": 0.010892852209508419, "learning_rate": 9.972006107758307e-06, - "loss": 0.5872, + "loss": 0.3678, "step": 6140 }, { "epoch": 2.414605418138987, - "grad_norm": 46.477821350097656, + "grad_norm": 9.54448127746582, "learning_rate": 9.96109939649531e-06, - "loss": 0.3621, + "loss": 0.314, "step": 6150 }, { "epoch": 2.4185316058107578, - "grad_norm": 14.68813705444336, + "grad_norm": 24.08507537841797, "learning_rate": 9.950192685232314e-06, - "loss": 0.1268, + "loss": 0.1195, "step": 6160 }, { "epoch": 2.4224577934825287, - "grad_norm": 1.923462152481079, + "grad_norm": 1.3726921081542969, "learning_rate": 9.939285973969317e-06, - "loss": 0.4947, + "loss": 0.551, "step": 6170 }, { "epoch": 2.426383981154299, - "grad_norm": 0.11958777904510498, + "grad_norm": 0.05125840753316879, "learning_rate": 9.92837926270632e-06, - "loss": 0.2668, + "loss": 0.2258, "step": 6180 }, { "epoch": 2.43031016882607, - "grad_norm": 2.3362338542938232, + "grad_norm": 3.7574574947357178, "learning_rate": 9.91747255144332e-06, - "loss": 0.1264, + "loss": 0.1338, "step": 6190 }, { "epoch": 2.4342363564978404, - "grad_norm": 47.766571044921875, + "grad_norm": 0.7719956636428833, "learning_rate": 9.906565840180324e-06, - "loss": 0.0966, + "loss": 0.0907, "step": 6200 }, { "epoch": 2.4381625441696113, - "grad_norm": 31.81584930419922, + "grad_norm": 70.89276123046875, "learning_rate": 9.895659128917327e-06, - "loss": 0.4733, + "loss": 0.7711, "step": 6210 }, { "epoch": 2.442088731841382, - "grad_norm": 1.4350743293762207, + "grad_norm": 2.345156669616699, "learning_rate": 9.88475241765433e-06, - "loss": 0.1829, + "loss": 0.141, "step": 6220 }, { "epoch": 2.4460149195131526, - "grad_norm": 1.9439697265625, + "grad_norm": 1.57748281955719, "learning_rate": 9.873845706391332e-06, - "loss": 0.0671, + "loss": 0.0542, "step": 6230 }, { "epoch": 2.4499411071849235, - "grad_norm": 18.077722549438477, + "grad_norm": 35.03361892700195, "learning_rate": 9.862938995128336e-06, - "loss": 0.166, + "loss": 0.1458, "step": 6240 }, { "epoch": 2.453867294856694, - "grad_norm": 0.06180582195520401, + "grad_norm": 0.022890105843544006, "learning_rate": 9.85203228386534e-06, - "loss": 0.1776, + "loss": 0.1489, "step": 6250 }, { "epoch": 2.457793482528465, - "grad_norm": 45.23520278930664, + "grad_norm": 21.752466201782227, "learning_rate": 9.841125572602343e-06, - "loss": 0.2891, + "loss": 0.2103, "step": 6260 }, { "epoch": 2.4617196702002357, - "grad_norm": 36.45094299316406, + "grad_norm": 22.51886558532715, "learning_rate": 9.830218861339344e-06, - "loss": 0.2247, + "loss": 0.3046, "step": 6270 }, { "epoch": 2.465645857872006, - "grad_norm": 33.61916732788086, + "grad_norm": 50.965476989746094, "learning_rate": 9.819312150076346e-06, - "loss": 0.2423, + "loss": 0.2663, "step": 6280 }, { "epoch": 2.469572045543777, - "grad_norm": 0.5178603529930115, + "grad_norm": 13.096270561218262, "learning_rate": 9.80840543881335e-06, - "loss": 0.033, + "loss": 0.0314, "step": 6290 }, { "epoch": 2.4734982332155475, - "grad_norm": 64.42828369140625, + "grad_norm": 17.061594009399414, "learning_rate": 9.797498727550353e-06, - "loss": 0.2502, + "loss": 0.3736, "step": 6300 }, { "epoch": 2.4774244208873184, - "grad_norm": 2.209799289703369, + "grad_norm": 2.3116650581359863, "learning_rate": 9.786592016287356e-06, - "loss": 0.0336, + "loss": 0.0711, "step": 6310 }, { "epoch": 2.4813506085590893, - "grad_norm": 3.8669638633728027, + "grad_norm": 2.641169786453247, "learning_rate": 9.775685305024358e-06, - "loss": 0.1431, + "loss": 0.1374, "step": 6320 }, { "epoch": 2.4852767962308597, - "grad_norm": 4.620296478271484, + "grad_norm": 19.763687133789062, "learning_rate": 9.764778593761361e-06, - "loss": 0.1392, + "loss": 0.1357, "step": 6330 }, { "epoch": 2.4892029839026306, - "grad_norm": 0.1899837851524353, + "grad_norm": 0.4087248146533966, "learning_rate": 9.753871882498365e-06, - "loss": 0.2397, + "loss": 0.2838, "step": 6340 }, { "epoch": 2.4931291715744015, - "grad_norm": 2.969348430633545, + "grad_norm": 1.8627806901931763, "learning_rate": 9.742965171235368e-06, - "loss": 0.2477, + "loss": 0.2778, "step": 6350 }, { "epoch": 2.497055359246172, - "grad_norm": 0.382293164730072, + "grad_norm": 0.4517025053501129, "learning_rate": 9.73205845997237e-06, - "loss": 0.0624, + "loss": 0.0168, "step": 6360 }, { "epoch": 2.500981546917943, - "grad_norm": 8.5194091796875, + "grad_norm": 0.15071170032024384, "learning_rate": 9.721151748709372e-06, - "loss": 0.3377, + "loss": 0.131, "step": 6370 }, { "epoch": 2.5049077345897133, - "grad_norm": 0.1314937174320221, + "grad_norm": 0.7975783348083496, "learning_rate": 9.710245037446375e-06, - "loss": 0.0207, + "loss": 0.0407, "step": 6380 }, { "epoch": 2.508833922261484, - "grad_norm": 1.6568098068237305, + "grad_norm": 2.8868019580841064, "learning_rate": 9.699338326183378e-06, - "loss": 0.2955, + "loss": 0.2311, "step": 6390 }, { "epoch": 2.5127601099332546, - "grad_norm": 0.19203191995620728, + "grad_norm": 1.2170113325119019, "learning_rate": 9.688431614920382e-06, - "loss": 0.3306, + "loss": 0.2593, "step": 6400 }, { "epoch": 2.5166862976050255, - "grad_norm": 2.4867968559265137, + "grad_norm": 0.2477523684501648, "learning_rate": 9.677524903657385e-06, - "loss": 0.0742, + "loss": 0.0891, "step": 6410 }, { "epoch": 2.5206124852767964, - "grad_norm": 0.002447237726300955, + "grad_norm": 0.003454150166362524, "learning_rate": 9.666618192394387e-06, - "loss": 0.215, + "loss": 0.1108, "step": 6420 }, { "epoch": 2.524538672948567, - "grad_norm": 0.6441366076469421, + "grad_norm": 0.7503302693367004, "learning_rate": 9.65571148113139e-06, - "loss": 0.0727, + "loss": 0.0951, "step": 6430 }, { "epoch": 2.5284648606203377, - "grad_norm": 2.135335922241211, + "grad_norm": 0.9688203930854797, "learning_rate": 9.644804769868392e-06, - "loss": 0.2282, + "loss": 0.3109, "step": 6440 }, { "epoch": 2.5323910482921086, - "grad_norm": 0.40267398953437805, + "grad_norm": 1.489219069480896, "learning_rate": 9.633898058605395e-06, - "loss": 0.0622, + "loss": 0.1244, "step": 6450 }, { "epoch": 2.536317235963879, - "grad_norm": 1.9927395582199097, + "grad_norm": 1.4182342290878296, "learning_rate": 9.622991347342397e-06, - "loss": 0.1404, + "loss": 0.0937, "step": 6460 }, { "epoch": 2.54024342363565, - "grad_norm": 19.26555633544922, + "grad_norm": 7.870201110839844, "learning_rate": 9.6120846360794e-06, - "loss": 0.4519, + "loss": 0.4896, "step": 6470 }, { "epoch": 2.5441696113074204, - "grad_norm": 36.916839599609375, + "grad_norm": 34.24169921875, "learning_rate": 9.601177924816404e-06, - "loss": 0.471, + "loss": 0.4117, "step": 6480 }, { "epoch": 2.5480957989791913, - "grad_norm": 2.4214043617248535, + "grad_norm": 1.055472731590271, "learning_rate": 9.590271213553407e-06, - "loss": 0.0723, + "loss": 0.0936, "step": 6490 }, { "epoch": 2.5520219866509617, - "grad_norm": 1.9171044826507568, + "grad_norm": 2.697065591812134, "learning_rate": 9.57936450229041e-06, - "loss": 0.1246, + "loss": 0.0895, "step": 6500 }, { "epoch": 2.5559481743227326, - "grad_norm": 1.9892618656158447, + "grad_norm": 3.796678066253662, "learning_rate": 9.568457791027412e-06, - "loss": 0.2743, + "loss": 0.2644, "step": 6510 }, { "epoch": 2.5598743619945035, - "grad_norm": 0.6479685306549072, + "grad_norm": 0.3166428208351135, "learning_rate": 9.557551079764416e-06, - "loss": 0.1877, + "loss": 0.1672, "step": 6520 }, { "epoch": 2.563800549666274, - "grad_norm": 0.07352516055107117, + "grad_norm": 1.1901061534881592, "learning_rate": 9.546644368501417e-06, - "loss": 0.3366, + "loss": 0.2412, "step": 6530 }, { "epoch": 2.567726737338045, - "grad_norm": 0.0705084353685379, + "grad_norm": 0.2481738030910492, "learning_rate": 9.535737657238421e-06, - "loss": 0.1234, + "loss": 0.0997, "step": 6540 }, { "epoch": 2.5716529250098157, - "grad_norm": 53.518978118896484, + "grad_norm": 15.139571189880371, "learning_rate": 9.524830945975424e-06, - "loss": 0.0777, + "loss": 0.0788, "step": 6550 }, { "epoch": 2.575579112681586, - "grad_norm": 89.0527572631836, + "grad_norm": 82.8722915649414, "learning_rate": 9.513924234712426e-06, - "loss": 0.2485, + "loss": 0.2965, "step": 6560 }, { "epoch": 2.579505300353357, - "grad_norm": 13.943245887756348, + "grad_norm": 11.235033988952637, "learning_rate": 9.50301752344943e-06, - "loss": 0.0835, + "loss": 0.0448, "step": 6570 }, { "epoch": 2.5834314880251275, - "grad_norm": 1.6598663330078125, + "grad_norm": 5.954591751098633, "learning_rate": 9.492110812186433e-06, - "loss": 0.0357, + "loss": 0.0422, "step": 6580 }, { "epoch": 2.5873576756968983, - "grad_norm": 0.2817022204399109, + "grad_norm": 0.1243339478969574, "learning_rate": 9.481204100923436e-06, - "loss": 0.3677, + "loss": 0.4456, "step": 6590 }, { "epoch": 2.591283863368669, - "grad_norm": 6.064679145812988, + "grad_norm": 6.766524314880371, "learning_rate": 9.470297389660438e-06, - "loss": 0.1253, + "loss": 0.0706, "step": 6600 }, { "epoch": 2.5952100510404397, - "grad_norm": 0.19657225906848907, + "grad_norm": 0.05468232184648514, "learning_rate": 9.45939067839744e-06, - "loss": 0.195, + "loss": 0.2343, "step": 6610 }, { "epoch": 2.5991362387122106, - "grad_norm": 4.953568458557129, + "grad_norm": 2.8523168563842773, "learning_rate": 9.448483967134443e-06, - "loss": 0.2652, + "loss": 0.3042, "step": 6620 }, { "epoch": 2.603062426383981, - "grad_norm": 0.1797858625650406, + "grad_norm": 0.21883773803710938, "learning_rate": 9.437577255871446e-06, - "loss": 0.4599, + "loss": 0.4721, "step": 6630 }, { "epoch": 2.606988614055752, - "grad_norm": 0.04305833950638771, + "grad_norm": 0.006246656179428101, "learning_rate": 9.42667054460845e-06, - "loss": 0.0214, + "loss": 0.0277, "step": 6640 }, { "epoch": 2.6109148017275228, - "grad_norm": 0.18950684368610382, + "grad_norm": 0.23834118247032166, "learning_rate": 9.415763833345451e-06, - "loss": 0.2616, + "loss": 0.1367, "step": 6650 }, { "epoch": 2.614840989399293, - "grad_norm": 48.84832763671875, + "grad_norm": 11.69246768951416, "learning_rate": 9.404857122082455e-06, - "loss": 0.3987, + "loss": 0.2859, "step": 6660 }, { "epoch": 2.618767177071064, - "grad_norm": 4.163015365600586, + "grad_norm": 3.6627304553985596, "learning_rate": 9.393950410819458e-06, - "loss": 0.2297, + "loss": 0.2954, "step": 6670 }, { "epoch": 2.6226933647428345, - "grad_norm": 0.011907198466360569, + "grad_norm": 0.0038379228208214045, "learning_rate": 9.383043699556462e-06, - "loss": 0.2658, + "loss": 0.2394, "step": 6680 }, { "epoch": 2.6266195524146054, - "grad_norm": 3.6187052726745605, + "grad_norm": 4.187229156494141, "learning_rate": 9.372136988293463e-06, - "loss": 0.2698, + "loss": 0.2207, "step": 6690 }, { "epoch": 2.630545740086376, - "grad_norm": 36.80274963378906, + "grad_norm": 12.471536636352539, "learning_rate": 9.361230277030465e-06, - "loss": 0.418, + "loss": 0.2345, "step": 6700 }, { "epoch": 2.6344719277581468, - "grad_norm": 0.4680624008178711, + "grad_norm": 0.020097658038139343, "learning_rate": 9.350323565767469e-06, - "loss": 0.0367, + "loss": 0.0173, "step": 6710 }, { "epoch": 2.6383981154299176, - "grad_norm": 2.078639268875122, + "grad_norm": 2.3731331825256348, "learning_rate": 9.339416854504472e-06, - "loss": 0.4063, + "loss": 0.3146, "step": 6720 }, { "epoch": 2.642324303101688, - "grad_norm": 4.436471939086914, + "grad_norm": 3.2486610412597656, "learning_rate": 9.328510143241475e-06, - "loss": 0.4433, + "loss": 0.2428, "step": 6730 }, { "epoch": 2.646250490773459, - "grad_norm": 0.4126029312610626, + "grad_norm": 0.36934366822242737, "learning_rate": 9.317603431978477e-06, - "loss": 0.3173, + "loss": 0.3588, "step": 6740 }, { "epoch": 2.65017667844523, - "grad_norm": 27.0905704498291, + "grad_norm": 9.044105529785156, "learning_rate": 9.30669672071548e-06, - "loss": 0.3562, + "loss": 0.3364, "step": 6750 }, { "epoch": 2.6541028661170003, - "grad_norm": 61.60869598388672, + "grad_norm": 50.49497985839844, "learning_rate": 9.295790009452484e-06, - "loss": 0.0641, + "loss": 0.0961, "step": 6760 }, { "epoch": 2.658029053788771, - "grad_norm": 3.42254638671875, + "grad_norm": 2.147735357284546, "learning_rate": 9.284883298189487e-06, - "loss": 0.1292, + "loss": 0.0842, "step": 6770 }, { "epoch": 2.661955241460542, - "grad_norm": 2.070840358734131, + "grad_norm": 25.78204345703125, "learning_rate": 9.273976586926489e-06, - "loss": 0.0369, + "loss": 0.0455, "step": 6780 }, { "epoch": 2.6658814291323125, - "grad_norm": 5.448889255523682, + "grad_norm": 5.900356292724609, "learning_rate": 9.26306987566349e-06, - "loss": 0.0922, + "loss": 0.1106, "step": 6790 }, { "epoch": 2.669807616804083, - "grad_norm": 3.0283076763153076, + "grad_norm": 0.19095315039157867, "learning_rate": 9.252163164400494e-06, - "loss": 0.5027, + "loss": 0.4659, "step": 6800 }, { "epoch": 2.673733804475854, - "grad_norm": 3.915083408355713, + "grad_norm": 1.449488878250122, "learning_rate": 9.241256453137497e-06, - "loss": 0.1452, + "loss": 0.111, "step": 6810 }, { "epoch": 2.6776599921476247, - "grad_norm": 0.46686550974845886, + "grad_norm": 2.1792714595794678, "learning_rate": 9.2303497418745e-06, - "loss": 0.0898, + "loss": 0.0665, "step": 6820 }, { "epoch": 2.681586179819395, - "grad_norm": 0.6087507605552673, + "grad_norm": 0.26577094197273254, "learning_rate": 9.219443030611503e-06, - "loss": 0.0717, + "loss": 0.014, "step": 6830 }, { "epoch": 2.685512367491166, - "grad_norm": 5.16321325302124, + "grad_norm": 12.622210502624512, "learning_rate": 9.208536319348506e-06, - "loss": 0.2094, + "loss": 0.3798, "step": 6840 }, { "epoch": 2.689438555162937, - "grad_norm": 3.0265049934387207, + "grad_norm": 1.8348852396011353, "learning_rate": 9.19762960808551e-06, - "loss": 0.017, + "loss": 0.0185, "step": 6850 }, { "epoch": 2.6933647428347074, - "grad_norm": 0.040980808436870575, + "grad_norm": 0.09319444745779037, "learning_rate": 9.186722896822513e-06, - "loss": 0.1454, + "loss": 0.1319, "step": 6860 }, { "epoch": 2.6972909305064783, - "grad_norm": 2.1761698722839355, + "grad_norm": 2.9519989490509033, "learning_rate": 9.175816185559514e-06, - "loss": 0.3257, + "loss": 0.2812, "step": 6870 }, { "epoch": 2.701217118178249, - "grad_norm": 0.4063561260700226, + "grad_norm": 2.134477376937866, "learning_rate": 9.164909474296516e-06, - "loss": 0.2656, + "loss": 0.1922, "step": 6880 }, { "epoch": 2.7051433058500196, - "grad_norm": 0.04433223232626915, + "grad_norm": 0.23005551099777222, "learning_rate": 9.15400276303352e-06, - "loss": 0.1275, + "loss": 0.1887, "step": 6890 }, { "epoch": 2.7090694935217905, - "grad_norm": 59.92265701293945, + "grad_norm": 56.31394958496094, "learning_rate": 9.143096051770523e-06, - "loss": 0.4339, + "loss": 0.6077, "step": 6900 }, { "epoch": 2.712995681193561, - "grad_norm": 114.50326538085938, + "grad_norm": 12.379575729370117, "learning_rate": 9.132189340507526e-06, - "loss": 0.1435, + "loss": 0.0845, "step": 6910 }, { "epoch": 2.716921868865332, - "grad_norm": 16.42485237121582, + "grad_norm": 23.357824325561523, "learning_rate": 9.12128262924453e-06, - "loss": 0.4926, + "loss": 0.4226, "step": 6920 }, { "epoch": 2.7208480565371023, - "grad_norm": 0.2809946835041046, + "grad_norm": 1.7891526222229004, "learning_rate": 9.110375917981531e-06, - "loss": 0.1381, + "loss": 0.1366, "step": 6930 }, { "epoch": 2.724774244208873, - "grad_norm": 0.7081897258758545, + "grad_norm": 0.6277144551277161, "learning_rate": 9.099469206718535e-06, - "loss": 0.206, + "loss": 0.2135, "step": 6940 }, { "epoch": 2.728700431880644, - "grad_norm": 3.5805580615997314, + "grad_norm": 1.3398008346557617, "learning_rate": 9.088562495455538e-06, - "loss": 0.4331, + "loss": 0.1367, "step": 6950 }, { "epoch": 2.7326266195524145, - "grad_norm": 2.8939030170440674, + "grad_norm": 2.7261030673980713, "learning_rate": 9.07765578419254e-06, - "loss": 0.0773, + "loss": 0.0352, "step": 6960 }, { "epoch": 2.7365528072241854, - "grad_norm": 2.2288002967834473, + "grad_norm": 0.1663471758365631, "learning_rate": 9.066749072929542e-06, - "loss": 0.4044, + "loss": 0.2284, "step": 6970 }, { "epoch": 2.7404789948959563, - "grad_norm": 2.2127175331115723, + "grad_norm": 0.6146605014801025, "learning_rate": 9.055842361666545e-06, - "loss": 0.2409, + "loss": 0.1686, "step": 6980 }, { "epoch": 2.7444051825677267, - "grad_norm": 1.2632640600204468, + "grad_norm": 2.2091305255889893, "learning_rate": 9.044935650403548e-06, - "loss": 0.0593, + "loss": 0.0772, "step": 6990 }, { "epoch": 2.7483313702394976, - "grad_norm": 0.7995769381523132, + "grad_norm": 39.840965270996094, "learning_rate": 9.034028939140552e-06, - "loss": 0.0973, + "loss": 0.187, "step": 7000 }, { "epoch": 2.752257557911268, - "grad_norm": 0.10134366154670715, + "grad_norm": 0.19906871020793915, "learning_rate": 9.023122227877555e-06, - "loss": 0.251, + "loss": 0.1496, "step": 7010 }, { "epoch": 2.756183745583039, - "grad_norm": 0.43296894431114197, + "grad_norm": 1.3489853143692017, "learning_rate": 9.012215516614557e-06, - "loss": 0.0387, + "loss": 0.0375, "step": 7020 }, { "epoch": 2.7601099332548094, - "grad_norm": 0.6308151483535767, + "grad_norm": 0.5379261374473572, "learning_rate": 9.00130880535156e-06, - "loss": 0.2209, + "loss": 0.2555, "step": 7030 }, { "epoch": 2.7640361209265802, - "grad_norm": 0.49843630194664, + "grad_norm": 0.07734381407499313, "learning_rate": 8.990402094088562e-06, - "loss": 0.4072, + "loss": 0.2476, "step": 7040 }, { "epoch": 2.767962308598351, - "grad_norm": 0.032789766788482666, + "grad_norm": 0.0302397720515728, "learning_rate": 8.979495382825565e-06, - "loss": 0.0973, + "loss": 0.1488, "step": 7050 }, { "epoch": 2.7718884962701216, - "grad_norm": 0.8880012631416321, + "grad_norm": 0.9817191958427429, "learning_rate": 8.968588671562567e-06, - "loss": 0.1363, + "loss": 0.1296, "step": 7060 }, { "epoch": 2.7758146839418925, - "grad_norm": 0.08528878539800644, + "grad_norm": 0.12161799520254135, "learning_rate": 8.95768196029957e-06, - "loss": 0.2197, + "loss": 0.3024, "step": 7070 }, { "epoch": 2.7797408716136633, - "grad_norm": 11.822524070739746, + "grad_norm": 17.695600509643555, "learning_rate": 8.946775249036574e-06, - "loss": 0.1505, + "loss": 0.1548, "step": 7080 }, { "epoch": 2.783667059285434, - "grad_norm": 26.118513107299805, + "grad_norm": 16.336050033569336, "learning_rate": 8.935868537773577e-06, - "loss": 0.1566, + "loss": 0.1997, "step": 7090 }, { "epoch": 2.7875932469572047, - "grad_norm": 0.7850349545478821, + "grad_norm": 0.7082786560058594, "learning_rate": 8.92496182651058e-06, - "loss": 0.1298, + "loss": 0.0398, "step": 7100 }, { "epoch": 2.791519434628975, - "grad_norm": 2.4232444763183594, + "grad_norm": 1.084769368171692, "learning_rate": 8.914055115247582e-06, - "loss": 0.5379, + "loss": 0.5019, "step": 7110 }, { "epoch": 2.795445622300746, - "grad_norm": 0.10379453003406525, + "grad_norm": 0.035373296588659286, "learning_rate": 8.903148403984586e-06, - "loss": 0.1128, + "loss": 0.1823, "step": 7120 }, { "epoch": 2.7993718099725164, - "grad_norm": 26.30695343017578, + "grad_norm": 16.239042282104492, "learning_rate": 8.892241692721588e-06, - "loss": 0.1491, + "loss": 0.1251, "step": 7130 }, { "epoch": 2.8032979976442873, - "grad_norm": 0.6674609780311584, + "grad_norm": 2.1121015548706055, "learning_rate": 8.881334981458591e-06, - "loss": 0.1727, + "loss": 0.2137, "step": 7140 }, { "epoch": 2.807224185316058, - "grad_norm": 0.264403760433197, + "grad_norm": 0.9777934551239014, "learning_rate": 8.870428270195594e-06, - "loss": 0.1027, + "loss": 0.0636, "step": 7150 }, { "epoch": 2.8111503729878287, - "grad_norm": 0.8759916424751282, + "grad_norm": 1.0806931257247925, "learning_rate": 8.859521558932596e-06, - "loss": 0.1808, + "loss": 0.2271, "step": 7160 }, { "epoch": 2.8150765606595995, - "grad_norm": 0.4683006703853607, + "grad_norm": 0.13286283612251282, "learning_rate": 8.8486148476696e-06, - "loss": 0.0775, + "loss": 0.078, "step": 7170 }, { "epoch": 2.8190027483313704, - "grad_norm": 0.07266096025705338, + "grad_norm": 0.09164169430732727, "learning_rate": 8.837708136406603e-06, - "loss": 0.3055, + "loss": 0.4122, "step": 7180 }, { "epoch": 2.822928936003141, - "grad_norm": 2.1959285736083984, + "grad_norm": 2.906200885772705, "learning_rate": 8.826801425143606e-06, - "loss": 0.0455, + "loss": 0.0415, "step": 7190 }, { "epoch": 2.8268551236749118, - "grad_norm": 4.996140956878662, + "grad_norm": 9.495919227600098, "learning_rate": 8.815894713880608e-06, - "loss": 0.1018, + "loss": 0.207, "step": 7200 }, { "epoch": 2.830781311346682, - "grad_norm": 0.2156902700662613, + "grad_norm": 0.10078518092632294, "learning_rate": 8.80498800261761e-06, - "loss": 0.241, + "loss": 0.4851, "step": 7210 }, { "epoch": 2.834707499018453, - "grad_norm": 19.904874801635742, + "grad_norm": 12.097614288330078, "learning_rate": 8.794081291354613e-06, - "loss": 0.3019, + "loss": 0.2573, "step": 7220 }, { "epoch": 2.8386336866902235, - "grad_norm": 15.15645980834961, + "grad_norm": 46.391700744628906, "learning_rate": 8.783174580091617e-06, - "loss": 0.0795, + "loss": 0.0987, "step": 7230 }, { "epoch": 2.8425598743619944, - "grad_norm": 0.3111743628978729, + "grad_norm": 0.01854799874126911, "learning_rate": 8.77226786882862e-06, - "loss": 0.275, + "loss": 0.2614, "step": 7240 }, { "epoch": 2.8464860620337653, - "grad_norm": 0.0960872620344162, + "grad_norm": 0.14692328870296478, "learning_rate": 8.761361157565622e-06, - "loss": 0.1438, + "loss": 0.1644, "step": 7250 }, { "epoch": 2.8504122497055357, - "grad_norm": 3.237501382827759, + "grad_norm": 15.00971508026123, "learning_rate": 8.750454446302625e-06, - "loss": 0.3122, + "loss": 0.3345, "step": 7260 }, { "epoch": 2.8543384373773066, - "grad_norm": 34.941776275634766, + "grad_norm": 29.544809341430664, "learning_rate": 8.739547735039628e-06, - "loss": 0.1453, + "loss": 0.1797, "step": 7270 }, { "epoch": 2.8582646250490775, - "grad_norm": 0.013809976167976856, + "grad_norm": 0.05832759663462639, "learning_rate": 8.728641023776632e-06, - "loss": 0.2703, + "loss": 0.1006, "step": 7280 }, { "epoch": 2.862190812720848, - "grad_norm": 4.848148345947266, + "grad_norm": 3.7172763347625732, "learning_rate": 8.717734312513635e-06, - "loss": 0.0874, + "loss": 0.0644, "step": 7290 }, { "epoch": 2.866117000392619, - "grad_norm": 6.772768020629883, + "grad_norm": 37.88047409057617, "learning_rate": 8.706827601250635e-06, - "loss": 0.0549, + "loss": 0.1037, "step": 7300 }, { "epoch": 2.8700431880643897, - "grad_norm": 4.1676483154296875, + "grad_norm": 3.8841378688812256, "learning_rate": 8.695920889987639e-06, - "loss": 0.4636, + "loss": 0.3338, "step": 7310 }, { "epoch": 2.87396937573616, - "grad_norm": 0.10867290198802948, + "grad_norm": 0.018319543451070786, "learning_rate": 8.685014178724642e-06, - "loss": 0.1106, + "loss": 0.0737, "step": 7320 }, { "epoch": 2.8778955634079306, - "grad_norm": 0.1281859576702118, + "grad_norm": 0.10322518646717072, "learning_rate": 8.674107467461645e-06, - "loss": 0.2999, + "loss": 0.334, "step": 7330 }, { "epoch": 2.8818217510797015, - "grad_norm": 7.64890193939209, + "grad_norm": 12.942615509033203, "learning_rate": 8.663200756198647e-06, - "loss": 0.4394, + "loss": 0.327, "step": 7340 }, { "epoch": 2.8857479387514724, - "grad_norm": 3.995598554611206, + "grad_norm": 3.6553971767425537, "learning_rate": 8.65229404493565e-06, - "loss": 0.0527, + "loss": 0.0907, "step": 7350 }, { "epoch": 2.889674126423243, - "grad_norm": 0.7157646417617798, + "grad_norm": 0.36398857831954956, "learning_rate": 8.641387333672654e-06, - "loss": 0.051, + "loss": 0.0803, "step": 7360 }, { "epoch": 2.8936003140950137, - "grad_norm": 0.4813132882118225, + "grad_norm": 1.0698785781860352, "learning_rate": 8.630480622409657e-06, - "loss": 0.2694, + "loss": 0.1574, "step": 7370 }, { "epoch": 2.8975265017667846, - "grad_norm": 0.8804084062576294, + "grad_norm": 0.04046402871608734, "learning_rate": 8.619573911146659e-06, - "loss": 0.3021, + "loss": 0.437, "step": 7380 }, { "epoch": 2.901452689438555, - "grad_norm": 13.784459114074707, + "grad_norm": 6.628992080688477, "learning_rate": 8.60866719988366e-06, - "loss": 0.2616, + "loss": 0.2852, "step": 7390 }, { "epoch": 2.905378877110326, - "grad_norm": 0.34155967831611633, + "grad_norm": 0.07181015610694885, "learning_rate": 8.597760488620664e-06, - "loss": 0.1507, + "loss": 0.1233, "step": 7400 }, { "epoch": 2.909305064782097, - "grad_norm": 30.922632217407227, + "grad_norm": 33.79270935058594, "learning_rate": 8.586853777357668e-06, - "loss": 0.073, + "loss": 0.1382, "step": 7410 }, { "epoch": 2.9132312524538673, - "grad_norm": 0.09361296892166138, + "grad_norm": 0.012769495137035847, "learning_rate": 8.575947066094671e-06, - "loss": 0.2751, + "loss": 0.1818, "step": 7420 }, { "epoch": 2.917157440125638, - "grad_norm": 20.691898345947266, + "grad_norm": 0.9529443383216858, "learning_rate": 8.565040354831674e-06, - "loss": 0.3467, + "loss": 0.3313, "step": 7430 }, { "epoch": 2.9210836277974086, - "grad_norm": 44.624263763427734, + "grad_norm": 43.84505844116211, "learning_rate": 8.554133643568676e-06, - "loss": 0.2427, + "loss": 0.1978, "step": 7440 }, { "epoch": 2.9250098154691795, - "grad_norm": 0.14301219582557678, + "grad_norm": 0.13246089220046997, "learning_rate": 8.54322693230568e-06, - "loss": 0.0627, + "loss": 0.0481, "step": 7450 }, { "epoch": 2.92893600314095, - "grad_norm": 7.879396915435791, + "grad_norm": 6.40209436416626, "learning_rate": 8.532320221042683e-06, - "loss": 0.4229, + "loss": 0.3866, "step": 7460 }, { "epoch": 2.932862190812721, - "grad_norm": 1.421470046043396, + "grad_norm": 0.985817551612854, "learning_rate": 8.521413509779685e-06, - "loss": 0.0576, + "loss": 0.1343, "step": 7470 }, { "epoch": 2.9367883784844917, - "grad_norm": 0.04031867906451225, + "grad_norm": 0.052463430911302567, "learning_rate": 8.510506798516686e-06, - "loss": 0.2258, + "loss": 0.2171, "step": 7480 }, { "epoch": 2.940714566156262, - "grad_norm": 1.08770751953125, + "grad_norm": 0.7640523314476013, "learning_rate": 8.49960008725369e-06, - "loss": 0.1644, + "loss": 0.1308, "step": 7490 }, { "epoch": 2.944640753828033, - "grad_norm": 27.244140625, + "grad_norm": 17.417028427124023, "learning_rate": 8.488693375990693e-06, - "loss": 0.2951, + "loss": 0.2983, "step": 7500 }, { "epoch": 2.948566941499804, - "grad_norm": 2.9796953201293945, + "grad_norm": 2.886998414993286, "learning_rate": 8.477786664727696e-06, - "loss": 0.1556, + "loss": 0.0883, "step": 7510 }, { "epoch": 2.9524931291715744, - "grad_norm": 0.8287544846534729, + "grad_norm": 0.5142208933830261, "learning_rate": 8.4668799534647e-06, - "loss": 0.0344, + "loss": 0.0501, "step": 7520 }, { "epoch": 2.9564193168433452, - "grad_norm": 5.502281188964844, + "grad_norm": 2.3838653564453125, "learning_rate": 8.455973242201702e-06, - "loss": 0.204, + "loss": 0.2846, "step": 7530 }, { "epoch": 2.9603455045151157, - "grad_norm": 22.03399658203125, + "grad_norm": 40.50202560424805, "learning_rate": 8.445066530938705e-06, - "loss": 0.2888, + "loss": 0.2859, "step": 7540 }, { "epoch": 2.9642716921868866, - "grad_norm": 3.150238037109375, + "grad_norm": 2.1738104820251465, "learning_rate": 8.434159819675707e-06, - "loss": 0.1666, + "loss": 0.0693, "step": 7550 }, { "epoch": 2.968197879858657, - "grad_norm": 13.789191246032715, + "grad_norm": 29.985048294067383, "learning_rate": 8.42325310841271e-06, - "loss": 0.1586, + "loss": 0.2554, "step": 7560 }, { "epoch": 2.972124067530428, - "grad_norm": 0.5419015884399414, + "grad_norm": 0.03450935333967209, "learning_rate": 8.412346397149712e-06, - "loss": 0.0602, + "loss": 0.0944, "step": 7570 }, { "epoch": 2.976050255202199, - "grad_norm": 3.3520283699035645, + "grad_norm": 2.9536733627319336, "learning_rate": 8.401439685886715e-06, - "loss": 0.1901, + "loss": 0.2586, "step": 7580 }, { "epoch": 2.9799764428739692, - "grad_norm": 30.982398986816406, + "grad_norm": 28.326351165771484, "learning_rate": 8.390532974623719e-06, - "loss": 0.658, + "loss": 0.5322, "step": 7590 }, { "epoch": 2.98390263054574, - "grad_norm": 1.175611138343811, + "grad_norm": 0.5654705166816711, "learning_rate": 8.379626263360722e-06, - "loss": 0.2894, + "loss": 0.2642, "step": 7600 }, { "epoch": 2.987828818217511, - "grad_norm": 0.03015066310763359, + "grad_norm": 0.01225184090435505, "learning_rate": 8.368719552097725e-06, - "loss": 0.2842, + "loss": 0.2029, "step": 7610 }, { "epoch": 2.9917550058892814, - "grad_norm": 28.496431350708008, + "grad_norm": 37.016441345214844, "learning_rate": 8.357812840834727e-06, - "loss": 0.2976, + "loss": 0.3349, "step": 7620 }, { "epoch": 2.9956811935610523, - "grad_norm": 56.41312026977539, + "grad_norm": 19.842605590820312, "learning_rate": 8.34690612957173e-06, - "loss": 0.2029, + "loss": 0.1141, "step": 7630 }, { "epoch": 2.999607381232823, - "grad_norm": 3.412231206893921, + "grad_norm": 2.9952104091644287, "learning_rate": 8.335999418308732e-06, - "loss": 0.1069, + "loss": 0.2037, "step": 7640 }, { "epoch": 3.0, - "eval_loss": 0.19104745984077454, - "eval_runtime": 11.3908, - "eval_samples_per_second": 198.758, - "eval_steps_per_second": 24.845, + "eval_loss": 0.2000712752342224, + "eval_runtime": 12.0817, + "eval_samples_per_second": 187.392, + "eval_steps_per_second": 23.424, "step": 7641 }, { "epoch": 3.0035335689045937, - "grad_norm": 2.9253339767456055, + "grad_norm": 0.6600627303123474, "learning_rate": 8.325092707045736e-06, - "loss": 0.105, + "loss": 0.1094, "step": 7650 }, { "epoch": 3.0074597565763646, - "grad_norm": 0.24066723883152008, + "grad_norm": 0.307326078414917, "learning_rate": 8.314185995782739e-06, - "loss": 0.0627, + "loss": 0.0542, "step": 7660 }, { "epoch": 3.011385944248135, - "grad_norm": 0.19360512495040894, + "grad_norm": 0.15401798486709595, "learning_rate": 8.30327928451974e-06, - "loss": 0.0291, + "loss": 0.0161, "step": 7670 }, { "epoch": 3.015312131919906, - "grad_norm": 0.27192139625549316, + "grad_norm": 0.2852010726928711, "learning_rate": 8.292372573256744e-06, - "loss": 0.2564, + "loss": 0.3104, "step": 7680 }, { "epoch": 3.0192383195916763, - "grad_norm": 5.074106216430664, + "grad_norm": 3.0000016689300537, "learning_rate": 8.281465861993747e-06, - "loss": 0.0179, + "loss": 0.0242, "step": 7690 }, { "epoch": 3.023164507263447, - "grad_norm": 32.16835021972656, + "grad_norm": 18.037723541259766, "learning_rate": 8.270559150730751e-06, - "loss": 0.1794, + "loss": 0.166, "step": 7700 }, { "epoch": 3.027090694935218, - "grad_norm": 2.7100470066070557, + "grad_norm": 7.298466682434082, "learning_rate": 8.259652439467753e-06, - "loss": 0.0469, + "loss": 0.0505, "step": 7710 }, { "epoch": 3.0310168826069885, - "grad_norm": 9.974100112915039, + "grad_norm": 1.8767467737197876, "learning_rate": 8.248745728204754e-06, - "loss": 0.0831, + "loss": 0.0715, "step": 7720 }, { "epoch": 3.0349430702787594, - "grad_norm": 11.936753273010254, + "grad_norm": 6.157838344573975, "learning_rate": 8.237839016941758e-06, - "loss": 0.3263, + "loss": 0.1423, "step": 7730 }, { "epoch": 3.03886925795053, - "grad_norm": 40.117919921875, + "grad_norm": 2.6973254680633545, "learning_rate": 8.226932305678761e-06, - "loss": 0.1292, + "loss": 0.1354, "step": 7740 }, { "epoch": 3.0427954456223008, - "grad_norm": 4.400332927703857, + "grad_norm": 6.5018839836120605, "learning_rate": 8.216025594415765e-06, - "loss": 0.025, + "loss": 0.0573, "step": 7750 }, { "epoch": 3.0467216332940716, - "grad_norm": 1.2223269939422607, + "grad_norm": 1.431301474571228, "learning_rate": 8.205118883152766e-06, - "loss": 0.1637, + "loss": 0.1932, "step": 7760 }, { "epoch": 3.050647820965842, - "grad_norm": 36.453365325927734, + "grad_norm": 33.391014099121094, "learning_rate": 8.19421217188977e-06, - "loss": 0.2328, + "loss": 0.1983, "step": 7770 }, { "epoch": 3.054574008637613, - "grad_norm": 0.16483445465564728, + "grad_norm": 0.6226595640182495, "learning_rate": 8.183305460626773e-06, - "loss": 0.0497, + "loss": 0.037, "step": 7780 }, { "epoch": 3.0585001963093834, - "grad_norm": 22.267799377441406, + "grad_norm": 14.073274612426758, "learning_rate": 8.172398749363776e-06, - "loss": 0.262, + "loss": 0.2616, "step": 7790 }, { "epoch": 3.0624263839811543, - "grad_norm": 30.272363662719727, + "grad_norm": 78.72065734863281, "learning_rate": 8.16149203810078e-06, - "loss": 0.1598, + "loss": 0.2439, "step": 7800 }, { "epoch": 3.066352571652925, - "grad_norm": 3.393531560897827, + "grad_norm": 3.3628101348876953, "learning_rate": 8.15058532683778e-06, - "loss": 0.0781, + "loss": 0.1024, "step": 7810 }, { "epoch": 3.0702787593246956, - "grad_norm": 108.46585845947266, + "grad_norm": 4.851952075958252, "learning_rate": 8.139678615574783e-06, - "loss": 0.1869, + "loss": 0.0973, "step": 7820 }, { "epoch": 3.0742049469964665, - "grad_norm": 5.491083145141602, + "grad_norm": 5.918734073638916, "learning_rate": 8.128771904311787e-06, - "loss": 0.1296, + "loss": 0.1485, "step": 7830 }, { "epoch": 3.078131134668237, - "grad_norm": 5.934266090393066, + "grad_norm": 0.20275111496448517, "learning_rate": 8.11786519304879e-06, - "loss": 0.0504, + "loss": 0.0376, "step": 7840 }, { "epoch": 3.082057322340008, - "grad_norm": 13.079463958740234, + "grad_norm": 13.090296745300293, "learning_rate": 8.106958481785792e-06, - "loss": 0.1379, + "loss": 0.1017, "step": 7850 }, { "epoch": 3.0859835100117787, - "grad_norm": 0.5619011521339417, + "grad_norm": 0.14617076516151428, "learning_rate": 8.096051770522795e-06, - "loss": 0.0655, + "loss": 0.1248, "step": 7860 }, { "epoch": 3.089909697683549, - "grad_norm": 30.218957901000977, + "grad_norm": 6.19636869430542, "learning_rate": 8.085145059259799e-06, - "loss": 0.1642, + "loss": 0.2893, "step": 7870 }, { "epoch": 3.09383588535532, - "grad_norm": 0.2990739047527313, + "grad_norm": 0.07450656592845917, "learning_rate": 8.074238347996802e-06, - "loss": 0.0353, + "loss": 0.0229, "step": 7880 }, { "epoch": 3.0977620730270905, - "grad_norm": 0.6670025587081909, + "grad_norm": 11.89781379699707, "learning_rate": 8.063331636733804e-06, - "loss": 0.0396, + "loss": 0.0792, "step": 7890 }, { "epoch": 3.1016882606988614, - "grad_norm": 2.651264190673828, + "grad_norm": 2.6443698406219482, "learning_rate": 8.052424925470805e-06, - "loss": 0.0656, + "loss": 0.1012, "step": 7900 }, { "epoch": 3.1056144483706323, - "grad_norm": 8.138945579528809, + "grad_norm": 33.47779083251953, "learning_rate": 8.041518214207809e-06, - "loss": 0.2098, + "loss": 0.2697, "step": 7910 }, { "epoch": 3.1095406360424027, - "grad_norm": 45.713130950927734, + "grad_norm": 45.60388946533203, "learning_rate": 8.030611502944812e-06, - "loss": 0.2846, + "loss": 0.2473, "step": 7920 }, { "epoch": 3.1134668237141736, - "grad_norm": 0.6482675075531006, + "grad_norm": 1.1584374904632568, "learning_rate": 8.019704791681816e-06, - "loss": 0.2537, + "loss": 0.1987, "step": 7930 }, { "epoch": 3.117393011385944, - "grad_norm": 1.3989120721817017, + "grad_norm": 0.21901720762252808, "learning_rate": 8.008798080418817e-06, - "loss": 0.2275, + "loss": 0.1889, "step": 7940 }, { "epoch": 3.121319199057715, - "grad_norm": 0.014421232044696808, + "grad_norm": 0.007941018790006638, "learning_rate": 7.99789136915582e-06, - "loss": 0.2363, + "loss": 0.2029, "step": 7950 }, { "epoch": 3.125245386729486, - "grad_norm": 0.654133141040802, + "grad_norm": 0.17286254465579987, "learning_rate": 7.986984657892824e-06, - "loss": 0.0254, + "loss": 0.0271, "step": 7960 }, { "epoch": 3.1291715744012563, - "grad_norm": 1.1010600328445435, + "grad_norm": 1.0573610067367554, "learning_rate": 7.976077946629827e-06, - "loss": 0.0262, + "loss": 0.0379, "step": 7970 }, { "epoch": 3.133097762073027, - "grad_norm": 0.012308135628700256, + "grad_norm": 0.012497455812990665, "learning_rate": 7.965171235366829e-06, - "loss": 0.2935, + "loss": 0.3462, "step": 7980 }, { "epoch": 3.1370239497447976, - "grad_norm": 4.330096244812012, + "grad_norm": 1.0641800165176392, "learning_rate": 7.954264524103831e-06, - "loss": 0.2423, + "loss": 0.0752, "step": 7990 }, { "epoch": 3.1409501374165685, - "grad_norm": 42.24994659423828, + "grad_norm": 37.56785202026367, "learning_rate": 7.943357812840834e-06, - "loss": 0.3442, + "loss": 0.3157, "step": 8000 }, { "epoch": 3.1448763250883394, - "grad_norm": 0.139979749917984, + "grad_norm": 0.07521757483482361, "learning_rate": 7.932451101577838e-06, - "loss": 0.1328, + "loss": 0.067, "step": 8010 }, { "epoch": 3.14880251276011, - "grad_norm": 0.5079324245452881, + "grad_norm": 0.5178630948066711, "learning_rate": 7.921544390314841e-06, - "loss": 0.3609, + "loss": 0.2734, "step": 8020 }, { "epoch": 3.1527287004318807, - "grad_norm": 0.07123516499996185, + "grad_norm": 0.14671330153942108, "learning_rate": 7.910637679051844e-06, - "loss": 0.2678, + "loss": 0.2286, "step": 8030 }, { "epoch": 3.1566548881036516, - "grad_norm": 0.45936712622642517, + "grad_norm": 0.10935986042022705, "learning_rate": 7.899730967788846e-06, - "loss": 0.0588, + "loss": 0.0202, "step": 8040 }, { "epoch": 3.160581075775422, - "grad_norm": 1.8183523416519165, + "grad_norm": 4.84882116317749, "learning_rate": 7.88882425652585e-06, - "loss": 0.2015, + "loss": 0.1372, "step": 8050 }, { "epoch": 3.164507263447193, - "grad_norm": 0.1024223268032074, + "grad_norm": 0.4085615277290344, "learning_rate": 7.877917545262853e-06, - "loss": 0.0312, + "loss": 0.0197, "step": 8060 }, { "epoch": 3.1684334511189634, - "grad_norm": 50.8315315246582, + "grad_norm": 52.41949462890625, "learning_rate": 7.867010833999855e-06, - "loss": 0.1948, + "loss": 0.2874, "step": 8070 }, { "epoch": 3.1723596387907342, - "grad_norm": 0.17993241548538208, + "grad_norm": 0.2706127166748047, "learning_rate": 7.856104122736856e-06, - "loss": 0.1591, + "loss": 0.1317, "step": 8080 }, { "epoch": 3.1762858264625047, - "grad_norm": 49.28693771362305, + "grad_norm": 26.715774536132812, "learning_rate": 7.84519741147386e-06, - "loss": 0.1469, + "loss": 0.0594, "step": 8090 }, { "epoch": 3.1802120141342756, - "grad_norm": 0.21395254135131836, + "grad_norm": 0.12675605714321136, "learning_rate": 7.834290700210863e-06, - "loss": 0.1303, + "loss": 0.052, "step": 8100 }, { "epoch": 3.1841382018060465, - "grad_norm": 5.759995937347412, + "grad_norm": 8.121604919433594, "learning_rate": 7.823383988947867e-06, - "loss": 0.2316, + "loss": 0.2119, "step": 8110 }, { "epoch": 3.188064389477817, - "grad_norm": 1.8371074199676514, + "grad_norm": 2.4823458194732666, "learning_rate": 7.81247727768487e-06, - "loss": 0.0873, + "loss": 0.0772, "step": 8120 }, { "epoch": 3.191990577149588, - "grad_norm": 0.48116979002952576, + "grad_norm": 2.690593719482422, "learning_rate": 7.801570566421872e-06, - "loss": 0.6231, + "loss": 0.2574, "step": 8130 }, { "epoch": 3.1959167648213587, - "grad_norm": 35.71897506713867, + "grad_norm": 20.89065933227539, "learning_rate": 7.790663855158875e-06, - "loss": 0.1157, + "loss": 0.0559, "step": 8140 }, { "epoch": 3.199842952493129, - "grad_norm": 45.561302185058594, + "grad_norm": 41.33374786376953, "learning_rate": 7.779757143895877e-06, - "loss": 0.1483, + "loss": 0.0774, "step": 8150 }, { "epoch": 3.2037691401649, - "grad_norm": 1.0882476568222046, + "grad_norm": 0.41390347480773926, "learning_rate": 7.76885043263288e-06, - "loss": 0.0518, + "loss": 0.0412, "step": 8160 }, { "epoch": 3.2076953278366704, - "grad_norm": 0.24169518053531647, + "grad_norm": 0.2790501117706299, "learning_rate": 7.757943721369884e-06, - "loss": 0.2278, + "loss": 0.2695, "step": 8170 }, { "epoch": 3.2116215155084413, - "grad_norm": 1.0709370374679565, + "grad_norm": 1.4948405027389526, "learning_rate": 7.747037010106885e-06, - "loss": 0.0589, + "loss": 0.0693, "step": 8180 }, { "epoch": 3.215547703180212, - "grad_norm": 9.445268630981445, + "grad_norm": 1.8810807466506958, "learning_rate": 7.736130298843889e-06, - "loss": 0.2113, + "loss": 0.2674, "step": 8190 }, { "epoch": 3.2194738908519827, - "grad_norm": 0.27919629216194153, + "grad_norm": 0.03659799322485924, "learning_rate": 7.725223587580892e-06, - "loss": 0.1967, + "loss": 0.1515, "step": 8200 }, { "epoch": 3.2234000785237535, - "grad_norm": 39.1192626953125, + "grad_norm": 32.701473236083984, "learning_rate": 7.714316876317895e-06, - "loss": 0.1998, + "loss": 0.2401, "step": 8210 }, { "epoch": 3.227326266195524, - "grad_norm": 131.23068237304688, + "grad_norm": 185.47679138183594, "learning_rate": 7.703410165054897e-06, - "loss": 0.3079, + "loss": 0.1573, "step": 8220 }, { "epoch": 3.231252453867295, - "grad_norm": 4.130087852478027, + "grad_norm": 9.424933433532715, "learning_rate": 7.6925034537919e-06, - "loss": 0.0592, + "loss": 0.0609, "step": 8230 }, { "epoch": 3.2351786415390658, - "grad_norm": 41.0843391418457, + "grad_norm": 29.956815719604492, "learning_rate": 7.681596742528902e-06, - "loss": 0.2847, + "loss": 0.4123, "step": 8240 }, { "epoch": 3.239104829210836, - "grad_norm": 0.057517558336257935, + "grad_norm": 0.13973675668239594, "learning_rate": 7.670690031265906e-06, - "loss": 0.2521, + "loss": 0.2585, "step": 8250 }, { "epoch": 3.243031016882607, - "grad_norm": 16.58539390563965, + "grad_norm": 0.6026905179023743, "learning_rate": 7.659783320002909e-06, - "loss": 0.1917, + "loss": 0.1554, "step": 8260 }, { "epoch": 3.2469572045543775, - "grad_norm": 0.0022131644655019045, + "grad_norm": 0.005956938955932856, "learning_rate": 7.64887660873991e-06, - "loss": 0.0357, + "loss": 0.0309, "step": 8270 }, { "epoch": 3.2508833922261484, - "grad_norm": 1.800696849822998, + "grad_norm": 3.0398926734924316, "learning_rate": 7.637969897476914e-06, - "loss": 0.0345, + "loss": 0.0219, "step": 8280 }, { "epoch": 3.2548095798979193, - "grad_norm": 2.5760087966918945, + "grad_norm": 0.10834541916847229, "learning_rate": 7.627063186213918e-06, - "loss": 0.0701, + "loss": 0.0215, "step": 8290 }, { "epoch": 3.2587357675696897, - "grad_norm": 1.6981908082962036, + "grad_norm": 1.7746319770812988, "learning_rate": 7.61615647495092e-06, - "loss": 0.356, + "loss": 0.3307, "step": 8300 }, { "epoch": 3.2626619552414606, - "grad_norm": 0.9137970805168152, + "grad_norm": 0.25790587067604065, "learning_rate": 7.605249763687922e-06, - "loss": 0.17, + "loss": 0.1372, "step": 8310 }, { "epoch": 3.266588142913231, - "grad_norm": 0.26534122228622437, + "grad_norm": 0.06282588839530945, "learning_rate": 7.594343052424925e-06, - "loss": 0.0146, + "loss": 0.0196, "step": 8320 }, { "epoch": 3.270514330585002, - "grad_norm": 107.3065185546875, + "grad_norm": 33.373512268066406, "learning_rate": 7.583436341161929e-06, - "loss": 0.3136, + "loss": 0.1415, "step": 8330 }, { "epoch": 3.274440518256773, - "grad_norm": 0.2451629489660263, + "grad_norm": 0.12356891483068466, "learning_rate": 7.572529629898931e-06, - "loss": 0.0842, + "loss": 0.116, "step": 8340 }, { "epoch": 3.2783667059285433, - "grad_norm": 1.9701683521270752, + "grad_norm": 1.529503583908081, "learning_rate": 7.561622918635935e-06, - "loss": 0.1676, + "loss": 0.1263, "step": 8350 }, { "epoch": 3.282292893600314, - "grad_norm": 1.8848209381103516, + "grad_norm": 0.2636513113975525, "learning_rate": 7.550716207372936e-06, - "loss": 0.1841, + "loss": 0.0863, "step": 8360 }, { "epoch": 3.2862190812720846, - "grad_norm": 0.0860225185751915, + "grad_norm": 0.021591916680336, "learning_rate": 7.53980949610994e-06, - "loss": 0.3694, + "loss": 0.522, "step": 8370 }, { "epoch": 3.2901452689438555, - "grad_norm": 0.4592539966106415, + "grad_norm": 0.07466388493776321, "learning_rate": 7.528902784846943e-06, - "loss": 0.0333, + "loss": 0.0359, "step": 8380 }, { "epoch": 3.2940714566156264, - "grad_norm": 95.80058288574219, + "grad_norm": 26.616647720336914, "learning_rate": 7.517996073583946e-06, - "loss": 0.1507, + "loss": 0.1302, "step": 8390 }, { "epoch": 3.297997644287397, - "grad_norm": 1.3287121057510376, + "grad_norm": 0.6950975656509399, "learning_rate": 7.507089362320949e-06, - "loss": 0.3178, + "loss": 0.2564, "step": 8400 }, { "epoch": 3.3019238319591677, - "grad_norm": 0.19601622223854065, + "grad_norm": 0.2244836688041687, "learning_rate": 7.496182651057952e-06, - "loss": 0.0164, + "loss": 0.011, "step": 8410 }, { "epoch": 3.305850019630938, - "grad_norm": 0.6849249601364136, + "grad_norm": 2.6343295574188232, "learning_rate": 7.485275939794954e-06, - "loss": 0.6023, + "loss": 0.5969, "step": 8420 }, { "epoch": 3.309776207302709, - "grad_norm": 2.2179195880889893, + "grad_norm": 4.250097274780273, "learning_rate": 7.474369228531957e-06, - "loss": 0.1385, + "loss": 0.1313, "step": 8430 }, { "epoch": 3.31370239497448, - "grad_norm": 1.0667563676834106, + "grad_norm": 0.5972789525985718, "learning_rate": 7.463462517268959e-06, - "loss": 0.2082, + "loss": 0.2106, "step": 8440 }, { "epoch": 3.3176285826462504, - "grad_norm": 0.23048140108585358, + "grad_norm": 0.07247540354728699, "learning_rate": 7.452555806005963e-06, - "loss": 0.2151, + "loss": 0.2116, "step": 8450 }, { "epoch": 3.3215547703180213, - "grad_norm": 24.052919387817383, + "grad_norm": 44.81258773803711, "learning_rate": 7.441649094742965e-06, - "loss": 0.0644, + "loss": 0.0833, "step": 8460 }, { "epoch": 3.325480957989792, - "grad_norm": 2.1752066612243652, + "grad_norm": 2.344716787338257, "learning_rate": 7.430742383479968e-06, - "loss": 0.026, + "loss": 0.0249, "step": 8470 }, { "epoch": 3.3294071456615626, - "grad_norm": 3.312345504760742, + "grad_norm": 2.4029715061187744, "learning_rate": 7.419835672216971e-06, - "loss": 0.1497, + "loss": 0.2652, "step": 8480 }, { "epoch": 3.3333333333333335, - "grad_norm": 1.8271385431289673, + "grad_norm": 1.5212286710739136, "learning_rate": 7.408928960953974e-06, - "loss": 0.2253, + "loss": 0.3924, "step": 8490 }, { "epoch": 3.337259521005104, - "grad_norm": 0.3415015637874603, + "grad_norm": 0.5456534028053284, "learning_rate": 7.398022249690977e-06, - "loss": 0.0796, + "loss": 0.036, "step": 8500 }, { "epoch": 3.341185708676875, - "grad_norm": 12.786201477050781, + "grad_norm": 19.842634201049805, "learning_rate": 7.387115538427979e-06, - "loss": 0.1826, + "loss": 0.1543, "step": 8510 }, { "epoch": 3.3451118963486453, - "grad_norm": 1.5180126428604126, + "grad_norm": 0.7682566046714783, "learning_rate": 7.376208827164982e-06, - "loss": 0.2822, + "loss": 0.1203, "step": 8520 }, { "epoch": 3.349038084020416, - "grad_norm": 0.006688406225293875, + "grad_norm": 0.012378252111375332, "learning_rate": 7.365302115901985e-06, - "loss": 0.0324, + "loss": 0.0458, "step": 8530 }, { "epoch": 3.352964271692187, - "grad_norm": 76.7862319946289, + "grad_norm": 82.90477752685547, "learning_rate": 7.354395404638988e-06, - "loss": 0.407, + "loss": 0.3241, "step": 8540 }, { "epoch": 3.3568904593639575, - "grad_norm": 44.041866302490234, + "grad_norm": 55.83918380737305, "learning_rate": 7.343488693375992e-06, - "loss": 0.3348, + "loss": 0.3217, "step": 8550 }, { "epoch": 3.3608166470357284, - "grad_norm": 0.0861259326338768, + "grad_norm": 0.09521665424108505, "learning_rate": 7.332581982112993e-06, - "loss": 0.035, + "loss": 0.0047, "step": 8560 }, { "epoch": 3.3647428347074992, - "grad_norm": 11.63525104522705, + "grad_norm": 10.619758605957031, "learning_rate": 7.321675270849997e-06, - "loss": 0.0227, + "loss": 0.0529, "step": 8570 }, { "epoch": 3.3686690223792697, - "grad_norm": 6.469714164733887, + "grad_norm": 2.0451769828796387, "learning_rate": 7.310768559586999e-06, - "loss": 0.2847, + "loss": 0.1915, "step": 8580 }, { "epoch": 3.3725952100510406, - "grad_norm": 1.1779093742370605, + "grad_norm": 2.3846752643585205, "learning_rate": 7.299861848324003e-06, - "loss": 0.1142, + "loss": 0.086, "step": 8590 }, { "epoch": 3.376521397722811, - "grad_norm": 3.698263645172119, + "grad_norm": 4.591152667999268, "learning_rate": 7.288955137061004e-06, - "loss": 0.3644, + "loss": 0.4382, "step": 8600 }, { "epoch": 3.380447585394582, - "grad_norm": 0.8072194457054138, + "grad_norm": 0.23025058209896088, "learning_rate": 7.278048425798008e-06, - "loss": 0.1614, + "loss": 0.3227, "step": 8610 }, { "epoch": 3.3843737730663523, - "grad_norm": 51.72392272949219, + "grad_norm": 37.376190185546875, "learning_rate": 7.267141714535011e-06, - "loss": 0.2984, + "loss": 0.1436, "step": 8620 }, { "epoch": 3.3882999607381232, - "grad_norm": 1.7757817506790161, + "grad_norm": 1.6544480323791504, "learning_rate": 7.256235003272014e-06, - "loss": 0.2027, + "loss": 0.2683, "step": 8630 }, { "epoch": 3.392226148409894, - "grad_norm": 3.490008592605591, + "grad_norm": 0.8117730021476746, "learning_rate": 7.245328292009016e-06, - "loss": 0.0625, + "loss": 0.0438, "step": 8640 }, { "epoch": 3.3961523360816646, - "grad_norm": 3.9412710666656494, + "grad_norm": 3.6689159870147705, "learning_rate": 7.234421580746019e-06, - "loss": 0.1852, + "loss": 0.1554, "step": 8650 }, { "epoch": 3.4000785237534354, - "grad_norm": 2.3700900077819824, + "grad_norm": 0.3229542076587677, "learning_rate": 7.223514869483022e-06, - "loss": 0.1975, + "loss": 0.1198, "step": 8660 }, { "epoch": 3.4040047114252063, - "grad_norm": 4.911314964294434, + "grad_norm": 5.374237537384033, "learning_rate": 7.212608158220025e-06, - "loss": 0.0355, + "loss": 0.0568, "step": 8670 }, { "epoch": 3.4079308990969768, - "grad_norm": 0.5654276609420776, + "grad_norm": 0.030682506039738655, "learning_rate": 7.201701446957027e-06, - "loss": 0.6256, + "loss": 0.3911, "step": 8680 }, { "epoch": 3.4118570867687477, - "grad_norm": 0.4800680875778198, + "grad_norm": 0.22470061480998993, "learning_rate": 7.190794735694031e-06, - "loss": 0.0681, + "loss": 0.1776, "step": 8690 }, { "epoch": 3.415783274440518, - "grad_norm": 3.1208109855651855, + "grad_norm": 3.251796245574951, "learning_rate": 7.179888024431033e-06, - "loss": 0.3064, + "loss": 0.2772, "step": 8700 }, { "epoch": 3.419709462112289, - "grad_norm": 0.004039533901959658, + "grad_norm": 0.008656243793666363, "learning_rate": 7.168981313168037e-06, - "loss": 0.0277, + "loss": 0.1449, "step": 8710 }, { "epoch": 3.4236356497840594, - "grad_norm": 0.027961621060967445, + "grad_norm": 0.02136993780732155, "learning_rate": 7.158074601905039e-06, - "loss": 0.1336, + "loss": 0.1033, "step": 8720 }, { "epoch": 3.4275618374558303, - "grad_norm": 7.6953911781311035, + "grad_norm": 14.517278671264648, "learning_rate": 7.147167890642042e-06, - "loss": 0.0517, + "loss": 0.0528, "step": 8730 }, { "epoch": 3.431488025127601, - "grad_norm": 1.0221000909805298, + "grad_norm": 0.23135441541671753, "learning_rate": 7.136261179379044e-06, - "loss": 0.2565, + "loss": 0.2594, "step": 8740 }, { "epoch": 3.4354142127993716, - "grad_norm": 0.8163778781890869, + "grad_norm": 1.5432502031326294, "learning_rate": 7.125354468116048e-06, - "loss": 0.0565, + "loss": 0.2524, "step": 8750 }, { "epoch": 3.4393404004711425, - "grad_norm": 6.896636486053467, + "grad_norm": 4.422389030456543, "learning_rate": 7.114447756853051e-06, - "loss": 0.2291, + "loss": 0.2519, "step": 8760 }, { "epoch": 3.4432665881429134, - "grad_norm": 1.921363353729248, + "grad_norm": 0.9874147772789001, "learning_rate": 7.103541045590053e-06, - "loss": 0.0825, + "loss": 0.123, "step": 8770 }, { "epoch": 3.447192775814684, - "grad_norm": 86.53144836425781, + "grad_norm": 33.40144729614258, "learning_rate": 7.092634334327056e-06, - "loss": 0.2492, + "loss": 0.2279, "step": 8780 }, { "epoch": 3.4511189634864547, - "grad_norm": 0.1712948977947235, + "grad_norm": 0.1019761934876442, "learning_rate": 7.081727623064059e-06, - "loss": 0.1445, + "loss": 0.0908, "step": 8790 }, { "epoch": 3.455045151158225, - "grad_norm": 0.23910944163799286, + "grad_norm": 0.13676011562347412, "learning_rate": 7.070820911801062e-06, - "loss": 0.2387, + "loss": 0.0895, "step": 8800 }, { "epoch": 3.458971338829996, - "grad_norm": 0.872718095779419, + "grad_norm": 0.7299799919128418, "learning_rate": 7.059914200538064e-06, - "loss": 0.1554, + "loss": 0.0761, "step": 8810 }, { "epoch": 3.462897526501767, - "grad_norm": 0.9722446203231812, + "grad_norm": 3.8705031871795654, "learning_rate": 7.049007489275067e-06, - "loss": 0.0558, + "loss": 0.0495, "step": 8820 }, { "epoch": 3.4668237141735374, - "grad_norm": 2.0872268676757812, + "grad_norm": 4.899084568023682, "learning_rate": 7.03810077801207e-06, - "loss": 0.0445, + "loss": 0.0612, "step": 8830 }, { "epoch": 3.4707499018453083, - "grad_norm": 0.05669504404067993, + "grad_norm": 0.12403878569602966, "learning_rate": 7.027194066749073e-06, - "loss": 0.2428, + "loss": 0.3844, "step": 8840 }, { "epoch": 3.4746760895170787, - "grad_norm": 0.12508156895637512, + "grad_norm": 0.08584719896316528, "learning_rate": 7.016287355486077e-06, - "loss": 0.1732, + "loss": 0.1869, "step": 8850 }, { "epoch": 3.4786022771888496, - "grad_norm": 55.8948860168457, + "grad_norm": 48.21470642089844, "learning_rate": 7.005380644223078e-06, - "loss": 0.2432, + "loss": 0.2922, "step": 8860 }, { "epoch": 3.4825284648606205, - "grad_norm": 2.9225540161132812, + "grad_norm": 3.4342103004455566, "learning_rate": 6.994473932960082e-06, - "loss": 0.2478, + "loss": 0.2063, "step": 8870 }, { "epoch": 3.486454652532391, - "grad_norm": 61.05860137939453, + "grad_norm": 70.65544891357422, "learning_rate": 6.983567221697084e-06, - "loss": 0.3805, + "loss": 0.2111, "step": 8880 }, { "epoch": 3.490380840204162, - "grad_norm": 0.11319629102945328, + "grad_norm": 1.3855644464492798, "learning_rate": 6.972660510434088e-06, - "loss": 0.2118, + "loss": 0.1269, "step": 8890 }, { "epoch": 3.4943070278759323, - "grad_norm": 0.0022335960529744625, + "grad_norm": 0.0017630542861297727, "learning_rate": 6.9617537991710894e-06, - "loss": 0.1232, + "loss": 0.1417, "step": 8900 }, { "epoch": 3.498233215547703, - "grad_norm": 0.19205982983112335, + "grad_norm": 0.03894443437457085, "learning_rate": 6.950847087908093e-06, - "loss": 0.0631, + "loss": 0.1035, "step": 8910 }, { "epoch": 3.502159403219474, - "grad_norm": 3.112199544906616, + "grad_norm": 4.921162128448486, "learning_rate": 6.939940376645096e-06, - "loss": 0.4327, + "loss": 0.4226, "step": 8920 }, { "epoch": 3.5060855908912445, - "grad_norm": 3.0055389404296875, + "grad_norm": 3.8692731857299805, "learning_rate": 6.929033665382099e-06, - "loss": 0.2056, + "loss": 0.1011, "step": 8930 }, { "epoch": 3.5100117785630154, - "grad_norm": 1.67527437210083, + "grad_norm": 1.3821625709533691, "learning_rate": 6.918126954119101e-06, - "loss": 0.2039, + "loss": 0.1836, "step": 8940 }, { "epoch": 3.513937966234786, - "grad_norm": 38.10768508911133, + "grad_norm": 35.635643005371094, "learning_rate": 6.907220242856104e-06, - "loss": 0.1686, + "loss": 0.2268, "step": 8950 }, { "epoch": 3.5178641539065567, - "grad_norm": 0.34699195623397827, + "grad_norm": 0.4415988624095917, "learning_rate": 6.896313531593107e-06, - "loss": 0.5673, + "loss": 0.5328, "step": 8960 }, { "epoch": 3.5217903415783276, - "grad_norm": 1.4509392976760864, + "grad_norm": 0.5169740319252014, "learning_rate": 6.88540682033011e-06, - "loss": 0.0977, + "loss": 0.1144, "step": 8970 }, { "epoch": 3.525716529250098, - "grad_norm": 0.23722325265407562, + "grad_norm": 1.0992881059646606, "learning_rate": 6.874500109067112e-06, - "loss": 0.199, + "loss": 0.2129, "step": 8980 }, { "epoch": 3.529642716921869, - "grad_norm": 0.3096579313278198, + "grad_norm": 0.15133367478847504, "learning_rate": 6.863593397804116e-06, - "loss": 0.1037, + "loss": 0.3123, "step": 8990 }, { "epoch": 3.53356890459364, - "grad_norm": 2.208101987838745, + "grad_norm": 2.107811212539673, "learning_rate": 6.852686686541118e-06, - "loss": 0.0327, + "loss": 0.0296, "step": 9000 }, { "epoch": 3.5374950922654103, - "grad_norm": 1.1535919904708862, + "grad_norm": 0.9873151183128357, "learning_rate": 6.841779975278122e-06, - "loss": 0.2969, + "loss": 0.4091, "step": 9010 }, { "epoch": 3.541421279937181, - "grad_norm": 0.31663551926612854, + "grad_norm": 0.02958032861351967, "learning_rate": 6.830873264015124e-06, - "loss": 0.059, + "loss": 0.0563, "step": 9020 }, { "epoch": 3.5453474676089516, - "grad_norm": 0.10102329403162003, + "grad_norm": 0.09154967963695526, "learning_rate": 6.819966552752127e-06, - "loss": 0.0362, + "loss": 0.026, "step": 9030 }, { "epoch": 3.5492736552807225, - "grad_norm": 0.29346683621406555, + "grad_norm": 0.7550766468048096, "learning_rate": 6.809059841489129e-06, - "loss": 0.0528, + "loss": 0.0176, "step": 9040 }, { "epoch": 3.553199842952493, - "grad_norm": 9.581647872924805, + "grad_norm": 16.913530349731445, "learning_rate": 6.798153130226133e-06, - "loss": 0.389, + "loss": 0.3559, "step": 9050 }, { "epoch": 3.557126030624264, - "grad_norm": 1.812854528427124, + "grad_norm": 3.0624022483825684, "learning_rate": 6.787246418963136e-06, - "loss": 0.2668, + "loss": 0.4437, "step": 9060 }, { "epoch": 3.5610522182960347, - "grad_norm": 1.9888079166412354, + "grad_norm": 0.7572641968727112, "learning_rate": 6.776339707700138e-06, - "loss": 0.3411, + "loss": 0.3066, "step": 9070 }, { "epoch": 3.564978405967805, - "grad_norm": 0.3859824538230896, + "grad_norm": 0.18037593364715576, "learning_rate": 6.765432996437141e-06, - "loss": 0.0562, + "loss": 0.0239, "step": 9080 }, { "epoch": 3.568904593639576, - "grad_norm": 6.903296947479248, + "grad_norm": 6.928786277770996, "learning_rate": 6.754526285174144e-06, - "loss": 0.4031, + "loss": 0.2712, "step": 9090 }, { "epoch": 3.572830781311347, - "grad_norm": 0.2021486610174179, + "grad_norm": 0.22859081625938416, "learning_rate": 6.743619573911147e-06, - "loss": 0.3314, + "loss": 0.2368, "step": 9100 }, { "epoch": 3.5767569689831173, - "grad_norm": 54.10165786743164, + "grad_norm": 104.52559661865234, "learning_rate": 6.732712862648149e-06, - "loss": 0.2613, + "loss": 0.3187, "step": 9110 }, { "epoch": 3.5806831566548882, - "grad_norm": 2.5160226821899414, + "grad_norm": 3.5120961666107178, "learning_rate": 6.721806151385152e-06, - "loss": 0.0343, + "loss": 0.0742, "step": 9120 }, { "epoch": 3.5846093443266587, - "grad_norm": 0.06641437858343124, + "grad_norm": 0.025100823491811752, "learning_rate": 6.710899440122156e-06, - "loss": 0.0432, + "loss": 0.0212, "step": 9130 }, { "epoch": 3.5885355319984296, - "grad_norm": 0.021912194788455963, + "grad_norm": 0.017144978046417236, "learning_rate": 6.699992728859158e-06, - "loss": 0.0362, + "loss": 0.0551, "step": 9140 }, { "epoch": 3.5924617196702, - "grad_norm": 0.5164591670036316, + "grad_norm": 0.05097698047757149, "learning_rate": 6.689086017596161e-06, - "loss": 0.1446, + "loss": 0.1419, "step": 9150 }, { "epoch": 3.596387907341971, - "grad_norm": 0.08118721097707748, + "grad_norm": 0.03761767968535423, "learning_rate": 6.6781793063331634e-06, - "loss": 0.1533, + "loss": 0.1504, "step": 9160 }, { "epoch": 3.6003140950137418, - "grad_norm": 0.20599114894866943, + "grad_norm": 9.327248573303223, "learning_rate": 6.667272595070167e-06, - "loss": 0.1573, + "loss": 0.231, "step": 9170 }, { "epoch": 3.604240282685512, - "grad_norm": 30.359926223754883, + "grad_norm": 9.968353271484375, "learning_rate": 6.656365883807169e-06, - "loss": 0.5229, + "loss": 0.2463, "step": 9180 }, { "epoch": 3.608166470357283, - "grad_norm": 6.81015157699585, + "grad_norm": 0.8269527554512024, "learning_rate": 6.645459172544173e-06, - "loss": 0.0774, + "loss": 0.1203, "step": 9190 }, { "epoch": 3.612092658029054, - "grad_norm": 6.422562599182129, + "grad_norm": 6.291353225708008, "learning_rate": 6.634552461281175e-06, - "loss": 0.2609, + "loss": 0.261, "step": 9200 }, { "epoch": 3.6160188457008244, - "grad_norm": 0.8523460626602173, + "grad_norm": 4.791230201721191, "learning_rate": 6.623645750018178e-06, - "loss": 0.0676, + "loss": 0.1243, "step": 9210 }, { "epoch": 3.6199450333725953, - "grad_norm": 0.44231483340263367, + "grad_norm": 0.9438167810440063, "learning_rate": 6.612739038755181e-06, - "loss": 0.1681, + "loss": 0.1178, "step": 9220 }, { "epoch": 3.6238712210443658, - "grad_norm": 0.6610956788063049, + "grad_norm": 0.17735238373279572, "learning_rate": 6.601832327492184e-06, - "loss": 0.1633, + "loss": 0.2039, "step": 9230 }, { "epoch": 3.6277974087161367, - "grad_norm": 0.0642637088894844, + "grad_norm": 0.02930208295583725, "learning_rate": 6.590925616229186e-06, - "loss": 0.0459, + "loss": 0.0345, "step": 9240 }, { "epoch": 3.631723596387907, - "grad_norm": 1.4521757364273071, + "grad_norm": 2.7516653537750244, "learning_rate": 6.580018904966189e-06, - "loss": 0.0653, + "loss": 0.026, "step": 9250 }, { "epoch": 3.635649784059678, - "grad_norm": 0.11491431295871735, + "grad_norm": 0.060328368097543716, "learning_rate": 6.569112193703192e-06, - "loss": 0.0637, + "loss": 0.0981, "step": 9260 }, { "epoch": 3.639575971731449, - "grad_norm": 2.7123095989227295, + "grad_norm": 1.6880004405975342, "learning_rate": 6.558205482440195e-06, - "loss": 0.2497, + "loss": 0.3311, "step": 9270 }, { "epoch": 3.6435021594032193, - "grad_norm": 1.7606005668640137, + "grad_norm": 1.8217518329620361, "learning_rate": 6.5472987711771975e-06, - "loss": 0.1453, + "loss": 0.0887, "step": 9280 }, { "epoch": 3.64742834707499, - "grad_norm": 0.39487242698669434, + "grad_norm": 1.2838503122329712, "learning_rate": 6.536392059914201e-06, - "loss": 0.022, + "loss": 0.0153, "step": 9290 }, { "epoch": 3.651354534746761, - "grad_norm": 0.953216552734375, + "grad_norm": 2.0000507831573486, "learning_rate": 6.525485348651203e-06, - "loss": 0.4276, + "loss": 0.2079, "step": 9300 }, { "epoch": 3.6552807224185315, - "grad_norm": 2.0590908527374268, + "grad_norm": 0.24762168526649475, "learning_rate": 6.514578637388207e-06, - "loss": 0.1166, + "loss": 0.2219, "step": 9310 }, { "epoch": 3.6592069100903024, - "grad_norm": 7.356611728668213, + "grad_norm": 6.612955570220947, "learning_rate": 6.503671926125209e-06, - "loss": 0.1384, + "loss": 0.2401, "step": 9320 }, { "epoch": 3.6631330977620733, - "grad_norm": 0.5277950167655945, + "grad_norm": 1.7870872020721436, "learning_rate": 6.492765214862212e-06, - "loss": 0.0266, + "loss": 0.0294, "step": 9330 }, { "epoch": 3.6670592854338437, - "grad_norm": 0.006288713775575161, + "grad_norm": 0.016380473971366882, "learning_rate": 6.4818585035992145e-06, - "loss": 0.1825, + "loss": 0.1319, "step": 9340 }, { "epoch": 3.670985473105614, - "grad_norm": 0.22776515781879425, + "grad_norm": 0.11230547726154327, "learning_rate": 6.470951792336218e-06, - "loss": 0.2692, + "loss": 0.1771, "step": 9350 }, { "epoch": 3.674911660777385, - "grad_norm": 47.333412170410156, + "grad_norm": 5.207958698272705, "learning_rate": 6.460045081073221e-06, - "loss": 0.2632, + "loss": 0.3415, "step": 9360 }, { "epoch": 3.678837848449156, - "grad_norm": 2.111056327819824, + "grad_norm": 0.14589035511016846, "learning_rate": 6.449138369810223e-06, - "loss": 0.0479, + "loss": 0.0437, "step": 9370 }, { "epoch": 3.6827640361209264, - "grad_norm": 2.4685513973236084, + "grad_norm": 0.6507335305213928, "learning_rate": 6.438231658547226e-06, - "loss": 0.0424, + "loss": 0.0223, "step": 9380 }, { "epoch": 3.6866902237926973, - "grad_norm": 0.956051230430603, + "grad_norm": 2.1637909412384033, "learning_rate": 6.427324947284229e-06, - "loss": 0.1642, + "loss": 0.1455, "step": 9390 }, { "epoch": 3.690616411464468, - "grad_norm": 0.009922314435243607, + "grad_norm": 0.006028300151228905, "learning_rate": 6.416418236021232e-06, - "loss": 0.3123, + "loss": 0.1776, "step": 9400 }, { "epoch": 3.6945425991362386, - "grad_norm": 0.007941363379359245, + "grad_norm": 0.0024832827039062977, "learning_rate": 6.405511524758234e-06, - "loss": 0.1728, + "loss": 0.1448, "step": 9410 }, { "epoch": 3.6984687868080095, - "grad_norm": 36.44522476196289, + "grad_norm": 27.421045303344727, "learning_rate": 6.3946048134952374e-06, - "loss": 0.157, + "loss": 0.1525, "step": 9420 }, { "epoch": 3.7023949744797804, - "grad_norm": 3.0550100803375244, + "grad_norm": 11.622103691101074, "learning_rate": 6.383698102232241e-06, - "loss": 0.1541, + "loss": 0.1764, "step": 9430 }, { "epoch": 3.706321162151551, - "grad_norm": 12.711790084838867, + "grad_norm": 0.16820883750915527, "learning_rate": 6.372791390969243e-06, - "loss": 0.0583, + "loss": 0.1259, "step": 9440 }, { "epoch": 3.7102473498233217, - "grad_norm": 0.06454482674598694, + "grad_norm": 0.4640953242778778, "learning_rate": 6.361884679706246e-06, - "loss": 0.0795, + "loss": 0.0485, "step": 9450 }, { "epoch": 3.714173537495092, - "grad_norm": 128.607666015625, + "grad_norm": 65.42041778564453, "learning_rate": 6.3509779684432485e-06, - "loss": 0.3274, + "loss": 0.1823, "step": 9460 }, { "epoch": 3.718099725166863, - "grad_norm": 0.020870935171842575, + "grad_norm": 0.011023514904081821, "learning_rate": 6.340071257180252e-06, - "loss": 0.3496, + "loss": 0.1441, "step": 9470 }, { "epoch": 3.7220259128386335, - "grad_norm": 1.192949652671814, + "grad_norm": 3.3063976764678955, "learning_rate": 6.3291645459172544e-06, - "loss": 0.2843, + "loss": 0.2261, "step": 9480 }, { "epoch": 3.7259521005104044, - "grad_norm": 20.26271629333496, + "grad_norm": 10.478616714477539, "learning_rate": 6.318257834654258e-06, - "loss": 0.3139, + "loss": 0.3548, "step": 9490 }, { "epoch": 3.7298782881821753, - "grad_norm": 56.30705642700195, + "grad_norm": 73.88912963867188, "learning_rate": 6.30735112339126e-06, - "loss": 0.3247, + "loss": 0.2325, "step": 9500 }, { "epoch": 3.7338044758539457, - "grad_norm": 2.0628185272216797, + "grad_norm": 1.4647951126098633, "learning_rate": 6.296444412128263e-06, - "loss": 0.0274, + "loss": 0.0362, "step": 9510 }, { "epoch": 3.7377306635257166, - "grad_norm": 67.16304016113281, + "grad_norm": 46.40589141845703, "learning_rate": 6.285537700865266e-06, - "loss": 0.271, + "loss": 0.3297, "step": 9520 }, { "epoch": 3.7416568511974875, - "grad_norm": 0.9274380207061768, + "grad_norm": 0.9199576377868652, "learning_rate": 6.274630989602269e-06, - "loss": 0.0085, + "loss": 0.0114, "step": 9530 }, { "epoch": 3.745583038869258, - "grad_norm": 26.669519424438477, + "grad_norm": 38.47342300415039, "learning_rate": 6.2637242783392715e-06, - "loss": 0.0543, + "loss": 0.2211, "step": 9540 }, { "epoch": 3.749509226541029, - "grad_norm": 56.41508102416992, + "grad_norm": 96.15496063232422, "learning_rate": 6.252817567076274e-06, - "loss": 0.4007, + "loss": 0.1758, "step": 9550 }, { "epoch": 3.7534354142127992, - "grad_norm": 0.26100969314575195, + "grad_norm": 0.4806312322616577, "learning_rate": 6.241910855813277e-06, - "loss": 0.3593, + "loss": 0.3929, "step": 9560 }, { "epoch": 3.75736160188457, - "grad_norm": 0.24605228006839752, + "grad_norm": 0.07336679100990295, "learning_rate": 6.231004144550281e-06, - "loss": 0.1276, + "loss": 0.1249, "step": 9570 }, { "epoch": 3.7612877895563406, - "grad_norm": 36.539207458496094, + "grad_norm": 36.373573303222656, "learning_rate": 6.2200974332872825e-06, - "loss": 0.048, + "loss": 0.0509, "step": 9580 }, { "epoch": 3.7652139772281115, - "grad_norm": 9.589261054992676, + "grad_norm": 33.07835388183594, "learning_rate": 6.209190722024286e-06, - "loss": 0.1379, + "loss": 0.1806, "step": 9590 }, { "epoch": 3.7691401648998824, - "grad_norm": 0.6312021613121033, + "grad_norm": 1.7554200887680054, "learning_rate": 6.1982840107612885e-06, - "loss": 0.1315, + "loss": 0.1221, "step": 9600 }, { "epoch": 3.773066352571653, - "grad_norm": 1.6168729066848755, + "grad_norm": 1.750022053718567, "learning_rate": 6.187377299498292e-06, - "loss": 0.1211, + "loss": 0.2172, "step": 9610 }, { "epoch": 3.7769925402434237, - "grad_norm": 1.7230174541473389, + "grad_norm": 2.586303234100342, "learning_rate": 6.176470588235294e-06, - "loss": 0.2155, + "loss": 0.2448, "step": 9620 }, { "epoch": 3.7809187279151946, - "grad_norm": 3.2534639835357666, + "grad_norm": 2.4197680950164795, "learning_rate": 6.165563876972297e-06, - "loss": 0.0139, + "loss": 0.0366, "step": 9630 }, { "epoch": 3.784844915586965, - "grad_norm": 4.82680082321167, + "grad_norm": 5.7693586349487305, "learning_rate": 6.1546571657092995e-06, - "loss": 0.1052, + "loss": 0.0842, "step": 9640 }, { "epoch": 3.788771103258736, - "grad_norm": 3.023184299468994, + "grad_norm": 2.4173743724823, "learning_rate": 6.143750454446303e-06, - "loss": 0.1659, + "loss": 0.1784, "step": 9650 }, { "epoch": 3.7926972909305063, - "grad_norm": 81.79749298095703, + "grad_norm": 157.00308227539062, "learning_rate": 6.132843743183306e-06, - "loss": 0.2958, + "loss": 0.3371, "step": 9660 }, { "epoch": 3.7966234786022772, - "grad_norm": 2.416285514831543, + "grad_norm": 1.6241252422332764, "learning_rate": 6.121937031920308e-06, - "loss": 0.189, + "loss": 0.167, "step": 9670 }, { "epoch": 3.8005496662740477, - "grad_norm": 3.828796625137329, + "grad_norm": 11.806884765625, "learning_rate": 6.1110303206573114e-06, - "loss": 0.2789, + "loss": 0.3754, "step": 9680 }, { "epoch": 3.8044758539458186, - "grad_norm": 0.3105577826499939, + "grad_norm": 0.38643673062324524, "learning_rate": 6.100123609394314e-06, - "loss": 0.0992, + "loss": 0.0885, "step": 9690 }, { "epoch": 3.8084020416175894, - "grad_norm": 48.58467483520508, + "grad_norm": 16.490985870361328, "learning_rate": 6.089216898131317e-06, - "loss": 0.1348, + "loss": 0.0337, "step": 9700 }, { "epoch": 3.81232822928936, - "grad_norm": 9.342592239379883, + "grad_norm": 15.258231163024902, "learning_rate": 6.078310186868319e-06, - "loss": 0.1754, + "loss": 0.1415, "step": 9710 }, { "epoch": 3.8162544169611308, - "grad_norm": 69.68245697021484, + "grad_norm": 51.53931427001953, "learning_rate": 6.0674034756053225e-06, - "loss": 0.2722, + "loss": 0.2783, "step": 9720 }, { "epoch": 3.8201806046329017, - "grad_norm": 10.57545280456543, + "grad_norm": 1.3561761379241943, "learning_rate": 6.056496764342326e-06, - "loss": 0.0323, + "loss": 0.0341, "step": 9730 }, { "epoch": 3.824106792304672, - "grad_norm": 37.80369186401367, + "grad_norm": 51.188941955566406, "learning_rate": 6.0455900530793284e-06, - "loss": 0.3336, + "loss": 0.2983, "step": 9740 }, { "epoch": 3.828032979976443, - "grad_norm": 0.6457146406173706, + "grad_norm": 0.10509059578180313, "learning_rate": 6.034683341816331e-06, - "loss": 0.148, + "loss": 0.1843, "step": 9750 }, { "epoch": 3.8319591676482134, - "grad_norm": 1.19339919090271, + "grad_norm": 1.3554174900054932, "learning_rate": 6.0237766305533336e-06, - "loss": 0.0183, + "loss": 0.0189, "step": 9760 }, { "epoch": 3.8358853553199843, - "grad_norm": 43.22340774536133, + "grad_norm": 84.62676239013672, "learning_rate": 6.012869919290337e-06, - "loss": 0.1705, + "loss": 0.177, "step": 9770 }, { "epoch": 3.8398115429917548, - "grad_norm": 2.7199602127075195, + "grad_norm": 1.4999524354934692, "learning_rate": 6.0019632080273395e-06, - "loss": 0.0221, + "loss": 0.0299, "step": 9780 }, { "epoch": 3.8437377306635256, - "grad_norm": 65.54672241210938, + "grad_norm": 13.336804389953613, "learning_rate": 5.991056496764342e-06, - "loss": 0.1357, + "loss": 0.0823, "step": 9790 }, { "epoch": 3.8476639183352965, - "grad_norm": 0.37672173976898193, + "grad_norm": 0.19731977581977844, "learning_rate": 5.9801497855013455e-06, - "loss": 0.3666, + "loss": 0.2347, "step": 9800 }, { "epoch": 3.851590106007067, - "grad_norm": 1.9424209594726562, + "grad_norm": 0.47505125403404236, "learning_rate": 5.969243074238348e-06, - "loss": 0.0479, + "loss": 0.0696, "step": 9810 }, { "epoch": 3.855516293678838, - "grad_norm": 4.423801422119141, + "grad_norm": 2.611741542816162, "learning_rate": 5.958336362975351e-06, - "loss": 0.1945, + "loss": 0.1248, "step": 9820 }, { "epoch": 3.8594424813506087, - "grad_norm": 0.6105039119720459, + "grad_norm": 0.4827256500720978, "learning_rate": 5.947429651712354e-06, - "loss": 0.3908, + "loss": 0.205, "step": 9830 }, { "epoch": 3.863368669022379, - "grad_norm": 0.052522335201501846, + "grad_norm": 0.6738161444664001, "learning_rate": 5.9365229404493565e-06, - "loss": 0.1498, + "loss": 0.1709, "step": 9840 }, { "epoch": 3.86729485669415, - "grad_norm": 44.8753547668457, + "grad_norm": 35.567047119140625, "learning_rate": 5.925616229186359e-06, - "loss": 0.0797, + "loss": 0.044, "step": 9850 }, { "epoch": 3.871221044365921, - "grad_norm": 1.8650813102722168, + "grad_norm": 1.7769533395767212, "learning_rate": 5.9147095179233625e-06, - "loss": 0.1529, + "loss": 0.1656, "step": 9860 }, { "epoch": 3.8751472320376914, - "grad_norm": 0.16531294584274292, + "grad_norm": 0.1485966593027115, "learning_rate": 5.903802806660366e-06, - "loss": 0.0156, + "loss": 0.0518, "step": 9870 }, { "epoch": 3.879073419709462, - "grad_norm": 1.6942330598831177, + "grad_norm": 2.069671392440796, "learning_rate": 5.892896095397368e-06, - "loss": 0.0403, + "loss": 0.034, "step": 9880 }, { "epoch": 3.8829996073812327, - "grad_norm": 0.08303657174110413, + "grad_norm": 0.9944360852241516, "learning_rate": 5.881989384134371e-06, - "loss": 0.4119, + "loss": 0.2451, "step": 9890 }, { "epoch": 3.8869257950530036, - "grad_norm": 2.739856719970703, + "grad_norm": 3.1562142372131348, "learning_rate": 5.8710826728713735e-06, - "loss": 0.223, + "loss": 0.3003, "step": 9900 }, { "epoch": 3.890851982724774, - "grad_norm": 0.2548028528690338, + "grad_norm": 0.6076232194900513, "learning_rate": 5.860175961608377e-06, - "loss": 0.3655, + "loss": 0.3002, "step": 9910 }, { "epoch": 3.894778170396545, - "grad_norm": 4.5191473960876465, + "grad_norm": 3.9306066036224365, "learning_rate": 5.849269250345379e-06, - "loss": 0.1516, + "loss": 0.1058, "step": 9920 }, { "epoch": 3.898704358068316, - "grad_norm": 4.687385559082031, + "grad_norm": 3.2051665782928467, "learning_rate": 5.838362539082382e-06, - "loss": 0.1118, + "loss": 0.2676, "step": 9930 }, { "epoch": 3.9026305457400863, - "grad_norm": 5.724741458892822, + "grad_norm": 5.449111461639404, "learning_rate": 5.8274558278193854e-06, - "loss": 0.5923, + "loss": 0.5856, "step": 9940 }, { "epoch": 3.906556733411857, - "grad_norm": 2.977881669998169, + "grad_norm": 4.242855072021484, "learning_rate": 5.816549116556388e-06, - "loss": 0.0588, + "loss": 0.051, "step": 9950 }, { "epoch": 3.910482921083628, - "grad_norm": 0.08356577903032303, + "grad_norm": 0.2990489602088928, "learning_rate": 5.8056424052933905e-06, - "loss": 0.0909, + "loss": 0.0536, "step": 9960 }, { "epoch": 3.9144091087553985, - "grad_norm": 3.4132349491119385, + "grad_norm": 4.186092376708984, "learning_rate": 5.794735694030393e-06, - "loss": 0.0258, + "loss": 0.0393, "step": 9970 }, { "epoch": 3.9183352964271694, - "grad_norm": 2.5122876167297363, + "grad_norm": 1.7896088361740112, "learning_rate": 5.7838289827673965e-06, - "loss": 0.1671, + "loss": 0.0621, "step": 9980 }, { "epoch": 3.92226148409894, - "grad_norm": 2.8513975143432617, + "grad_norm": 3.6926989555358887, "learning_rate": 5.772922271504399e-06, - "loss": 0.0545, + "loss": 0.0514, "step": 9990 }, { "epoch": 3.9261876717707107, - "grad_norm": 9.997546195983887, + "grad_norm": 7.654712200164795, "learning_rate": 5.7620155602414024e-06, - "loss": 0.323, + "loss": 0.3637, "step": 10000 }, { "epoch": 3.930113859442481, - "grad_norm": 0.003472641808912158, + "grad_norm": 0.002584452275186777, "learning_rate": 5.751108848978404e-06, - "loss": 0.1881, + "loss": 0.2348, "step": 10010 }, { "epoch": 3.934040047114252, - "grad_norm": 6.679306507110596, + "grad_norm": 2.818413734436035, "learning_rate": 5.7402021377154076e-06, - "loss": 0.358, + "loss": 0.3574, "step": 10020 }, { "epoch": 3.937966234786023, - "grad_norm": 2.0606131553649902, + "grad_norm": 5.139772415161133, "learning_rate": 5.729295426452411e-06, - "loss": 0.0454, + "loss": 0.0888, "step": 10030 }, { "epoch": 3.9418924224577934, - "grad_norm": 2.554800271987915, + "grad_norm": 1.490856647491455, "learning_rate": 5.7183887151894135e-06, - "loss": 0.1365, + "loss": 0.0982, "step": 10040 }, { "epoch": 3.9458186101295643, - "grad_norm": 54.287200927734375, + "grad_norm": 51.38816833496094, "learning_rate": 5.707482003926416e-06, - "loss": 0.302, + "loss": 0.2233, "step": 10050 }, { "epoch": 3.949744797801335, - "grad_norm": 4.47686243057251, + "grad_norm": 0.35356801748275757, "learning_rate": 5.696575292663419e-06, - "loss": 0.0313, + "loss": 0.0143, "step": 10060 }, { "epoch": 3.9536709854731056, - "grad_norm": 0.5483373999595642, + "grad_norm": 0.6045596599578857, "learning_rate": 5.685668581400422e-06, - "loss": 0.0785, + "loss": 0.1494, "step": 10070 }, { "epoch": 3.9575971731448765, - "grad_norm": 0.9867703914642334, + "grad_norm": 0.5573551058769226, "learning_rate": 5.6747618701374246e-06, - "loss": 0.1033, + "loss": 0.145, "step": 10080 }, { "epoch": 3.961523360816647, - "grad_norm": 0.2987212836742401, + "grad_norm": 0.9685236811637878, "learning_rate": 5.663855158874427e-06, - "loss": 0.0193, + "loss": 0.0126, "step": 10090 }, { "epoch": 3.965449548488418, - "grad_norm": 41.47883224487305, + "grad_norm": 55.8563117980957, "learning_rate": 5.6529484476114305e-06, - "loss": 0.2009, + "loss": 0.133, "step": 10100 }, { "epoch": 3.9693757361601882, - "grad_norm": 8.49411392211914, + "grad_norm": 5.721922397613525, "learning_rate": 5.642041736348433e-06, - "loss": 0.0119, + "loss": 0.0072, "step": 10110 }, { "epoch": 3.973301923831959, - "grad_norm": 3.3371706008911133, + "grad_norm": 2.285773515701294, "learning_rate": 5.6311350250854365e-06, - "loss": 0.1729, + "loss": 0.1487, "step": 10120 }, { "epoch": 3.97722811150373, - "grad_norm": 2.105215549468994, + "grad_norm": 3.1833279132843018, "learning_rate": 5.620228313822439e-06, - "loss": 0.1653, + "loss": 0.0703, "step": 10130 }, { "epoch": 3.9811542991755005, - "grad_norm": 14.59489917755127, + "grad_norm": 10.577066421508789, "learning_rate": 5.609321602559442e-06, - "loss": 0.0785, + "loss": 0.0569, "step": 10140 }, { "epoch": 3.9850804868472713, - "grad_norm": 0.1921074390411377, + "grad_norm": 0.18988385796546936, "learning_rate": 5.598414891296444e-06, - "loss": 0.2141, + "loss": 0.3631, "step": 10150 }, { "epoch": 3.9890066745190422, - "grad_norm": 5.136042594909668, + "grad_norm": 7.493011474609375, "learning_rate": 5.5875081800334475e-06, - "loss": 0.0432, + "loss": 0.0584, "step": 10160 }, { "epoch": 3.9929328621908127, - "grad_norm": 0.22844859957695007, + "grad_norm": 0.2774406969547272, "learning_rate": 5.576601468770451e-06, - "loss": 0.3779, + "loss": 0.5554, "step": 10170 }, { "epoch": 3.9968590498625836, - "grad_norm": 1.2341351509094238, + "grad_norm": 1.1551597118377686, "learning_rate": 5.565694757507453e-06, - "loss": 0.2305, + "loss": 0.2642, "step": 10180 }, { "epoch": 4.0, - "eval_loss": 0.21103519201278687, - "eval_runtime": 11.4328, - "eval_samples_per_second": 198.026, - "eval_steps_per_second": 24.753, + "eval_loss": 0.23891106247901917, + "eval_runtime": 12.1068, + "eval_samples_per_second": 187.002, + "eval_steps_per_second": 23.375, "step": 10188 } ],