diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -11,6326 +11,6326 @@ "log_history": [ { "epoch": 0.004434589800443459, - "grad_norm": 4.369466304779053, - "learning_rate": 1.098901098901099e-07, - "loss": 1.865166187286377, + "grad_norm": 3.7738454341888428, + "learning_rate": 5.494505494505495e-08, + "loss": 1.8639533519744873, "step": 2 }, { "epoch": 0.008869179600886918, - "grad_norm": 6.864917278289795, - "learning_rate": 3.296703296703297e-07, - "loss": 2.130244493484497, + "grad_norm": 6.427864074707031, + "learning_rate": 1.6483516483516484e-07, + "loss": 2.130033016204834, "step": 4 }, { "epoch": 0.013303769401330377, - "grad_norm": 3.4614903926849365, - "learning_rate": 5.494505494505495e-07, - "loss": 1.9028818607330322, + "grad_norm": 4.859611511230469, + "learning_rate": 2.7472527472527475e-07, + "loss": 1.9051225185394287, "step": 6 }, { "epoch": 0.017738359201773836, - "grad_norm": 1.170404076576233, - "learning_rate": 7.692307692307694e-07, - "loss": 1.815584659576416, + "grad_norm": 2.374562978744507, + "learning_rate": 3.846153846153847e-07, + "loss": 1.8245947360992432, "step": 8 }, { "epoch": 0.022172949002217297, - "grad_norm": 4.44560432434082, - "learning_rate": 9.890109890109891e-07, - "loss": 1.616060733795166, + "grad_norm": 3.0884368419647217, + "learning_rate": 4.945054945054946e-07, + "loss": 1.6384509801864624, "step": 10 }, { "epoch": 0.026607538802660754, - "grad_norm": 3.004225730895996, - "learning_rate": 1.2087912087912089e-06, - "loss": 2.0138046741485596, + "grad_norm": 1.9582329988479614, + "learning_rate": 6.043956043956044e-07, + "loss": 2.195298671722412, "step": 12 }, { "epoch": 0.031042128603104215, - "grad_norm": 6.603450298309326, - "learning_rate": 1.4285714285714286e-06, - "loss": 1.5741312503814697, + "grad_norm": 24.451740264892578, + "learning_rate": 7.142857142857143e-07, + "loss": 1.955391764640808, "step": 14 }, { "epoch": 0.03547671840354767, - "grad_norm": 3.626671552658081, - "learning_rate": 1.6483516483516484e-06, - "loss": 1.1557213068008423, + "grad_norm": 4.774598121643066, + "learning_rate": 8.241758241758242e-07, + "loss": 1.51339852809906, "step": 16 }, { "epoch": 0.03991130820399113, - "grad_norm": 1.9218900203704834, - "learning_rate": 1.8681318681318684e-06, - "loss": 1.167083978652954, + "grad_norm": 3.925572156906128, + "learning_rate": 9.340659340659342e-07, + "loss": 1.449619174003601, "step": 18 }, { "epoch": 0.04434589800443459, - "grad_norm": 27.988502502441406, - "learning_rate": 2.0879120879120883e-06, - "loss": 1.2604156732559204, + "grad_norm": 4.0381388664245605, + "learning_rate": 1.0439560439560442e-06, + "loss": 1.6513968706130981, "step": 20 }, { "epoch": 0.04878048780487805, - "grad_norm": 1.9937968254089355, - "learning_rate": 2.307692307692308e-06, - "loss": 1.300764799118042, + "grad_norm": 8.873974800109863, + "learning_rate": 1.153846153846154e-06, + "loss": 1.4907894134521484, "step": 22 }, { "epoch": 0.05321507760532151, - "grad_norm": 1.050702691078186, - "learning_rate": 2.5274725274725274e-06, - "loss": 1.5187550783157349, + "grad_norm": 1.0349433422088623, + "learning_rate": 1.2637362637362637e-06, + "loss": 1.594868779182434, "step": 24 }, { "epoch": 0.057649667405764965, - "grad_norm": 0.8804575204849243, - "learning_rate": 2.7472527472527476e-06, - "loss": 1.2839621305465698, + "grad_norm": 1.1628096103668213, + "learning_rate": 1.3736263736263738e-06, + "loss": 1.5131434202194214, "step": 26 }, { "epoch": 0.06208425720620843, - "grad_norm": 1.4461461305618286, - "learning_rate": 2.9670329670329673e-06, - "loss": 1.0896079540252686, + "grad_norm": 1.3587442636489868, + "learning_rate": 1.4835164835164837e-06, + "loss": 1.1953634023666382, "step": 28 }, { "epoch": 0.06651884700665188, - "grad_norm": 1.1735917329788208, - "learning_rate": 3.1868131868131867e-06, - "loss": 1.2101552486419678, + "grad_norm": 1.4870824813842773, + "learning_rate": 1.5934065934065933e-06, + "loss": 1.2982568740844727, "step": 30 }, { "epoch": 0.07095343680709534, - "grad_norm": 1.792823076248169, - "learning_rate": 3.406593406593407e-06, - "loss": 1.1142032146453857, + "grad_norm": 3.842538595199585, + "learning_rate": 1.7032967032967034e-06, + "loss": 1.2121697664260864, "step": 32 }, { "epoch": 0.07538802660753881, - "grad_norm": 0.8939670920372009, - "learning_rate": 3.6263736263736266e-06, - "loss": 1.4834434986114502, + "grad_norm": 0.8271878957748413, + "learning_rate": 1.8131868131868133e-06, + "loss": 1.571251630783081, "step": 34 }, { "epoch": 0.07982261640798226, - "grad_norm": 5.579226970672607, - "learning_rate": 3.846153846153847e-06, - "loss": 1.2127963304519653, + "grad_norm": 38.774314880371094, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.330682635307312, "step": 36 }, { "epoch": 0.08425720620842572, - "grad_norm": 11.267181396484375, - "learning_rate": 4.065934065934066e-06, - "loss": 1.535402774810791, + "grad_norm": 6.1416730880737305, + "learning_rate": 2.032967032967033e-06, + "loss": 1.591033935546875, "step": 38 }, { "epoch": 0.08869179600886919, - "grad_norm": 1.9921720027923584, - "learning_rate": 4.2857142857142855e-06, - "loss": 0.7790605425834656, + "grad_norm": 2.0341765880584717, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.8703436851501465, "step": 40 }, { "epoch": 0.09312638580931264, - "grad_norm": 1.5248279571533203, - "learning_rate": 4.505494505494506e-06, - "loss": 1.3986105918884277, + "grad_norm": 1.3116881847381592, + "learning_rate": 2.252747252747253e-06, + "loss": 1.460440754890442, "step": 42 }, { "epoch": 0.0975609756097561, - "grad_norm": 0.9806777238845825, - "learning_rate": 4.725274725274726e-06, - "loss": 1.3404041528701782, + "grad_norm": 0.6786495447158813, + "learning_rate": 2.362637362637363e-06, + "loss": 1.403860330581665, "step": 44 }, { "epoch": 0.10199556541019955, - "grad_norm": 1.6693533658981323, - "learning_rate": 4.945054945054946e-06, - "loss": 1.3681906461715698, + "grad_norm": 1.2536464929580688, + "learning_rate": 2.472527472527473e-06, + "loss": 1.4209010601043701, "step": 46 }, { "epoch": 0.10643015521064302, - "grad_norm": 2.5410993099212646, - "learning_rate": 5.164835164835166e-06, - "loss": 1.6145933866500854, + "grad_norm": 1.742255449295044, + "learning_rate": 2.582417582417583e-06, + "loss": 1.6356741189956665, "step": 48 }, { "epoch": 0.11086474501108648, - "grad_norm": 1.414282202720642, - "learning_rate": 5.384615384615385e-06, - "loss": 1.4454115629196167, + "grad_norm": 1.2730481624603271, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.4965991973876953, "step": 50 }, { "epoch": 0.11529933481152993, - "grad_norm": 1.7944461107254028, - "learning_rate": 5.604395604395605e-06, - "loss": 1.3703702688217163, + "grad_norm": 1.8393515348434448, + "learning_rate": 2.8021978021978024e-06, + "loss": 1.4182051420211792, "step": 52 }, { "epoch": 0.1197339246119734, - "grad_norm": 1.0293529033660889, - "learning_rate": 5.824175824175825e-06, - "loss": 1.368741750717163, + "grad_norm": 1.2401989698410034, + "learning_rate": 2.9120879120879125e-06, + "loss": 1.4068325757980347, "step": 54 }, { "epoch": 0.12416851441241686, - "grad_norm": 2.983600616455078, - "learning_rate": 6.043956043956044e-06, - "loss": 1.1135094165802002, + "grad_norm": 4.087460517883301, + "learning_rate": 3.021978021978022e-06, + "loss": 1.1563267707824707, "step": 56 }, { "epoch": 0.1286031042128603, - "grad_norm": 1.0177669525146484, - "learning_rate": 6.2637362637362645e-06, - "loss": 1.3418025970458984, + "grad_norm": 1.3389958143234253, + "learning_rate": 3.1318681318681323e-06, + "loss": 1.366070032119751, "step": 58 }, { "epoch": 0.13303769401330376, - "grad_norm": 0.7596590518951416, - "learning_rate": 6.483516483516485e-06, - "loss": 1.3871124982833862, + "grad_norm": 0.9454852342605591, + "learning_rate": 3.2417582417582424e-06, + "loss": 1.421806812286377, "step": 60 }, { "epoch": 0.13747228381374724, - "grad_norm": 1.77037513256073, - "learning_rate": 6.703296703296703e-06, - "loss": 1.336590051651001, + "grad_norm": 1.2951329946517944, + "learning_rate": 3.3516483516483516e-06, + "loss": 1.3717670440673828, "step": 62 }, { "epoch": 0.1419068736141907, - "grad_norm": 1.6162333488464355, - "learning_rate": 6.923076923076923e-06, - "loss": 1.3942538499832153, + "grad_norm": 1.5734539031982422, + "learning_rate": 3.4615384615384617e-06, + "loss": 1.4363198280334473, "step": 64 }, { "epoch": 0.14634146341463414, - "grad_norm": 3.2967028617858887, - "learning_rate": 7.1428571428571436e-06, - "loss": 1.4562139511108398, + "grad_norm": 1.5510807037353516, + "learning_rate": 3.5714285714285718e-06, + "loss": 1.4910911321640015, "step": 66 }, { "epoch": 0.15077605321507762, - "grad_norm": 0.9108039140701294, - "learning_rate": 7.362637362637364e-06, - "loss": 1.324294924736023, + "grad_norm": 1.0450427532196045, + "learning_rate": 3.681318681318682e-06, + "loss": 1.355659008026123, "step": 68 }, { "epoch": 0.15521064301552107, - "grad_norm": 3.4899282455444336, - "learning_rate": 7.582417582417583e-06, - "loss": 1.3148343563079834, + "grad_norm": 0.8717012405395508, + "learning_rate": 3.7912087912087915e-06, + "loss": 1.3062938451766968, "step": 70 }, { "epoch": 0.15964523281596452, - "grad_norm": 1.4243505001068115, - "learning_rate": 7.802197802197802e-06, - "loss": 1.3413867950439453, + "grad_norm": 0.879036545753479, + "learning_rate": 3.901098901098901e-06, + "loss": 1.3595796823501587, "step": 72 }, { "epoch": 0.164079822616408, - "grad_norm": 0.9133158922195435, - "learning_rate": 8.021978021978023e-06, - "loss": 1.2512249946594238, + "grad_norm": 0.9112725257873535, + "learning_rate": 4.010989010989012e-06, + "loss": 1.26053786277771, "step": 74 }, { "epoch": 0.16851441241685144, - "grad_norm": 1.0660244226455688, - "learning_rate": 8.241758241758243e-06, - "loss": 1.3262264728546143, + "grad_norm": 1.1469699144363403, + "learning_rate": 4.120879120879121e-06, + "loss": 1.3409998416900635, "step": 76 }, { "epoch": 0.1729490022172949, - "grad_norm": 1.9149298667907715, - "learning_rate": 8.461538461538462e-06, - "loss": 1.3764103651046753, + "grad_norm": 1.1423449516296387, + "learning_rate": 4.230769230769231e-06, + "loss": 1.3864156007766724, "step": 78 }, { "epoch": 0.17738359201773837, - "grad_norm": 1.6447206735610962, - "learning_rate": 8.681318681318681e-06, - "loss": 1.3489444255828857, + "grad_norm": 3.403249502182007, + "learning_rate": 4.340659340659341e-06, + "loss": 1.3426871299743652, "step": 80 }, { "epoch": 0.18181818181818182, - "grad_norm": 1.179309368133545, - "learning_rate": 8.9010989010989e-06, - "loss": 0.8915775418281555, + "grad_norm": 1.019535779953003, + "learning_rate": 4.45054945054945e-06, + "loss": 0.8479611277580261, "step": 82 }, { "epoch": 0.18625277161862527, - "grad_norm": 1.079959511756897, - "learning_rate": 9.120879120879122e-06, - "loss": 1.0530604124069214, + "grad_norm": 0.9872419238090515, + "learning_rate": 4.560439560439561e-06, + "loss": 1.0275837182998657, "step": 84 }, { "epoch": 0.19068736141906872, - "grad_norm": 1.5468087196350098, - "learning_rate": 9.340659340659341e-06, - "loss": 1.0116887092590332, + "grad_norm": 2.4420623779296875, + "learning_rate": 4.6703296703296706e-06, + "loss": 0.9826464653015137, "step": 86 }, { "epoch": 0.1951219512195122, - "grad_norm": 1.0850789546966553, - "learning_rate": 9.560439560439562e-06, - "loss": 1.2681747674942017, + "grad_norm": 1.1877732276916504, + "learning_rate": 4.780219780219781e-06, + "loss": 1.2627744674682617, "step": 88 }, { "epoch": 0.19955654101995565, - "grad_norm": 3.765213966369629, - "learning_rate": 9.780219780219781e-06, - "loss": 1.398827314376831, + "grad_norm": 2.3475279808044434, + "learning_rate": 4.890109890109891e-06, + "loss": 1.4005193710327148, "step": 90 }, { "epoch": 0.2039911308203991, - "grad_norm": 1.0919721126556396, - "learning_rate": 1e-05, - "loss": 1.2438007593154907, + "grad_norm": 1.02353036403656, + "learning_rate": 5e-06, + "loss": 1.2373454570770264, "step": 92 }, { "epoch": 0.20842572062084258, - "grad_norm": 1.347847819328308, - "learning_rate": 9.99996972898091e-06, - "loss": 1.2643123865127563, + "grad_norm": 0.9194503426551819, + "learning_rate": 4.999984864490455e-06, + "loss": 1.2537040710449219, "step": 94 }, { "epoch": 0.21286031042128603, - "grad_norm": 2.1960842609405518, - "learning_rate": 9.999878916330893e-06, - "loss": 1.8062665462493896, + "grad_norm": 2.111903190612793, + "learning_rate": 4.999939458165447e-06, + "loss": 1.7977383136749268, "step": 96 }, { "epoch": 0.21729490022172948, - "grad_norm": 1.4442038536071777, - "learning_rate": 9.999727563271727e-06, - "loss": 0.9751254320144653, + "grad_norm": 1.7523187398910522, + "learning_rate": 4.999863781635863e-06, + "loss": 0.9749844074249268, "step": 98 }, { "epoch": 0.22172949002217296, - "grad_norm": 1.2550246715545654, - "learning_rate": 9.999515671839682e-06, - "loss": 1.1938117742538452, + "grad_norm": 2.5243184566497803, + "learning_rate": 4.999757835919841e-06, + "loss": 1.164440393447876, "step": 100 }, { "epoch": 0.2261640798226164, - "grad_norm": 1.0004876852035522, - "learning_rate": 9.999243244885499e-06, - "loss": 1.3510534763336182, + "grad_norm": 0.87540602684021, + "learning_rate": 4.9996216224427495e-06, + "loss": 1.3278536796569824, "step": 102 }, { "epoch": 0.23059866962305986, - "grad_norm": 1.2936842441558838, - "learning_rate": 9.998910286074355e-06, - "loss": 1.0911916494369507, + "grad_norm": 1.5194096565246582, + "learning_rate": 4.999455143037178e-06, + "loss": 1.0450935363769531, "step": 104 }, { "epoch": 0.23503325942350334, - "grad_norm": 1.3704265356063843, - "learning_rate": 9.998516799885806e-06, - "loss": 1.3209298849105835, + "grad_norm": 0.8938301801681519, + "learning_rate": 4.999258399942903e-06, + "loss": 1.2851738929748535, "step": 106 }, { "epoch": 0.2394678492239468, - "grad_norm": 1.4554086923599243, - "learning_rate": 9.998062791613729e-06, - "loss": 1.513480305671692, + "grad_norm": 1.5863221883773804, + "learning_rate": 4.9990313958068645e-06, + "loss": 1.4904606342315674, "step": 108 }, { "epoch": 0.24390243902439024, - "grad_norm": 1.170039176940918, - "learning_rate": 9.997548267366255e-06, - "loss": 1.0687693357467651, + "grad_norm": 2.0058369636535645, + "learning_rate": 4.998774133683127e-06, + "loss": 1.0308055877685547, "step": 110 }, { "epoch": 0.24833702882483372, - "grad_norm": 3.054081678390503, - "learning_rate": 9.996973234065685e-06, - "loss": 1.310200810432434, + "grad_norm": 1.978546142578125, + "learning_rate": 4.9984866170328426e-06, + "loss": 1.3031495809555054, "step": 112 }, { "epoch": 0.25277161862527714, - "grad_norm": 1.4177216291427612, - "learning_rate": 9.996337699448392e-06, - "loss": 0.8176467418670654, + "grad_norm": 1.6387444734573364, + "learning_rate": 4.998168849724196e-06, + "loss": 0.7833878397941589, "step": 114 }, { "epoch": 0.2572062084257206, - "grad_norm": 1.1188538074493408, - "learning_rate": 9.995641672064726e-06, - "loss": 1.3813865184783936, + "grad_norm": 1.0411269664764404, + "learning_rate": 4.997820836032363e-06, + "loss": 1.3135759830474854, "step": 116 }, { "epoch": 0.2616407982261641, - "grad_norm": 0.9215674996376038, - "learning_rate": 9.994885161278885e-06, - "loss": 1.1077316999435425, + "grad_norm": 0.7238956689834595, + "learning_rate": 4.997442580639443e-06, + "loss": 1.0706069469451904, "step": 118 }, { "epoch": 0.2660753880266075, - "grad_norm": 1.6228851079940796, - "learning_rate": 9.994068177268807e-06, - "loss": 1.2811754941940308, + "grad_norm": 1.1347033977508545, + "learning_rate": 4.997034088634404e-06, + "loss": 1.252564549446106, "step": 120 }, { "epoch": 0.270509977827051, - "grad_norm": 1.9863255023956299, - "learning_rate": 9.993190731026024e-06, - "loss": 1.1968728303909302, + "grad_norm": 1.396546721458435, + "learning_rate": 4.996595365513012e-06, + "loss": 1.1585849523544312, "step": 122 }, { "epoch": 0.2749445676274945, - "grad_norm": 1.8475158214569092, - "learning_rate": 9.992252834355503e-06, - "loss": 1.5891046524047852, + "grad_norm": 2.234487771987915, + "learning_rate": 4.9961264171777515e-06, + "loss": 1.5745362043380737, "step": 124 }, { "epoch": 0.2793791574279379, - "grad_norm": 1.485600471496582, - "learning_rate": 9.99125449987551e-06, - "loss": 0.9440798163414001, + "grad_norm": 1.0086833238601685, + "learning_rate": 4.995627249937755e-06, + "loss": 0.9223954677581787, "step": 126 }, { "epoch": 0.2838137472283814, - "grad_norm": 1.168931484222412, - "learning_rate": 9.990195741017422e-06, - "loss": 1.21555495262146, + "grad_norm": 2.619391679763794, + "learning_rate": 4.995097870508711e-06, + "loss": 1.2081269025802612, "step": 128 }, { "epoch": 0.28824833702882485, - "grad_norm": 1.28008234500885, - "learning_rate": 9.989076572025554e-06, - "loss": 0.9523183703422546, + "grad_norm": 4.811182022094727, + "learning_rate": 4.994538286012777e-06, + "loss": 0.9236148595809937, "step": 130 }, { "epoch": 0.2926829268292683, - "grad_norm": 3.5025172233581543, - "learning_rate": 9.987897007956968e-06, - "loss": 0.9987781047821045, + "grad_norm": 2.1846261024475098, + "learning_rate": 4.993948503978484e-06, + "loss": 0.9870991706848145, "step": 132 }, { "epoch": 0.29711751662971175, - "grad_norm": 2.9309399127960205, - "learning_rate": 9.986657064681267e-06, - "loss": 0.9768642783164978, + "grad_norm": 11.924750328063965, + "learning_rate": 4.993328532340633e-06, + "loss": 0.9675296545028687, "step": 134 }, { "epoch": 0.30155210643015523, - "grad_norm": 6.351615905761719, - "learning_rate": 9.98535675888038e-06, - "loss": 1.2585439682006836, + "grad_norm": 7.58044958114624, + "learning_rate": 4.99267837944019e-06, + "loss": 1.133386254310608, "step": 136 }, { "epoch": 0.30598669623059865, - "grad_norm": 5.213984489440918, - "learning_rate": 9.983996108048345e-06, - "loss": 0.7967538833618164, + "grad_norm": 1.9771738052368164, + "learning_rate": 4.991998054024172e-06, + "loss": 0.792849600315094, "step": 138 }, { "epoch": 0.31042128603104213, - "grad_norm": 1.3048728704452515, - "learning_rate": 9.982575130491068e-06, - "loss": 1.1146520376205444, + "grad_norm": 1.3070610761642456, + "learning_rate": 4.991287565245534e-06, + "loss": 1.035190463066101, "step": 140 }, { "epoch": 0.3148558758314856, - "grad_norm": 0.8629128336906433, - "learning_rate": 9.981093845326079e-06, - "loss": 0.9639315009117126, + "grad_norm": 0.9725500345230103, + "learning_rate": 4.990546922663039e-06, + "loss": 0.9571182727813721, "step": 142 }, { "epoch": 0.31929046563192903, - "grad_norm": 1.43756902217865, - "learning_rate": 9.979552272482268e-06, - "loss": 0.9869639873504639, + "grad_norm": 1.864827036857605, + "learning_rate": 4.989776136241134e-06, + "loss": 0.9656538367271423, "step": 144 }, { "epoch": 0.3237250554323725, - "grad_norm": 2.2892510890960693, - "learning_rate": 9.977950432699629e-06, - "loss": 0.9267846345901489, + "grad_norm": 2.245912551879883, + "learning_rate": 4.988975216349814e-06, + "loss": 0.9354503154754639, "step": 146 }, { "epoch": 0.328159645232816, - "grad_norm": 2.10282301902771, - "learning_rate": 9.976288347528972e-06, - "loss": 1.4183735847473145, + "grad_norm": 1.4899214506149292, + "learning_rate": 4.988144173764486e-06, + "loss": 1.385457992553711, "step": 148 }, { "epoch": 0.3325942350332594, - "grad_norm": 2.502537488937378, - "learning_rate": 9.974566039331634e-06, - "loss": 1.1835788488388062, + "grad_norm": 3.3478896617889404, + "learning_rate": 4.987283019665817e-06, + "loss": 1.1480491161346436, "step": 150 }, { "epoch": 0.3370288248337029, - "grad_norm": 2.951907157897949, - "learning_rate": 9.972783531279184e-06, - "loss": 1.0829112529754639, + "grad_norm": 1.2977336645126343, + "learning_rate": 4.986391765639592e-06, + "loss": 1.0610523223876953, "step": 152 }, { "epoch": 0.34146341463414637, - "grad_norm": 1.5924861431121826, - "learning_rate": 9.970940847353103e-06, - "loss": 1.2782995700836182, + "grad_norm": 0.8803158402442932, + "learning_rate": 4.985470423676551e-06, + "loss": 1.26231050491333, "step": 154 }, { "epoch": 0.3458980044345898, - "grad_norm": 3.426605701446533, - "learning_rate": 9.969038012344465e-06, - "loss": 1.2216734886169434, + "grad_norm": 1.620816946029663, + "learning_rate": 4.984519006172232e-06, + "loss": 1.1941092014312744, "step": 156 }, { "epoch": 0.35033259423503327, - "grad_norm": 2.055274724960327, - "learning_rate": 9.967075051853609e-06, - "loss": 1.3016668558120728, + "grad_norm": 1.560025930404663, + "learning_rate": 4.983537525926804e-06, + "loss": 1.2882779836654663, "step": 158 }, { "epoch": 0.35476718403547675, - "grad_norm": 0.5714741945266724, - "learning_rate": 9.965051992289782e-06, - "loss": 1.135823130607605, + "grad_norm": 0.5345287919044495, + "learning_rate": 4.982525996144891e-06, + "loss": 1.0993400812149048, "step": 160 }, { "epoch": 0.35920177383592017, - "grad_norm": 4.934542655944824, - "learning_rate": 9.962968860870798e-06, - "loss": 0.8752337098121643, + "grad_norm": 0.7228168249130249, + "learning_rate": 4.981484430435399e-06, + "loss": 0.8929040431976318, "step": 162 }, { "epoch": 0.36363636363636365, - "grad_norm": 2.104079008102417, - "learning_rate": 9.96082568562266e-06, - "loss": 0.904317319393158, + "grad_norm": 1.204595685005188, + "learning_rate": 4.98041284281133e-06, + "loss": 0.8954707980155945, "step": 164 }, { "epoch": 0.36807095343680707, - "grad_norm": 1.8156555891036987, - "learning_rate": 9.958622495379193e-06, - "loss": 1.284702181816101, + "grad_norm": 0.7818751931190491, + "learning_rate": 4.979311247689596e-06, + "loss": 1.2652803659439087, "step": 166 }, { "epoch": 0.37250554323725055, - "grad_norm": 1.5921517610549927, - "learning_rate": 9.956359319781642e-06, - "loss": 1.2042418718338013, + "grad_norm": 1.151502013206482, + "learning_rate": 4.978179659890821e-06, + "loss": 1.2041521072387695, "step": 168 }, { "epoch": 0.376940133037694, - "grad_norm": 1.4465107917785645, - "learning_rate": 9.954036189278292e-06, - "loss": 1.2029085159301758, + "grad_norm": 1.6603472232818604, + "learning_rate": 4.977018094639146e-06, + "loss": 1.227750301361084, "step": 170 }, { "epoch": 0.38137472283813745, - "grad_norm": 2.0260727405548096, - "learning_rate": 9.951653135124045e-06, - "loss": 0.7804557681083679, + "grad_norm": 2.9484810829162598, + "learning_rate": 4.975826567562023e-06, + "loss": 0.7771618366241455, "step": 172 }, { "epoch": 0.3858093126385809, - "grad_norm": 3.316241979598999, - "learning_rate": 9.94921018938e-06, - "loss": 1.6452889442443848, + "grad_norm": 2.0457444190979004, + "learning_rate": 4.97460509469e-06, + "loss": 1.6198290586471558, "step": 174 }, { "epoch": 0.3902439024390244, - "grad_norm": 1.2219425439834595, - "learning_rate": 9.946707384913027e-06, - "loss": 1.2721954584121704, + "grad_norm": 1.2279316186904907, + "learning_rate": 4.973353692456513e-06, + "loss": 1.2602885961532593, "step": 176 }, { "epoch": 0.3946784922394678, - "grad_norm": 1.0764427185058594, - "learning_rate": 9.944144755395321e-06, - "loss": 1.325971245765686, + "grad_norm": 1.4660323858261108, + "learning_rate": 4.972072377697661e-06, + "loss": 1.3121440410614014, "step": 178 }, { "epoch": 0.3991130820399113, - "grad_norm": 0.585249662399292, - "learning_rate": 9.941522335303955e-06, - "loss": 1.0814615488052368, + "grad_norm": 0.714910089969635, + "learning_rate": 4.9707611676519775e-06, + "loss": 1.0512202978134155, "step": 180 }, { "epoch": 0.4035476718403548, - "grad_norm": 1.7605838775634766, - "learning_rate": 9.938840159920406e-06, - "loss": 1.2727550268173218, + "grad_norm": 1.5509921312332153, + "learning_rate": 4.969420079960203e-06, + "loss": 1.259682059288025, "step": 182 }, { "epoch": 0.4079822616407982, - "grad_norm": 0.8892675638198853, - "learning_rate": 9.93609826533009e-06, - "loss": 0.8603274822235107, + "grad_norm": 1.5183836221694946, + "learning_rate": 4.968049132665045e-06, + "loss": 0.9290481805801392, "step": 184 }, { "epoch": 0.4124168514412417, - "grad_norm": 1.1300294399261475, - "learning_rate": 9.933296688421872e-06, - "loss": 0.986240565776825, + "grad_norm": 2.068286180496216, + "learning_rate": 4.966648344210936e-06, + "loss": 0.9747883677482605, "step": 186 }, { "epoch": 0.41685144124168516, - "grad_norm": 1.141814947128296, - "learning_rate": 9.930435466887564e-06, - "loss": 0.99045330286026, + "grad_norm": 0.7379282712936401, + "learning_rate": 4.965217733443782e-06, + "loss": 0.9480677843093872, "step": 188 }, { "epoch": 0.4212860310421286, - "grad_norm": 2.3094711303710938, - "learning_rate": 9.927514639221433e-06, - "loss": 1.0205762386322021, + "grad_norm": 1.9371439218521118, + "learning_rate": 4.963757319610716e-06, + "loss": 0.9953845143318176, "step": 190 }, { "epoch": 0.42572062084257206, - "grad_norm": 3.4740211963653564, - "learning_rate": 9.92453424471967e-06, - "loss": 0.8767862319946289, + "grad_norm": 2.3159310817718506, + "learning_rate": 4.962267122359835e-06, + "loss": 0.8490515947341919, "step": 192 }, { "epoch": 0.43015521064301554, - "grad_norm": 1.3851454257965088, - "learning_rate": 9.921494323479862e-06, - "loss": 1.306305170059204, + "grad_norm": 2.2765939235687256, + "learning_rate": 4.960747161739931e-06, + "loss": 1.2720048427581787, "step": 194 }, { "epoch": 0.43458980044345896, - "grad_norm": 2.2007641792297363, - "learning_rate": 9.918394916400465e-06, - "loss": 1.5621771812438965, + "grad_norm": 3.2983458042144775, + "learning_rate": 4.9591974582002324e-06, + "loss": 1.5371360778808594, "step": 196 }, { "epoch": 0.43902439024390244, - "grad_norm": 4.152230262756348, - "learning_rate": 9.915236065180235e-06, - "loss": 1.2867047786712646, + "grad_norm": 1.635307788848877, + "learning_rate": 4.957618032590118e-06, + "loss": 1.2765772342681885, "step": 198 }, { "epoch": 0.4434589800443459, - "grad_norm": 0.7970097064971924, - "learning_rate": 9.912017812317684e-06, - "loss": 1.1508140563964844, + "grad_norm": 0.527726411819458, + "learning_rate": 4.956008906158842e-06, + "loss": 1.1208865642547607, "step": 200 }, { "epoch": 0.44789356984478934, - "grad_norm": 1.0820010900497437, - "learning_rate": 9.908740201110497e-06, - "loss": 1.2738271951675415, + "grad_norm": 1.2475026845932007, + "learning_rate": 4.954370100555249e-06, + "loss": 1.2454034090042114, "step": 202 }, { "epoch": 0.4523281596452328, - "grad_norm": 0.833768367767334, - "learning_rate": 9.905403275654951e-06, - "loss": 1.229623556137085, + "grad_norm": 0.8595089316368103, + "learning_rate": 4.952701637827476e-06, + "loss": 1.209128737449646, "step": 204 }, { "epoch": 0.4567627494456763, - "grad_norm": 1.3860057592391968, - "learning_rate": 9.902007080845336e-06, - "loss": 1.0840635299682617, + "grad_norm": 1.4940823316574097, + "learning_rate": 4.951003540422668e-06, + "loss": 1.0659428834915161, "step": 206 }, { "epoch": 0.4611973392461197, - "grad_norm": 1.5478756427764893, - "learning_rate": 9.898551662373325e-06, - "loss": 1.0657180547714233, + "grad_norm": 2.2497997283935547, + "learning_rate": 4.949275831186663e-06, + "loss": 1.0191223621368408, "step": 208 }, { "epoch": 0.4656319290465632, - "grad_norm": 1.0406877994537354, - "learning_rate": 9.895037066727382e-06, - "loss": 0.7027549743652344, + "grad_norm": 3.156873941421509, + "learning_rate": 4.947518533363691e-06, + "loss": 0.6471428871154785, "step": 210 }, { "epoch": 0.4700665188470067, - "grad_norm": 4.616578578948975, - "learning_rate": 9.891463341192124e-06, - "loss": 0.8683266043663025, + "grad_norm": 9.676581382751465, + "learning_rate": 4.945731670596062e-06, + "loss": 0.849310040473938, "step": 212 }, { "epoch": 0.4745011086474501, - "grad_norm": 0.9766786098480225, - "learning_rate": 9.88783053384769e-06, - "loss": 1.1040089130401611, + "grad_norm": 0.766165018081665, + "learning_rate": 4.943915266923845e-06, + "loss": 1.0647257566452026, "step": 214 }, { "epoch": 0.4789356984478936, - "grad_norm": 1.0400160551071167, - "learning_rate": 9.884138693569095e-06, - "loss": 1.1147682666778564, + "grad_norm": 1.1118712425231934, + "learning_rate": 4.942069346784547e-06, + "loss": 1.082270622253418, "step": 216 }, { "epoch": 0.48337028824833705, - "grad_norm": 1.1635631322860718, - "learning_rate": 9.88038787002557e-06, - "loss": 1.1477664709091187, + "grad_norm": 1.1239439249038696, + "learning_rate": 4.940193935012785e-06, + "loss": 1.113852858543396, "step": 218 }, { "epoch": 0.4878048780487805, - "grad_norm": 3.343935489654541, - "learning_rate": 9.876578113679891e-06, - "loss": 1.24171781539917, + "grad_norm": 1.8485164642333984, + "learning_rate": 4.938289056839946e-06, + "loss": 1.1989140510559082, "step": 220 }, { "epoch": 0.49223946784922396, - "grad_norm": 1.407214879989624, - "learning_rate": 9.872709475787708e-06, - "loss": 1.2408087253570557, + "grad_norm": 1.7778033018112183, + "learning_rate": 4.936354737893854e-06, + "loss": 1.223215103149414, "step": 222 }, { "epoch": 0.49667405764966743, - "grad_norm": 0.8646455407142639, - "learning_rate": 9.868782008396848e-06, - "loss": 1.2195643186569214, + "grad_norm": 0.9021375775337219, + "learning_rate": 4.934391004198424e-06, + "loss": 1.1974425315856934, "step": 224 }, { "epoch": 0.5011086474501109, - "grad_norm": 1.0053114891052246, - "learning_rate": 9.864795764346615e-06, - "loss": 1.2099742889404297, + "grad_norm": 0.6981101036071777, + "learning_rate": 4.932397882173307e-06, + "loss": 1.20035982131958, "step": 226 }, { "epoch": 0.5055432372505543, - "grad_norm": 1.0781813859939575, - "learning_rate": 9.860750797267085e-06, - "loss": 1.2584586143493652, + "grad_norm": 0.9951314330101013, + "learning_rate": 4.930375398633543e-06, + "loss": 1.2479407787322998, "step": 228 }, { "epoch": 0.5099778270509978, - "grad_norm": 2.1706690788269043, - "learning_rate": 9.856647161578384e-06, - "loss": 1.8491344451904297, + "grad_norm": 2.1762142181396484, + "learning_rate": 4.928323580789192e-06, + "loss": 1.8489172458648682, "step": 230 }, { "epoch": 0.5144124168514412, - "grad_norm": 1.5501409769058228, - "learning_rate": 9.852484912489946e-06, - "loss": 0.8673834204673767, + "grad_norm": 1.2985577583312988, + "learning_rate": 4.926242456244973e-06, + "loss": 0.8134359121322632, "step": 232 }, { "epoch": 0.5188470066518847, - "grad_norm": 1.1405872106552124, - "learning_rate": 9.848264105999783e-06, - "loss": 1.287527322769165, + "grad_norm": 1.1328946352005005, + "learning_rate": 4.924132052999892e-06, + "loss": 1.2583706378936768, "step": 234 }, { "epoch": 0.5232815964523282, - "grad_norm": 1.5920919179916382, - "learning_rate": 9.843984798893722e-06, - "loss": 0.9661246538162231, + "grad_norm": 0.6931086182594299, + "learning_rate": 4.921992399446861e-06, + "loss": 0.9323728084564209, "step": 236 }, { "epoch": 0.5277161862527716, - "grad_norm": 1.4155439138412476, - "learning_rate": 9.839647048744645e-06, - "loss": 1.0282375812530518, + "grad_norm": 1.0881880521774292, + "learning_rate": 4.919823524372323e-06, + "loss": 0.9777665734291077, "step": 238 }, { "epoch": 0.532150776053215, - "grad_norm": 1.3770828247070312, - "learning_rate": 9.83525091391172e-06, - "loss": 1.2707240581512451, + "grad_norm": 2.0472190380096436, + "learning_rate": 4.91762545695586e-06, + "loss": 1.2541948556900024, "step": 240 }, { "epoch": 0.5365853658536586, - "grad_norm": 1.1959309577941895, - "learning_rate": 9.8307964535396e-06, - "loss": 1.3354653120040894, + "grad_norm": 1.1526095867156982, + "learning_rate": 4.9153982267698e-06, + "loss": 1.3235599994659424, "step": 242 }, { "epoch": 0.541019955654102, - "grad_norm": 5.0889363288879395, - "learning_rate": 9.826283727557644e-06, - "loss": 0.9864997863769531, + "grad_norm": 2.487705707550049, + "learning_rate": 4.913141863778822e-06, + "loss": 0.9698494672775269, "step": 244 }, { "epoch": 0.5454545454545454, - "grad_norm": 0.7636584639549255, - "learning_rate": 9.821712796679106e-06, - "loss": 1.2685621976852417, + "grad_norm": 0.8203330039978027, + "learning_rate": 4.910856398339553e-06, + "loss": 1.2535961866378784, "step": 246 }, { "epoch": 0.549889135254989, - "grad_norm": 1.781443476676941, - "learning_rate": 9.817083722400309e-06, - "loss": 1.4938619136810303, + "grad_norm": 1.1372792720794678, + "learning_rate": 4.9085418612001545e-06, + "loss": 1.4743397235870361, "step": 248 }, { "epoch": 0.5543237250554324, - "grad_norm": 2.2715258598327637, - "learning_rate": 9.812396566999832e-06, - "loss": 1.2405654191970825, + "grad_norm": 1.7914355993270874, + "learning_rate": 4.906198283499916e-06, + "loss": 1.214085340499878, "step": 250 }, { "epoch": 0.5587583148558758, - "grad_norm": 30.650293350219727, - "learning_rate": 9.807651393537659e-06, - "loss": 0.6915596127510071, + "grad_norm": 3.3823254108428955, + "learning_rate": 4.903825696768829e-06, + "loss": 0.6424598693847656, "step": 252 }, { "epoch": 0.5631929046563193, - "grad_norm": 4.594031810760498, - "learning_rate": 9.802848265854343e-06, - "loss": 1.4810152053833008, + "grad_norm": 1.305862545967102, + "learning_rate": 4.901424132927172e-06, + "loss": 1.470249891281128, "step": 254 }, { "epoch": 0.5676274944567627, - "grad_norm": 8.177288055419922, - "learning_rate": 9.797987248570137e-06, - "loss": 1.3323289155960083, + "grad_norm": 1.1991181373596191, + "learning_rate": 4.898993624285069e-06, + "loss": 1.3120447397232056, "step": 256 }, { "epoch": 0.5720620842572062, - "grad_norm": 1.162073016166687, - "learning_rate": 9.793068407084125e-06, - "loss": 1.2933639287948608, + "grad_norm": 0.9037002921104431, + "learning_rate": 4.896534203542062e-06, + "loss": 1.316646933555603, "step": 258 }, { "epoch": 0.5764966740576497, - "grad_norm": 1.2364484071731567, - "learning_rate": 9.78809180757335e-06, - "loss": 1.2908434867858887, + "grad_norm": 1.464098572731018, + "learning_rate": 4.894045903786675e-06, + "loss": 1.2750022411346436, "step": 260 }, { "epoch": 0.5809312638580931, - "grad_norm": 2.779216766357422, - "learning_rate": 9.783057516991921e-06, - "loss": 0.7582840919494629, + "grad_norm": 2.3139257431030273, + "learning_rate": 4.891528758495961e-06, + "loss": 0.7120662927627563, "step": 262 }, { "epoch": 0.5853658536585366, - "grad_norm": 1.085245966911316, - "learning_rate": 9.777965603070106e-06, - "loss": 1.3473342657089233, + "grad_norm": 1.463733196258545, + "learning_rate": 4.888982801535053e-06, + "loss": 1.3662123680114746, "step": 264 }, { "epoch": 0.5898004434589801, - "grad_norm": 1.5546146631240845, - "learning_rate": 9.772816134313424e-06, - "loss": 1.0741627216339111, + "grad_norm": 1.677898645401001, + "learning_rate": 4.886408067156712e-06, + "loss": 1.0462416410446167, "step": 266 }, { "epoch": 0.5942350332594235, - "grad_norm": 1.2348523139953613, - "learning_rate": 9.76760918000173e-06, - "loss": 1.5512841939926147, + "grad_norm": 0.9817636013031006, + "learning_rate": 4.883804590000865e-06, + "loss": 1.5489835739135742, "step": 268 }, { "epoch": 0.5986696230598669, - "grad_norm": 2.3876912593841553, - "learning_rate": 9.762344810188276e-06, - "loss": 1.1929247379302979, + "grad_norm": 2.1008689403533936, + "learning_rate": 4.881172405094138e-06, + "loss": 1.1634057760238647, "step": 270 }, { "epoch": 0.6031042128603105, - "grad_norm": 1.362686038017273, - "learning_rate": 9.757023095698766e-06, - "loss": 1.2366266250610352, + "grad_norm": 1.2963234186172485, + "learning_rate": 4.878511547849383e-06, + "loss": 1.2225638628005981, "step": 272 }, { "epoch": 0.6075388026607539, - "grad_norm": 1.314932107925415, - "learning_rate": 9.751644108130405e-06, - "loss": 1.230374813079834, + "grad_norm": 1.1285306215286255, + "learning_rate": 4.875822054065203e-06, + "loss": 1.2115212678909302, "step": 274 }, { "epoch": 0.6119733924611973, - "grad_norm": 1.0009852647781372, - "learning_rate": 9.746207919850951e-06, - "loss": 1.230873942375183, + "grad_norm": 1.0443319082260132, + "learning_rate": 4.8731039599254754e-06, + "loss": 1.2144972085952759, "step": 276 }, { "epoch": 0.6164079822616408, - "grad_norm": 2.000821590423584, - "learning_rate": 9.740714603997712e-06, - "loss": 1.241438627243042, + "grad_norm": 9.736445426940918, + "learning_rate": 4.870357301998856e-06, + "loss": 1.210550308227539, "step": 278 }, { "epoch": 0.6208425720620843, - "grad_norm": 1.346825122833252, - "learning_rate": 9.735164234476588e-06, - "loss": 1.3042198419570923, + "grad_norm": 0.8090811967849731, + "learning_rate": 4.867582117238294e-06, + "loss": 1.284433126449585, "step": 280 }, { "epoch": 0.6252771618625277, - "grad_norm": 1.8797986507415771, - "learning_rate": 9.729556885961064e-06, - "loss": 0.9092460870742798, + "grad_norm": 1.3445334434509277, + "learning_rate": 4.864778442980532e-06, + "loss": 0.8542180061340332, "step": 282 }, { "epoch": 0.6297117516629712, - "grad_norm": 0.9287862777709961, - "learning_rate": 9.72389263389121e-06, - "loss": 1.269888162612915, + "grad_norm": 1.9137824773788452, + "learning_rate": 4.861946316945605e-06, + "loss": 1.2416703701019287, "step": 284 }, { "epoch": 0.6341463414634146, - "grad_norm": 1.3805606365203857, - "learning_rate": 9.718171554472662e-06, - "loss": 1.3446077108383179, + "grad_norm": 1.3683984279632568, + "learning_rate": 4.859085777236331e-06, + "loss": 1.3251290321350098, "step": 286 }, { "epoch": 0.6385809312638581, - "grad_norm": 1.057990312576294, - "learning_rate": 9.712393724675597e-06, - "loss": 1.259419322013855, + "grad_norm": 0.9978231191635132, + "learning_rate": 4.8561968623377985e-06, + "loss": 1.2392802238464355, "step": 288 }, { "epoch": 0.6430155210643016, - "grad_norm": 1.940826654434204, - "learning_rate": 9.706559222233704e-06, - "loss": 1.2493295669555664, + "grad_norm": 0.7207977175712585, + "learning_rate": 4.853279611116852e-06, + "loss": 1.2283697128295898, "step": 290 }, { "epoch": 0.647450110864745, - "grad_norm": 1.3292601108551025, - "learning_rate": 9.700668125643132e-06, - "loss": 1.3664789199829102, + "grad_norm": 1.396116018295288, + "learning_rate": 4.850334062821566e-06, + "loss": 1.3465940952301025, "step": 292 }, { "epoch": 0.6518847006651884, - "grad_norm": 1.5756752490997314, - "learning_rate": 9.694720514161437e-06, - "loss": 0.9199124574661255, + "grad_norm": 0.916964590549469, + "learning_rate": 4.8473602570807185e-06, + "loss": 0.9178141951560974, "step": 294 }, { "epoch": 0.656319290465632, - "grad_norm": 0.7564427256584167, - "learning_rate": 9.688716467806508e-06, - "loss": 1.0202033519744873, + "grad_norm": 0.9746919870376587, + "learning_rate": 4.844358233903254e-06, + "loss": 0.9849019050598145, "step": 296 }, { "epoch": 0.6607538802660754, - "grad_norm": 0.8535528182983398, - "learning_rate": 9.682656067355505e-06, - "loss": 1.2138911485671997, + "grad_norm": 1.0369846820831299, + "learning_rate": 4.841328033677753e-06, + "loss": 1.1890286207199097, "step": 298 }, { "epoch": 0.6651884700665188, - "grad_norm": 1.1474356651306152, - "learning_rate": 9.67653939434376e-06, - "loss": 1.2619802951812744, + "grad_norm": 1.0989309549331665, + "learning_rate": 4.83826969717188e-06, + "loss": 1.2377430200576782, "step": 300 }, { "epoch": 0.6696230598669624, - "grad_norm": 1.2519088983535767, - "learning_rate": 9.670366531063686e-06, - "loss": 1.2084006071090698, + "grad_norm": 1.5578566789627075, + "learning_rate": 4.835183265531843e-06, + "loss": 1.1927175521850586, "step": 302 }, { "epoch": 0.6740576496674058, - "grad_norm": 1.4693647623062134, - "learning_rate": 9.664137560563663e-06, - "loss": 1.2800395488739014, + "grad_norm": 1.3345965147018433, + "learning_rate": 4.832068780281831e-06, + "loss": 1.2588074207305908, "step": 304 }, { "epoch": 0.6784922394678492, - "grad_norm": 0.9270733594894409, - "learning_rate": 9.657852566646929e-06, - "loss": 1.2464381456375122, + "grad_norm": 1.7896565198898315, + "learning_rate": 4.828926283323464e-06, + "loss": 1.2271308898925781, "step": 306 }, { "epoch": 0.6829268292682927, - "grad_norm": 0.7677431106567383, - "learning_rate": 9.651511633870451e-06, - "loss": 0.8917385339736938, + "grad_norm": 2.4165589809417725, + "learning_rate": 4.8257558169352254e-06, + "loss": 0.8486894965171814, "step": 308 }, { "epoch": 0.6873614190687362, - "grad_norm": 0.7328879833221436, - "learning_rate": 9.645114847543781e-06, - "loss": 1.248028039932251, + "grad_norm": 1.6142799854278564, + "learning_rate": 4.8225574237718906e-06, + "loss": 1.2312774658203125, "step": 310 }, { "epoch": 0.6917960088691796, - "grad_norm": 0.6110285520553589, - "learning_rate": 9.638662293727916e-06, - "loss": 1.2161647081375122, + "grad_norm": 1.035621166229248, + "learning_rate": 4.819331146863958e-06, + "loss": 1.2014350891113281, "step": 312 }, { "epoch": 0.6962305986696231, - "grad_norm": 0.7204345464706421, - "learning_rate": 9.632154059234137e-06, - "loss": 1.2376904487609863, + "grad_norm": 0.8215665817260742, + "learning_rate": 4.8160770296170685e-06, + "loss": 1.2188996076583862, "step": 314 }, { "epoch": 0.7006651884700665, - "grad_norm": 0.8166297078132629, - "learning_rate": 9.625590231622837e-06, - "loss": 1.3569447994232178, + "grad_norm": 0.8507355451583862, + "learning_rate": 4.812795115811419e-06, + "loss": 1.3254660367965698, "step": 316 }, { "epoch": 0.70509977827051, - "grad_norm": 0.9921203255653381, - "learning_rate": 9.618970899202354e-06, - "loss": 1.0280705690383911, + "grad_norm": 0.8648268580436707, + "learning_rate": 4.809485449601177e-06, + "loss": 0.978560745716095, "step": 318 }, { "epoch": 0.7095343680709535, - "grad_norm": 2.89858078956604, - "learning_rate": 9.612296151027765e-06, - "loss": 0.9978334903717041, + "grad_norm": 1.1685987710952759, + "learning_rate": 4.806148075513883e-06, + "loss": 0.964942991733551, "step": 320 }, { "epoch": 0.7139689578713969, - "grad_norm": 1.732686996459961, - "learning_rate": 9.605566076899714e-06, - "loss": 0.9863637685775757, + "grad_norm": 6.244750499725342, + "learning_rate": 4.802783038449857e-06, + "loss": 0.9973806142807007, "step": 322 }, { "epoch": 0.7184035476718403, - "grad_norm": 9.696839332580566, - "learning_rate": 9.598780767363174e-06, - "loss": 1.1041550636291504, + "grad_norm": 2.577277660369873, + "learning_rate": 4.799390383681587e-06, + "loss": 1.064439058303833, "step": 324 }, { "epoch": 0.7228381374722838, - "grad_norm": 2.4485864639282227, - "learning_rate": 9.591940313706248e-06, - "loss": 1.1134647130966187, + "grad_norm": 1.7076307535171509, + "learning_rate": 4.795970156853124e-06, + "loss": 1.0868260860443115, "step": 326 }, { "epoch": 0.7272727272727273, - "grad_norm": 0.39739716053009033, - "learning_rate": 9.585044807958942e-06, - "loss": 0.8426170349121094, + "grad_norm": 0.3369574248790741, + "learning_rate": 4.792522403979471e-06, + "loss": 0.7968674302101135, "step": 328 }, { "epoch": 0.7317073170731707, - "grad_norm": 1.39254629611969, - "learning_rate": 9.578094342891915e-06, - "loss": 0.8422537446022034, + "grad_norm": 0.9717708230018616, + "learning_rate": 4.789047171445957e-06, + "loss": 0.8146457672119141, "step": 330 }, { "epoch": 0.7361419068736141, - "grad_norm": 1.350459337234497, - "learning_rate": 9.571089012015237e-06, - "loss": 1.4164326190948486, + "grad_norm": 1.6244539022445679, + "learning_rate": 4.785544506007619e-06, + "loss": 1.3726072311401367, "step": 332 }, { "epoch": 0.7405764966740577, - "grad_norm": 1.751500129699707, - "learning_rate": 9.564028909577132e-06, - "loss": 1.2159979343414307, + "grad_norm": 3.454514980316162, + "learning_rate": 4.782014454788566e-06, + "loss": 1.1831202507019043, "step": 334 }, { "epoch": 0.7450110864745011, - "grad_norm": 1.3023916482925415, - "learning_rate": 9.55691413056271e-06, - "loss": 1.2543408870697021, + "grad_norm": 0.8379015922546387, + "learning_rate": 4.778457065281355e-06, + "loss": 1.216602087020874, "step": 336 }, { "epoch": 0.7494456762749445, - "grad_norm": 2.021757125854492, - "learning_rate": 9.54974477069269e-06, - "loss": 0.7136483788490295, + "grad_norm": 2.0432989597320557, + "learning_rate": 4.774872385346345e-06, + "loss": 0.6669434309005737, "step": 338 }, { "epoch": 0.753880266075388, - "grad_norm": 2.4734063148498535, - "learning_rate": 9.542520926422105e-06, - "loss": 0.3343808054924011, + "grad_norm": 2.758876085281372, + "learning_rate": 4.7712604632110524e-06, + "loss": 0.3195689618587494, "step": 340 }, { "epoch": 0.7583148558758315, - "grad_norm": 4.090439796447754, - "learning_rate": 9.535242694939011e-06, - "loss": 0.8977051973342896, + "grad_norm": 7.965346336364746, + "learning_rate": 4.767621347469506e-06, + "loss": 0.8559633493423462, "step": 342 }, { "epoch": 0.7627494456762749, - "grad_norm": 1.0740253925323486, - "learning_rate": 9.527910174163179e-06, - "loss": 1.317376732826233, + "grad_norm": 0.7290642261505127, + "learning_rate": 4.7639550870815895e-06, + "loss": 1.2730909585952759, "step": 344 }, { "epoch": 0.7671840354767184, - "grad_norm": 0.9744492173194885, - "learning_rate": 9.520523462744776e-06, - "loss": 1.249900221824646, + "grad_norm": 0.8464753031730652, + "learning_rate": 4.760261731372388e-06, + "loss": 1.2238872051239014, "step": 346 }, { "epoch": 0.7716186252771619, - "grad_norm": 1.6449717283248901, - "learning_rate": 9.51308266006304e-06, - "loss": 1.1062456369400024, + "grad_norm": 4.048059940338135, + "learning_rate": 4.75654133003152e-06, + "loss": 1.084853172302246, "step": 348 }, { "epoch": 0.7760532150776053, - "grad_norm": 1.0315587520599365, - "learning_rate": 9.505587866224939e-06, - "loss": 1.2459303140640259, + "grad_norm": 0.9353682994842529, + "learning_rate": 4.752793933112469e-06, + "loss": 1.2124656438827515, "step": 350 }, { "epoch": 0.7804878048780488, - "grad_norm": 1.7614156007766724, - "learning_rate": 9.498039182063828e-06, - "loss": 1.1719058752059937, + "grad_norm": 1.724868655204773, + "learning_rate": 4.749019591031914e-06, + "loss": 1.137851595878601, "step": 352 }, { "epoch": 0.7849223946784922, - "grad_norm": 3.0436220169067383, - "learning_rate": 9.49043670913809e-06, - "loss": 1.0381566286087036, + "grad_norm": 1.9649287462234497, + "learning_rate": 4.745218354569045e-06, + "loss": 1.0045366287231445, "step": 354 }, { "epoch": 0.7893569844789357, - "grad_norm": 6.369256019592285, - "learning_rate": 9.48278054972977e-06, - "loss": 1.2438908815383911, + "grad_norm": 2.2398102283477783, + "learning_rate": 4.741390274864885e-06, + "loss": 1.210289716720581, "step": 356 }, { "epoch": 0.7937915742793792, - "grad_norm": 3.2349908351898193, - "learning_rate": 9.475070806843202e-06, - "loss": 1.323697566986084, + "grad_norm": 2.9946892261505127, + "learning_rate": 4.737535403421601e-06, + "loss": 1.250780701637268, "step": 358 }, { "epoch": 0.7982261640798226, - "grad_norm": 1.123029351234436, - "learning_rate": 9.467307584203619e-06, - "loss": 1.2511515617370605, + "grad_norm": 1.1408754587173462, + "learning_rate": 4.733653792101809e-06, + "loss": 1.2131381034851074, "step": 360 }, { "epoch": 0.802660753880266, - "grad_norm": 1.8244556188583374, - "learning_rate": 9.459490986255756e-06, - "loss": 0.6574705839157104, + "grad_norm": 3.3678433895111084, + "learning_rate": 4.729745493127878e-06, + "loss": 0.5941082835197449, "step": 362 }, { "epoch": 0.8070953436807096, - "grad_norm": 1.0850750207901, - "learning_rate": 9.451621118162453e-06, - "loss": 1.3474435806274414, + "grad_norm": 1.4601346254348755, + "learning_rate": 4.725810559081227e-06, + "loss": 1.3125383853912354, "step": 364 }, { "epoch": 0.811529933481153, - "grad_norm": 1.0580451488494873, - "learning_rate": 9.443698085803235e-06, - "loss": 1.2190296649932861, + "grad_norm": 0.950433075428009, + "learning_rate": 4.7218490429016175e-06, + "loss": 1.186044692993164, "step": 366 }, { "epoch": 0.8159645232815964, - "grad_norm": 1.2732537984848022, - "learning_rate": 9.435721995772884e-06, - "loss": 1.001318335533142, + "grad_norm": 1.5122184753417969, + "learning_rate": 4.717860997886442e-06, + "loss": 0.9993484020233154, "step": 368 }, { "epoch": 0.8203991130820399, - "grad_norm": 3.1396641731262207, - "learning_rate": 9.42769295538001e-06, - "loss": 0.8487209677696228, + "grad_norm": 1.5292415618896484, + "learning_rate": 4.713846477690005e-06, + "loss": 0.8163633942604065, "step": 370 }, { "epoch": 0.8248337028824834, - "grad_norm": 1.9196490049362183, - "learning_rate": 9.419611072645608e-06, - "loss": 1.2449017763137817, + "grad_norm": 1.121957540512085, + "learning_rate": 4.709805536322804e-06, + "loss": 1.213725209236145, "step": 372 }, { "epoch": 0.8292682926829268, - "grad_norm": 0.878728449344635, - "learning_rate": 9.4114764563016e-06, - "loss": 1.2437989711761475, + "grad_norm": 0.9955883026123047, + "learning_rate": 4.7057382281508e-06, + "loss": 1.2182434797286987, "step": 374 }, { "epoch": 0.8337028824833703, - "grad_norm": 1.0601072311401367, - "learning_rate": 9.403289215789373e-06, - "loss": 1.205723524093628, + "grad_norm": 1.1312615871429443, + "learning_rate": 4.701644607894687e-06, + "loss": 1.1773383617401123, "step": 376 }, { "epoch": 0.8381374722838137, - "grad_norm": 6.103841304779053, - "learning_rate": 9.395049461258318e-06, - "loss": 1.246256709098816, + "grad_norm": 1.2919820547103882, + "learning_rate": 4.697524730629159e-06, + "loss": 1.1870676279067993, "step": 378 }, { "epoch": 0.8425720620842572, - "grad_norm": 2.8056368827819824, - "learning_rate": 9.386757303564323e-06, - "loss": 0.7605912089347839, + "grad_norm": 1.1439024209976196, + "learning_rate": 4.693378651782162e-06, + "loss": 0.7241402268409729, "step": 380 }, { "epoch": 0.8470066518847007, - "grad_norm": 1.2943381071090698, - "learning_rate": 9.37841285426831e-06, - "loss": 1.3198047876358032, + "grad_norm": 2.4431052207946777, + "learning_rate": 4.689206427134155e-06, + "loss": 1.2909201383590698, "step": 382 }, { "epoch": 0.8514412416851441, - "grad_norm": 1.5381852388381958, - "learning_rate": 9.370016225634719e-06, - "loss": 1.1507357358932495, + "grad_norm": 2.413292169570923, + "learning_rate": 4.6850081128173595e-06, + "loss": 1.1248009204864502, "step": 384 }, { "epoch": 0.8558758314855875, - "grad_norm": 4.027266025543213, - "learning_rate": 9.361567530629988e-06, - "loss": 1.253443956375122, + "grad_norm": 1.0476144552230835, + "learning_rate": 4.680783765314994e-06, + "loss": 1.2368849515914917, "step": 386 }, { "epoch": 0.8603104212860311, - "grad_norm": 0.5111842155456543, - "learning_rate": 9.353066882921063e-06, - "loss": 1.1759639978408813, + "grad_norm": 0.5809259414672852, + "learning_rate": 4.6765334414605315e-06, + "loss": 1.134302020072937, "step": 388 }, { "epoch": 0.8647450110864745, - "grad_norm": 0.8026149868965149, - "learning_rate": 9.344514396873837e-06, - "loss": 1.2608635425567627, + "grad_norm": 0.950793981552124, + "learning_rate": 4.672257198436918e-06, + "loss": 1.2347341775894165, "step": 390 }, { "epoch": 0.8691796008869179, - "grad_norm": 1.007558822631836, - "learning_rate": 9.335910187551628e-06, - "loss": 0.9073533415794373, + "grad_norm": 1.9269758462905884, + "learning_rate": 4.667955093775814e-06, + "loss": 0.9339362978935242, "step": 392 }, { "epoch": 0.8736141906873615, - "grad_norm": 1.0619909763336182, - "learning_rate": 9.327254370713636e-06, - "loss": 1.2199063301086426, + "grad_norm": 2.1874947547912598, + "learning_rate": 4.663627185356818e-06, + "loss": 1.2027480602264404, "step": 394 }, { "epoch": 0.8780487804878049, - "grad_norm": 0.797447681427002, - "learning_rate": 9.31854706281336e-06, - "loss": 1.2162882089614868, + "grad_norm": 0.8487191200256348, + "learning_rate": 4.65927353140668e-06, + "loss": 1.1914976835250854, "step": 396 }, { "epoch": 0.8824833702882483, - "grad_norm": 1.5606045722961426, - "learning_rate": 9.309788380997069e-06, - "loss": 1.208269476890564, + "grad_norm": 1.360918641090393, + "learning_rate": 4.654894190498534e-06, + "loss": 1.194710373878479, "step": 398 }, { "epoch": 0.8869179600886918, - "grad_norm": 0.7051414847373962, - "learning_rate": 9.30097844310219e-06, - "loss": 0.5201588869094849, + "grad_norm": 0.5938453078269958, + "learning_rate": 4.650489221551095e-06, + "loss": 0.43390318751335144, "step": 400 }, { "epoch": 0.8913525498891353, - "grad_norm": 1.7164850234985352, - "learning_rate": 9.292117367655749e-06, - "loss": 1.0428240299224854, + "grad_norm": 1.3284165859222412, + "learning_rate": 4.646058683827874e-06, + "loss": 1.0268417596817017, "step": 402 }, { "epoch": 0.8957871396895787, - "grad_norm": 0.5142672657966614, - "learning_rate": 9.283205273872757e-06, - "loss": 0.9249513745307922, + "grad_norm": 0.35066622495651245, + "learning_rate": 4.641602636936378e-06, + "loss": 0.9203835725784302, "step": 404 }, { "epoch": 0.9002217294900222, - "grad_norm": 2.530928373336792, - "learning_rate": 9.274242281654621e-06, - "loss": 1.2720247507095337, + "grad_norm": 2.698913335800171, + "learning_rate": 4.637121140827311e-06, + "loss": 1.253874659538269, "step": 406 }, { "epoch": 0.9046563192904656, - "grad_norm": 2.1848092079162598, - "learning_rate": 9.265228511587525e-06, - "loss": 1.130611538887024, + "grad_norm": 1.8617788553237915, + "learning_rate": 4.632614255793762e-06, + "loss": 1.1092817783355713, "step": 408 }, { "epoch": 0.9090909090909091, - "grad_norm": 1.4184147119522095, - "learning_rate": 9.2561640849408e-06, - "loss": 1.1104357242584229, + "grad_norm": 1.344133734703064, + "learning_rate": 4.6280820424704e-06, + "loss": 1.053206443786621, "step": 410 }, { "epoch": 0.9135254988913526, - "grad_norm": 2.9592013359069824, - "learning_rate": 9.247049123665306e-06, - "loss": 1.2363438606262207, + "grad_norm": 1.601177453994751, + "learning_rate": 4.623524561832653e-06, + "loss": 1.221002459526062, "step": 412 }, { "epoch": 0.917960088691796, - "grad_norm": 1.4257206916809082, - "learning_rate": 9.237883750391786e-06, - "loss": 1.2583258152008057, + "grad_norm": 0.8422732353210449, + "learning_rate": 4.618941875195893e-06, + "loss": 1.2346315383911133, "step": 414 }, { "epoch": 0.9223946784922394, - "grad_norm": 4.5751776695251465, - "learning_rate": 9.228668088429212e-06, - "loss": 0.9869507551193237, + "grad_norm": 1.5042940378189087, + "learning_rate": 4.614334044214606e-06, + "loss": 0.959531307220459, "step": 416 }, { "epoch": 0.926829268292683, - "grad_norm": 0.9538375735282898, - "learning_rate": 9.219402261763129e-06, - "loss": 1.2694066762924194, + "grad_norm": 30.954843521118164, + "learning_rate": 4.6097011308815645e-06, + "loss": 1.2459607124328613, "step": 418 }, { "epoch": 0.9312638580931264, - "grad_norm": 1.5408073663711548, - "learning_rate": 9.210086395053992e-06, - "loss": 0.7773984670639038, + "grad_norm": 36.690757751464844, + "learning_rate": 4.605043197526996e-06, + "loss": 0.7548654079437256, "step": 420 }, { "epoch": 0.9356984478935698, - "grad_norm": 3.4732754230499268, - "learning_rate": 9.200720613635476e-06, - "loss": 1.4664820432662964, + "grad_norm": 3.826604127883911, + "learning_rate": 4.600360306817738e-06, + "loss": 1.448906421661377, "step": 422 }, { "epoch": 0.9401330376940134, - "grad_norm": 2.461186408996582, - "learning_rate": 9.191305043512806e-06, - "loss": 1.0693186521530151, + "grad_norm": 12.927244186401367, + "learning_rate": 4.595652521756403e-06, + "loss": 1.0475653409957886, "step": 424 }, { "epoch": 0.9445676274944568, - "grad_norm": 3.3022894859313965, - "learning_rate": 9.181839811361048e-06, - "loss": 1.2006292343139648, + "grad_norm": 6.169033527374268, + "learning_rate": 4.590919905680524e-06, + "loss": 1.184989094734192, "step": 426 }, { "epoch": 0.9490022172949002, - "grad_norm": 0.8173018097877502, - "learning_rate": 9.172325044523413e-06, - "loss": 1.100250005722046, + "grad_norm": 0.7973089814186096, + "learning_rate": 4.5861625222617065e-06, + "loss": 1.069338321685791, "step": 428 }, { "epoch": 0.9534368070953437, - "grad_norm": 1.222411870956421, - "learning_rate": 9.16276087100954e-06, - "loss": 0.6654449701309204, + "grad_norm": 2.250502586364746, + "learning_rate": 4.58138043550477e-06, + "loss": 0.6720293164253235, "step": 430 }, { "epoch": 0.9578713968957872, - "grad_norm": 1.2987111806869507, - "learning_rate": 9.153147419493774e-06, - "loss": 1.2298423051834106, + "grad_norm": 0.9973225593566895, + "learning_rate": 4.576573709746887e-06, + "loss": 1.2099367380142212, "step": 432 }, { "epoch": 0.9623059866962306, - "grad_norm": 2.3424274921417236, - "learning_rate": 9.143484819313441e-06, - "loss": 1.1017597913742065, + "grad_norm": 1.777103304862976, + "learning_rate": 4.5717424096567205e-06, + "loss": 1.053672194480896, "step": 434 }, { "epoch": 0.9667405764966741, - "grad_norm": 1.5677627325057983, - "learning_rate": 9.133773200467095e-06, - "loss": 1.2761000394821167, + "grad_norm": 1.1499074697494507, + "learning_rate": 4.566886600233547e-06, + "loss": 1.2612353563308716, "step": 436 }, { "epoch": 0.9711751662971175, - "grad_norm": 1.3605282306671143, - "learning_rate": 9.12401269361278e-06, - "loss": 1.2378090620040894, + "grad_norm": 0.8228787779808044, + "learning_rate": 4.56200634680639e-06, + "loss": 1.2220462560653687, "step": 438 }, { "epoch": 0.975609756097561, - "grad_norm": 6.334904193878174, - "learning_rate": 9.114203430066273e-06, - "loss": 0.9209675788879395, + "grad_norm": 2.36310076713562, + "learning_rate": 4.557101715033136e-06, + "loss": 0.7764140367507935, "step": 440 }, { "epoch": 0.9800443458980045, - "grad_norm": 1.7632060050964355, - "learning_rate": 9.104345541799304e-06, - "loss": 1.0448412895202637, + "grad_norm": 0.8385096192359924, + "learning_rate": 4.552172770899652e-06, + "loss": 1.01227867603302, "step": 442 }, { "epoch": 0.9844789356984479, - "grad_norm": 1.790408730506897, - "learning_rate": 9.094439161437797e-06, - "loss": 1.2859784364700317, + "grad_norm": 1.1155610084533691, + "learning_rate": 4.547219580718899e-06, + "loss": 1.261846661567688, "step": 444 }, { "epoch": 0.9889135254988913, - "grad_norm": 1.3654117584228516, - "learning_rate": 9.084484422260079e-06, - "loss": 1.2079429626464844, + "grad_norm": 1.4040062427520752, + "learning_rate": 4.542242211130039e-06, + "loss": 1.1938276290893555, "step": 446 }, { "epoch": 0.9933481152993349, - "grad_norm": 1.1531587839126587, - "learning_rate": 9.074481458195077e-06, - "loss": 1.2449597120285034, + "grad_norm": 1.3951795101165771, + "learning_rate": 4.537240729097539e-06, + "loss": 1.228947401046753, "step": 448 }, { "epoch": 0.9977827050997783, - "grad_norm": 0.8101630806922913, - "learning_rate": 9.064430403820538e-06, - "loss": 1.0602502822875977, + "grad_norm": 1.0445842742919922, + "learning_rate": 4.532215201910269e-06, + "loss": 1.021051287651062, "step": 450 }, { "epoch": 1.0022172949002217, - "grad_norm": 3.6886227130889893, - "learning_rate": 9.054331394361195e-06, - "loss": 1.0475915670394897, + "grad_norm": 3.247546911239624, + "learning_rate": 4.527165697180598e-06, + "loss": 1.091888189315796, "step": 452 }, { "epoch": 1.0066518847006651, - "grad_norm": 1.2973512411117554, - "learning_rate": 9.044184565686963e-06, - "loss": 1.2166190147399902, + "grad_norm": 1.5735362768173218, + "learning_rate": 4.522092282843481e-06, + "loss": 1.2799968719482422, "step": 454 }, { "epoch": 1.0110864745011086, - "grad_norm": 1.0704854726791382, - "learning_rate": 9.033990054311108e-06, - "loss": 1.214512586593628, + "grad_norm": 37.99199676513672, + "learning_rate": 4.516995027155554e-06, + "loss": 1.2860794067382812, "step": 456 }, { "epoch": 1.0155210643015522, - "grad_norm": 5.711384296417236, - "learning_rate": 9.023747997388409e-06, - "loss": 0.8170270919799805, + "grad_norm": 0.8153271079063416, + "learning_rate": 4.511873998694204e-06, + "loss": 0.8918830156326294, "step": 458 }, { "epoch": 1.0199556541019956, - "grad_norm": 0.5732322931289673, - "learning_rate": 9.013458532713303e-06, - "loss": 0.7988295555114746, + "grad_norm": 0.31422239542007446, + "learning_rate": 4.506729266356651e-06, + "loss": 0.8340937495231628, "step": 460 }, { "epoch": 1.024390243902439, - "grad_norm": 1.6274175643920898, - "learning_rate": 9.003121798718055e-06, - "loss": 0.520905077457428, + "grad_norm": 1.3617857694625854, + "learning_rate": 4.5015608993590276e-06, + "loss": 0.5565242171287537, "step": 462 }, { "epoch": 1.0288248337028825, - "grad_norm": 1.794394612312317, - "learning_rate": 8.992737934470875e-06, - "loss": 0.7766972184181213, + "grad_norm": 1.1211755275726318, + "learning_rate": 4.4963689672354375e-06, + "loss": 0.864283561706543, "step": 464 }, { "epoch": 1.033259423503326, - "grad_norm": 1.3309742212295532, - "learning_rate": 8.982307079674051e-06, - "loss": 0.6682877540588379, + "grad_norm": 1.0255306959152222, + "learning_rate": 4.491153539837026e-06, + "loss": 0.720465898513794, "step": 466 }, { "epoch": 1.0376940133037693, - "grad_norm": 1.3699984550476074, - "learning_rate": 8.971829374662075e-06, - "loss": 0.9524274468421936, + "grad_norm": 1.1541537046432495, + "learning_rate": 4.4859146873310375e-06, + "loss": 1.0676193237304688, "step": 468 }, { "epoch": 1.042128603104213, - "grad_norm": 1.1349494457244873, - "learning_rate": 8.961304960399746e-06, - "loss": 0.610037624835968, + "grad_norm": 1.8817100524902344, + "learning_rate": 4.480652480199873e-06, + "loss": 0.6701173782348633, "step": 470 }, { "epoch": 1.0465631929046564, - "grad_norm": 1.1969398260116577, - "learning_rate": 8.950733978480295e-06, - "loss": 0.9021300077438354, + "grad_norm": 2.1447463035583496, + "learning_rate": 4.475366989240147e-06, + "loss": 1.1118934154510498, "step": 472 }, { "epoch": 1.0509977827050998, - "grad_norm": 1.6304041147232056, - "learning_rate": 8.940116571123442e-06, - "loss": 1.0323752164840698, + "grad_norm": 1.540354609489441, + "learning_rate": 4.470058285561721e-06, + "loss": 1.037009835243225, "step": 474 }, { "epoch": 1.0554323725055432, - "grad_norm": 1.383133888244629, - "learning_rate": 8.929452881173522e-06, - "loss": 1.06901216506958, + "grad_norm": 0.8637385368347168, + "learning_rate": 4.464726440586761e-06, + "loss": 1.051493525505066, "step": 476 }, { "epoch": 1.0598669623059866, - "grad_norm": 1.9538545608520508, - "learning_rate": 8.91874305209754e-06, - "loss": 1.087662696838379, + "grad_norm": 1.036024570465088, + "learning_rate": 4.45937152604877e-06, + "loss": 1.0941234827041626, "step": 478 }, { "epoch": 1.06430155210643, - "grad_norm": 0.9558976888656616, - "learning_rate": 8.907987227983244e-06, - "loss": 0.628630518913269, + "grad_norm": 1.068137526512146, + "learning_rate": 4.453993613991622e-06, + "loss": 0.6281552910804749, "step": 480 }, { "epoch": 1.0687361419068737, - "grad_norm": 0.9924697875976562, - "learning_rate": 8.897185553537199e-06, - "loss": 1.079522728919983, + "grad_norm": 1.500739574432373, + "learning_rate": 4.4485927767685995e-06, + "loss": 1.1264927387237549, "step": 482 }, { "epoch": 1.0731707317073171, - "grad_norm": 0.4461686313152313, - "learning_rate": 8.886338174082818e-06, - "loss": 0.9315227270126343, + "grad_norm": 0.5921480059623718, + "learning_rate": 4.443169087041409e-06, + "loss": 0.9420715570449829, "step": 484 }, { "epoch": 1.0776053215077606, - "grad_norm": 0.8930976986885071, - "learning_rate": 8.875445235558429e-06, - "loss": 1.030774474143982, + "grad_norm": 0.7887612581253052, + "learning_rate": 4.4377226177792145e-06, + "loss": 1.0924605131149292, "step": 486 }, { "epoch": 1.082039911308204, - "grad_norm": 0.9663419723510742, - "learning_rate": 8.864506884515298e-06, - "loss": 0.775393009185791, + "grad_norm": 1.0860843658447266, + "learning_rate": 4.432253442257649e-06, + "loss": 0.7638985514640808, "step": 488 }, { "epoch": 1.0864745011086474, - "grad_norm": 1.8761645555496216, - "learning_rate": 8.853523268115662e-06, - "loss": 0.878838062286377, + "grad_norm": 1.021136999130249, + "learning_rate": 4.426761634057831e-06, + "loss": 0.9648894667625427, "step": 490 }, { "epoch": 1.0909090909090908, - "grad_norm": 1.990370512008667, - "learning_rate": 8.84249453413075e-06, - "loss": 1.1431140899658203, + "grad_norm": 1.4795594215393066, + "learning_rate": 4.421247267065375e-06, + "loss": 1.113416314125061, "step": 492 }, { "epoch": 1.0953436807095343, - "grad_norm": 4.04485559463501, - "learning_rate": 8.831420830938787e-06, - "loss": 1.0441185235977173, + "grad_norm": 4.473145484924316, + "learning_rate": 4.415710415469394e-06, + "loss": 1.06260085105896, "step": 494 }, { "epoch": 1.099778270509978, - "grad_norm": 3.171431303024292, - "learning_rate": 8.820302307523012e-06, - "loss": 0.8327910304069519, + "grad_norm": 2.0926437377929688, + "learning_rate": 4.410151153761506e-06, + "loss": 0.8711175918579102, "step": 496 }, { "epoch": 1.1042128603104213, - "grad_norm": 0.8679511547088623, - "learning_rate": 8.809139113469664e-06, - "loss": 0.9644469618797302, + "grad_norm": 2.1538195610046387, + "learning_rate": 4.404569556734832e-06, + "loss": 1.0097200870513916, "step": 498 }, { "epoch": 1.1086474501108647, - "grad_norm": 1.1949224472045898, - "learning_rate": 8.797931398965968e-06, - "loss": 0.6914905905723572, + "grad_norm": 0.7072110176086426, + "learning_rate": 4.398965699482984e-06, + "loss": 0.7574429512023926, "step": 500 }, { "epoch": 1.1130820399113082, - "grad_norm": 1.3626770973205566, - "learning_rate": 8.78667931479812e-06, - "loss": 1.0657753944396973, + "grad_norm": 0.6986474990844727, + "learning_rate": 4.39333965739906e-06, + "loss": 1.0583008527755737, "step": 502 }, { "epoch": 1.1175166297117516, - "grad_norm": 1.2040343284606934, - "learning_rate": 8.775383012349255e-06, - "loss": 0.8900744915008545, + "grad_norm": 2.5247440338134766, + "learning_rate": 4.3876915061746275e-06, + "loss": 0.9256818294525146, "step": 504 }, { "epoch": 1.1219512195121952, - "grad_norm": 2.972325325012207, - "learning_rate": 8.764042643597413e-06, - "loss": 0.9030492305755615, + "grad_norm": 2.264343738555908, + "learning_rate": 4.382021321798707e-06, + "loss": 1.0718340873718262, "step": 506 }, { "epoch": 1.1263858093126387, - "grad_norm": 2.5267043113708496, - "learning_rate": 8.75265836111349e-06, - "loss": 0.8307876586914062, + "grad_norm": 2.3817667961120605, + "learning_rate": 4.376329180556745e-06, + "loss": 0.9168416857719421, "step": 508 }, { "epoch": 1.130820399113082, - "grad_norm": 1.421155333518982, - "learning_rate": 8.741230318059188e-06, - "loss": 1.1410354375839233, + "grad_norm": 1.3641257286071777, + "learning_rate": 4.370615159029594e-06, + "loss": 1.1087809801101685, "step": 510 }, { "epoch": 1.1352549889135255, - "grad_norm": 1.9269262552261353, - "learning_rate": 8.72975866818496e-06, - "loss": 0.5328395962715149, + "grad_norm": 2.9134786128997803, + "learning_rate": 4.36487933409248e-06, + "loss": 0.5451399683952332, "step": 512 }, { "epoch": 1.139689578713969, - "grad_norm": 1.2638076543807983, - "learning_rate": 8.718243565827927e-06, - "loss": 0.44444531202316284, + "grad_norm": 1.1057777404785156, + "learning_rate": 4.359121782913964e-06, + "loss": 0.4450605511665344, "step": 514 }, { "epoch": 1.1441241685144123, - "grad_norm": 1.3325319290161133, - "learning_rate": 8.706685165909817e-06, - "loss": 0.9025118350982666, + "grad_norm": 1.3963892459869385, + "learning_rate": 4.3533425829549085e-06, + "loss": 0.9792951345443726, "step": 516 }, { "epoch": 1.1485587583148558, - "grad_norm": 1.31704843044281, - "learning_rate": 8.695083623934872e-06, - "loss": 1.0184192657470703, + "grad_norm": 1.4983371496200562, + "learning_rate": 4.347541811967436e-06, + "loss": 1.1121540069580078, "step": 518 }, { "epoch": 1.1529933481152994, - "grad_norm": 0.9442891478538513, - "learning_rate": 8.683439095987758e-06, - "loss": 1.075165033340454, + "grad_norm": 0.9903774261474609, + "learning_rate": 4.341719547993879e-06, + "loss": 1.130444049835205, "step": 520 }, { "epoch": 1.1574279379157428, - "grad_norm": 2.553899049758911, - "learning_rate": 8.671751738731464e-06, - "loss": 0.545891284942627, + "grad_norm": 1.0208805799484253, + "learning_rate": 4.335875869365732e-06, + "loss": 0.5359130501747131, "step": 522 }, { "epoch": 1.1618625277161863, - "grad_norm": 1.0405535697937012, - "learning_rate": 8.660021709405197e-06, - "loss": 0.9202799201011658, + "grad_norm": 1.0656358003616333, + "learning_rate": 4.330010854702598e-06, + "loss": 1.042147159576416, "step": 524 }, { "epoch": 1.1662971175166297, - "grad_norm": 1.757952332496643, - "learning_rate": 8.648249165822265e-06, - "loss": 1.147822380065918, + "grad_norm": 2.328760862350464, + "learning_rate": 4.3241245829111324e-06, + "loss": 1.1488664150238037, "step": 526 }, { "epoch": 1.170731707317073, - "grad_norm": 0.8222943544387817, - "learning_rate": 8.636434266367956e-06, - "loss": 0.6507425904273987, + "grad_norm": 1.1009130477905273, + "learning_rate": 4.318217133183978e-06, + "loss": 0.7168905138969421, "step": 528 }, { "epoch": 1.1751662971175167, - "grad_norm": 0.7118900418281555, - "learning_rate": 8.624577169997394e-06, - "loss": 0.8483346104621887, + "grad_norm": 0.8763664364814758, + "learning_rate": 4.312288584998697e-06, + "loss": 0.8310704827308655, "step": 530 }, { "epoch": 1.1796008869179602, - "grad_norm": 0.7820613384246826, - "learning_rate": 8.612678036233428e-06, - "loss": 0.9108617305755615, + "grad_norm": 0.7050794363021851, + "learning_rate": 4.306339018116714e-06, + "loss": 0.9871019124984741, "step": 532 }, { "epoch": 1.1840354767184036, - "grad_norm": 1.182815670967102, - "learning_rate": 8.600737025164454e-06, - "loss": 1.0241936445236206, + "grad_norm": 1.0369445085525513, + "learning_rate": 4.300368512582227e-06, + "loss": 1.096502423286438, "step": 534 }, { "epoch": 1.188470066518847, - "grad_norm": 0.36487239599227905, - "learning_rate": 8.588754297442288e-06, - "loss": 0.7436657547950745, + "grad_norm": 0.4809003472328186, + "learning_rate": 4.294377148721144e-06, + "loss": 0.8078370094299316, "step": 536 }, { "epoch": 1.1929046563192904, - "grad_norm": 0.996915876865387, - "learning_rate": 8.576730014279982e-06, - "loss": 1.047356128692627, + "grad_norm": 1.3127416372299194, + "learning_rate": 4.288365007139991e-06, + "loss": 1.0696966648101807, "step": 538 }, { "epoch": 1.1973392461197339, - "grad_norm": 0.9109376072883606, - "learning_rate": 8.564664337449677e-06, - "loss": 0.6739315986633301, + "grad_norm": 1.225541591644287, + "learning_rate": 4.2823321687248386e-06, + "loss": 0.6737713813781738, "step": 540 }, { "epoch": 1.2017738359201773, - "grad_norm": 1.8329638242721558, - "learning_rate": 8.552557429280407e-06, - "loss": 0.6435797214508057, + "grad_norm": 4.297253608703613, + "learning_rate": 4.276278714640203e-06, + "loss": 0.7530418634414673, "step": 542 }, { "epoch": 1.206208425720621, - "grad_norm": 1.6585458517074585, - "learning_rate": 8.540409452655927e-06, - "loss": 1.0112941265106201, + "grad_norm": 2.4006433486938477, + "learning_rate": 4.270204726327963e-06, + "loss": 1.1115124225616455, "step": 544 }, { "epoch": 1.2106430155210643, - "grad_norm": 0.4716590940952301, - "learning_rate": 8.528220571012518e-06, - "loss": 0.7702317237854004, + "grad_norm": 0.5028179883956909, + "learning_rate": 4.264110285506259e-06, + "loss": 0.8297025561332703, "step": 546 }, { "epoch": 1.2150776053215078, - "grad_norm": 1.1359302997589111, - "learning_rate": 8.51599094833679e-06, - "loss": 1.208444356918335, + "grad_norm": 1.1112703084945679, + "learning_rate": 4.257995474168395e-06, + "loss": 1.2000458240509033, "step": 548 }, { "epoch": 1.2195121951219512, - "grad_norm": 1.7960258722305298, - "learning_rate": 8.503720749163472e-06, - "loss": 0.6087892055511475, + "grad_norm": 7.750529766082764, + "learning_rate": 4.251860374581736e-06, + "loss": 0.5883951783180237, "step": 550 }, { "epoch": 1.2239467849223946, - "grad_norm": 1.232657551765442, - "learning_rate": 8.491410138573201e-06, - "loss": 1.0305365324020386, + "grad_norm": 1.483959436416626, + "learning_rate": 4.245705069286601e-06, + "loss": 1.0737543106079102, "step": 552 }, { "epoch": 1.2283813747228383, - "grad_norm": 1.60177481174469, - "learning_rate": 8.479059282190298e-06, - "loss": 0.7768429517745972, + "grad_norm": 4.382837772369385, + "learning_rate": 4.239529641095149e-06, + "loss": 0.7496063709259033, "step": 554 }, { "epoch": 1.2328159645232817, - "grad_norm": 0.3955824077129364, - "learning_rate": 8.466668346180548e-06, - "loss": 0.7353140115737915, + "grad_norm": 0.482736200094223, + "learning_rate": 4.233334173090274e-06, + "loss": 0.7637969255447388, "step": 556 }, { "epoch": 1.237250554323725, - "grad_norm": 2.8532001972198486, - "learning_rate": 8.454237497248956e-06, - "loss": 0.6795215010643005, + "grad_norm": 1.713169813156128, + "learning_rate": 4.227118748624478e-06, + "loss": 0.73581862449646, "step": 558 }, { "epoch": 1.2416851441241685, - "grad_norm": 2.764127016067505, - "learning_rate": 8.441766902637506e-06, - "loss": 1.2555410861968994, + "grad_norm": 2.0154945850372314, + "learning_rate": 4.220883451318753e-06, + "loss": 1.2916665077209473, "step": 560 }, { "epoch": 1.246119733924612, - "grad_norm": 2.7635672092437744, - "learning_rate": 8.429256730122909e-06, - "loss": 0.9878131747245789, + "grad_norm": 1.1945637464523315, + "learning_rate": 4.2146283650614545e-06, + "loss": 0.9394684433937073, "step": 562 }, { "epoch": 1.2505543237250554, - "grad_norm": 0.5536887645721436, - "learning_rate": 8.416707148014358e-06, - "loss": 0.6607757806777954, + "grad_norm": 0.8182294368743896, + "learning_rate": 4.208353574007179e-06, + "loss": 0.6380811333656311, "step": 564 }, { "epoch": 1.2549889135254988, - "grad_norm": 2.1173784732818604, - "learning_rate": 8.404118325151245e-06, - "loss": 0.9502476453781128, + "grad_norm": 1.0067088603973389, + "learning_rate": 4.202059162575622e-06, + "loss": 0.9561277627944946, "step": 566 }, { "epoch": 1.2594235033259422, - "grad_norm": 1.0017777681350708, - "learning_rate": 8.391490430900902e-06, - "loss": 0.8671638369560242, + "grad_norm": 1.1284490823745728, + "learning_rate": 4.195745215450451e-06, + "loss": 0.9579231142997742, "step": 568 }, { "epoch": 1.2638580931263859, - "grad_norm": 7.101733207702637, - "learning_rate": 8.378823635156319e-06, - "loss": 1.1196980476379395, + "grad_norm": 0.6526016592979431, + "learning_rate": 4.189411817578159e-06, + "loss": 1.1114004850387573, "step": 570 }, { "epoch": 1.2682926829268293, - "grad_norm": 0.7950246930122375, - "learning_rate": 8.366118108333861e-06, - "loss": 0.7276540398597717, + "grad_norm": 0.7916569113731384, + "learning_rate": 4.1830590541669304e-06, + "loss": 0.7596962451934814, "step": 572 }, { "epoch": 1.2727272727272727, - "grad_norm": 2.6005003452301025, - "learning_rate": 8.353374021370967e-06, - "loss": 1.2642651796340942, + "grad_norm": 1.7166008949279785, + "learning_rate": 4.176687010685484e-06, + "loss": 1.2456161975860596, "step": 574 }, { "epoch": 1.2771618625277161, - "grad_norm": 1.111757516860962, - "learning_rate": 8.340591545723861e-06, - "loss": 0.9772793650627136, + "grad_norm": 0.9254941344261169, + "learning_rate": 4.170295772861931e-06, + "loss": 0.9804297685623169, "step": 576 }, { "epoch": 1.2815964523281598, - "grad_norm": 1.9483938217163086, - "learning_rate": 8.327770853365238e-06, - "loss": 1.144858479499817, + "grad_norm": 0.8669252395629883, + "learning_rate": 4.163885426682619e-06, + "loss": 1.1345466375350952, "step": 578 }, { "epoch": 1.2860310421286032, - "grad_norm": 1.3433754444122314, - "learning_rate": 8.314912116781954e-06, - "loss": 1.0109100341796875, + "grad_norm": 0.9402420520782471, + "learning_rate": 4.157456058390977e-06, + "loss": 1.0704562664031982, "step": 580 }, { "epoch": 1.2904656319290466, - "grad_norm": 3.104252815246582, - "learning_rate": 8.302015508972702e-06, - "loss": 1.3024400472640991, + "grad_norm": 6.258746147155762, + "learning_rate": 4.151007754486351e-06, + "loss": 1.2947583198547363, "step": 582 }, { "epoch": 1.29490022172949, - "grad_norm": 0.7278043031692505, - "learning_rate": 8.289081203445686e-06, - "loss": 0.6852630376815796, + "grad_norm": 0.737083375453949, + "learning_rate": 4.144540601722843e-06, + "loss": 0.782565712928772, "step": 584 }, { "epoch": 1.2993348115299335, - "grad_norm": 0.9191624522209167, - "learning_rate": 8.276109374216286e-06, - "loss": 0.7753503322601318, + "grad_norm": 1.295681118965149, + "learning_rate": 4.138054687108143e-06, + "loss": 0.829471230506897, "step": 586 }, { "epoch": 1.3037694013303769, - "grad_norm": 2.7969019412994385, - "learning_rate": 8.263100195804722e-06, - "loss": 0.5715383291244507, + "grad_norm": 2.530069351196289, + "learning_rate": 4.131550097902361e-06, + "loss": 0.5070698857307434, "step": 588 }, { "epoch": 1.3082039911308203, - "grad_norm": 3.702993392944336, - "learning_rate": 8.250053843233704e-06, - "loss": 1.0089302062988281, + "grad_norm": 4.558071613311768, + "learning_rate": 4.125026921616852e-06, + "loss": 1.0417896509170532, "step": 590 }, { "epoch": 1.3126385809312637, - "grad_norm": 0.8864567279815674, - "learning_rate": 8.236970492026063e-06, - "loss": 0.9880189895629883, + "grad_norm": 1.0858033895492554, + "learning_rate": 4.118485246013031e-06, + "loss": 1.0292699337005615, "step": 592 }, { "epoch": 1.3170731707317074, - "grad_norm": 0.795543372631073, - "learning_rate": 8.223850318202415e-06, - "loss": 1.063581109046936, + "grad_norm": 2.0192573070526123, + "learning_rate": 4.111925159101208e-06, + "loss": 1.0664730072021484, "step": 594 }, { "epoch": 1.3215077605321508, - "grad_norm": 2.8183748722076416, - "learning_rate": 8.210693498278773e-06, - "loss": 1.100631833076477, + "grad_norm": 0.6075845956802368, + "learning_rate": 4.1053467491393864e-06, + "loss": 1.077016830444336, "step": 596 }, { "epoch": 1.3259423503325942, - "grad_norm": 0.9428039789199829, - "learning_rate": 8.197500209264181e-06, - "loss": 1.1460652351379395, + "grad_norm": 1.1085582971572876, + "learning_rate": 4.098750104632091e-06, + "loss": 1.0554149150848389, "step": 598 }, { "epoch": 1.3303769401330376, - "grad_norm": 2.286390781402588, - "learning_rate": 8.18427062865833e-06, - "loss": 0.4320100247859955, + "grad_norm": 0.8516935110092163, + "learning_rate": 4.092135314329165e-06, + "loss": 0.41985565423965454, "step": 600 }, { "epoch": 1.3348115299334813, - "grad_norm": 2.263921022415161, - "learning_rate": 8.171004934449166e-06, - "loss": 0.9320815205574036, + "grad_norm": 1.4308011531829834, + "learning_rate": 4.085502467224583e-06, + "loss": 1.0022186040878296, "step": 602 }, { "epoch": 1.3392461197339247, - "grad_norm": 2.219259023666382, - "learning_rate": 8.157703305110508e-06, - "loss": 0.8366701602935791, + "grad_norm": 0.866374135017395, + "learning_rate": 4.078851652555254e-06, + "loss": 0.8239214420318604, "step": 604 }, { "epoch": 1.3436807095343681, - "grad_norm": 0.45662444829940796, - "learning_rate": 8.144365919599632e-06, - "loss": 0.7252450585365295, + "grad_norm": 0.39041706919670105, + "learning_rate": 4.072182959799816e-06, + "loss": 0.7566671967506409, "step": 606 }, { "epoch": 1.3481152993348116, - "grad_norm": 3.525024890899658, - "learning_rate": 8.130992957354872e-06, - "loss": 1.1409834623336792, + "grad_norm": 3.046255588531494, + "learning_rate": 4.065496478677436e-06, + "loss": 1.0992430448532104, "step": 608 }, { "epoch": 1.352549889135255, - "grad_norm": 1.893685221672058, - "learning_rate": 8.117584598293204e-06, - "loss": 1.1217989921569824, + "grad_norm": 2.365644693374634, + "learning_rate": 4.058792299146602e-06, + "loss": 1.1112772226333618, "step": 610 }, { "epoch": 1.3569844789356984, - "grad_norm": 19.174959182739258, - "learning_rate": 8.104141022807824e-06, - "loss": 0.6694931983947754, + "grad_norm": 0.9957023859024048, + "learning_rate": 4.052070511403912e-06, + "loss": 0.6697097420692444, "step": 612 }, { "epoch": 1.3614190687361418, - "grad_norm": 0.9400426745414734, - "learning_rate": 8.090662411765726e-06, - "loss": 1.087446689605713, + "grad_norm": 1.1376887559890747, + "learning_rate": 4.045331205882863e-06, + "loss": 1.117611050605774, "step": 614 }, { "epoch": 1.3658536585365852, - "grad_norm": 0.36289501190185547, - "learning_rate": 8.077148946505258e-06, - "loss": 0.6697701215744019, + "grad_norm": 0.6812132596969604, + "learning_rate": 4.038574473252629e-06, + "loss": 0.7285422086715698, "step": 616 }, { "epoch": 1.370288248337029, - "grad_norm": 0.8704696893692017, - "learning_rate": 8.063600808833698e-06, - "loss": 0.8936916589736938, + "grad_norm": 1.371950626373291, + "learning_rate": 4.031800404416849e-06, + "loss": 0.9932656288146973, "step": 618 }, { "epoch": 1.3747228381374723, - "grad_norm": 0.25890159606933594, - "learning_rate": 8.050018181024788e-06, - "loss": 0.08715429157018661, + "grad_norm": 0.4800332188606262, + "learning_rate": 4.025009090512394e-06, + "loss": 0.05759064108133316, "step": 620 }, { "epoch": 1.3791574279379157, - "grad_norm": 0.6486095190048218, - "learning_rate": 8.036401245816306e-06, - "loss": 0.991193413734436, + "grad_norm": 0.9680930972099304, + "learning_rate": 4.018200622908153e-06, + "loss": 1.0663511753082275, "step": 622 }, { "epoch": 1.3835920177383592, - "grad_norm": 1.103645920753479, - "learning_rate": 8.022750186407586e-06, - "loss": 0.943727970123291, + "grad_norm": 1.1375492811203003, + "learning_rate": 4.011375093203793e-06, + "loss": 0.9414310455322266, "step": 624 }, { "epoch": 1.3880266075388026, - "grad_norm": 2.296745538711548, - "learning_rate": 8.009065186457061e-06, - "loss": 0.9769091606140137, + "grad_norm": 1.8018168210983276, + "learning_rate": 4.004532593228531e-06, + "loss": 0.9270746111869812, "step": 626 }, { "epoch": 1.3924611973392462, - "grad_norm": 2.5007238388061523, - "learning_rate": 7.995346430079799e-06, - "loss": 0.976685106754303, + "grad_norm": 0.7995503544807434, + "learning_rate": 3.997673215039899e-06, + "loss": 1.022778868675232, "step": 628 }, { "epoch": 1.3968957871396896, - "grad_norm": 1.3621313571929932, - "learning_rate": 7.981594101845012e-06, - "loss": 1.103744387626648, + "grad_norm": 0.6482570171356201, + "learning_rate": 3.990797050922506e-06, + "loss": 1.0561681985855103, "step": 630 }, { "epoch": 1.401330376940133, - "grad_norm": 0.9651110172271729, - "learning_rate": 7.967808386773591e-06, - "loss": 0.7250331044197083, + "grad_norm": 0.9474309086799622, + "learning_rate": 3.9839041933867954e-06, + "loss": 0.7767112851142883, "step": 632 }, { "epoch": 1.4057649667405765, - "grad_norm": 1.190347671508789, - "learning_rate": 7.953989470335592e-06, - "loss": 0.9121408462524414, + "grad_norm": 0.9471772313117981, + "learning_rate": 3.976994735167796e-06, + "loss": 0.9378006458282471, "step": 634 }, { "epoch": 1.41019955654102, - "grad_norm": 0.8217507004737854, - "learning_rate": 7.940137538447769e-06, - "loss": 1.0396496057510376, + "grad_norm": 3.042051315307617, + "learning_rate": 3.970068769223884e-06, + "loss": 1.0586607456207275, "step": 636 }, { "epoch": 1.4146341463414633, - "grad_norm": 7.098917484283447, - "learning_rate": 7.92625277747105e-06, - "loss": 0.8050640225410461, + "grad_norm": 1.6717044115066528, + "learning_rate": 3.963126388735525e-06, + "loss": 0.7560147047042847, "step": 638 }, { "epoch": 1.4190687361419068, - "grad_norm": 1.0582122802734375, - "learning_rate": 7.912335374208043e-06, - "loss": 0.6821240186691284, + "grad_norm": 0.8269588351249695, + "learning_rate": 3.956167687104021e-06, + "loss": 0.6895716190338135, "step": 640 }, { "epoch": 1.4235033259423504, - "grad_norm": 1.3478533029556274, - "learning_rate": 7.898385515900517e-06, - "loss": 0.8159670829772949, + "grad_norm": 2.3829591274261475, + "learning_rate": 3.9491927579502584e-06, + "loss": 0.8010018467903137, "step": 642 }, { "epoch": 1.4279379157427938, - "grad_norm": 1.2756816148757935, - "learning_rate": 7.884403390226883e-06, - "loss": 0.7149834036827087, + "grad_norm": 5.464539527893066, + "learning_rate": 3.9422016951134415e-06, + "loss": 0.6167224645614624, "step": 644 }, { "epoch": 1.4323725055432373, - "grad_norm": 0.8082099556922913, - "learning_rate": 7.870389185299672e-06, - "loss": 1.2317858934402466, + "grad_norm": 0.9182257056236267, + "learning_rate": 3.935194592649836e-06, + "loss": 1.2061957120895386, "step": 646 }, { "epoch": 1.4368070953436807, - "grad_norm": 0.9018802046775818, - "learning_rate": 7.856343089663002e-06, - "loss": 1.0831472873687744, + "grad_norm": 1.2997310161590576, + "learning_rate": 3.928171544831501e-06, + "loss": 1.0815800428390503, "step": 648 }, { "epoch": 1.441241685144124, - "grad_norm": 1.004470944404602, - "learning_rate": 7.842265292290039e-06, - "loss": 1.1530892848968506, + "grad_norm": 0.9832943081855774, + "learning_rate": 3.921132646145019e-06, + "loss": 1.1462043523788452, "step": 650 }, { "epoch": 1.4456762749445677, - "grad_norm": 2.8486859798431396, - "learning_rate": 7.828155982580465e-06, - "loss": 0.8907480835914612, + "grad_norm": 4.495982646942139, + "learning_rate": 3.914077991290232e-06, + "loss": 0.9017251133918762, "step": 652 }, { "epoch": 1.4501108647450112, - "grad_norm": 3.3158531188964844, - "learning_rate": 7.814015350357912e-06, - "loss": 1.065255880355835, + "grad_norm": 3.023244619369507, + "learning_rate": 3.907007675178956e-06, + "loss": 1.071535348892212, "step": 654 }, { "epoch": 1.4545454545454546, - "grad_norm": 2.0944631099700928, - "learning_rate": 7.799843585867426e-06, - "loss": 0.9423489570617676, + "grad_norm": 1.0500205755233765, + "learning_rate": 3.899921792933713e-06, + "loss": 0.8729349374771118, "step": 656 }, { "epoch": 1.458980044345898, - "grad_norm": 1.330924391746521, - "learning_rate": 7.785640879772897e-06, - "loss": 0.797570526599884, + "grad_norm": 0.5229496359825134, + "learning_rate": 3.892820439886448e-06, + "loss": 0.7276408076286316, "step": 658 }, { "epoch": 1.4634146341463414, - "grad_norm": 9.296934127807617, - "learning_rate": 7.771407423154498e-06, - "loss": 1.0860297679901123, + "grad_norm": 1.74760901927948, + "learning_rate": 3.885703711577249e-06, + "loss": 1.0792577266693115, "step": 660 }, { "epoch": 1.4678492239467849, - "grad_norm": 1.0748159885406494, - "learning_rate": 7.757143407506111e-06, - "loss": 1.0053012371063232, + "grad_norm": 0.6744844913482666, + "learning_rate": 3.8785717037530555e-06, + "loss": 1.04912269115448, "step": 662 }, { "epoch": 1.4722838137472283, - "grad_norm": 1.2716701030731201, - "learning_rate": 7.742849024732754e-06, - "loss": 0.7912719249725342, + "grad_norm": 0.9498162865638733, + "learning_rate": 3.871424512366377e-06, + "loss": 0.7877933382987976, "step": 664 }, { "epoch": 1.476718403547672, - "grad_norm": 0.5906314849853516, - "learning_rate": 7.728524467148e-06, - "loss": 0.5228022933006287, + "grad_norm": 0.9392883777618408, + "learning_rate": 3.864262233574e-06, + "loss": 0.5171116590499878, "step": 666 }, { "epoch": 1.4811529933481153, - "grad_norm": 12.187054634094238, - "learning_rate": 7.714169927471379e-06, - "loss": 0.5079471468925476, + "grad_norm": 2.990640640258789, + "learning_rate": 3.857084963735689e-06, + "loss": 0.6348138451576233, "step": 668 }, { "epoch": 1.4855875831485588, - "grad_norm": 1.7716224193572998, - "learning_rate": 7.699785598825805e-06, - "loss": 0.903016209602356, + "grad_norm": 1.4360038042068481, + "learning_rate": 3.849892799412902e-06, + "loss": 0.9652891159057617, "step": 670 }, { "epoch": 1.4900221729490022, - "grad_norm": 2.2982170581817627, - "learning_rate": 7.68537167473496e-06, - "loss": 1.0076159238815308, + "grad_norm": 0.7203241586685181, + "learning_rate": 3.84268583736748e-06, + "loss": 1.0772919654846191, "step": 672 }, { "epoch": 1.4944567627494456, - "grad_norm": 0.5563678741455078, - "learning_rate": 7.670928349120699e-06, - "loss": 0.6532096266746521, + "grad_norm": 0.7038840055465698, + "learning_rate": 3.835464174560349e-06, + "loss": 0.6485112905502319, "step": 674 }, { "epoch": 1.4988913525498893, - "grad_norm": 0.7698723077774048, - "learning_rate": 7.656455816300434e-06, - "loss": 0.7211450934410095, + "grad_norm": 5.711329460144043, + "learning_rate": 3.828227908150217e-06, + "loss": 0.7235292196273804, "step": 676 }, { "epoch": 1.5033259423503327, - "grad_norm": 1.3669579029083252, - "learning_rate": 7.641954270984532e-06, - "loss": 1.0425605773925781, + "grad_norm": 0.828093409538269, + "learning_rate": 3.820977135492266e-06, + "loss": 1.063615083694458, "step": 678 }, { "epoch": 1.507760532150776, - "grad_norm": 0.6051411628723145, - "learning_rate": 7.627423908273683e-06, - "loss": 0.5795204639434814, + "grad_norm": 0.7944739460945129, + "learning_rate": 3.8137119541368415e-06, + "loss": 0.659883975982666, "step": 680 }, { "epoch": 1.5121951219512195, - "grad_norm": 1.3261148929595947, - "learning_rate": 7.61286492365628e-06, - "loss": 0.5524274110794067, + "grad_norm": 7.873995304107666, + "learning_rate": 3.80643246182814e-06, + "loss": 0.5578336715698242, "step": 682 }, { "epoch": 1.516629711751663, - "grad_norm": 2.459198474884033, - "learning_rate": 7.598277513005793e-06, - "loss": 1.0282782316207886, + "grad_norm": 1.194551706314087, + "learning_rate": 3.7991387565028963e-06, + "loss": 1.11123526096344, "step": 684 }, { "epoch": 1.5210643015521064, - "grad_norm": 0.8493002653121948, - "learning_rate": 7.583661872578124e-06, - "loss": 0.961588978767395, + "grad_norm": 1.1621936559677124, + "learning_rate": 3.791830936289062e-06, + "loss": 0.9926000833511353, "step": 686 }, { "epoch": 1.5254988913525498, - "grad_norm": 1.2923295497894287, - "learning_rate": 7.569018199008976e-06, - "loss": 0.5933117270469666, + "grad_norm": 1.093554139137268, + "learning_rate": 3.784509099504488e-06, + "loss": 0.5783771872520447, "step": 688 }, { "epoch": 1.5299334811529932, - "grad_norm": 1.0922276973724365, - "learning_rate": 7.554346689311205e-06, - "loss": 0.4136204123497009, + "grad_norm": 0.998589277267456, + "learning_rate": 3.7771733446556025e-06, + "loss": 0.25470170378685, "step": 690 }, { "epoch": 1.5343680709534369, - "grad_norm": 0.827107310295105, - "learning_rate": 7.539647540872165e-06, - "loss": 0.724818766117096, + "grad_norm": 1.2918857336044312, + "learning_rate": 3.7698237704360826e-06, + "loss": 0.8159583806991577, "step": 692 }, { "epoch": 1.5388026607538803, - "grad_norm": 10.415434837341309, - "learning_rate": 7.5249209514510595e-06, - "loss": 0.9134396314620972, + "grad_norm": 3.156062364578247, + "learning_rate": 3.7624604757255297e-06, + "loss": 0.914296567440033, "step": 694 }, { "epoch": 1.5432372505543237, - "grad_norm": 1.308587908744812, - "learning_rate": 7.510167119176273e-06, - "loss": 0.5761569738388062, + "grad_norm": 1.473464012145996, + "learning_rate": 3.7550835595881365e-06, + "loss": 0.6349583268165588, "step": 696 }, { "epoch": 1.5476718403547673, - "grad_norm": 0.7201342582702637, - "learning_rate": 7.49538624254271e-06, - "loss": 1.1129498481750488, + "grad_norm": 0.6629686951637268, + "learning_rate": 3.747693121271355e-06, + "loss": 1.1163161993026733, "step": 698 }, { "epoch": 1.5521064301552108, - "grad_norm": 0.7750831842422485, - "learning_rate": 7.48057852040913e-06, - "loss": 1.0308855772018433, + "grad_norm": 1.4136548042297363, + "learning_rate": 3.740289260204565e-06, + "loss": 1.0595505237579346, "step": 700 }, { "epoch": 1.5565410199556542, - "grad_norm": 2.1023459434509277, - "learning_rate": 7.465744151995458e-06, - "loss": 1.0691447257995605, + "grad_norm": 0.9893305897712708, + "learning_rate": 3.732872075997729e-06, + "loss": 1.0329627990722656, "step": 702 }, { "epoch": 1.5609756097560976, - "grad_norm": 0.7603687047958374, - "learning_rate": 7.450883336880116e-06, - "loss": 0.8402605652809143, + "grad_norm": 1.2765549421310425, + "learning_rate": 3.725441668440058e-06, + "loss": 0.8997711539268494, "step": 704 }, { "epoch": 1.565410199556541, - "grad_norm": 2.021561861038208, - "learning_rate": 7.435996274997337e-06, - "loss": 0.43568840622901917, + "grad_norm": 25.827316284179688, + "learning_rate": 3.7179981374986683e-06, + "loss": 0.47515010833740234, "step": 706 }, { "epoch": 1.5698447893569845, - "grad_norm": 0.7875716090202332, - "learning_rate": 7.421083166634466e-06, - "loss": 1.1289795637130737, + "grad_norm": 1.0069860219955444, + "learning_rate": 3.710541583317233e-06, + "loss": 1.101843237876892, "step": 708 }, { "epoch": 1.5742793791574279, - "grad_norm": 3.5039725303649902, - "learning_rate": 7.40614421242928e-06, - "loss": 0.9707282781600952, + "grad_norm": 1.7606300115585327, + "learning_rate": 3.70307210621464e-06, + "loss": 0.8734806776046753, "step": 710 }, { "epoch": 1.5787139689578713, - "grad_norm": 1.424716591835022, - "learning_rate": 7.391179613367272e-06, - "loss": 0.6375144720077515, + "grad_norm": 4.4100141525268555, + "learning_rate": 3.695589806683636e-06, + "loss": 0.6428739428520203, "step": 712 }, { "epoch": 1.5831485587583147, - "grad_norm": 0.6600134968757629, - "learning_rate": 7.37618957077896e-06, - "loss": 0.9794768691062927, + "grad_norm": 0.7855361104011536, + "learning_rate": 3.68809478538948e-06, + "loss": 1.0313284397125244, "step": 714 }, { "epoch": 1.5875831485587582, - "grad_norm": 1.2803093194961548, - "learning_rate": 7.361174286337175e-06, - "loss": 1.127861738204956, + "grad_norm": 1.0333298444747925, + "learning_rate": 3.6805871431685875e-06, + "loss": 1.0764654874801636, "step": 716 }, { "epoch": 1.5920177383592018, - "grad_norm": 3.5021018981933594, - "learning_rate": 7.346133962054341e-06, - "loss": 1.2201976776123047, + "grad_norm": 3.6170754432678223, + "learning_rate": 3.6730669810271707e-06, + "loss": 1.3448973894119263, "step": 718 }, { "epoch": 1.5964523281596452, - "grad_norm": 1.4853342771530151, - "learning_rate": 7.33106880027977e-06, - "loss": 0.5727948546409607, + "grad_norm": 1.6662654876708984, + "learning_rate": 3.665534400139885e-06, + "loss": 0.5263369679450989, "step": 720 }, { "epoch": 1.6008869179600886, - "grad_norm": 0.3484899401664734, - "learning_rate": 7.315979003696927e-06, - "loss": 0.5201311111450195, + "grad_norm": 0.41445672512054443, + "learning_rate": 3.6579895018484635e-06, + "loss": 0.5438380837440491, "step": 722 }, { "epoch": 1.6053215077605323, - "grad_norm": 0.6640962958335876, - "learning_rate": 7.300864775320708e-06, - "loss": 1.0324370861053467, + "grad_norm": 4.3432183265686035, + "learning_rate": 3.650432387660354e-06, + "loss": 1.0684770345687866, "step": 724 }, { "epoch": 1.6097560975609757, - "grad_norm": 0.6866331696510315, - "learning_rate": 7.285726318494717e-06, - "loss": 1.0323240756988525, + "grad_norm": 0.9178838133811951, + "learning_rate": 3.6428631592473584e-06, + "loss": 1.0635815858840942, "step": 726 }, { "epoch": 1.6141906873614191, - "grad_norm": 0.8838914632797241, - "learning_rate": 7.2705638368885105e-06, - "loss": 0.708541989326477, + "grad_norm": 1.0494548082351685, + "learning_rate": 3.6352819184442552e-06, + "loss": 0.7950181365013123, "step": 728 }, { "epoch": 1.6186252771618626, - "grad_norm": 0.6660059690475464, - "learning_rate": 7.255377534494875e-06, - "loss": 1.0379067659378052, + "grad_norm": 0.6856399178504944, + "learning_rate": 3.6276887672474374e-06, + "loss": 1.0625864267349243, "step": 730 }, { "epoch": 1.623059866962306, - "grad_norm": 0.61259925365448, - "learning_rate": 7.240167615627082e-06, - "loss": 1.0389152765274048, + "grad_norm": 0.8747734427452087, + "learning_rate": 3.620083807813541e-06, + "loss": 1.0343061685562134, "step": 732 }, { "epoch": 1.6274944567627494, - "grad_norm": 0.6811412572860718, - "learning_rate": 7.224934284916127e-06, - "loss": 0.9704182744026184, + "grad_norm": 0.7687981128692627, + "learning_rate": 3.6124671424580633e-06, + "loss": 1.0561656951904297, "step": 734 }, { "epoch": 1.6319290465631928, - "grad_norm": 0.36625775694847107, - "learning_rate": 7.209677747307982e-06, - "loss": 0.6105785965919495, + "grad_norm": 0.4724279046058655, + "learning_rate": 3.604838873653991e-06, + "loss": 0.7322379350662231, "step": 736 }, { "epoch": 1.6363636363636362, - "grad_norm": 0.6093775033950806, - "learning_rate": 7.194398208060848e-06, - "loss": 1.1938239336013794, + "grad_norm": 0.8618527054786682, + "learning_rate": 3.597199104030424e-06, + "loss": 1.174364447593689, "step": 738 }, { "epoch": 1.6407982261640797, - "grad_norm": 0.6593520045280457, - "learning_rate": 7.179095872742378e-06, - "loss": 1.0288550853729248, + "grad_norm": 1.1735891103744507, + "learning_rate": 3.589547936371189e-06, + "loss": 1.0586884021759033, "step": 740 }, { "epoch": 1.6452328159645233, - "grad_norm": 2.0731823444366455, - "learning_rate": 7.16377094722692e-06, - "loss": 1.107767939567566, + "grad_norm": 1.6278800964355469, + "learning_rate": 3.58188547361346e-06, + "loss": 1.2099087238311768, "step": 742 }, { "epoch": 1.6496674057649667, - "grad_norm": 1.3170353174209595, - "learning_rate": 7.148423637692748e-06, - "loss": 0.8513399362564087, + "grad_norm": 8.375886917114258, + "learning_rate": 3.574211818846374e-06, + "loss": 0.8451070189476013, "step": 744 }, { "epoch": 1.6541019955654102, - "grad_norm": 0.9616439342498779, - "learning_rate": 7.133054150619282e-06, - "loss": 0.9469323754310608, + "grad_norm": 4.998831272125244, + "learning_rate": 3.566527075309641e-06, + "loss": 0.9085059762001038, "step": 746 }, { "epoch": 1.6585365853658538, - "grad_norm": 1.0744072198867798, - "learning_rate": 7.117662692784318e-06, - "loss": 0.6854807734489441, + "grad_norm": 2.622978687286377, + "learning_rate": 3.558831346392159e-06, + "loss": 0.7893009185791016, "step": 748 }, { "epoch": 1.6629711751662972, - "grad_norm": 0.8329254984855652, - "learning_rate": 7.102249471261241e-06, - "loss": 1.0346763134002686, + "grad_norm": 1.087239384651184, + "learning_rate": 3.5511247356306205e-06, + "loss": 1.0533859729766846, "step": 750 }, { "epoch": 1.6674057649667406, - "grad_norm": 1.6085177659988403, - "learning_rate": 7.0868146934162365e-06, - "loss": 0.9381062388420105, + "grad_norm": 0.8943765163421631, + "learning_rate": 3.5434073467081183e-06, + "loss": 0.9193379282951355, "step": 752 }, { "epoch": 1.671840354767184, - "grad_norm": 0.6383267641067505, - "learning_rate": 7.071358566905507e-06, - "loss": 0.6153408885002136, + "grad_norm": 0.7251817584037781, + "learning_rate": 3.5356792834527533e-06, + "loss": 0.555644154548645, "step": 754 }, { "epoch": 1.6762749445676275, - "grad_norm": 0.7473850846290588, - "learning_rate": 7.055881299672476e-06, - "loss": 0.9577728509902954, + "grad_norm": 0.9888861775398254, + "learning_rate": 3.527940649836238e-06, + "loss": 0.9255459308624268, "step": 756 }, { "epoch": 1.680709534368071, - "grad_norm": 0.7026475071907043, - "learning_rate": 7.040383099944988e-06, - "loss": 0.9313692450523376, + "grad_norm": 0.9948540925979614, + "learning_rate": 3.520191549972494e-06, + "loss": 1.0330082178115845, "step": 758 }, { "epoch": 1.6851441241685143, - "grad_norm": 13.826199531555176, - "learning_rate": 7.02486417623251e-06, - "loss": 0.8703269362449646, + "grad_norm": 1.4498493671417236, + "learning_rate": 3.512432088116255e-06, + "loss": 0.9195749759674072, "step": 760 }, { "epoch": 1.6895787139689578, - "grad_norm": 1.2666261196136475, - "learning_rate": 7.009324737323325e-06, - "loss": 1.1479053497314453, + "grad_norm": 2.5919599533081055, + "learning_rate": 3.5046623686616627e-06, + "loss": 1.0231900215148926, "step": 762 }, { "epoch": 1.6940133037694012, - "grad_norm": 1.4821723699569702, - "learning_rate": 6.993764992281722e-06, - "loss": 1.0106499195098877, + "grad_norm": 1.6839133501052856, + "learning_rate": 3.496882496140861e-06, + "loss": 1.0705158710479736, "step": 764 }, { "epoch": 1.6984478935698448, - "grad_norm": 2.4803268909454346, - "learning_rate": 6.978185150445187e-06, - "loss": 0.9789531230926514, + "grad_norm": 2.3896384239196777, + "learning_rate": 3.4890925752225935e-06, + "loss": 0.9851964116096497, "step": 766 }, { "epoch": 1.7028824833702882, - "grad_norm": 1.579527497291565, - "learning_rate": 6.96258542142158e-06, - "loss": 0.6890674233436584, + "grad_norm": 1.7617751359939575, + "learning_rate": 3.48129271071079e-06, + "loss": 0.8785147666931152, "step": 768 }, { "epoch": 1.7073170731707317, - "grad_norm": 1.3111584186553955, - "learning_rate": 6.946966015086321e-06, - "loss": 1.0892305374145508, + "grad_norm": 0.7179524302482605, + "learning_rate": 3.4734830075431605e-06, + "loss": 1.0462019443511963, "step": 770 }, { "epoch": 1.7117516629711753, - "grad_norm": 3.6061043739318848, - "learning_rate": 6.931327141579565e-06, - "loss": 1.0220389366149902, + "grad_norm": 99.32469177246094, + "learning_rate": 3.4656635707897823e-06, + "loss": 1.0003533363342285, "step": 772 }, { "epoch": 1.7161862527716187, - "grad_norm": 0.8147404789924622, - "learning_rate": 6.915669011303374e-06, - "loss": 0.9009866118431091, + "grad_norm": 1.4013028144836426, + "learning_rate": 3.457834505651687e-06, + "loss": 0.9330251216888428, "step": 774 }, { "epoch": 1.7206208425720622, - "grad_norm": 0.8462866544723511, - "learning_rate": 6.899991834918884e-06, - "loss": 1.1094865798950195, + "grad_norm": 1.3153235912322998, + "learning_rate": 3.449995917459442e-06, + "loss": 1.068703532218933, "step": 776 }, { "epoch": 1.7250554323725056, - "grad_norm": 0.7154016494750977, - "learning_rate": 6.884295823343479e-06, - "loss": 1.0449351072311401, + "grad_norm": 1.7329002618789673, + "learning_rate": 3.4421479116717394e-06, + "loss": 1.0447511672973633, "step": 778 }, { "epoch": 1.729490022172949, - "grad_norm": 1.2640376091003418, - "learning_rate": 6.868581187747941e-06, - "loss": 0.6298045516014099, + "grad_norm": 1.7924240827560425, + "learning_rate": 3.4342905938739707e-06, + "loss": 0.6974395513534546, "step": 780 }, { "epoch": 1.7339246119733924, - "grad_norm": 0.6618129014968872, - "learning_rate": 6.852848139553619e-06, - "loss": 0.9618121981620789, + "grad_norm": 0.8617153763771057, + "learning_rate": 3.4264240697768096e-06, + "loss": 1.005650281906128, "step": 782 }, { "epoch": 1.7383592017738358, - "grad_norm": 0.4092734158039093, - "learning_rate": 6.837096890429582e-06, - "loss": 0.6855942606925964, + "grad_norm": 2.041933298110962, + "learning_rate": 3.418548445214791e-06, + "loss": 0.7548515200614929, "step": 784 }, { "epoch": 1.7427937915742793, - "grad_norm": 0.6238394379615784, - "learning_rate": 6.821327652289768e-06, - "loss": 0.7413522005081177, + "grad_norm": 1.0399845838546753, + "learning_rate": 3.410663826144884e-06, + "loss": 0.7364926338195801, "step": 786 }, { "epoch": 1.7472283813747227, - "grad_norm": 1.0907618999481201, - "learning_rate": 6.8055406372901344e-06, - "loss": 0.6723021268844604, + "grad_norm": 1.0632048845291138, + "learning_rate": 3.4027703186450672e-06, + "loss": 0.6422973871231079, "step": 788 }, { "epoch": 1.7516629711751663, - "grad_norm": 3.868008613586426, - "learning_rate": 6.789736057825812e-06, - "loss": 0.6211203932762146, + "grad_norm": 2.727888822555542, + "learning_rate": 3.394868028912906e-06, + "loss": 0.6838661432266235, "step": 790 }, { "epoch": 1.7560975609756098, - "grad_norm": 0.46340852975845337, - "learning_rate": 6.77391412652823e-06, - "loss": 0.6684106588363647, + "grad_norm": 0.47422733902931213, + "learning_rate": 3.386957063264115e-06, + "loss": 0.6767208576202393, "step": 792 }, { "epoch": 1.7605321507760532, - "grad_norm": 0.6993800401687622, - "learning_rate": 6.758075056262271e-06, - "loss": 0.8443524837493896, + "grad_norm": 0.7666518688201904, + "learning_rate": 3.3790375281311355e-06, + "loss": 0.8840131759643555, "step": 794 }, { "epoch": 1.7649667405764968, - "grad_norm": 0.9882437586784363, - "learning_rate": 6.742219060123403e-06, - "loss": 0.7883599400520325, + "grad_norm": 1.755710244178772, + "learning_rate": 3.3711095300617015e-06, + "loss": 0.9250625371932983, "step": 796 }, { "epoch": 1.7694013303769403, - "grad_norm": 0.7495700716972351, - "learning_rate": 6.7263463514348095e-06, - "loss": 0.9688935875892639, + "grad_norm": 0.984292209148407, + "learning_rate": 3.3631731757174048e-06, + "loss": 1.0221278667449951, "step": 798 }, { "epoch": 1.7738359201773837, - "grad_norm": 0.7344732880592346, - "learning_rate": 6.710457143744519e-06, - "loss": 1.0964137315750122, + "grad_norm": 2.9353697299957275, + "learning_rate": 3.3552285718722593e-06, + "loss": 1.0566647052764893, "step": 800 }, { "epoch": 1.778270509977827, - "grad_norm": 1.3385992050170898, - "learning_rate": 6.6945516508225325e-06, - "loss": 0.9830183982849121, + "grad_norm": 2.145174980163574, + "learning_rate": 3.3472758254112662e-06, + "loss": 1.1013846397399902, "step": 802 }, { "epoch": 1.7827050997782705, - "grad_norm": 1.4036580324172974, - "learning_rate": 6.678630086657959e-06, - "loss": 1.1154334545135498, + "grad_norm": 1.3626816272735596, + "learning_rate": 3.3393150433289795e-06, + "loss": 1.1163952350616455, "step": 804 }, { "epoch": 1.787139689578714, - "grad_norm": 1.4150375127792358, - "learning_rate": 6.662692665456115e-06, - "loss": 0.7647712230682373, + "grad_norm": 1.1192476749420166, + "learning_rate": 3.3313463327280576e-06, + "loss": 0.7770203948020935, "step": 806 }, { "epoch": 1.7915742793791574, - "grad_norm": 1.0048019886016846, - "learning_rate": 6.646739601635661e-06, - "loss": 1.0117907524108887, + "grad_norm": 1.5108779668807983, + "learning_rate": 3.3233698008178306e-06, + "loss": 1.042984962463379, "step": 808 }, { "epoch": 1.7960088691796008, - "grad_norm": 0.8310498595237732, - "learning_rate": 6.6307711098257074e-06, - "loss": 0.5957526564598083, + "grad_norm": 1.3956575393676758, + "learning_rate": 3.3153855549128537e-06, + "loss": 0.5932509899139404, "step": 810 }, { "epoch": 1.8004434589800442, - "grad_norm": 1.8356163501739502, - "learning_rate": 6.6147874048629294e-06, - "loss": 0.6535270810127258, + "grad_norm": 5.858518600463867, + "learning_rate": 3.3073937024314647e-06, + "loss": 0.5802476406097412, "step": 812 }, { "epoch": 1.8048780487804879, - "grad_norm": 2.6936192512512207, - "learning_rate": 6.598788701788677e-06, - "loss": 1.2300708293914795, + "grad_norm": 1.5974634885787964, + "learning_rate": 3.2993943508943386e-06, + "loss": 1.1237906217575073, "step": 814 }, { "epoch": 1.8093126385809313, - "grad_norm": 1.4539717435836792, - "learning_rate": 6.582775215846082e-06, - "loss": 0.9219212532043457, + "grad_norm": 2.2132883071899414, + "learning_rate": 3.291387607923041e-06, + "loss": 0.9052677154541016, "step": 816 }, { "epoch": 1.8137472283813747, - "grad_norm": 1.1613236665725708, - "learning_rate": 6.566747162477164e-06, - "loss": 0.4706512689590454, + "grad_norm": 1.916467547416687, + "learning_rate": 3.283373581238582e-06, + "loss": 0.4801388084888458, "step": 818 }, { "epoch": 1.8181818181818183, - "grad_norm": 0.9962729811668396, - "learning_rate": 6.5507047573199235e-06, - "loss": 1.0331782102584839, + "grad_norm": 0.9333745241165161, + "learning_rate": 3.2753523786599618e-06, + "loss": 1.098004698753357, "step": 820 }, { "epoch": 1.8226164079822618, - "grad_norm": 1.4629065990447998, - "learning_rate": 6.5346482162054526e-06, - "loss": 1.0425474643707275, + "grad_norm": 0.9473229050636292, + "learning_rate": 3.2673241081027263e-06, + "loss": 1.0623825788497925, "step": 822 }, { "epoch": 1.8270509977827052, - "grad_norm": 0.9501014351844788, - "learning_rate": 6.518577755155024e-06, - "loss": 1.1319053173065186, + "grad_norm": 1.0046288967132568, + "learning_rate": 3.259288877577512e-06, + "loss": 1.1435964107513428, "step": 824 }, { "epoch": 1.8314855875831486, - "grad_norm": 2.260728120803833, - "learning_rate": 6.502493590377184e-06, - "loss": 0.9275112748146057, + "grad_norm": 1.3672471046447754, + "learning_rate": 3.251246795188592e-06, + "loss": 0.9531443119049072, "step": 826 }, { "epoch": 1.835920177383592, - "grad_norm": 1.412589192390442, - "learning_rate": 6.48639593826485e-06, - "loss": 0.8285998106002808, + "grad_norm": 4.6819353103637695, + "learning_rate": 3.243197969132425e-06, + "loss": 0.785484790802002, "step": 828 }, { "epoch": 1.8403547671840355, - "grad_norm": 0.8061251640319824, - "learning_rate": 6.4702850153923915e-06, - "loss": 0.9728037714958191, + "grad_norm": 0.6413378715515137, + "learning_rate": 3.2351425076961957e-06, + "loss": 1.0066115856170654, "step": 830 }, { "epoch": 1.8447893569844789, - "grad_norm": 3.3840270042419434, - "learning_rate": 6.45416103851272e-06, - "loss": 0.9926842451095581, + "grad_norm": 0.9913519620895386, + "learning_rate": 3.22708051925636e-06, + "loss": 1.0463634729385376, "step": 832 }, { "epoch": 1.8492239467849223, - "grad_norm": 0.75401771068573, - "learning_rate": 6.438024224554378e-06, - "loss": 0.9589172005653381, + "grad_norm": 0.8798125386238098, + "learning_rate": 3.219012112277189e-06, + "loss": 1.014890432357788, "step": 834 }, { "epoch": 1.8536585365853657, - "grad_norm": 0.8730403184890747, - "learning_rate": 6.421874790618608e-06, - "loss": 0.9601749181747437, + "grad_norm": 0.8148372769355774, + "learning_rate": 3.210937395309304e-06, + "loss": 1.0661628246307373, "step": 836 }, { "epoch": 1.8580931263858091, - "grad_norm": 7.889377593994141, - "learning_rate": 6.405712953976444e-06, - "loss": 0.9879204034805298, + "grad_norm": 3.0656192302703857, + "learning_rate": 3.202856476988222e-06, + "loss": 1.011720061302185, "step": 838 }, { "epoch": 1.8625277161862528, - "grad_norm": 1.303679347038269, - "learning_rate": 6.389538932065783e-06, - "loss": 1.1837224960327148, + "grad_norm": 0.9508707523345947, + "learning_rate": 3.1947694660328914e-06, + "loss": 1.1549465656280518, "step": 840 }, { "epoch": 1.8669623059866962, - "grad_norm": 2.8485257625579834, - "learning_rate": 6.373352942488455e-06, - "loss": 0.7557274103164673, + "grad_norm": 1.2188172340393066, + "learning_rate": 3.1866764712442273e-06, + "loss": 0.6235454082489014, "step": 842 }, { "epoch": 1.8713968957871396, - "grad_norm": 1.183737874031067, - "learning_rate": 6.357155203007307e-06, - "loss": 0.7274500131607056, + "grad_norm": 1.0734593868255615, + "learning_rate": 3.1785776015036533e-06, + "loss": 0.7762396335601807, "step": 844 }, { "epoch": 1.8758314855875833, - "grad_norm": 1.8683289289474487, - "learning_rate": 6.340945931543263e-06, - "loss": 0.8478662371635437, + "grad_norm": 8.548871994018555, + "learning_rate": 3.1704729657716314e-06, + "loss": 0.8318262696266174, "step": 846 }, { "epoch": 1.8802660753880267, - "grad_norm": 2.6959264278411865, - "learning_rate": 6.324725346172399e-06, - "loss": 0.8910980820655823, + "grad_norm": 1.0601602792739868, + "learning_rate": 3.1623626730861996e-06, + "loss": 1.0204439163208008, "step": 848 }, { "epoch": 1.8847006651884701, - "grad_norm": 1.2880501747131348, - "learning_rate": 6.308493665123e-06, - "loss": 1.2397483587265015, + "grad_norm": 0.9268496632575989, + "learning_rate": 3.1542468325615e-06, + "loss": 1.2579779624938965, "step": 850 }, { "epoch": 1.8891352549889135, - "grad_norm": 2.6425297260284424, - "learning_rate": 6.2922511067726365e-06, - "loss": 0.6860370635986328, + "grad_norm": 1.1618844270706177, + "learning_rate": 3.1461255533863183e-06, + "loss": 0.7151064872741699, "step": 852 }, { "epoch": 1.893569844789357, - "grad_norm": 1.1020407676696777, - "learning_rate": 6.2759978896452155e-06, - "loss": 1.0302581787109375, + "grad_norm": 1.2911403179168701, + "learning_rate": 3.1379989448226077e-06, + "loss": 1.0588139295578003, "step": 854 }, { "epoch": 1.8980044345898004, - "grad_norm": 1.1593939065933228, - "learning_rate": 6.259734232408047e-06, - "loss": 0.9292960166931152, + "grad_norm": 1.2581367492675781, + "learning_rate": 3.1298671162040236e-06, + "loss": 0.9134914875030518, "step": 856 }, { "epoch": 1.9024390243902438, - "grad_norm": 0.899362325668335, - "learning_rate": 6.2434603538688975e-06, - "loss": 0.726291835308075, + "grad_norm": 0.9134765267372131, + "learning_rate": 3.1217301769344488e-06, + "loss": 0.723232090473175, "step": 858 }, { "epoch": 1.9068736141906872, - "grad_norm": 2.7879903316497803, - "learning_rate": 6.2271764729730525e-06, - "loss": 0.7539620995521545, + "grad_norm": 1.5111923217773438, + "learning_rate": 3.1135882364865262e-06, + "loss": 0.7724399566650391, "step": 860 }, { "epoch": 1.9113082039911307, - "grad_norm": 2.1077184677124023, - "learning_rate": 6.210882808800366e-06, - "loss": 1.0768671035766602, + "grad_norm": 2.6118454933166504, + "learning_rate": 3.105441404400183e-06, + "loss": 1.0999490022659302, "step": 862 }, { "epoch": 1.9157427937915743, - "grad_norm": 0.9502280354499817, - "learning_rate": 6.19457958056231e-06, - "loss": 0.7523056864738464, + "grad_norm": 0.951991617679596, + "learning_rate": 3.097289790281155e-06, + "loss": 0.7594307661056519, "step": 864 }, { "epoch": 1.9201773835920177, - "grad_norm": 1.5630991458892822, - "learning_rate": 6.178267007599034e-06, - "loss": 0.6150217652320862, + "grad_norm": 5.733903884887695, + "learning_rate": 3.089133503799517e-06, + "loss": 0.6187224388122559, "step": 866 }, { "epoch": 1.9246119733924612, - "grad_norm": 1.0712734460830688, - "learning_rate": 6.161945309376409e-06, - "loss": 1.0981650352478027, + "grad_norm": 0.7913883924484253, + "learning_rate": 3.0809726546882045e-06, + "loss": 1.1264593601226807, "step": 868 }, { "epoch": 1.9290465631929048, - "grad_norm": 0.8349732160568237, - "learning_rate": 6.145614705483075e-06, - "loss": 1.0270382165908813, + "grad_norm": 0.7077911496162415, + "learning_rate": 3.0728073527415376e-06, + "loss": 1.0579288005828857, "step": 870 }, { "epoch": 1.9334811529933482, - "grad_norm": 0.5500652194023132, - "learning_rate": 6.129275415627485e-06, - "loss": 0.7593640685081482, + "grad_norm": 0.4359738826751709, + "learning_rate": 3.0646377078137424e-06, + "loss": 0.693384051322937, "step": 872 }, { "epoch": 1.9379157427937916, - "grad_norm": 1.514795184135437, - "learning_rate": 6.11292765963495e-06, - "loss": 0.6513434052467346, + "grad_norm": 1.4608731269836426, + "learning_rate": 3.056463829817475e-06, + "loss": 0.6210131049156189, "step": 874 }, { "epoch": 1.942350332594235, - "grad_norm": 0.9090774655342102, - "learning_rate": 6.09657165744469e-06, - "loss": 0.6522112488746643, + "grad_norm": 1.902843952178955, + "learning_rate": 3.048285828722345e-06, + "loss": 0.6315315961837769, "step": 876 }, { "epoch": 1.9467849223946785, - "grad_norm": 0.8675610423088074, - "learning_rate": 6.080207629106859e-06, - "loss": 0.740065336227417, + "grad_norm": 0.7998486757278442, + "learning_rate": 3.0401038145534297e-06, + "loss": 0.8021946549415588, "step": 878 }, { "epoch": 1.951219512195122, - "grad_norm": 1.5948195457458496, - "learning_rate": 6.063835794779598e-06, - "loss": 0.8240799307823181, + "grad_norm": 1.0074536800384521, + "learning_rate": 3.031917897389799e-06, + "loss": 0.8259488344192505, "step": 880 }, { "epoch": 1.9556541019955653, - "grad_norm": 3.2498433589935303, - "learning_rate": 6.047456374726067e-06, - "loss": 0.5737804174423218, + "grad_norm": 2.5570080280303955, + "learning_rate": 3.0237281873630335e-06, + "loss": 0.6797860860824585, "step": 882 }, { "epoch": 1.9600886917960088, - "grad_norm": 0.8827385902404785, - "learning_rate": 6.031069589311481e-06, - "loss": 0.7567857503890991, + "grad_norm": 0.7867917418479919, + "learning_rate": 3.0155347946557407e-06, + "loss": 0.8724421262741089, "step": 884 }, { "epoch": 1.9645232815964522, - "grad_norm": 1.2192054986953735, - "learning_rate": 6.01467565900015e-06, - "loss": 0.9545295238494873, + "grad_norm": 0.7262649536132812, + "learning_rate": 3.007337829500075e-06, + "loss": 1.0460387468338013, "step": 886 }, { "epoch": 1.9689578713968958, - "grad_norm": 0.7741813659667969, - "learning_rate": 5.99827480435251e-06, - "loss": 0.9531633257865906, + "grad_norm": 0.5823050737380981, + "learning_rate": 2.999137402176255e-06, + "loss": 0.9942973852157593, "step": 888 }, { "epoch": 1.9733924611973392, - "grad_norm": 0.8368670344352722, - "learning_rate": 5.981867246022149e-06, - "loss": 1.0802265405654907, + "grad_norm": 0.9097557067871094, + "learning_rate": 2.9909336230110747e-06, + "loss": 1.0488195419311523, "step": 890 }, { "epoch": 1.9778270509977827, - "grad_norm": 1.4058384895324707, - "learning_rate": 5.965453204752855e-06, - "loss": 1.0436638593673706, + "grad_norm": 0.9082350730895996, + "learning_rate": 2.9827266023764274e-06, + "loss": 1.0950303077697754, "step": 892 }, { "epoch": 1.9822616407982263, - "grad_norm": 1.1106218099594116, - "learning_rate": 5.949032901375627e-06, - "loss": 1.013320803642273, + "grad_norm": 1.0125060081481934, + "learning_rate": 2.9745164506878134e-06, + "loss": 1.032450556755066, "step": 894 }, { "epoch": 1.9866962305986697, - "grad_norm": 5.496564865112305, - "learning_rate": 5.932606556805719e-06, - "loss": 0.8702308535575867, + "grad_norm": 2.043154001235962, + "learning_rate": 2.9663032784028596e-06, + "loss": 0.8691053986549377, "step": 896 }, { "epoch": 1.9911308203991132, - "grad_norm": 0.7846924066543579, - "learning_rate": 5.916174392039659e-06, - "loss": 0.9531219005584717, + "grad_norm": 1.362161636352539, + "learning_rate": 2.9580871960198297e-06, + "loss": 1.0347143411636353, "step": 898 }, { "epoch": 1.9955654101995566, - "grad_norm": 1.4032496213912964, - "learning_rate": 5.899736628152284e-06, - "loss": 0.7455130815505981, + "grad_norm": 0.7828076481819153, + "learning_rate": 2.949868314076142e-06, + "loss": 0.7763835191726685, "step": 900 }, { "epoch": 2.0, - "grad_norm": 0.851958155632019, - "learning_rate": 5.88329348629375e-06, - "loss": 0.9732981324195862, + "grad_norm": 0.5598412752151489, + "learning_rate": 2.941646743146875e-06, + "loss": 1.0524572134017944, "step": 902 }, { "epoch": 2.0044345898004434, - "grad_norm": 0.8989659547805786, - "learning_rate": 5.8668451876865736e-06, - "loss": 0.6848430633544922, + "grad_norm": 0.6863189339637756, + "learning_rate": 2.9334225938432868e-06, + "loss": 0.8581877946853638, "step": 904 }, { "epoch": 2.008869179600887, - "grad_norm": 0.9057672023773193, - "learning_rate": 5.850391953622652e-06, - "loss": 0.7127602100372314, + "grad_norm": 0.7125548720359802, + "learning_rate": 2.925195976811326e-06, + "loss": 0.8792709708213806, "step": 906 }, { "epoch": 2.0133037694013303, - "grad_norm": 0.8921846747398376, - "learning_rate": 5.8339340054602775e-06, - "loss": 0.851362943649292, + "grad_norm": 0.6354836821556091, + "learning_rate": 2.9169670027301387e-06, + "loss": 1.0257556438446045, "step": 908 }, { "epoch": 2.0177383592017737, - "grad_norm": 0.8631959557533264, - "learning_rate": 5.817471564621169e-06, - "loss": 0.7096256017684937, + "grad_norm": 0.9187865853309631, + "learning_rate": 2.9087357823105843e-06, + "loss": 0.8405720591545105, "step": 910 }, { "epoch": 2.022172949002217, - "grad_norm": 1.8130816221237183, - "learning_rate": 5.801004852587485e-06, - "loss": 0.5044680237770081, + "grad_norm": 1.2554885149002075, + "learning_rate": 2.9005024262937427e-06, + "loss": 0.5283271074295044, "step": 912 }, { "epoch": 2.0266075388026605, - "grad_norm": 1.5005384683609009, - "learning_rate": 5.784534090898849e-06, - "loss": 0.5559933185577393, + "grad_norm": 1.8454920053482056, + "learning_rate": 2.8922670454494247e-06, + "loss": 0.6945717334747314, "step": 914 }, { "epoch": 2.0310421286031044, - "grad_norm": 1.5633256435394287, - "learning_rate": 5.768059501149369e-06, - "loss": 0.6281445622444153, + "grad_norm": 1.0446429252624512, + "learning_rate": 2.8840297505746843e-06, + "loss": 0.5962017774581909, "step": 916 }, { "epoch": 2.035476718403548, - "grad_norm": 1.614864706993103, - "learning_rate": 5.751581304984657e-06, - "loss": 0.6711671948432922, + "grad_norm": 0.8583077192306519, + "learning_rate": 2.8757906524923286e-06, + "loss": 0.9059169888496399, "step": 918 }, { "epoch": 2.0399113082039912, - "grad_norm": 1.0288786888122559, - "learning_rate": 5.735099724098838e-06, - "loss": 0.5363720655441284, + "grad_norm": 0.7749403715133667, + "learning_rate": 2.867549862049419e-06, + "loss": 0.6539976000785828, "step": 920 }, { "epoch": 2.0443458980044347, - "grad_norm": 6.116771221160889, - "learning_rate": 5.718614980231582e-06, - "loss": 0.5760122537612915, + "grad_norm": 0.7740585207939148, + "learning_rate": 2.859307490115791e-06, + "loss": 0.7305862307548523, "step": 922 }, { "epoch": 2.048780487804878, - "grad_norm": 0.6346896290779114, - "learning_rate": 5.702127295165107e-06, - "loss": 0.1978059560060501, + "grad_norm": 0.5781393051147461, + "learning_rate": 2.8510636475825533e-06, + "loss": 0.24691088497638702, "step": 924 }, { "epoch": 2.0532150776053215, - "grad_norm": 1.3448708057403564, - "learning_rate": 5.685636890721205e-06, - "loss": 0.8290249705314636, + "grad_norm": 0.9956364035606384, + "learning_rate": 2.8428184453606027e-06, + "loss": 0.8243362307548523, "step": 926 }, { "epoch": 2.057649667405765, - "grad_norm": 2.6560189723968506, - "learning_rate": 5.669143988758253e-06, - "loss": 0.5688458681106567, + "grad_norm": 6.8051276206970215, + "learning_rate": 2.8345719943791266e-06, + "loss": 0.6835483312606812, "step": 928 }, { "epoch": 2.0620842572062084, - "grad_norm": 0.2830648124217987, - "learning_rate": 5.652648811168228e-06, - "loss": 0.4656969904899597, + "grad_norm": 0.4145214855670929, + "learning_rate": 2.826324405584114e-06, + "loss": 0.5555226802825928, "step": 930 }, { "epoch": 2.066518847006652, - "grad_norm": 2.9565589427948, - "learning_rate": 5.636151579873726e-06, - "loss": 0.5648703575134277, + "grad_norm": 15.418578147888184, + "learning_rate": 2.818075789936863e-06, + "loss": 0.6951206922531128, "step": 932 }, { "epoch": 2.070953436807095, - "grad_norm": 0.8213310837745667, - "learning_rate": 5.619652516824967e-06, - "loss": 0.6612739562988281, + "grad_norm": 2.4657557010650635, + "learning_rate": 2.8098262584124834e-06, + "loss": 0.9028570652008057, "step": 934 }, { "epoch": 2.0753880266075386, - "grad_norm": 1.1572480201721191, - "learning_rate": 5.603151843996822e-06, - "loss": 0.7529350519180298, + "grad_norm": 1.7383790016174316, + "learning_rate": 2.801575921998411e-06, + "loss": 0.8572009205818176, "step": 936 }, { "epoch": 2.079822616407982, - "grad_norm": 1.7981088161468506, - "learning_rate": 5.586649783385813e-06, - "loss": 0.5522570610046387, + "grad_norm": 0.8971472382545471, + "learning_rate": 2.7933248916929066e-06, + "loss": 0.6242398023605347, "step": 938 }, { "epoch": 2.084257206208426, - "grad_norm": 1.3611361980438232, - "learning_rate": 5.570146557007141e-06, - "loss": 0.48875561356544495, + "grad_norm": 0.8894023299217224, + "learning_rate": 2.7850732785035705e-06, + "loss": 0.5844857692718506, "step": 940 }, { "epoch": 2.0886917960088693, - "grad_norm": 1.683354139328003, - "learning_rate": 5.553642386891683e-06, - "loss": 0.811143159866333, + "grad_norm": 1.6542750597000122, + "learning_rate": 2.7768211934458417e-06, + "loss": 0.8088274002075195, "step": 942 }, { "epoch": 2.0931263858093128, - "grad_norm": 1.7977195978164673, - "learning_rate": 5.537137495083018e-06, - "loss": 0.45032718777656555, + "grad_norm": 1.1526122093200684, + "learning_rate": 2.768568747541509e-06, + "loss": 0.48685911297798157, "step": 944 }, { "epoch": 2.097560975609756, - "grad_norm": 1.0323246717453003, - "learning_rate": 5.5206321036344304e-06, - "loss": 0.703310489654541, + "grad_norm": 0.9583385586738586, + "learning_rate": 2.7603160518172152e-06, + "loss": 0.8856199979782104, "step": 946 }, { "epoch": 2.1019955654101996, - "grad_norm": 1.2375712394714355, - "learning_rate": 5.504126434605932e-06, - "loss": 0.7233847379684448, + "grad_norm": 1.043729543685913, + "learning_rate": 2.752063217302966e-06, + "loss": 0.8684977293014526, "step": 948 }, { "epoch": 2.106430155210643, - "grad_norm": 1.348511815071106, - "learning_rate": 5.487620710061262e-06, - "loss": 0.4606630802154541, + "grad_norm": 1.6375235319137573, + "learning_rate": 2.743810355030631e-06, + "loss": 0.5918598175048828, "step": 950 }, { "epoch": 2.1108647450110865, - "grad_norm": 1.3615354299545288, - "learning_rate": 5.471115152064916e-06, - "loss": 0.5963136553764343, + "grad_norm": 0.9976754188537598, + "learning_rate": 2.735557576032458e-06, + "loss": 0.7557308673858643, "step": 952 }, { "epoch": 2.11529933481153, - "grad_norm": 3.1463687419891357, - "learning_rate": 5.454609982679138e-06, - "loss": 0.5811668038368225, + "grad_norm": 1.8317605257034302, + "learning_rate": 2.727304991339569e-06, + "loss": 0.7505866289138794, "step": 954 }, { "epoch": 2.1197339246119733, - "grad_norm": 1.2822504043579102, - "learning_rate": 5.4381054239609525e-06, - "loss": 0.7643156051635742, + "grad_norm": 0.8461515307426453, + "learning_rate": 2.7190527119804762e-06, + "loss": 0.8690387010574341, "step": 956 }, { "epoch": 2.1241685144124167, - "grad_norm": 1.20485258102417, - "learning_rate": 5.421601697959164e-06, - "loss": 0.5839754343032837, + "grad_norm": 1.2172333002090454, + "learning_rate": 2.710800848979582e-06, + "loss": 0.7549737691879272, "step": 958 }, { "epoch": 2.12860310421286, - "grad_norm": 2.490534782409668, - "learning_rate": 5.405099026711374e-06, - "loss": 0.5331164598464966, + "grad_norm": 2.6961116790771484, + "learning_rate": 2.702549513355687e-06, + "loss": 0.6273566484451294, "step": 960 }, { "epoch": 2.1330376940133036, - "grad_norm": 3.638983726501465, - "learning_rate": 5.388597632240994e-06, - "loss": 0.40006810426712036, + "grad_norm": 2.817370653152466, + "learning_rate": 2.694298816120497e-06, + "loss": 0.507022500038147, "step": 962 }, { "epoch": 2.1374722838137474, - "grad_norm": 6.238394737243652, - "learning_rate": 5.372097736554261e-06, - "loss": 0.6906276941299438, + "grad_norm": 3.4701592922210693, + "learning_rate": 2.6860488682771306e-06, + "loss": 0.9122434854507446, "step": 964 }, { "epoch": 2.141906873614191, - "grad_norm": 0.839712917804718, - "learning_rate": 5.35559956163724e-06, - "loss": 0.6319162249565125, + "grad_norm": 1.0491613149642944, + "learning_rate": 2.67779978081862e-06, + "loss": 0.8000502586364746, "step": 966 }, { "epoch": 2.1463414634146343, - "grad_norm": 0.7551127672195435, - "learning_rate": 5.339103329452856e-06, - "loss": 0.41347965598106384, + "grad_norm": 0.8666864037513733, + "learning_rate": 2.669551664726428e-06, + "loss": 0.5371144413948059, "step": 968 }, { "epoch": 2.1507760532150777, - "grad_norm": 0.8068141937255859, - "learning_rate": 5.322609261937887e-06, - "loss": 0.5399714112281799, + "grad_norm": 0.929885983467102, + "learning_rate": 2.6613046309689433e-06, + "loss": 0.7838866710662842, "step": 970 }, { "epoch": 2.155210643015521, - "grad_norm": 1.4304999113082886, - "learning_rate": 5.306117580999993e-06, - "loss": 0.4987761676311493, + "grad_norm": 1.449052095413208, + "learning_rate": 2.6530587904999966e-06, + "loss": 0.6714781522750854, "step": 972 }, { "epoch": 2.1596452328159645, - "grad_norm": 1.4152289628982544, - "learning_rate": 5.289628508514725e-06, - "loss": 0.7639641165733337, + "grad_norm": 1.6018487215042114, + "learning_rate": 2.6448142542573624e-06, + "loss": 0.8650703430175781, "step": 974 }, { "epoch": 2.164079822616408, - "grad_norm": 1.2822636365890503, - "learning_rate": 5.2731422663225385e-06, - "loss": 0.7585563659667969, + "grad_norm": 1.1995538473129272, + "learning_rate": 2.6365711331612692e-06, + "loss": 0.9092912077903748, "step": 976 }, { "epoch": 2.1685144124168514, - "grad_norm": 2.3158578872680664, - "learning_rate": 5.256659076225813e-06, - "loss": 0.3914712071418762, + "grad_norm": 4.522655010223389, + "learning_rate": 2.6283295381129066e-06, + "loss": 0.623193621635437, "step": 978 }, { "epoch": 2.172949002217295, - "grad_norm": 1.916115164756775, - "learning_rate": 5.240179159985866e-06, - "loss": 0.8097031712532043, + "grad_norm": 1.066388487815857, + "learning_rate": 2.620089579992933e-06, + "loss": 0.8715465664863586, "step": 980 }, { "epoch": 2.1773835920177382, - "grad_norm": 1.6557332277297974, - "learning_rate": 5.2237027393199645e-06, - "loss": 0.37278667092323303, + "grad_norm": 0.7982610464096069, + "learning_rate": 2.6118513696599823e-06, + "loss": 0.4557168781757355, "step": 982 }, { "epoch": 2.1818181818181817, - "grad_norm": 1.2614706754684448, - "learning_rate": 5.207230035898356e-06, - "loss": 0.2516429126262665, + "grad_norm": 2.2363100051879883, + "learning_rate": 2.603615017949178e-06, + "loss": 0.2402995377779007, "step": 984 }, { "epoch": 2.186252771618625, - "grad_norm": 1.9887027740478516, - "learning_rate": 5.190761271341268e-06, - "loss": 0.659031331539154, + "grad_norm": 1.7150230407714844, + "learning_rate": 2.595380635670634e-06, + "loss": 0.6125180721282959, "step": 986 }, { "epoch": 2.1906873614190685, - "grad_norm": 3.152275323867798, - "learning_rate": 5.174296667215939e-06, - "loss": 0.28620240092277527, + "grad_norm": 2.03562331199646, + "learning_rate": 2.5871483336079694e-06, + "loss": 0.3789454698562622, "step": 988 }, { "epoch": 2.1951219512195124, - "grad_norm": 0.960594654083252, - "learning_rate": 5.157836445033636e-06, - "loss": 0.702060878276825, + "grad_norm": 0.9788453578948975, + "learning_rate": 2.578918222516818e-06, + "loss": 0.8006269335746765, "step": 990 }, { "epoch": 2.199556541019956, - "grad_norm": 2.6779539585113525, - "learning_rate": 5.141380826246667e-06, - "loss": 0.9569138288497925, + "grad_norm": 3.577479600906372, + "learning_rate": 2.5706904131233336e-06, + "loss": 1.1745080947875977, "step": 992 }, { "epoch": 2.203991130820399, - "grad_norm": 1.3519855737686157, - "learning_rate": 5.124930032245415e-06, - "loss": 0.5782943964004517, + "grad_norm": 0.9117245078086853, + "learning_rate": 2.5624650161227073e-06, + "loss": 0.752780556678772, "step": 994 }, { "epoch": 2.2084257206208426, - "grad_norm": 1.4382355213165283, - "learning_rate": 5.108484284355339e-06, - "loss": 0.7067066431045532, + "grad_norm": 1.0597925186157227, + "learning_rate": 2.5542421421776696e-06, + "loss": 0.92555171251297, "step": 996 }, { "epoch": 2.212860310421286, - "grad_norm": 1.1686522960662842, - "learning_rate": 5.0920438038340194e-06, - "loss": 0.6596247553825378, + "grad_norm": 2.012413740158081, + "learning_rate": 2.5460219019170097e-06, + "loss": 0.7486417293548584, "step": 998 }, { "epoch": 2.2172949002217295, - "grad_norm": 1.4124189615249634, - "learning_rate": 5.075608811868169e-06, - "loss": 0.6456693410873413, + "grad_norm": 1.388223648071289, + "learning_rate": 2.5378044059340845e-06, + "loss": 0.6380313038825989, "step": 1000 }, { "epoch": 2.221729490022173, - "grad_norm": 4.106961727142334, - "learning_rate": 5.059179529570657e-06, - "loss": 0.3303482234477997, + "grad_norm": 2.087331533432007, + "learning_rate": 2.5295897647853283e-06, + "loss": 0.42272722721099854, "step": 1002 }, { "epoch": 2.2261640798226163, - "grad_norm": 0.9121060371398926, - "learning_rate": 5.042756177977534e-06, - "loss": 0.6765180230140686, + "grad_norm": 1.6069846153259277, + "learning_rate": 2.521378088988767e-06, + "loss": 0.9431757926940918, "step": 1004 }, { "epoch": 2.2305986696230597, - "grad_norm": 1.4012293815612793, - "learning_rate": 5.026338978045062e-06, - "loss": 0.5389603972434998, + "grad_norm": 1.133226990699768, + "learning_rate": 2.513169489022531e-06, + "loss": 0.7965492010116577, "step": 1006 }, { "epoch": 2.235033259423503, - "grad_norm": 0.968221127986908, - "learning_rate": 5.009928150646741e-06, - "loss": 0.6899822950363159, + "grad_norm": 2.120872735977173, + "learning_rate": 2.5049640753233705e-06, + "loss": 0.8802080750465393, "step": 1008 }, { "epoch": 2.2394678492239466, - "grad_norm": 1.1386181116104126, - "learning_rate": 4.993523916570334e-06, - "loss": 0.6064386367797852, + "grad_norm": 0.9329570531845093, + "learning_rate": 2.496761958285167e-06, + "loss": 0.5040686130523682, "step": 1010 }, { "epoch": 2.2439024390243905, - "grad_norm": 1.1713107824325562, - "learning_rate": 4.977126496514902e-06, - "loss": 0.6847143769264221, + "grad_norm": 0.7371305227279663, + "learning_rate": 2.488563248257451e-06, + "loss": 0.8810028433799744, "step": 1012 }, { "epoch": 2.248337028824834, - "grad_norm": 1.6454007625579834, - "learning_rate": 4.960736111087827e-06, - "loss": 0.6909704804420471, + "grad_norm": 5.761690139770508, + "learning_rate": 2.4803680555439136e-06, + "loss": 0.7554865479469299, "step": 1014 }, { "epoch": 2.2527716186252773, - "grad_norm": 0.9556133151054382, - "learning_rate": 4.9443529808018545e-06, - "loss": 0.8876364231109619, + "grad_norm": 1.0782852172851562, + "learning_rate": 2.4721764904009272e-06, + "loss": 0.8898205161094666, "step": 1016 }, { "epoch": 2.2572062084257207, - "grad_norm": 1.2127729654312134, - "learning_rate": 4.927977326072115e-06, - "loss": 0.36061155796051025, + "grad_norm": 1.2129498720169067, + "learning_rate": 2.4639886630360574e-06, + "loss": 0.40009406208992004, "step": 1018 }, { "epoch": 2.261640798226164, - "grad_norm": 0.925119936466217, - "learning_rate": 4.911609367213168e-06, - "loss": 0.8120240569114685, + "grad_norm": 0.9777927994728088, + "learning_rate": 2.455804683606584e-06, + "loss": 0.8841888904571533, "step": 1020 }, { "epoch": 2.2660753880266076, - "grad_norm": 0.3137432932853699, - "learning_rate": 4.895249324436035e-06, - "loss": 0.4972486197948456, + "grad_norm": 0.3243866562843323, + "learning_rate": 2.4476246622180174e-06, + "loss": 0.44640085101127625, "step": 1022 }, { "epoch": 2.270509977827051, - "grad_norm": 1.6952173709869385, - "learning_rate": 4.8788974178452316e-06, - "loss": 0.9327743053436279, + "grad_norm": 2.328533411026001, + "learning_rate": 2.4394487089226158e-06, + "loss": 1.2326172590255737, "step": 1024 }, { "epoch": 2.2749445676274944, - "grad_norm": 0.680587112903595, - "learning_rate": 4.86255386743582e-06, - "loss": 0.524124026298523, + "grad_norm": 0.8293269872665405, + "learning_rate": 2.43127693371791e-06, + "loss": 0.6043848395347595, "step": 1026 }, { "epoch": 2.279379157427938, - "grad_norm": 1.0751386880874634, - "learning_rate": 4.846218893090426e-06, - "loss": 0.7662097215652466, + "grad_norm": 1.0006300210952759, + "learning_rate": 2.423109446545213e-06, + "loss": 0.9160785675048828, "step": 1028 }, { "epoch": 2.2838137472283813, - "grad_norm": 0.8584238290786743, - "learning_rate": 4.829892714576307e-06, - "loss": 0.7474344968795776, + "grad_norm": 1.3495242595672607, + "learning_rate": 2.4149463572881537e-06, + "loss": 1.0066651105880737, "step": 1030 }, { "epoch": 2.2882483370288247, - "grad_norm": 2.667839765548706, - "learning_rate": 4.813575551542381e-06, - "loss": 0.6120243072509766, + "grad_norm": 3.227001190185547, + "learning_rate": 2.4067877757711907e-06, + "loss": 0.6590787172317505, "step": 1032 }, { "epoch": 2.292682926829268, - "grad_norm": 2.76969313621521, - "learning_rate": 4.7972676235162714e-06, - "loss": 0.7319304347038269, + "grad_norm": 1.3093198537826538, + "learning_rate": 2.3986338117581357e-06, + "loss": 0.6363720893859863, "step": 1034 }, { "epoch": 2.2971175166297115, - "grad_norm": 1.307244896888733, - "learning_rate": 4.780969149901354e-06, - "loss": 0.7238577604293823, + "grad_norm": 8.721003532409668, + "learning_rate": 2.390484574950677e-06, + "loss": 1.0154914855957031, "step": 1036 }, { "epoch": 2.3015521064301554, - "grad_norm": 2.190412998199463, - "learning_rate": 4.764680349973812e-06, - "loss": 0.732725977897644, + "grad_norm": 2.5220930576324463, + "learning_rate": 2.382340174986906e-06, + "loss": 0.7293195724487305, "step": 1038 }, { "epoch": 2.305986696230599, - "grad_norm": 1.9367870092391968, - "learning_rate": 4.748401442879674e-06, - "loss": 0.6513870358467102, + "grad_norm": 1.6561472415924072, + "learning_rate": 2.374200721439837e-06, + "loss": 0.8729753494262695, "step": 1040 }, { "epoch": 2.3104212860310422, - "grad_norm": 1.6586291790008545, - "learning_rate": 4.732132647631881e-06, - "loss": 0.9142364263534546, + "grad_norm": 1.198063611984253, + "learning_rate": 2.3660663238159405e-06, + "loss": 0.8993839621543884, "step": 1042 }, { "epoch": 2.3148558758314857, - "grad_norm": 0.5376819968223572, - "learning_rate": 4.715874183107324e-06, - "loss": 0.6070502996444702, + "grad_norm": 0.6974464654922485, + "learning_rate": 2.357937091553662e-06, + "loss": 0.6564696431159973, "step": 1044 }, { "epoch": 2.319290465631929, - "grad_norm": 1.9359222650527954, - "learning_rate": 4.699626268043911e-06, - "loss": 0.7449045181274414, + "grad_norm": 2.7228505611419678, + "learning_rate": 2.3498131340219554e-06, + "loss": 0.8685024380683899, "step": 1046 }, { "epoch": 2.3237250554323725, - "grad_norm": 1.128960132598877, - "learning_rate": 4.683389121037618e-06, - "loss": 0.6669731736183167, + "grad_norm": 1.095363974571228, + "learning_rate": 2.341694560518809e-06, + "loss": 0.8607962727546692, "step": 1048 }, { "epoch": 2.328159645232816, - "grad_norm": 0.9983499646186829, - "learning_rate": 4.667162960539552e-06, - "loss": 0.7283903360366821, + "grad_norm": 3.3042118549346924, + "learning_rate": 2.333581480269776e-06, + "loss": 0.8638893365859985, "step": 1050 }, { "epoch": 2.3325942350332594, - "grad_norm": 2.680569887161255, - "learning_rate": 4.650948004853006e-06, - "loss": 0.614159107208252, + "grad_norm": 2.431647539138794, + "learning_rate": 2.325474002426503e-06, + "loss": 0.7261286973953247, "step": 1052 }, { "epoch": 2.337028824833703, - "grad_norm": 2.4752540588378906, - "learning_rate": 4.634744472130529e-06, - "loss": 0.4821033477783203, + "grad_norm": 1.1630088090896606, + "learning_rate": 2.3173722360652644e-06, + "loss": 0.6455481052398682, "step": 1054 }, { "epoch": 2.341463414634146, - "grad_norm": 0.9671631455421448, - "learning_rate": 4.618552580370988e-06, - "loss": 0.6265279054641724, + "grad_norm": 0.9199745655059814, + "learning_rate": 2.309276290185494e-06, + "loss": 0.79603511095047, "step": 1056 }, { "epoch": 2.3458980044345896, - "grad_norm": 0.278334379196167, - "learning_rate": 4.6023725474166324e-06, - "loss": 0.3953332304954529, + "grad_norm": 0.4536246955394745, + "learning_rate": 2.3011862737083162e-06, + "loss": 0.508949875831604, "step": 1058 }, { "epoch": 2.3503325942350335, - "grad_norm": 1.0874994993209839, - "learning_rate": 4.586204590950169e-06, - "loss": 0.8931505084037781, + "grad_norm": 0.858386218547821, + "learning_rate": 2.2931022954750843e-06, + "loss": 0.9388585090637207, "step": 1060 }, { "epoch": 2.354767184035477, - "grad_norm": 3.5059680938720703, - "learning_rate": 4.570048928491824e-06, - "loss": 0.37978875637054443, + "grad_norm": 6.024104118347168, + "learning_rate": 2.285024464245912e-06, + "loss": 0.5697469115257263, "step": 1062 }, { "epoch": 2.3592017738359203, - "grad_norm": 1.128374695777893, - "learning_rate": 4.5539057773964316e-06, - "loss": 0.4617552161216736, + "grad_norm": 0.5417823195457458, + "learning_rate": 2.2769528886982158e-06, + "loss": 0.48910415172576904, "step": 1064 }, { "epoch": 2.3636363636363638, - "grad_norm": 2.8418750762939453, - "learning_rate": 4.537775354850496e-06, - "loss": 0.6248428821563721, + "grad_norm": 3.6564247608184814, + "learning_rate": 2.268887677425248e-06, + "loss": 0.706069827079773, "step": 1066 }, { "epoch": 2.368070953436807, - "grad_norm": 1.0952420234680176, - "learning_rate": 4.5216578778692725e-06, - "loss": 0.7397058010101318, + "grad_norm": 1.85207998752594, + "learning_rate": 2.2608289389346362e-06, + "loss": 0.9581363797187805, "step": 1068 }, { "epoch": 2.3725055432372506, - "grad_norm": 1.1010463237762451, - "learning_rate": 4.5055535632938526e-06, - "loss": 0.6921043395996094, + "grad_norm": 1.0115586519241333, + "learning_rate": 2.2527767816469263e-06, + "loss": 0.847273051738739, "step": 1070 }, { "epoch": 2.376940133037694, - "grad_norm": 1.206845998764038, - "learning_rate": 4.489462627788242e-06, - "loss": 0.8171138167381287, + "grad_norm": 1.160576343536377, + "learning_rate": 2.244731313894121e-06, + "loss": 0.9410176873207092, "step": 1072 }, { "epoch": 2.3813747228381374, - "grad_norm": 0.2211584895849228, - "learning_rate": 4.473385287836448e-06, - "loss": 0.034517209976911545, + "grad_norm": 0.41236647963523865, + "learning_rate": 2.236692643918224e-06, + "loss": 0.03157289698719978, "step": 1074 }, { "epoch": 2.385809312638581, - "grad_norm": 1.39876389503479, - "learning_rate": 4.457321759739567e-06, - "loss": 0.4455287456512451, + "grad_norm": 1.3738679885864258, + "learning_rate": 2.2286608798697834e-06, + "loss": 0.652180016040802, "step": 1076 }, { "epoch": 2.3902439024390243, - "grad_norm": 0.5344479084014893, - "learning_rate": 4.4412722596128686e-06, - "loss": 0.2573848068714142, + "grad_norm": 1.2516071796417236, + "learning_rate": 2.2206361298064343e-06, + "loss": 0.2624179422855377, "step": 1078 }, { "epoch": 2.3946784922394677, - "grad_norm": 3.3447608947753906, - "learning_rate": 4.425237003382903e-06, - "loss": 0.6614237427711487, + "grad_norm": 1.7923249006271362, + "learning_rate": 2.2126185016914515e-06, + "loss": 0.7395876049995422, "step": 1080 }, { "epoch": 2.399113082039911, - "grad_norm": 0.2269127368927002, - "learning_rate": 4.409216206784577e-06, - "loss": 0.4690076410770416, + "grad_norm": 0.4336431622505188, + "learning_rate": 2.2046081033922884e-06, + "loss": 0.5255239009857178, "step": 1082 }, { "epoch": 2.4035476718403546, - "grad_norm": 0.7799621820449829, - "learning_rate": 4.393210085358265e-06, - "loss": 0.5260664224624634, + "grad_norm": 2.713783025741577, + "learning_rate": 2.1966050426791325e-06, + "loss": 0.7811592817306519, "step": 1084 }, { "epoch": 2.4079822616407984, - "grad_norm": 0.9041287899017334, - "learning_rate": 4.3772188544469016e-06, - "loss": 0.71802818775177, + "grad_norm": 1.2868162393569946, + "learning_rate": 2.1886094272234508e-06, + "loss": 0.8133082985877991, "step": 1086 }, { "epoch": 2.412416851441242, - "grad_norm": 0.303363174200058, - "learning_rate": 4.3612427291930915e-06, - "loss": 0.20643645524978638, + "grad_norm": 0.6888259053230286, + "learning_rate": 2.1806213645965457e-06, + "loss": 0.2685484290122986, "step": 1088 }, { "epoch": 2.4168514412416853, - "grad_norm": 2.20377779006958, - "learning_rate": 4.345281924536208e-06, - "loss": 0.7628622651100159, + "grad_norm": 3.5159847736358643, + "learning_rate": 2.172640962268104e-06, + "loss": 0.8924703598022461, "step": 1090 }, { "epoch": 2.4212860310421287, - "grad_norm": 1.9586279392242432, - "learning_rate": 4.329336655209505e-06, - "loss": 0.6242840886116028, + "grad_norm": 1.7184735536575317, + "learning_rate": 2.1646683276047525e-06, + "loss": 0.8745633959770203, "step": 1092 }, { "epoch": 2.425720620842572, - "grad_norm": 2.349154233932495, - "learning_rate": 4.31340713573723e-06, - "loss": 0.4106002748012543, + "grad_norm": 2.238689661026001, + "learning_rate": 2.156703567868615e-06, + "loss": 0.40802738070487976, "step": 1094 }, { "epoch": 2.4301552106430155, - "grad_norm": 4.811046600341797, - "learning_rate": 4.297493580431732e-06, - "loss": 0.4525107741355896, + "grad_norm": 3.6044087409973145, + "learning_rate": 2.148746790215866e-06, + "loss": 0.538159191608429, "step": 1096 }, { "epoch": 2.434589800443459, - "grad_norm": 1.139159917831421, - "learning_rate": 4.281596203390582e-06, - "loss": 0.43235841393470764, + "grad_norm": 1.0334011316299438, + "learning_rate": 2.140798101695291e-06, + "loss": 0.5009183287620544, "step": 1098 }, { "epoch": 2.4390243902439024, - "grad_norm": 2.029642343521118, - "learning_rate": 4.265715218493695e-06, - "loss": 0.6632136702537537, + "grad_norm": 3.447432518005371, + "learning_rate": 2.1328576092468476e-06, + "loss": 0.8873782157897949, "step": 1100 }, { "epoch": 2.443458980044346, - "grad_norm": 0.7346834540367126, - "learning_rate": 4.249850839400446e-06, - "loss": 0.8561656475067139, + "grad_norm": 19.567584991455078, + "learning_rate": 2.124925419700223e-06, + "loss": 0.9391869902610779, "step": 1102 }, { "epoch": 2.4478935698447892, - "grad_norm": 1.7342432737350464, - "learning_rate": 4.2340032795468e-06, - "loss": 0.3280484080314636, + "grad_norm": 1.3884862661361694, + "learning_rate": 2.1170016397734e-06, + "loss": 0.4363320469856262, "step": 1104 }, { "epoch": 2.4523281596452327, - "grad_norm": 1.2294172048568726, - "learning_rate": 4.218172752142442e-06, - "loss": 0.8532360792160034, + "grad_norm": 0.8786958456039429, + "learning_rate": 2.109086376071221e-06, + "loss": 0.8789792060852051, "step": 1106 }, { "epoch": 2.4567627494456765, - "grad_norm": 2.7901058197021484, - "learning_rate": 4.202359470167903e-06, - "loss": 0.6427351236343384, + "grad_norm": 1.4285457134246826, + "learning_rate": 2.1011797350839513e-06, + "loss": 0.7075852751731873, "step": 1108 }, { "epoch": 2.4611973392461195, - "grad_norm": 1.3930901288986206, - "learning_rate": 4.186563646371696e-06, - "loss": 0.7979812622070312, + "grad_norm": 1.5161508321762085, + "learning_rate": 2.093281823185848e-06, + "loss": 0.9112823009490967, "step": 1110 }, { "epoch": 2.4656319290465634, - "grad_norm": 0.6308720111846924, - "learning_rate": 4.170785493267463e-06, - "loss": 0.5055820345878601, + "grad_norm": 0.9512194991111755, + "learning_rate": 2.0853927466337315e-06, + "loss": 0.6104337573051453, "step": 1112 }, { "epoch": 2.470066518847007, - "grad_norm": 1.5060687065124512, - "learning_rate": 4.155025223131102e-06, - "loss": 0.7073782086372375, + "grad_norm": 2.540562391281128, + "learning_rate": 2.077512611565551e-06, + "loss": 0.8170069456100464, "step": 1114 }, { "epoch": 2.47450110864745, - "grad_norm": 1.2661110162734985, - "learning_rate": 4.139283047997919e-06, - "loss": 0.2692304849624634, + "grad_norm": 3.007708787918091, + "learning_rate": 2.0696415239989593e-06, + "loss": 0.3202068507671356, "step": 1116 }, { "epoch": 2.4789356984478936, - "grad_norm": 0.8920889496803284, - "learning_rate": 4.123559179659771e-06, - "loss": 0.6808326840400696, + "grad_norm": 1.8201653957366943, + "learning_rate": 2.0617795898298855e-06, + "loss": 0.8330751657485962, "step": 1118 }, { "epoch": 2.483370288248337, - "grad_norm": 1.1632484197616577, - "learning_rate": 4.107853829662224e-06, - "loss": 0.6864634156227112, + "grad_norm": 2.480975389480591, + "learning_rate": 2.053926914831112e-06, + "loss": 0.7699635028839111, "step": 1120 }, { "epoch": 2.4878048780487805, - "grad_norm": 1.0989418029785156, - "learning_rate": 4.0921672093017e-06, - "loss": 0.6558045148849487, + "grad_norm": 1.417551040649414, + "learning_rate": 2.04608360465085e-06, + "loss": 0.8326176404953003, "step": 1122 }, { "epoch": 2.492239467849224, - "grad_norm": 2.8047521114349365, - "learning_rate": 4.076499529622636e-06, - "loss": 0.8816790580749512, + "grad_norm": 3.1235289573669434, + "learning_rate": 2.038249764811318e-06, + "loss": 1.0917719602584839, "step": 1124 }, { "epoch": 2.4966740576496673, - "grad_norm": 0.8200728297233582, - "learning_rate": 4.0608510014146455e-06, - "loss": 0.7856003642082214, + "grad_norm": 1.4231172800064087, + "learning_rate": 2.0304255007073227e-06, + "loss": 0.924064040184021, "step": 1126 }, { "epoch": 2.5011086474501107, - "grad_norm": 1.6482559442520142, - "learning_rate": 4.045221835209684e-06, - "loss": 0.5203614234924316, + "grad_norm": 1.0138386487960815, + "learning_rate": 2.022610917604842e-06, + "loss": 0.5847758650779724, "step": 1128 }, { "epoch": 2.505543237250554, - "grad_norm": 1.5657063722610474, - "learning_rate": 4.02961224127921e-06, - "loss": 0.5360028147697449, + "grad_norm": 12.797927856445312, + "learning_rate": 2.014806120639605e-06, + "loss": 0.6378868818283081, "step": 1130 }, { "epoch": 2.5099778270509976, - "grad_norm": 1.1746268272399902, - "learning_rate": 4.014022429631368e-06, - "loss": 0.6573871970176697, + "grad_norm": 1.234104037284851, + "learning_rate": 2.007011214815684e-06, + "loss": 0.7710060477256775, "step": 1132 }, { "epoch": 2.5144124168514415, - "grad_norm": 1.0907959938049316, - "learning_rate": 3.998452610008147e-06, - "loss": 0.3955955505371094, + "grad_norm": 1.5984342098236084, + "learning_rate": 1.9992263050040737e-06, + "loss": 0.5080645680427551, "step": 1134 }, { "epoch": 2.5188470066518844, - "grad_norm": 0.861308217048645, - "learning_rate": 3.982902991882578e-06, - "loss": 0.7564470767974854, + "grad_norm": 0.7567741274833679, + "learning_rate": 1.991451495941289e-06, + "loss": 0.9128345847129822, "step": 1136 }, { "epoch": 2.5232815964523283, - "grad_norm": 0.7468408346176147, - "learning_rate": 3.967373784455896e-06, - "loss": 0.6149483919143677, + "grad_norm": 0.9700871706008911, + "learning_rate": 1.983686892227948e-06, + "loss": 0.7003995776176453, "step": 1138 }, { "epoch": 2.5277161862527717, - "grad_norm": 0.710243284702301, - "learning_rate": 3.951865196654738e-06, - "loss": 0.8047510385513306, + "grad_norm": 1.3000694513320923, + "learning_rate": 1.975932598327369e-06, + "loss": 0.8861632943153381, "step": 1140 }, { "epoch": 2.532150776053215, - "grad_norm": 2.8993804454803467, - "learning_rate": 3.936377437128329e-06, - "loss": 0.41506367921829224, + "grad_norm": 1.9337971210479736, + "learning_rate": 1.9681887185641646e-06, + "loss": 0.4743580222129822, "step": 1142 }, { "epoch": 2.5365853658536586, - "grad_norm": 0.2720961272716522, - "learning_rate": 3.920910714245679e-06, - "loss": 0.44911813735961914, + "grad_norm": 3.3465616703033447, + "learning_rate": 1.9604553571228395e-06, + "loss": 0.5982359647750854, "step": 1144 }, { "epoch": 2.541019955654102, - "grad_norm": 0.7515284419059753, - "learning_rate": 3.905465236092771e-06, - "loss": 0.7769864201545715, + "grad_norm": 0.9957235455513, + "learning_rate": 1.9527326180463855e-06, + "loss": 0.8827557563781738, "step": 1146 }, { "epoch": 2.5454545454545454, - "grad_norm": 0.7892196178436279, - "learning_rate": 3.890041210469765e-06, - "loss": 0.7185046076774597, + "grad_norm": 0.8949941992759705, + "learning_rate": 1.9450206052348823e-06, + "loss": 0.9071930646896362, "step": 1148 }, { "epoch": 2.549889135254989, - "grad_norm": 0.7974555492401123, - "learning_rate": 3.8746388448882055e-06, - "loss": 0.6233813166618347, + "grad_norm": 0.8225301504135132, + "learning_rate": 1.9373194224441028e-06, + "loss": 0.8257545232772827, "step": 1150 }, { "epoch": 2.5543237250554323, - "grad_norm": 0.8632022142410278, - "learning_rate": 3.859258346568228e-06, - "loss": 0.44528669118881226, + "grad_norm": 0.8032196760177612, + "learning_rate": 1.929629173284114e-06, + "loss": 0.5126382112503052, "step": 1152 }, { "epoch": 2.5587583148558757, - "grad_norm": 0.21908880770206451, - "learning_rate": 3.843899922435767e-06, - "loss": 0.2759099304676056, + "grad_norm": 0.5240678191184998, + "learning_rate": 1.9219499612178836e-06, + "loss": 0.3800676763057709, "step": 1154 }, { "epoch": 2.5631929046563195, - "grad_norm": 0.6866997480392456, - "learning_rate": 3.8285637791197815e-06, - "loss": 0.5508578419685364, + "grad_norm": 0.8303525447845459, + "learning_rate": 1.9142818895598908e-06, + "loss": 0.49963274598121643, "step": 1156 }, { "epoch": 2.5676274944567625, - "grad_norm": 2.519130229949951, - "learning_rate": 3.8132501229494635e-06, - "loss": 0.5549399852752686, + "grad_norm": 3.7417755126953125, + "learning_rate": 1.9066250614747317e-06, + "loss": 0.4743386507034302, "step": 1158 }, { "epoch": 2.5720620842572064, - "grad_norm": 2.8596558570861816, - "learning_rate": 3.7979591599514696e-06, - "loss": 0.6041897535324097, + "grad_norm": 2.17212176322937, + "learning_rate": 1.8989795799757348e-06, + "loss": 0.7726760506629944, "step": 1160 }, { "epoch": 2.57649667405765, - "grad_norm": 0.7755621671676636, - "learning_rate": 3.782691095847151e-06, - "loss": 0.7620603442192078, + "grad_norm": 0.7126960158348083, + "learning_rate": 1.8913455479235754e-06, + "loss": 0.9239242076873779, "step": 1162 }, { "epoch": 2.5809312638580932, - "grad_norm": 0.9750792980194092, - "learning_rate": 3.767446136049775e-06, - "loss": 0.939260721206665, + "grad_norm": 0.8875038027763367, + "learning_rate": 1.8837230680248874e-06, + "loss": 0.9501176476478577, "step": 1164 }, { "epoch": 2.5853658536585367, - "grad_norm": 1.4371932744979858, - "learning_rate": 3.752224485661775e-06, - "loss": 0.4603317975997925, + "grad_norm": 0.9675799012184143, + "learning_rate": 1.8761122428308875e-06, + "loss": 0.6145892143249512, "step": 1166 }, { "epoch": 2.58980044345898, - "grad_norm": 1.9136115312576294, - "learning_rate": 3.7370263494719805e-06, - "loss": 0.9075461626052856, + "grad_norm": 0.7325379252433777, + "learning_rate": 1.8685131747359902e-06, + "loss": 0.8216499090194702, "step": 1168 }, { "epoch": 2.5942350332594235, - "grad_norm": 1.7198249101638794, - "learning_rate": 3.721851931952869e-06, - "loss": 0.8085947632789612, + "grad_norm": 5.725553035736084, + "learning_rate": 1.8609259659764345e-06, + "loss": 0.8852095007896423, "step": 1170 }, { "epoch": 2.598669623059867, - "grad_norm": 2.1932644844055176, - "learning_rate": 3.706701437257808e-06, - "loss": 0.28652137517929077, + "grad_norm": 3.2101731300354004, + "learning_rate": 1.853350718628904e-06, + "loss": 0.42097488045692444, "step": 1172 }, { "epoch": 2.6031042128603104, - "grad_norm": 0.7438820600509644, - "learning_rate": 3.691575069218314e-06, - "loss": 0.5113945007324219, + "grad_norm": 2.247046947479248, + "learning_rate": 1.845787534609157e-06, + "loss": 0.6113148331642151, "step": 1174 }, { "epoch": 2.6075388026607538, - "grad_norm": 0.860270619392395, - "learning_rate": 3.676473031341313e-06, - "loss": 0.6308207511901855, + "grad_norm": 4.695516586303711, + "learning_rate": 1.8382365156706566e-06, + "loss": 0.6265615224838257, "step": 1176 }, { "epoch": 2.611973392461197, - "grad_norm": 1.0259983539581299, - "learning_rate": 3.661395526806395e-06, - "loss": 0.4299200773239136, + "grad_norm": 1.2384305000305176, + "learning_rate": 1.8306977634031976e-06, + "loss": 0.627465546131134, "step": 1178 }, { "epoch": 2.6164079822616406, - "grad_norm": 2.737011194229126, - "learning_rate": 3.6463427584630806e-06, - "loss": 0.5992394089698792, + "grad_norm": 9.685011863708496, + "learning_rate": 1.8231713792315403e-06, + "loss": 0.7403496503829956, "step": 1180 }, { "epoch": 2.6208425720620845, - "grad_norm": 0.8241181373596191, - "learning_rate": 3.631314928828099e-06, - "loss": 0.7023974061012268, + "grad_norm": 0.927742063999176, + "learning_rate": 1.8156574644140495e-06, + "loss": 0.85722416639328, "step": 1182 }, { "epoch": 2.6252771618625275, - "grad_norm": 0.6770392656326294, - "learning_rate": 3.616312240082659e-06, - "loss": 0.7103127241134644, + "grad_norm": 1.9018654823303223, + "learning_rate": 1.8081561200413295e-06, + "loss": 0.853569507598877, "step": 1184 }, { "epoch": 2.6297117516629713, - "grad_norm": 1.7923256158828735, - "learning_rate": 3.601334894069728e-06, - "loss": 0.8160955309867859, + "grad_norm": 0.8076664805412292, + "learning_rate": 1.800667447034864e-06, + "loss": 0.8907285332679749, "step": 1186 }, { "epoch": 2.6341463414634148, - "grad_norm": 2.411703586578369, - "learning_rate": 3.5863830922913147e-06, - "loss": 0.8449252247810364, + "grad_norm": 2.0217840671539307, + "learning_rate": 1.7931915461456573e-06, + "loss": 0.9535523653030396, "step": 1188 }, { "epoch": 2.638580931263858, - "grad_norm": 0.7744673490524292, - "learning_rate": 3.5714570359057676e-06, - "loss": 0.4491943418979645, + "grad_norm": 0.8926487565040588, + "learning_rate": 1.7857285179528838e-06, + "loss": 0.489310622215271, "step": 1190 }, { "epoch": 2.6430155210643016, - "grad_norm": 2.5860702991485596, - "learning_rate": 3.556556925725061e-06, - "loss": 0.5912431478500366, + "grad_norm": 3.029690742492676, + "learning_rate": 1.7782784628625305e-06, + "loss": 0.6923867464065552, "step": 1192 }, { "epoch": 2.647450110864745, - "grad_norm": 3.4122977256774902, - "learning_rate": 3.5416829622120875e-06, - "loss": 0.5446506142616272, + "grad_norm": 2.82142972946167, + "learning_rate": 1.7708414811060437e-06, + "loss": 0.582843542098999, "step": 1194 }, { "epoch": 2.6518847006651884, - "grad_norm": 1.6620879173278809, - "learning_rate": 3.526835345477978e-06, - "loss": 0.6308864951133728, + "grad_norm": 1.2220163345336914, + "learning_rate": 1.763417672738989e-06, + "loss": 0.7117786407470703, "step": 1196 }, { "epoch": 2.656319290465632, - "grad_norm": 0.7548476457595825, - "learning_rate": 3.5120142752793907e-06, - "loss": 0.21907749772071838, + "grad_norm": 0.4822169244289398, + "learning_rate": 1.7560071376396953e-06, + "loss": 0.2628706693649292, "step": 1198 }, { "epoch": 2.6607538802660753, - "grad_norm": 3.578019380569458, - "learning_rate": 3.4972199510158393e-06, - "loss": 0.9215325117111206, + "grad_norm": 1.2988340854644775, + "learning_rate": 1.7486099755079197e-06, + "loss": 0.9527356624603271, "step": 1200 }, { "epoch": 2.6651884700665187, - "grad_norm": 0.7092931270599365, - "learning_rate": 3.4824525717269975e-06, - "loss": 0.8297696709632874, + "grad_norm": 1.3238375186920166, + "learning_rate": 1.7412262858634987e-06, + "loss": 0.8897156119346619, "step": 1202 }, { "epoch": 2.6696230598669626, - "grad_norm": 0.9896040558815002, - "learning_rate": 3.4677123360900342e-06, - "loss": 0.38038522005081177, + "grad_norm": 1.0240246057510376, + "learning_rate": 1.7338561680450171e-06, + "loss": 0.46362125873565674, "step": 1204 }, { "epoch": 2.6740576496674056, - "grad_norm": 2.033034563064575, - "learning_rate": 3.4529994424169233e-06, - "loss": 0.5968571901321411, + "grad_norm": 1.75541353225708, + "learning_rate": 1.7264997212084616e-06, + "loss": 0.7587183713912964, "step": 1206 }, { "epoch": 2.6784922394678494, - "grad_norm": 2.7922661304473877, - "learning_rate": 3.4383140886517953e-06, - "loss": 0.7829728722572327, + "grad_norm": 1.5657193660736084, + "learning_rate": 1.7191570443258976e-06, + "loss": 0.9052755832672119, "step": 1208 }, { "epoch": 2.682926829268293, - "grad_norm": 1.3708895444869995, - "learning_rate": 3.423656472368262e-06, - "loss": 0.4674299359321594, + "grad_norm": 1.5885916948318481, + "learning_rate": 1.711828236184131e-06, + "loss": 0.4789600968360901, "step": 1210 }, { "epoch": 2.6873614190687363, - "grad_norm": 1.8650954961776733, - "learning_rate": 3.409026790766756e-06, - "loss": 0.25274747610092163, + "grad_norm": 71.83616638183594, + "learning_rate": 1.704513395383378e-06, + "loss": 0.2857840955257416, "step": 1212 }, { "epoch": 2.6917960088691797, - "grad_norm": 2.8033974170684814, - "learning_rate": 3.394425240671891e-06, - "loss": 0.4370385706424713, + "grad_norm": 1.3978385925292969, + "learning_rate": 1.6972126203359454e-06, + "loss": 0.5244172215461731, "step": 1214 }, { "epoch": 2.696230598669623, - "grad_norm": 2.1162285804748535, - "learning_rate": 3.379852018529799e-06, - "loss": 0.5205950736999512, + "grad_norm": 6.41267204284668, + "learning_rate": 1.6899260092648995e-06, + "loss": 0.5827531814575195, "step": 1216 }, { "epoch": 2.7006651884700665, - "grad_norm": 1.2731273174285889, - "learning_rate": 3.3653073204054942e-06, - "loss": 0.5236338973045349, + "grad_norm": 3.540893316268921, + "learning_rate": 1.6826536602027471e-06, + "loss": 0.5931687355041504, "step": 1218 }, { "epoch": 2.70509977827051, - "grad_norm": 1.8009387254714966, - "learning_rate": 3.3507913419802403e-06, - "loss": 0.7941880822181702, + "grad_norm": 1.408761978149414, + "learning_rate": 1.6753956709901202e-06, + "loss": 0.9201699495315552, "step": 1220 }, { "epoch": 2.7095343680709534, - "grad_norm": 1.7554905414581299, - "learning_rate": 3.336304278548903e-06, - "loss": 0.7005539536476135, + "grad_norm": 3.0229973793029785, + "learning_rate": 1.6681521392744515e-06, + "loss": 0.7630390524864197, "step": 1222 }, { "epoch": 2.713968957871397, - "grad_norm": 1.5743012428283691, - "learning_rate": 3.321846325017342e-06, - "loss": 0.7519204616546631, + "grad_norm": 3.136180877685547, + "learning_rate": 1.660923162508671e-06, + "loss": 0.8200241923332214, "step": 1224 }, { "epoch": 2.7184035476718402, - "grad_norm": 3.2630934715270996, - "learning_rate": 3.3074176758997744e-06, - "loss": 0.37882906198501587, + "grad_norm": 5.015387535095215, + "learning_rate": 1.6537088379498872e-06, + "loss": 0.41038981080055237, "step": 1226 }, { "epoch": 2.7228381374722836, - "grad_norm": 0.9136330485343933, - "learning_rate": 3.2930185253161574e-06, - "loss": 0.8159320950508118, + "grad_norm": 0.8245871067047119, + "learning_rate": 1.6465092626580787e-06, + "loss": 0.967170238494873, "step": 1228 }, { "epoch": 2.7272727272727275, - "grad_norm": 1.3966491222381592, - "learning_rate": 3.2786490669895883e-06, - "loss": 0.6657707095146179, + "grad_norm": 1.9440391063690186, + "learning_rate": 1.6393245334947942e-06, + "loss": 0.6494452357292175, "step": 1230 }, { "epoch": 2.7317073170731705, - "grad_norm": 0.7242875099182129, - "learning_rate": 3.2643094942436865e-06, - "loss": 0.7183330655097961, + "grad_norm": 0.9693624973297119, + "learning_rate": 1.6321547471218432e-06, + "loss": 0.8679260611534119, "step": 1232 }, { "epoch": 2.7361419068736144, - "grad_norm": 2.514469623565674, - "learning_rate": 3.2500000000000015e-06, - "loss": 0.4106196463108063, + "grad_norm": 6.900158882141113, + "learning_rate": 1.6250000000000007e-06, + "loss": 0.5012823343276978, "step": 1234 }, { "epoch": 2.740576496674058, - "grad_norm": 1.027852177619934, - "learning_rate": 3.2357207767754063e-06, - "loss": 0.6766651272773743, + "grad_norm": 0.9100331664085388, + "learning_rate": 1.6178603883877032e-06, + "loss": 0.8535019755363464, "step": 1236 }, { "epoch": 2.745011086474501, - "grad_norm": 1.0152777433395386, - "learning_rate": 3.221472016679521e-06, - "loss": 0.47056448459625244, + "grad_norm": 0.8146524429321289, + "learning_rate": 1.6107360083397604e-06, + "loss": 0.5447841286659241, "step": 1238 }, { "epoch": 2.7494456762749446, - "grad_norm": 0.7261103987693787, - "learning_rate": 3.2072539114121188e-06, - "loss": 0.46467670798301697, + "grad_norm": 0.8178804516792297, + "learning_rate": 1.6036269557060594e-06, + "loss": 0.5961492657661438, "step": 1240 }, { "epoch": 2.753880266075388, - "grad_norm": 0.899706244468689, - "learning_rate": 3.193066652260547e-06, - "loss": 0.8382993340492249, + "grad_norm": 1.5546802282333374, + "learning_rate": 1.5965333261302735e-06, + "loss": 0.8809974193572998, "step": 1242 }, { "epoch": 2.7583148558758315, - "grad_norm": 0.9996641874313354, - "learning_rate": 3.1789104300971603e-06, - "loss": 0.7458208203315735, + "grad_norm": 2.939823865890503, + "learning_rate": 1.5894552150485801e-06, + "loss": 0.9211047291755676, "step": 1244 }, { "epoch": 2.762749445676275, - "grad_norm": 0.36415642499923706, - "learning_rate": 3.164785435376745e-06, - "loss": 0.23638926446437836, + "grad_norm": 0.6600274443626404, + "learning_rate": 1.5823927176883725e-06, + "loss": 0.3387180268764496, "step": 1246 }, { "epoch": 2.7671840354767183, - "grad_norm": 1.592871904373169, - "learning_rate": 3.1506918581339583e-06, - "loss": 0.47278061509132385, + "grad_norm": 10.310997009277344, + "learning_rate": 1.5753459290669792e-06, + "loss": 0.6843352317810059, "step": 1248 }, { "epoch": 2.7716186252771617, - "grad_norm": 1.0408028364181519, - "learning_rate": 3.136629887980781e-06, - "loss": 0.5122473835945129, + "grad_norm": 1.5545812845230103, + "learning_rate": 1.5683149439903905e-06, + "loss": 0.5795391201972961, "step": 1250 }, { "epoch": 2.776053215077605, - "grad_norm": 0.9867532849311829, - "learning_rate": 3.122599714103949e-06, - "loss": 0.8818725347518921, + "grad_norm": 0.9581854939460754, + "learning_rate": 1.5612998570519746e-06, + "loss": 0.9656073451042175, "step": 1252 }, { "epoch": 2.7804878048780486, - "grad_norm": 1.422898769378662, - "learning_rate": 3.1086015252624257e-06, - "loss": 0.8071056604385376, + "grad_norm": 1.066094160079956, + "learning_rate": 1.5543007626312129e-06, + "loss": 0.9749020338058472, "step": 1254 }, { "epoch": 2.7849223946784925, - "grad_norm": 1.5934648513793945, - "learning_rate": 3.0946355097848535e-06, - "loss": 0.7926267385482788, + "grad_norm": 2.5097603797912598, + "learning_rate": 1.5473177548924267e-06, + "loss": 0.9311152696609497, "step": 1256 }, { "epoch": 2.7893569844789354, - "grad_norm": 0.5067124962806702, - "learning_rate": 3.0807018555670153e-06, - "loss": 0.13775405287742615, + "grad_norm": 1.2054038047790527, + "learning_rate": 1.5403509277835077e-06, + "loss": 0.2651883065700531, "step": 1258 }, { "epoch": 2.7937915742793793, - "grad_norm": 1.111436128616333, - "learning_rate": 3.0668007500693216e-06, - "loss": 0.7149184346199036, + "grad_norm": 1.640947699546814, + "learning_rate": 1.5334003750346608e-06, + "loss": 0.873029887676239, "step": 1260 }, { "epoch": 2.7982261640798227, - "grad_norm": 2.614287853240967, - "learning_rate": 3.0529323803142697e-06, - "loss": 0.5375425815582275, + "grad_norm": 3.502584218978882, + "learning_rate": 1.5264661901571349e-06, + "loss": 0.6039642691612244, "step": 1262 }, { "epoch": 2.802660753880266, - "grad_norm": 1.180129885673523, - "learning_rate": 3.0390969328839464e-06, - "loss": 0.48145541548728943, + "grad_norm": 0.6926102042198181, + "learning_rate": 1.5195484664419732e-06, + "loss": 0.485037237405777, "step": 1264 }, { "epoch": 2.8070953436807096, - "grad_norm": 1.0282138586044312, - "learning_rate": 3.0252945939175004e-06, - "loss": 0.7159358859062195, + "grad_norm": 1.0856491327285767, + "learning_rate": 1.5126472969587502e-06, + "loss": 0.810798704624176, "step": 1266 }, { "epoch": 2.811529933481153, - "grad_norm": 3.18182635307312, - "learning_rate": 3.0115255491086537e-06, - "loss": 0.8956208229064941, + "grad_norm": 5.212289810180664, + "learning_rate": 1.5057627745543269e-06, + "loss": 0.8525525331497192, "step": 1268 }, { "epoch": 2.8159645232815964, - "grad_norm": 0.7246714234352112, - "learning_rate": 2.9977899837031895e-06, - "loss": 0.7317441701889038, + "grad_norm": 0.6678111553192139, + "learning_rate": 1.4988949918515947e-06, + "loss": 0.8990007638931274, "step": 1270 }, { "epoch": 2.82039911308204, - "grad_norm": 1.5273503065109253, - "learning_rate": 2.984088082496469e-06, - "loss": 0.2754761874675751, + "grad_norm": 6.717563152313232, + "learning_rate": 1.4920440412482345e-06, + "loss": 0.3461105525493622, "step": 1272 }, { "epoch": 2.8248337028824833, - "grad_norm": 0.9987423419952393, - "learning_rate": 2.970420029830946e-06, - "loss": 0.4880000650882721, + "grad_norm": 0.8497708439826965, + "learning_rate": 1.485210014915473e-06, + "loss": 0.5556339025497437, "step": 1274 }, { "epoch": 2.8292682926829267, - "grad_norm": 0.7692528367042542, - "learning_rate": 2.9567860095936775e-06, - "loss": 0.9671233892440796, + "grad_norm": 1.1711769104003906, + "learning_rate": 1.4783930047968388e-06, + "loss": 0.9120653867721558, "step": 1276 }, { "epoch": 2.8337028824833705, - "grad_norm": 0.9331156015396118, - "learning_rate": 2.9431862052138545e-06, - "loss": 0.8612651824951172, + "grad_norm": 1.5863559246063232, + "learning_rate": 1.4715931026069273e-06, + "loss": 0.8935397267341614, "step": 1278 }, { "epoch": 2.8381374722838135, - "grad_norm": 0.9668262600898743, - "learning_rate": 2.929620799660343e-06, - "loss": 0.3867911994457245, + "grad_norm": 1.5529717206954956, + "learning_rate": 1.4648103998301716e-06, + "loss": 0.5035147070884705, "step": 1280 }, { "epoch": 2.8425720620842574, - "grad_norm": 2.516414165496826, - "learning_rate": 2.916089975439207e-06, - "loss": 0.47361209988594055, + "grad_norm": 5.072021484375, + "learning_rate": 1.4580449877196035e-06, + "loss": 0.6653071045875549, "step": 1282 }, { "epoch": 2.847006651884701, - "grad_norm": 0.9498918056488037, - "learning_rate": 2.9025939145912655e-06, - "loss": 0.4672809839248657, + "grad_norm": 0.8926497101783752, + "learning_rate": 1.4512969572956328e-06, + "loss": 0.6203804612159729, "step": 1284 }, { "epoch": 2.8514412416851442, - "grad_norm": 0.8674972653388977, - "learning_rate": 2.8891327986896345e-06, - "loss": 0.8502570390701294, + "grad_norm": 4.661787986755371, + "learning_rate": 1.4445663993448173e-06, + "loss": 0.9532814621925354, "step": 1286 }, { "epoch": 2.8558758314855877, - "grad_norm": 2.823939085006714, - "learning_rate": 2.875706808837292e-06, - "loss": 0.1998748630285263, + "grad_norm": 3.233835458755493, + "learning_rate": 1.437853404418646e-06, + "loss": 0.32333725690841675, "step": 1288 }, { "epoch": 2.860310421286031, - "grad_norm": 1.9392937421798706, - "learning_rate": 2.862316125664636e-06, - "loss": 0.8196284770965576, + "grad_norm": 1.3348687887191772, + "learning_rate": 1.431158062832318e-06, + "loss": 0.8358311653137207, "step": 1290 }, { "epoch": 2.8647450110864745, - "grad_norm": 8.883353233337402, - "learning_rate": 2.848960929327053e-06, - "loss": 0.7396450042724609, + "grad_norm": 2.6443517208099365, + "learning_rate": 1.4244804646635266e-06, + "loss": 0.8612353205680847, "step": 1292 }, { "epoch": 2.869179600886918, - "grad_norm": 1.4906433820724487, - "learning_rate": 2.8356413995025044e-06, - "loss": 0.717079758644104, + "grad_norm": 0.6927613615989685, + "learning_rate": 1.4178206997512522e-06, + "loss": 0.8790969252586365, "step": 1294 }, { "epoch": 2.8736141906873613, - "grad_norm": 0.8910164833068848, - "learning_rate": 2.8223577153890934e-06, - "loss": 0.7069391012191772, + "grad_norm": 0.8676153421401978, + "learning_rate": 1.4111788576945467e-06, + "loss": 0.8564894795417786, "step": 1296 }, { "epoch": 2.8780487804878048, - "grad_norm": 3.354166030883789, - "learning_rate": 2.8091100557026702e-06, - "loss": 0.4838540852069855, + "grad_norm": 14.382169723510742, + "learning_rate": 1.4045550278513351e-06, + "loss": 0.5821776986122131, "step": 1298 }, { "epoch": 2.882483370288248, - "grad_norm": 0.841211199760437, - "learning_rate": 2.795898598674415e-06, - "loss": 0.6780248284339905, + "grad_norm": 0.7985261678695679, + "learning_rate": 1.3979492993372074e-06, + "loss": 0.8649250268936157, "step": 1300 }, { "epoch": 2.8869179600886916, - "grad_norm": 1.9939450025558472, - "learning_rate": 2.782723522048444e-06, - "loss": 0.2901532053947449, + "grad_norm": 1.3638763427734375, + "learning_rate": 1.391361761024222e-06, + "loss": 0.23123106360435486, "step": 1302 }, { "epoch": 2.8913525498891355, - "grad_norm": 1.6043404340744019, - "learning_rate": 2.7695850030794293e-06, - "loss": 0.7313271760940552, + "grad_norm": 1.07351815700531, + "learning_rate": 1.3847925015397146e-06, + "loss": 0.8127425909042358, "step": 1304 }, { "epoch": 2.8957871396895785, - "grad_norm": 2.5174219608306885, - "learning_rate": 2.7564832185301915e-06, - "loss": 0.6357086896896362, + "grad_norm": 1.6091636419296265, + "learning_rate": 1.3782416092650957e-06, + "loss": 0.9045838117599487, "step": 1306 }, { "epoch": 2.9002217294900223, - "grad_norm": 0.29980650544166565, - "learning_rate": 2.7434183446693397e-06, - "loss": 0.2599072754383087, + "grad_norm": 0.37830138206481934, + "learning_rate": 1.3717091723346699e-06, + "loss": 0.36935049295425415, "step": 1308 }, { "epoch": 2.9046563192904657, - "grad_norm": 1.3467071056365967, - "learning_rate": 2.730390557268897e-06, - "loss": 0.33962565660476685, + "grad_norm": 0.7938516736030579, + "learning_rate": 1.3651952786344485e-06, + "loss": 0.44842761754989624, "step": 1310 }, { "epoch": 2.909090909090909, - "grad_norm": 0.9841307401657104, - "learning_rate": 2.7174000316019277e-06, - "loss": 0.6833657622337341, + "grad_norm": 1.4822460412979126, + "learning_rate": 1.3587000158009638e-06, + "loss": 0.8571957349777222, "step": 1312 }, { "epoch": 2.9135254988913526, - "grad_norm": 1.2662699222564697, - "learning_rate": 2.704446942440191e-06, - "loss": 0.7377205491065979, + "grad_norm": 0.6906547546386719, + "learning_rate": 1.3522234712200954e-06, + "loss": 0.8675597906112671, "step": 1314 }, { "epoch": 2.917960088691796, - "grad_norm": 2.137340784072876, - "learning_rate": 2.6915314640517755e-06, - "loss": 0.6367099285125732, + "grad_norm": 2.601785898208618, + "learning_rate": 1.3457657320258878e-06, + "loss": 0.7463323473930359, "step": 1316 }, { "epoch": 2.9223946784922394, - "grad_norm": 1.5056116580963135, - "learning_rate": 2.6786537701987703e-06, - "loss": 0.8529772162437439, + "grad_norm": 2.53776216506958, + "learning_rate": 1.3393268850993852e-06, + "loss": 0.8853073716163635, "step": 1318 }, { "epoch": 2.926829268292683, - "grad_norm": 1.2501715421676636, - "learning_rate": 2.665814034134916e-06, - "loss": 0.5121623873710632, + "grad_norm": 2.7623400688171387, + "learning_rate": 1.332907017067458e-06, + "loss": 0.5930368304252625, "step": 1320 }, { "epoch": 2.9312638580931263, - "grad_norm": 1.0894711017608643, - "learning_rate": 2.6530124286032755e-06, - "loss": 0.7275009155273438, + "grad_norm": 1.027145504951477, + "learning_rate": 1.3265062143016378e-06, + "loss": 0.8478403687477112, "step": 1322 }, { "epoch": 2.9356984478935697, - "grad_norm": 3.4206230640411377, - "learning_rate": 2.640249125833915e-06, - "loss": 0.7550503611564636, + "grad_norm": 0.7706930041313171, + "learning_rate": 1.3201245629169574e-06, + "loss": 0.8749006390571594, "step": 1324 }, { "epoch": 2.9401330376940136, - "grad_norm": 0.8373146653175354, - "learning_rate": 2.6275242975415804e-06, - "loss": 0.8755195736885071, + "grad_norm": 1.0428484678268433, + "learning_rate": 1.3137621487707902e-06, + "loss": 0.9431027173995972, "step": 1326 }, { "epoch": 2.9445676274944566, - "grad_norm": 1.1842820644378662, - "learning_rate": 2.614838114923394e-06, - "loss": 0.7242047786712646, + "grad_norm": 1.0761144161224365, + "learning_rate": 1.307419057461697e-06, + "loss": 1.0074454545974731, "step": 1328 }, { "epoch": 2.9490022172949004, - "grad_norm": 0.9061135053634644, - "learning_rate": 2.6021907486565447e-06, - "loss": 0.7160645127296448, + "grad_norm": 0.6524636149406433, + "learning_rate": 1.3010953743282724e-06, + "loss": 0.844873309135437, "step": 1330 }, { "epoch": 2.953436807095344, - "grad_norm": 1.9043796062469482, - "learning_rate": 2.589582368895992e-06, - "loss": 0.7991137504577637, + "grad_norm": 1.394280195236206, + "learning_rate": 1.294791184447996e-06, + "loss": 0.9639885425567627, "step": 1332 }, { "epoch": 2.9578713968957873, - "grad_norm": 0.7662011384963989, - "learning_rate": 2.577013145272185e-06, - "loss": 0.42025405168533325, + "grad_norm": 0.7140282988548279, + "learning_rate": 1.2885065726360925e-06, + "loss": 0.48626944422721863, "step": 1334 }, { "epoch": 2.9623059866962307, - "grad_norm": 0.9441994428634644, - "learning_rate": 2.564483246888772e-06, - "loss": 0.6747118830680847, + "grad_norm": 0.707069456577301, + "learning_rate": 1.282241623444386e-06, + "loss": 0.8297737836837769, "step": 1336 }, { "epoch": 2.966740576496674, - "grad_norm": 1.0354760885238647, - "learning_rate": 2.5519928423203266e-06, - "loss": 0.7479525208473206, + "grad_norm": 0.9522484540939331, + "learning_rate": 1.2759964211601633e-06, + "loss": 0.8412789106369019, "step": 1338 }, { "epoch": 2.9711751662971175, - "grad_norm": 0.9520703554153442, - "learning_rate": 2.539542099610084e-06, - "loss": 0.5493751168251038, + "grad_norm": 0.9063006043434143, + "learning_rate": 1.269771049805042e-06, + "loss": 0.7291353940963745, "step": 1340 }, { "epoch": 2.975609756097561, - "grad_norm": 2.1422736644744873, - "learning_rate": 2.5271311862676727e-06, - "loss": 0.5847128033638, + "grad_norm": 2.06269907951355, + "learning_rate": 1.2635655931338364e-06, + "loss": 0.5531010627746582, "step": 1342 }, { "epoch": 2.9800443458980044, - "grad_norm": 0.5576608777046204, - "learning_rate": 2.514760269266871e-06, - "loss": 0.1638008952140808, + "grad_norm": 0.36816778779029846, + "learning_rate": 1.2573801346334355e-06, + "loss": 0.16619227826595306, "step": 1344 }, { "epoch": 2.984478935698448, - "grad_norm": 1.6136894226074219, - "learning_rate": 2.50242951504335e-06, - "loss": 0.5379747748374939, + "grad_norm": 0.8877193331718445, + "learning_rate": 1.251214757521675e-06, + "loss": 0.6579598784446716, "step": 1346 }, { "epoch": 2.988913525498891, - "grad_norm": 0.31877100467681885, - "learning_rate": 2.490139089492443e-06, - "loss": 0.42808306217193604, + "grad_norm": 0.42023998498916626, + "learning_rate": 1.2450695447462214e-06, + "loss": 0.5335787534713745, "step": 1348 }, { "epoch": 2.9933481152993346, - "grad_norm": 2.9408724308013916, - "learning_rate": 2.4778891579669067e-06, - "loss": 0.42848342657089233, + "grad_norm": 1.5073217153549194, + "learning_rate": 1.2389445789834534e-06, + "loss": 0.5976958870887756, "step": 1350 }, { "epoch": 2.9977827050997785, - "grad_norm": 1.016264796257019, - "learning_rate": 2.4656798852747023e-06, - "loss": 0.5269397497177124, + "grad_norm": 2.1122334003448486, + "learning_rate": 1.2328399426373511e-06, + "loss": 0.6611562967300415, "step": 1352 }, { "epoch": 3.002217294900222, - "grad_norm": 1.1126813888549805, - "learning_rate": 2.453511435676777e-06, - "loss": 0.557039737701416, + "grad_norm": 0.7240795493125916, + "learning_rate": 1.2267557178383886e-06, + "loss": 0.783865749835968, "step": 1354 }, { "epoch": 3.0066518847006654, - "grad_norm": 1.4688986539840698, - "learning_rate": 2.441383972884848e-06, - "loss": 0.2774271070957184, + "grad_norm": 0.8436267375946045, + "learning_rate": 1.220691986442424e-06, + "loss": 0.394011527299881, "step": 1356 }, { "epoch": 3.011086474501109, - "grad_norm": 1.5381768941879272, - "learning_rate": 2.4292976600592095e-06, - "loss": 0.45656082034111023, + "grad_norm": 2.886348009109497, + "learning_rate": 1.2146488300296047e-06, + "loss": 0.6734086871147156, "step": 1358 }, { "epoch": 3.015521064301552, - "grad_norm": 0.8177427649497986, - "learning_rate": 2.4172526598065304e-06, - "loss": 0.5471428036689758, + "grad_norm": 0.9459258913993835, + "learning_rate": 1.2086263299032652e-06, + "loss": 0.7187186479568481, "step": 1360 }, { "epoch": 3.0199556541019956, - "grad_norm": 1.1968615055084229, - "learning_rate": 2.4052491341776686e-06, - "loss": 0.3854435086250305, + "grad_norm": 0.9276299476623535, + "learning_rate": 1.2026245670888343e-06, + "loss": 0.6644557118415833, "step": 1362 }, { "epoch": 3.024390243902439, - "grad_norm": 0.8419128656387329, - "learning_rate": 2.393287244665494e-06, - "loss": 0.4754073917865753, + "grad_norm": 1.1041805744171143, + "learning_rate": 1.196643622332747e-06, + "loss": 0.78998863697052, "step": 1364 }, { "epoch": 3.0288248337028825, - "grad_norm": 0.12323111295700073, - "learning_rate": 2.3813671522027094e-06, - "loss": 0.22163067758083344, + "grad_norm": 0.25377586483955383, + "learning_rate": 1.1906835761013547e-06, + "loss": 0.3418872356414795, "step": 1366 }, { "epoch": 3.033259423503326, - "grad_norm": 1.2366307973861694, - "learning_rate": 2.369489017159692e-06, - "loss": 0.20418155193328857, + "grad_norm": 1.3526285886764526, + "learning_rate": 1.184744508579846e-06, + "loss": 0.423952579498291, "step": 1368 }, { "epoch": 3.0376940133037693, - "grad_norm": 2.100736141204834, - "learning_rate": 2.357652999342334e-06, - "loss": 0.2910291850566864, + "grad_norm": 1.1649706363677979, + "learning_rate": 1.178826499671167e-06, + "loss": 0.4472143352031708, "step": 1370 }, { "epoch": 3.0421286031042127, - "grad_norm": 4.004879951477051, - "learning_rate": 2.345859257989886e-06, - "loss": 0.47884419560432434, + "grad_norm": 1.4715780019760132, + "learning_rate": 1.172929628994943e-06, + "loss": 0.8062811493873596, "step": 1372 }, { "epoch": 3.046563192904656, - "grad_norm": 1.1280056238174438, - "learning_rate": 2.334107951772826e-06, - "loss": 0.15896357595920563, + "grad_norm": 1.3881590366363525, + "learning_rate": 1.167053975886413e-06, + "loss": 0.3142853379249573, "step": 1374 }, { "epoch": 3.0509977827050996, - "grad_norm": 3.1216893196105957, - "learning_rate": 2.3223992387907137e-06, - "loss": 0.23939193785190582, + "grad_norm": 3.416389226913452, + "learning_rate": 1.1611996193953569e-06, + "loss": 0.43006041646003723, "step": 1376 }, { "epoch": 3.0554323725055434, - "grad_norm": 1.0042521953582764, - "learning_rate": 2.3107332765700733e-06, - "loss": 0.1343676894903183, + "grad_norm": 1.048660159111023, + "learning_rate": 1.1553666382850366e-06, + "loss": 0.3392511308193207, "step": 1378 }, { "epoch": 3.059866962305987, - "grad_norm": 0.09163781255483627, - "learning_rate": 2.2991102220622647e-06, - "loss": 0.16645547747612, + "grad_norm": 0.3364250659942627, + "learning_rate": 1.1495551110311324e-06, + "loss": 0.2998298704624176, "step": 1380 }, { "epoch": 3.0643015521064303, - "grad_norm": 2.4105827808380127, - "learning_rate": 2.2875302316413807e-06, - "loss": 0.1257064789533615, + "grad_norm": 6.17432165145874, + "learning_rate": 1.1437651158206904e-06, + "loss": 0.17229698598384857, "step": 1382 }, { "epoch": 3.0687361419068737, - "grad_norm": 1.94559645652771, - "learning_rate": 2.275993461102138e-06, - "loss": 0.3437502384185791, + "grad_norm": 1.7204508781433105, + "learning_rate": 1.137996730551069e-06, + "loss": 0.4303905963897705, "step": 1384 }, { "epoch": 3.073170731707317, - "grad_norm": 0.2491624653339386, - "learning_rate": 2.2645000656577793e-06, - "loss": 0.1117410808801651, + "grad_norm": 0.39900439977645874, + "learning_rate": 1.1322500328288897e-06, + "loss": 0.2888520658016205, "step": 1386 }, { "epoch": 3.0776053215077606, - "grad_norm": 0.1076858639717102, - "learning_rate": 2.2530501999379932e-06, - "loss": 0.17695897817611694, + "grad_norm": 0.15358883142471313, + "learning_rate": 1.1265250999689966e-06, + "loss": 0.32026374340057373, "step": 1388 }, { "epoch": 3.082039911308204, - "grad_norm": 0.8501622080802917, - "learning_rate": 2.2416440179868236e-06, - "loss": 0.3754989504814148, + "grad_norm": 0.947392463684082, + "learning_rate": 1.1208220089934118e-06, + "loss": 0.6729350090026855, "step": 1390 }, { "epoch": 3.0864745011086474, - "grad_norm": 1.7676734924316406, - "learning_rate": 2.230281673260605e-06, - "loss": 0.15238375961780548, + "grad_norm": 3.0764870643615723, + "learning_rate": 1.1151408366303024e-06, + "loss": 0.325135737657547, "step": 1392 }, { "epoch": 3.090909090909091, - "grad_norm": 6.170825958251953, - "learning_rate": 2.218963318625895e-06, - "loss": 0.22011463344097137, + "grad_norm": 4.003721237182617, + "learning_rate": 1.1094816593129475e-06, + "loss": 0.4179095923900604, "step": 1394 }, { "epoch": 3.0953436807095343, - "grad_norm": 2.093130350112915, - "learning_rate": 2.2076891063574167e-06, - "loss": 0.5483108162879944, + "grad_norm": 1.155935287475586, + "learning_rate": 1.1038445531787083e-06, + "loss": 0.715002715587616, "step": 1396 }, { "epoch": 3.0997782705099777, - "grad_norm": 0.9943727850914001, - "learning_rate": 2.196459188136014e-06, - "loss": 0.39382025599479675, + "grad_norm": 1.3893831968307495, + "learning_rate": 1.098229594068007e-06, + "loss": 0.6728289127349854, "step": 1398 }, { "epoch": 3.104212860310421, - "grad_norm": 0.4041390120983124, - "learning_rate": 2.1852737150466064e-06, - "loss": 0.37452182173728943, + "grad_norm": 1.6952826976776123, + "learning_rate": 1.0926368575233032e-06, + "loss": 0.6240461468696594, "step": 1400 }, { "epoch": 3.1086474501108645, - "grad_norm": 0.41770124435424805, - "learning_rate": 2.174132837576156e-06, - "loss": 0.1812993884086609, + "grad_norm": 0.4904409945011139, + "learning_rate": 1.087066418788078e-06, + "loss": 0.19736936688423157, "step": 1402 }, { "epoch": 3.1130820399113084, - "grad_norm": 3.0523788928985596, - "learning_rate": 2.1630367056116496e-06, - "loss": 0.3220471739768982, + "grad_norm": 1.5871460437774658, + "learning_rate": 1.0815183528058248e-06, + "loss": 0.38163045048713684, "step": 1404 }, { "epoch": 3.117516629711752, - "grad_norm": 2.8227522373199463, - "learning_rate": 2.1519854684380724e-06, - "loss": 0.5665332674980164, + "grad_norm": 1.395402193069458, + "learning_rate": 1.0759927342190362e-06, + "loss": 0.7312334179878235, "step": 1406 }, { "epoch": 3.1219512195121952, - "grad_norm": 1.215554118156433, - "learning_rate": 2.1409792747364103e-06, - "loss": 0.6941906809806824, + "grad_norm": 1.0713874101638794, + "learning_rate": 1.0704896373682052e-06, + "loss": 0.7383747696876526, "step": 1408 }, { "epoch": 3.1263858093126387, - "grad_norm": 0.9595901370048523, - "learning_rate": 2.1300182725816378e-06, - "loss": 0.34386202692985535, + "grad_norm": 1.2227498292922974, + "learning_rate": 1.0650091362908189e-06, + "loss": 0.6887333393096924, "step": 1410 }, { "epoch": 3.130820399113082, - "grad_norm": 1.20671808719635, - "learning_rate": 2.1191026094407386e-06, - "loss": 0.40727710723876953, + "grad_norm": 1.1990351676940918, + "learning_rate": 1.0595513047203693e-06, + "loss": 0.6441534757614136, "step": 1412 }, { "epoch": 3.1352549889135255, - "grad_norm": 0.9301721453666687, - "learning_rate": 2.1082324321707075e-06, - "loss": 0.24828168749809265, + "grad_norm": 1.021954894065857, + "learning_rate": 1.0541162160853538e-06, + "loss": 0.4102858901023865, "step": 1414 }, { "epoch": 3.139689578713969, - "grad_norm": 0.29069337248802185, - "learning_rate": 2.0974078870165882e-06, - "loss": 0.02273000217974186, + "grad_norm": 0.33726537227630615, + "learning_rate": 1.0487039435082941e-06, + "loss": 0.056999024003744125, "step": 1416 }, { "epoch": 3.1441241685144123, - "grad_norm": 1.1463552713394165, - "learning_rate": 2.086629119609499e-06, - "loss": 0.4149464964866638, + "grad_norm": 1.8328347206115723, + "learning_rate": 1.0433145598047495e-06, + "loss": 0.6035170555114746, "step": 1418 }, { "epoch": 3.1485587583148558, - "grad_norm": 0.8834893107414246, - "learning_rate": 2.0758962749646716e-06, - "loss": 0.47038036584854126, + "grad_norm": 1.524192214012146, + "learning_rate": 1.0379481374823358e-06, + "loss": 0.7395703196525574, "step": 1420 }, { "epoch": 3.152993348115299, - "grad_norm": 3.7323062419891357, - "learning_rate": 2.065209497479502e-06, - "loss": 0.34102943539619446, + "grad_norm": 1.5557146072387695, + "learning_rate": 1.032604748739751e-06, + "loss": 0.6392835974693298, "step": 1422 }, { "epoch": 3.1574279379157426, - "grad_norm": 0.32775676250457764, - "learning_rate": 2.0545689309316138e-06, - "loss": 0.06301730126142502, + "grad_norm": 8.86196231842041, + "learning_rate": 1.0272844654658069e-06, + "loss": 0.1004699245095253, "step": 1424 }, { "epoch": 3.1618625277161865, - "grad_norm": 1.2714399099349976, - "learning_rate": 2.043974718476911e-06, - "loss": 0.4001501798629761, + "grad_norm": 1.7001556158065796, + "learning_rate": 1.0219873592384556e-06, + "loss": 0.6165364384651184, "step": 1426 }, { "epoch": 3.16629711751663, - "grad_norm": 1.573042631149292, - "learning_rate": 2.033427002647668e-06, - "loss": 0.37112024426460266, + "grad_norm": 2.5815086364746094, + "learning_rate": 1.016713501323834e-06, + "loss": 0.7140083312988281, "step": 1428 }, { "epoch": 3.1707317073170733, - "grad_norm": 1.6631622314453125, - "learning_rate": 2.0229259253505946e-06, - "loss": 0.2901914417743683, + "grad_norm": 2.0112009048461914, + "learning_rate": 1.0114629626752973e-06, + "loss": 0.3635685443878174, "step": 1430 }, { "epoch": 3.1751662971175167, - "grad_norm": 1.1970975399017334, - "learning_rate": 2.012471627864943e-06, - "loss": 0.29324379563331604, + "grad_norm": 0.9459992051124573, + "learning_rate": 1.0062358139324715e-06, + "loss": 0.34838780760765076, "step": 1432 }, { "epoch": 3.17960088691796, - "grad_norm": 1.13413667678833, - "learning_rate": 2.0020642508405984e-06, - "loss": 0.3578546643257141, + "grad_norm": 1.4729267358779907, + "learning_rate": 1.0010321254202992e-06, + "loss": 0.5929923057556152, "step": 1434 }, { "epoch": 3.1840354767184036, - "grad_norm": 2.5074386596679688, - "learning_rate": 1.9917039342961837e-06, - "loss": 0.0840587466955185, + "grad_norm": 2.3258347511291504, + "learning_rate": 9.958519671480919e-07, + "loss": 0.2461414337158203, "step": 1436 }, { "epoch": 3.188470066518847, - "grad_norm": 0.9287826418876648, - "learning_rate": 1.9813908176171857e-06, - "loss": 0.27323436737060547, + "grad_norm": 0.8266603946685791, + "learning_rate": 9.906954088085929e-07, + "loss": 0.421371191740036, "step": 1438 }, { "epoch": 3.1929046563192904, - "grad_norm": 2.366941213607788, - "learning_rate": 1.97112503955407e-06, - "loss": 0.43178707361221313, + "grad_norm": 4.340923309326172, + "learning_rate": 9.85562519777035e-07, + "loss": 0.49519577622413635, "step": 1440 }, { "epoch": 3.197339246119734, - "grad_norm": 1.5955890417099, - "learning_rate": 1.9609067382204224e-06, - "loss": 0.47529783844947815, + "grad_norm": 1.8006068468093872, + "learning_rate": 9.804533691102112e-07, + "loss": 0.6714183688163757, "step": 1442 }, { "epoch": 3.2017738359201773, - "grad_norm": 1.0811793804168701, - "learning_rate": 1.950736051091084e-06, - "loss": 0.4838941693305969, + "grad_norm": 1.2195959091186523, + "learning_rate": 9.75368025545542e-07, + "loss": 0.6853227615356445, "step": 1444 }, { "epoch": 3.2062084257206207, - "grad_norm": 1.8544070720672607, - "learning_rate": 1.9406131150003036e-06, - "loss": 0.37333235144615173, + "grad_norm": 1.3917747735977173, + "learning_rate": 9.703065575001518e-07, + "loss": 0.7487331628799438, "step": 1446 }, { "epoch": 3.210643015521064, - "grad_norm": 7.789588451385498, - "learning_rate": 1.930538066139904e-06, - "loss": 0.1161646842956543, + "grad_norm": 7.102336883544922, + "learning_rate": 9.65269033069952e-07, + "loss": 0.45779871940612793, "step": 1448 }, { "epoch": 3.2150776053215075, - "grad_norm": 0.8985663056373596, - "learning_rate": 1.9205110400574368e-06, - "loss": 0.5245546698570251, + "grad_norm": 1.3083984851837158, + "learning_rate": 9.602555200287184e-07, + "loss": 0.7010431289672852, "step": 1450 }, { "epoch": 3.2195121951219514, - "grad_norm": 1.785593867301941, - "learning_rate": 1.910532171654367e-06, - "loss": 0.23958845436573029, + "grad_norm": 5.842024803161621, + "learning_rate": 9.552660858271835e-07, + "loss": 0.4435151517391205, "step": 1452 }, { "epoch": 3.223946784922395, - "grad_norm": 1.4889003038406372, - "learning_rate": 1.9006015951842587e-06, - "loss": 0.3209075331687927, + "grad_norm": 0.8788203001022339, + "learning_rate": 9.503007975921294e-07, + "loss": 0.45844289660453796, "step": 1454 }, { "epoch": 3.2283813747228383, - "grad_norm": 3.653893232345581, - "learning_rate": 1.8907194442509642e-06, - "loss": 0.43527886271476746, + "grad_norm": 3.6687796115875244, + "learning_rate": 9.453597221254821e-07, + "loss": 0.8120266795158386, "step": 1456 }, { "epoch": 3.2328159645232817, - "grad_norm": 2.4832544326782227, - "learning_rate": 1.8808858518068312e-06, - "loss": 0.2721869647502899, + "grad_norm": 2.3230605125427246, + "learning_rate": 9.404429259034156e-07, + "loss": 0.39245831966400146, "step": 1458 }, { "epoch": 3.237250554323725, - "grad_norm": 1.3052841424942017, - "learning_rate": 1.8711009501509087e-06, - "loss": 0.44211310148239136, + "grad_norm": 1.0407835245132446, + "learning_rate": 9.355504750754543e-07, + "loss": 0.7990567684173584, "step": 1460 }, { "epoch": 3.2416851441241685, - "grad_norm": 1.3836771249771118, - "learning_rate": 1.8613648709271732e-06, - "loss": 0.26933524012565613, + "grad_norm": 1.1758776903152466, + "learning_rate": 9.306824354635866e-07, + "loss": 0.4867308437824249, "step": 1462 }, { "epoch": 3.246119733924612, - "grad_norm": 3.1456692218780518, - "learning_rate": 1.8516777451227552e-06, - "loss": 0.39066338539123535, + "grad_norm": 1.7548372745513916, + "learning_rate": 9.258388725613776e-07, + "loss": 0.5504351258277893, "step": 1464 }, { "epoch": 3.2505543237250554, - "grad_norm": 1.4456260204315186, - "learning_rate": 1.842039703066172e-06, - "loss": 0.47105956077575684, + "grad_norm": 1.3209401369094849, + "learning_rate": 9.21019851533086e-07, + "loss": 0.7579896450042725, "step": 1466 }, { "epoch": 3.254988913525499, - "grad_norm": 0.20494961738586426, - "learning_rate": 1.8324508744255842e-06, - "loss": 0.039846230298280716, + "grad_norm": 0.2684461176395416, + "learning_rate": 9.162254372127921e-07, + "loss": 0.07358714938163757, "step": 1468 }, { "epoch": 3.259423503325942, - "grad_norm": 2.348210573196411, - "learning_rate": 1.8229113882070398e-06, - "loss": 0.310930460691452, + "grad_norm": 2.430288791656494, + "learning_rate": 9.114556941035199e-07, + "loss": 0.6224230527877808, "step": 1470 }, { "epoch": 3.2638580931263856, - "grad_norm": 1.5028283596038818, - "learning_rate": 1.8134213727527504e-06, - "loss": 0.420907586812973, + "grad_norm": 1.1718429327011108, + "learning_rate": 9.067106863763752e-07, + "loss": 0.9210297465324402, "step": 1472 }, { "epoch": 3.2682926829268295, - "grad_norm": 0.636045515537262, - "learning_rate": 1.803980955739354e-06, - "loss": 0.08325402438640594, + "grad_norm": 1.4180784225463867, + "learning_rate": 9.01990477869677e-07, + "loss": 0.30042320489883423, "step": 1474 }, { "epoch": 3.2727272727272725, - "grad_norm": 0.7118128538131714, - "learning_rate": 1.7945902641762027e-06, - "loss": 0.3166371285915375, + "grad_norm": 1.123668909072876, + "learning_rate": 8.972951320881014e-07, + "loss": 0.5489499568939209, "step": 1476 }, { "epoch": 3.2771618625277164, - "grad_norm": 1.0103552341461182, - "learning_rate": 1.785249424403654e-06, - "loss": 0.4005421996116638, + "grad_norm": 0.8793950080871582, + "learning_rate": 8.92624712201827e-07, + "loss": 0.3639003336429596, "step": 1478 }, { "epoch": 3.2815964523281598, - "grad_norm": 0.3991997539997101, - "learning_rate": 1.7759585620913723e-06, - "loss": 0.27822190523147583, + "grad_norm": 1.9588091373443604, + "learning_rate": 8.879792810456861e-07, + "loss": 0.4639153778553009, "step": 1480 }, { "epoch": 3.286031042128603, - "grad_norm": 0.9653980731964111, - "learning_rate": 1.7667178022366294e-06, - "loss": 0.4057963192462921, + "grad_norm": 1.0029566287994385, + "learning_rate": 8.833589011183147e-07, + "loss": 0.7829899191856384, "step": 1482 }, { "epoch": 3.2904656319290466, - "grad_norm": 8.63097095489502, - "learning_rate": 1.757527269162636e-06, - "loss": 0.327663779258728, + "grad_norm": 0.9496995210647583, + "learning_rate": 8.78763634581318e-07, + "loss": 0.5258970260620117, "step": 1484 }, { "epoch": 3.29490022172949, - "grad_norm": 0.9608578681945801, - "learning_rate": 1.7483870865168585e-06, - "loss": 0.3130677342414856, + "grad_norm": 0.913833737373352, + "learning_rate": 8.741935432584292e-07, + "loss": 0.47597554326057434, "step": 1486 }, { "epoch": 3.2993348115299335, - "grad_norm": 2.4934980869293213, - "learning_rate": 1.739297377269361e-06, - "loss": 0.18216699361801147, + "grad_norm": 2.7132110595703125, + "learning_rate": 8.696486886346805e-07, + "loss": 0.43635129928588867, "step": 1488 }, { "epoch": 3.303769401330377, - "grad_norm": 2.835766553878784, - "learning_rate": 1.730258263711149e-06, - "loss": 0.1408637911081314, + "grad_norm": 2.0939266681671143, + "learning_rate": 8.651291318555745e-07, + "loss": 0.23065295815467834, "step": 1490 }, { "epoch": 3.3082039911308203, - "grad_norm": 1.3449338674545288, - "learning_rate": 1.7212698674525246e-06, - "loss": 0.3613136410713196, + "grad_norm": 0.9052222967147827, + "learning_rate": 8.606349337262623e-07, + "loss": 0.8228150606155396, "step": 1492 }, { "epoch": 3.3126385809312637, - "grad_norm": 1.2930020093917847, - "learning_rate": 1.7123323094214485e-06, - "loss": 0.3133498728275299, + "grad_norm": 1.733860731124878, + "learning_rate": 8.561661547107243e-07, + "loss": 0.47642073035240173, "step": 1494 }, { "epoch": 3.317073170731707, - "grad_norm": 2.178227663040161, - "learning_rate": 1.7034457098619176e-06, - "loss": 0.329238623380661, + "grad_norm": 1.4319210052490234, + "learning_rate": 8.517228549309588e-07, + "loss": 0.6294840574264526, "step": 1496 }, { "epoch": 3.3215077605321506, - "grad_norm": 0.9758614301681519, - "learning_rate": 1.6946101883323435e-06, - "loss": 0.5130437016487122, + "grad_norm": 3.192908525466919, + "learning_rate": 8.473050941661717e-07, + "loss": 0.767174482345581, "step": 1498 }, { "epoch": 3.3259423503325944, - "grad_norm": 0.9882797002792358, - "learning_rate": 1.6858258637039421e-06, - "loss": 0.3322954475879669, + "grad_norm": 1.09794020652771, + "learning_rate": 8.429129318519711e-07, + "loss": 0.3739165961742401, "step": 1500 }, { "epoch": 3.330376940133038, - "grad_norm": 1.1139705181121826, - "learning_rate": 1.677092854159142e-06, - "loss": 0.37578579783439636, + "grad_norm": 1.1555298566818237, + "learning_rate": 8.38546427079571e-07, + "loss": 0.6726783514022827, "step": 1502 }, { "epoch": 3.3348115299334813, - "grad_norm": 1.0699121952056885, - "learning_rate": 1.6684112771899858e-06, - "loss": 0.3910093903541565, + "grad_norm": 0.9871981739997864, + "learning_rate": 8.342056385949929e-07, + "loss": 0.7142524123191833, "step": 1504 }, { "epoch": 3.3392461197339247, - "grad_norm": 0.19254817068576813, - "learning_rate": 1.6597812495965537e-06, - "loss": 0.17204663157463074, + "grad_norm": 0.46822428703308105, + "learning_rate": 8.298906247982768e-07, + "loss": 0.3226020932197571, "step": 1506 }, { "epoch": 3.343680709534368, - "grad_norm": 1.2403380870819092, - "learning_rate": 1.651202887485394e-06, - "loss": 0.21663136780261993, + "grad_norm": 1.3958603143692017, + "learning_rate": 8.25601443742697e-07, + "loss": 0.48669373989105225, "step": 1508 }, { "epoch": 3.3481152993348116, - "grad_norm": 2.0894668102264404, - "learning_rate": 1.6426763062679553e-06, - "loss": 0.47299203276634216, + "grad_norm": 2.5492188930511475, + "learning_rate": 8.213381531339776e-07, + "loss": 0.732366681098938, "step": 1510 }, { "epoch": 3.352549889135255, - "grad_norm": 0.15021318197250366, - "learning_rate": 1.63420162065904e-06, - "loss": 0.20796214044094086, + "grad_norm": 0.35939931869506836, + "learning_rate": 8.1710081032952e-07, + "loss": 0.46142342686653137, "step": 1512 }, { "epoch": 3.3569844789356984, - "grad_norm": 1.074012041091919, - "learning_rate": 1.625778944675257e-06, - "loss": 0.4941790997982025, + "grad_norm": 1.1625959873199463, + "learning_rate": 8.128894723376285e-07, + "loss": 0.8541035056114197, "step": 1514 }, { "epoch": 3.361419068736142, - "grad_norm": 6.77968168258667, - "learning_rate": 1.6174083916334877e-06, - "loss": 0.3023959994316101, + "grad_norm": 3.446476697921753, + "learning_rate": 8.087041958167438e-07, + "loss": 0.47667139768600464, "step": 1516 }, { "epoch": 3.3658536585365852, - "grad_norm": 1.5364946126937866, - "learning_rate": 1.609090074149366e-06, - "loss": 0.30802658200263977, + "grad_norm": 0.9094996452331543, + "learning_rate": 8.04545037074683e-07, + "loss": 0.5068199634552002, "step": 1518 }, { "epoch": 3.3702882483370287, - "grad_norm": 2.5270133018493652, - "learning_rate": 1.6008241041357535e-06, - "loss": 0.5083972811698914, + "grad_norm": 0.8286429047584534, + "learning_rate": 8.004120520678768e-07, + "loss": 0.703849732875824, "step": 1520 }, { "epoch": 3.374722838137472, - "grad_norm": 0.16544599831104279, - "learning_rate": 1.5926105928012486e-06, - "loss": 0.2425152212381363, + "grad_norm": 0.2944241166114807, + "learning_rate": 7.963052964006243e-07, + "loss": 0.4450077414512634, "step": 1522 }, { "epoch": 3.3791574279379155, - "grad_norm": 1.5787891149520874, - "learning_rate": 1.5844496506486734e-06, - "loss": 0.5767493844032288, + "grad_norm": 2.379061698913574, + "learning_rate": 7.922248253243367e-07, + "loss": 0.6920251250267029, "step": 1524 }, { "epoch": 3.3835920177383594, - "grad_norm": 24.345401763916016, - "learning_rate": 1.576341387473601e-06, - "loss": 0.3439426124095917, + "grad_norm": 6.14056921005249, + "learning_rate": 7.881706937368005e-07, + "loss": 0.6898224353790283, "step": 1526 }, { "epoch": 3.388026607538803, - "grad_norm": 1.3219624757766724, - "learning_rate": 1.568285912362872e-06, - "loss": 0.21882264316082, + "grad_norm": 1.5987882614135742, + "learning_rate": 7.84142956181436e-07, + "loss": 0.45382753014564514, "step": 1528 }, { "epoch": 3.3924611973392462, - "grad_norm": 0.47698870301246643, - "learning_rate": 1.5602833336931242e-06, - "loss": 0.209283709526062, + "grad_norm": 0.6767929196357727, + "learning_rate": 7.801416668465621e-07, + "loss": 0.3358471691608429, "step": 1530 }, { "epoch": 3.3968957871396896, - "grad_norm": 0.9787905812263489, - "learning_rate": 1.552333759129344e-06, - "loss": 0.09470443427562714, + "grad_norm": 5.22633695602417, + "learning_rate": 7.76166879564672e-07, + "loss": 0.14293606579303741, "step": 1532 }, { "epoch": 3.401330376940133, - "grad_norm": 1.2374992370605469, - "learning_rate": 1.5444372956234062e-06, - "loss": 0.3461211919784546, + "grad_norm": 1.0749104022979736, + "learning_rate": 7.722186478117031e-07, + "loss": 0.6599565148353577, "step": 1534 }, { "epoch": 3.4057649667405765, - "grad_norm": 1.2397676706314087, - "learning_rate": 1.5365940494126424e-06, - "loss": 0.46922361850738525, + "grad_norm": 1.8686710596084595, + "learning_rate": 7.682970247063212e-07, + "loss": 0.7604563236236572, "step": 1536 }, { "epoch": 3.41019955654102, - "grad_norm": 1.3618249893188477, - "learning_rate": 1.5288041260184132e-06, - "loss": 0.3622947037220001, + "grad_norm": 1.1680110692977905, + "learning_rate": 7.644020630092066e-07, + "loss": 0.7063665986061096, "step": 1538 }, { "epoch": 3.4146341463414633, - "grad_norm": 1.0507458448410034, - "learning_rate": 1.5210676302446801e-06, - "loss": 0.39398759603500366, + "grad_norm": 1.0432260036468506, + "learning_rate": 7.605338151223401e-07, + "loss": 0.727014422416687, "step": 1540 }, { "epoch": 3.4190687361419068, - "grad_norm": 1.3180983066558838, - "learning_rate": 1.5133846661766058e-06, - "loss": 0.3517080545425415, + "grad_norm": 1.5552973747253418, + "learning_rate": 7.566923330883029e-07, + "loss": 0.4836811125278473, "step": 1542 }, { "epoch": 3.42350332594235, - "grad_norm": 1.1892797946929932, - "learning_rate": 1.5057553371791461e-06, - "loss": 0.3794390559196472, + "grad_norm": 1.57466459274292, + "learning_rate": 7.528776685895731e-07, + "loss": 0.5649837255477905, "step": 1544 }, { "epoch": 3.4279379157427936, - "grad_norm": 34.94332504272461, - "learning_rate": 1.4981797458956624e-06, - "loss": 0.10474438220262527, + "grad_norm": 4.2533650398254395, + "learning_rate": 7.490898729478312e-07, + "loss": 0.23879516124725342, "step": 1546 }, { "epoch": 3.4323725055432375, - "grad_norm": 0.027800027281045914, - "learning_rate": 1.490657994246542e-06, - "loss": 0.0768439918756485, + "grad_norm": 0.1743544042110443, + "learning_rate": 7.45328997123271e-07, + "loss": 0.1827031672000885, "step": 1548 }, { "epoch": 3.436807095343681, - "grad_norm": 1.083954095840454, - "learning_rate": 1.4831901834278212e-06, - "loss": 0.42769551277160645, + "grad_norm": 0.9885973930358887, + "learning_rate": 7.415950917139106e-07, + "loss": 0.7187482118606567, "step": 1550 }, { "epoch": 3.4412416851441243, - "grad_norm": 1.0862568616867065, - "learning_rate": 1.4757764139098332e-06, - "loss": 0.3436740040779114, + "grad_norm": 1.0436469316482544, + "learning_rate": 7.378882069549166e-07, + "loss": 0.6883783936500549, "step": 1552 }, { "epoch": 3.4456762749445677, - "grad_norm": 0.10507642477750778, - "learning_rate": 1.468416785435847e-06, - "loss": 0.29862260818481445, + "grad_norm": 0.17162276804447174, + "learning_rate": 7.342083927179235e-07, + "loss": 0.3528411388397217, "step": 1554 }, { "epoch": 3.450110864745011, - "grad_norm": 1.1534576416015625, - "learning_rate": 1.461111397020732e-06, - "loss": 0.4363846778869629, + "grad_norm": 0.9677134156227112, + "learning_rate": 7.30555698510366e-07, + "loss": 0.7020009756088257, "step": 1556 }, { "epoch": 3.4545454545454546, - "grad_norm": 1.3515652418136597, - "learning_rate": 1.4538603469496215e-06, - "loss": 0.4772418439388275, + "grad_norm": 2.290982961654663, + "learning_rate": 7.269301734748107e-07, + "loss": 0.6964924931526184, "step": 1558 }, { "epoch": 3.458980044345898, - "grad_norm": 0.9060417413711548, - "learning_rate": 1.4466637327765937e-06, - "loss": 0.5036817789077759, + "grad_norm": 0.9703812003135681, + "learning_rate": 7.233318663882968e-07, + "loss": 0.7445743083953857, "step": 1560 }, { "epoch": 3.4634146341463414, - "grad_norm": 1.3972153663635254, - "learning_rate": 1.4395216513233584e-06, - "loss": 0.0940362960100174, + "grad_norm": 2.6579227447509766, + "learning_rate": 7.197608256616792e-07, + "loss": 0.19093935191631317, "step": 1562 }, { "epoch": 3.467849223946785, - "grad_norm": 0.9557284712791443, - "learning_rate": 1.4324341986779527e-06, - "loss": 0.6766175627708435, + "grad_norm": 1.445381999015808, + "learning_rate": 7.162170993389763e-07, + "loss": 0.7774098515510559, "step": 1564 }, { "epoch": 3.4722838137472283, - "grad_norm": 0.8374095559120178, - "learning_rate": 1.4254014701934481e-06, - "loss": 0.21213091909885406, + "grad_norm": 1.2807323932647705, + "learning_rate": 7.127007350967241e-07, + "loss": 0.3930266201496124, "step": 1566 }, { "epoch": 3.4767184035476717, - "grad_norm": 1.0304781198501587, - "learning_rate": 1.4184235604866725e-06, - "loss": 0.3508773744106293, + "grad_norm": 1.6947154998779297, + "learning_rate": 7.092117802433362e-07, + "loss": 0.8776201009750366, "step": 1568 }, { "epoch": 3.481152993348115, - "grad_norm": 2.0591909885406494, - "learning_rate": 1.4115005634369296e-06, - "loss": 0.34335634112358093, + "grad_norm": 1.315631628036499, + "learning_rate": 7.057502817184648e-07, + "loss": 0.4978081285953522, "step": 1570 }, { "epoch": 3.4855875831485585, - "grad_norm": 0.2248295098543167, - "learning_rate": 1.4046325721847443e-06, - "loss": 0.3259221315383911, + "grad_norm": 0.36691853404045105, + "learning_rate": 7.023162860923722e-07, + "loss": 0.5448426604270935, "step": 1572 }, { "epoch": 3.4900221729490024, - "grad_norm": 1.0798416137695312, - "learning_rate": 1.397819679130601e-06, - "loss": 0.5986089110374451, + "grad_norm": 1.9715549945831299, + "learning_rate": 6.989098395653005e-07, + "loss": 0.8805798292160034, "step": 1574 }, { "epoch": 3.494456762749446, - "grad_norm": 3.3881709575653076, - "learning_rate": 1.3910619759337074e-06, - "loss": 0.26687368750572205, + "grad_norm": 2.96600341796875, + "learning_rate": 6.955309879668537e-07, + "loss": 0.3866577744483948, "step": 1576 }, { "epoch": 3.4988913525498893, - "grad_norm": 0.6919270157814026, - "learning_rate": 1.3843595535107587e-06, - "loss": 0.23883309960365295, + "grad_norm": 0.8945562839508057, + "learning_rate": 6.921797767553794e-07, + "loss": 0.413793683052063, "step": 1578 }, { "epoch": 3.5033259423503327, - "grad_norm": 1.5870050191879272, - "learning_rate": 1.377712502034712e-06, - "loss": 0.5271674394607544, + "grad_norm": 1.501956582069397, + "learning_rate": 6.88856251017356e-07, + "loss": 0.6843544244766235, "step": 1580 }, { "epoch": 3.507760532150776, - "grad_norm": 1.3132179975509644, - "learning_rate": 1.3711209109335793e-06, - "loss": 0.48455068469047546, + "grad_norm": 3.895604372024536, + "learning_rate": 6.855604554667897e-07, + "loss": 0.8031338453292847, "step": 1582 }, { "epoch": 3.5121951219512195, - "grad_norm": 0.9779510498046875, - "learning_rate": 1.3645848688892162e-06, - "loss": 0.2770904302597046, + "grad_norm": 1.0929995775222778, + "learning_rate": 6.822924344446081e-07, + "loss": 0.46741926670074463, "step": 1584 }, { "epoch": 3.516629711751663, - "grad_norm": 1.488776445388794, - "learning_rate": 1.3581044638361373e-06, - "loss": 0.2443387657403946, + "grad_norm": 1.4208186864852905, + "learning_rate": 6.790522319180687e-07, + "loss": 0.4863869249820709, "step": 1586 }, { "epoch": 3.5210643015521064, - "grad_norm": 1.3030271530151367, - "learning_rate": 1.3516797829603256e-06, - "loss": 0.44791534543037415, + "grad_norm": 1.2825591564178467, + "learning_rate": 6.758398914801628e-07, + "loss": 0.8202866315841675, "step": 1588 }, { "epoch": 3.52549889135255, - "grad_norm": 2.1599605083465576, - "learning_rate": 1.3453109126980643e-06, - "loss": 0.19074156880378723, + "grad_norm": 2.9333906173706055, + "learning_rate": 6.726554563490321e-07, + "loss": 0.4186065196990967, "step": 1590 }, { "epoch": 3.529933481152993, - "grad_norm": 1.5765827894210815, - "learning_rate": 1.3389979387347743e-06, - "loss": 0.23592326045036316, + "grad_norm": 4.726576805114746, + "learning_rate": 6.694989693673872e-07, + "loss": 0.4944823384284973, "step": 1592 }, { "epoch": 3.5343680709534366, - "grad_norm": 1.3213473558425903, - "learning_rate": 1.332740946003857e-06, - "loss": 0.4785956144332886, + "grad_norm": 1.305874228477478, + "learning_rate": 6.663704730019285e-07, + "loss": 0.8017009496688843, "step": 1594 }, { "epoch": 3.5388026607538805, - "grad_norm": 1.8811619281768799, - "learning_rate": 1.3265400186855548e-06, - "loss": 0.07135710120201111, + "grad_norm": 4.202098846435547, + "learning_rate": 6.632700093427774e-07, + "loss": 0.23011818528175354, "step": 1596 }, { "epoch": 3.5432372505543235, - "grad_norm": 3.579979181289673, - "learning_rate": 1.320395240205819e-06, - "loss": 0.4762045741081238, + "grad_norm": 2.405488967895508, + "learning_rate": 6.601976201029095e-07, + "loss": 0.44181007146835327, "step": 1598 }, { "epoch": 3.5476718403547673, - "grad_norm": 1.0917787551879883, - "learning_rate": 1.3143066932351856e-06, - "loss": 0.2512458860874176, + "grad_norm": 1.209696888923645, + "learning_rate": 6.571533466175928e-07, + "loss": 0.5328426957130432, "step": 1600 }, { "epoch": 3.5521064301552108, - "grad_norm": 1.354770302772522, - "learning_rate": 1.308274459687665e-06, - "loss": 0.2989339232444763, + "grad_norm": 0.9200423359870911, + "learning_rate": 6.541372298438325e-07, + "loss": 0.7708749175071716, "step": 1602 }, { "epoch": 3.556541019955654, - "grad_norm": 0.9256249070167542, - "learning_rate": 1.3022986207196367e-06, - "loss": 0.5872430205345154, + "grad_norm": 0.8987447023391724, + "learning_rate": 6.511493103598184e-07, + "loss": 0.8243938088417053, "step": 1604 }, { "epoch": 3.5609756097560976, - "grad_norm": 1.0018621683120728, - "learning_rate": 1.2963792567287617e-06, - "loss": 0.5670958757400513, + "grad_norm": 1.0883064270019531, + "learning_rate": 6.481896283643808e-07, + "loss": 0.7721865177154541, "step": 1606 }, { "epoch": 3.565410199556541, - "grad_norm": 0.1671750247478485, - "learning_rate": 1.290516447352899e-06, - "loss": 0.034741759300231934, + "grad_norm": 0.14681853353977203, + "learning_rate": 6.452582236764495e-07, + "loss": 0.08870165795087814, "step": 1608 }, { "epoch": 3.5698447893569845, - "grad_norm": 1.0988103151321411, - "learning_rate": 1.2847102714690308e-06, - "loss": 0.3017559051513672, + "grad_norm": 7.72481632232666, + "learning_rate": 6.423551357345154e-07, + "loss": 0.5422983169555664, "step": 1610 }, { "epoch": 3.574279379157428, - "grad_norm": 2.024768352508545, - "learning_rate": 1.2789608071922076e-06, - "loss": 0.09087596833705902, + "grad_norm": 6.495584487915039, + "learning_rate": 6.394804035961038e-07, + "loss": 0.14715789258480072, "step": 1612 }, { "epoch": 3.5787139689578713, - "grad_norm": 2.462632656097412, - "learning_rate": 1.2732681318744923e-06, - "loss": 0.3814306855201721, + "grad_norm": 2.7080962657928467, + "learning_rate": 6.366340659372462e-07, + "loss": 0.5568990111351013, "step": 1614 }, { "epoch": 3.5831485587583147, - "grad_norm": 1.586369276046753, - "learning_rate": 1.2676323221039236e-06, - "loss": 0.7159979939460754, + "grad_norm": 1.5873769521713257, + "learning_rate": 6.338161610519618e-07, + "loss": 0.7860218286514282, "step": 1616 }, { "epoch": 3.587583148558758, - "grad_norm": 3.622847557067871, - "learning_rate": 1.2620534537034795e-06, - "loss": 0.24280951917171478, + "grad_norm": 0.8671674132347107, + "learning_rate": 6.310267268517397e-07, + "loss": 0.4535370171070099, "step": 1618 }, { "epoch": 3.5920177383592016, - "grad_norm": 2.6759791374206543, - "learning_rate": 1.2565316017300635e-06, - "loss": 0.40450718998908997, + "grad_norm": 5.442802906036377, + "learning_rate": 6.282658008650318e-07, + "loss": 0.5291122198104858, "step": 1620 }, { "epoch": 3.5964523281596454, - "grad_norm": 0.979674220085144, - "learning_rate": 1.2510668404734924e-06, - "loss": 0.3859134614467621, + "grad_norm": 0.8540799021720886, + "learning_rate": 6.255334202367462e-07, + "loss": 0.647901713848114, "step": 1622 }, { "epoch": 3.6008869179600884, - "grad_norm": 1.1429744958877563, - "learning_rate": 1.2456592434554963e-06, - "loss": 0.4199633002281189, + "grad_norm": 1.4981876611709595, + "learning_rate": 6.228296217277481e-07, + "loss": 0.5466744899749756, "step": 1624 }, { "epoch": 3.6053215077605323, - "grad_norm": 10.887653350830078, - "learning_rate": 1.2403088834287282e-06, - "loss": 0.11880503594875336, + "grad_norm": 0.7501009106636047, + "learning_rate": 6.201544417143641e-07, + "loss": 0.20443859696388245, "step": 1626 }, { "epoch": 3.6097560975609757, - "grad_norm": 1.1529648303985596, - "learning_rate": 1.2350158323757903e-06, - "loss": 0.3755669593811035, + "grad_norm": 2.2188422679901123, + "learning_rate": 6.175079161878951e-07, + "loss": 0.6764265894889832, "step": 1628 }, { "epoch": 3.614190687361419, - "grad_norm": 2.1950061321258545, - "learning_rate": 1.229780161508259e-06, - "loss": 0.30650004744529724, + "grad_norm": 3.0544168949127197, + "learning_rate": 6.148900807541295e-07, + "loss": 0.5957894325256348, "step": 1630 }, { "epoch": 3.6186252771618626, - "grad_norm": 1.1274499893188477, - "learning_rate": 1.2246019412657319e-06, - "loss": 0.49355947971343994, + "grad_norm": 1.2711377143859863, + "learning_rate": 6.123009706328659e-07, + "loss": 0.5789898037910461, "step": 1632 }, { "epoch": 3.623059866962306, - "grad_norm": 1.7049529552459717, - "learning_rate": 1.2194812413148756e-06, - "loss": 0.49852749705314636, + "grad_norm": 3.054331064224243, + "learning_rate": 6.097406206574378e-07, + "loss": 0.6961408257484436, "step": 1634 }, { "epoch": 3.6274944567627494, - "grad_norm": 1.2080436944961548, - "learning_rate": 1.214418130548495e-06, - "loss": 0.3345094621181488, + "grad_norm": 1.0726757049560547, + "learning_rate": 6.072090652742475e-07, + "loss": 0.47810229659080505, "step": 1636 }, { "epoch": 3.631929046563193, - "grad_norm": 2.6900432109832764, - "learning_rate": 1.2094126770845986e-06, - "loss": 0.38614609837532043, + "grad_norm": 2.5502026081085205, + "learning_rate": 6.047063385422993e-07, + "loss": 0.6304081678390503, "step": 1638 }, { "epoch": 3.6363636363636362, - "grad_norm": 0.9296088814735413, - "learning_rate": 1.2044649482654876e-06, - "loss": 0.5239455103874207, + "grad_norm": 2.4343056678771973, + "learning_rate": 6.022324741327438e-07, + "loss": 0.6360555291175842, "step": 1640 }, { "epoch": 3.6407982261640797, - "grad_norm": 0.42654258012771606, - "learning_rate": 1.1995750106568496e-06, - "loss": 0.24880681931972504, + "grad_norm": 0.8209331035614014, + "learning_rate": 5.997875053284248e-07, + "loss": 0.32580727338790894, "step": 1642 }, { "epoch": 3.6452328159645235, - "grad_norm": 0.9114068746566772, - "learning_rate": 1.1947429300468575e-06, - "loss": 0.42800384759902954, + "grad_norm": 1.0483583211898804, + "learning_rate": 5.973714650234287e-07, + "loss": 0.563210666179657, "step": 1644 }, { "epoch": 3.6496674057649665, - "grad_norm": 0.22093704342842102, - "learning_rate": 1.1899687714452932e-06, - "loss": 0.24181388318538666, + "grad_norm": 0.1659860759973526, + "learning_rate": 5.949843857226466e-07, + "loss": 0.376314640045166, "step": 1646 }, { "epoch": 3.6541019955654104, - "grad_norm": 2.1429035663604736, - "learning_rate": 1.1852525990826658e-06, - "loss": 0.16454415023326874, + "grad_norm": 1.0978350639343262, + "learning_rate": 5.926262995413329e-07, + "loss": 0.36298975348472595, "step": 1648 }, { "epoch": 3.658536585365854, - "grad_norm": 1.6719216108322144, - "learning_rate": 1.1805944764093484e-06, - "loss": 0.34569817781448364, + "grad_norm": 5.020273685455322, + "learning_rate": 5.902972382046742e-07, + "loss": 0.4144408106803894, "step": 1650 }, { "epoch": 3.662971175166297, - "grad_norm": 2.202148199081421, - "learning_rate": 1.1759944660947301e-06, - "loss": 0.44416671991348267, + "grad_norm": 0.9348329901695251, + "learning_rate": 5.879972330473651e-07, + "loss": 0.677159309387207, "step": 1652 }, { "epoch": 3.6674057649667406, - "grad_norm": 1.0591462850570679, - "learning_rate": 1.171452630026365e-06, - "loss": 0.1786390095949173, + "grad_norm": 0.8802472949028015, + "learning_rate": 5.857263150131825e-07, + "loss": 0.3901694416999817, "step": 1654 }, { "epoch": 3.671840354767184, - "grad_norm": 1.1581944227218628, - "learning_rate": 1.1669690293091452e-06, - "loss": 0.4971603453159332, + "grad_norm": 0.9511557817459106, + "learning_rate": 5.834845146545726e-07, + "loss": 0.7068908214569092, "step": 1656 }, { "epoch": 3.6762749445676275, - "grad_norm": 2.9780452251434326, - "learning_rate": 1.1625437242644772e-06, - "loss": 0.28031206130981445, + "grad_norm": 2.1888368129730225, + "learning_rate": 5.812718621322386e-07, + "loss": 0.5586342215538025, "step": 1658 }, { "epoch": 3.680709534368071, - "grad_norm": 0.9189567565917969, - "learning_rate": 1.1581767744294682e-06, - "loss": 0.325800359249115, + "grad_norm": 1.1138534545898438, + "learning_rate": 5.790883872147341e-07, + "loss": 0.4197966456413269, "step": 1660 }, { "epoch": 3.6851441241685143, - "grad_norm": 0.1943116933107376, - "learning_rate": 1.1538682385561286e-06, - "loss": 0.2070523202419281, + "grad_norm": 0.3132419288158417, + "learning_rate": 5.769341192780643e-07, + "loss": 0.3502754867076874, "step": 1662 }, { "epoch": 3.6895787139689578, - "grad_norm": 1.0158610343933105, - "learning_rate": 1.1496181746105784e-06, - "loss": 0.23334433138370514, + "grad_norm": 1.2125017642974854, + "learning_rate": 5.748090873052892e-07, + "loss": 0.4432962238788605, "step": 1664 }, { "epoch": 3.694013303769401, - "grad_norm": 0.8135693073272705, - "learning_rate": 1.1454266397722707e-06, - "loss": 0.40261054039001465, + "grad_norm": 1.3100497722625732, + "learning_rate": 5.727133198861353e-07, + "loss": 0.6520885229110718, "step": 1666 }, { "epoch": 3.6984478935698446, - "grad_norm": 1.025733470916748, - "learning_rate": 1.1412936904332181e-06, - "loss": 0.3658636212348938, + "grad_norm": 1.8288265466690063, + "learning_rate": 5.706468452166091e-07, + "loss": 0.5817002654075623, "step": 1668 }, { "epoch": 3.7028824833702885, - "grad_norm": 1.5712085962295532, - "learning_rate": 1.1372193821972379e-06, - "loss": 0.4164735674858093, + "grad_norm": 0.997337818145752, + "learning_rate": 5.686096910986189e-07, + "loss": 0.6981693506240845, "step": 1670 }, { "epoch": 3.7073170731707314, - "grad_norm": 1.251022458076477, - "learning_rate": 1.1332037698792033e-06, - "loss": 0.32476893067359924, + "grad_norm": 1.4828516244888306, + "learning_rate": 5.666018849396016e-07, + "loss": 0.6814447045326233, "step": 1672 }, { "epoch": 3.7117516629711753, - "grad_norm": 1.0968090295791626, - "learning_rate": 1.1292469075043026e-06, - "loss": 0.5027676224708557, + "grad_norm": 0.9741377234458923, + "learning_rate": 5.646234537521513e-07, + "loss": 0.6926964521408081, "step": 1674 }, { "epoch": 3.7161862527716187, - "grad_norm": 1.1787587404251099, - "learning_rate": 1.1253488483073177e-06, - "loss": 0.4444116950035095, + "grad_norm": 1.1306507587432861, + "learning_rate": 5.626744241536589e-07, + "loss": 0.7342678308486938, "step": 1676 }, { "epoch": 3.720620842572062, - "grad_norm": 1.1607528924942017, - "learning_rate": 1.1215096447319038e-06, - "loss": 0.41037842631340027, + "grad_norm": 1.6748311519622803, + "learning_rate": 5.607548223659519e-07, + "loss": 0.7321768999099731, "step": 1678 }, { "epoch": 3.7250554323725056, - "grad_norm": 2.8341286182403564, - "learning_rate": 1.117729348429884e-06, - "loss": 0.19149872660636902, + "grad_norm": 4.892649173736572, + "learning_rate": 5.58864674214942e-07, + "loss": 0.5290915369987488, "step": 1680 }, { "epoch": 3.729490022172949, - "grad_norm": 1.3583526611328125, - "learning_rate": 1.114008010260558e-06, - "loss": 0.4621056318283081, + "grad_norm": 0.8138172030448914, + "learning_rate": 5.57004005130279e-07, + "loss": 0.5308825969696045, "step": 1682 }, { "epoch": 3.7339246119733924, - "grad_norm": 0.825949490070343, - "learning_rate": 1.1103456802900134e-06, - "loss": 0.18589909374713898, + "grad_norm": 1.0458836555480957, + "learning_rate": 5.551728401450067e-07, + "loss": 0.35428744554519653, "step": 1684 }, { "epoch": 3.738359201773836, - "grad_norm": 2.031763792037964, - "learning_rate": 1.1067424077904555e-06, - "loss": 0.3091331422328949, + "grad_norm": 2.807513475418091, + "learning_rate": 5.533712038952278e-07, + "loss": 0.6030918955802917, "step": 1686 }, { "epoch": 3.7427937915742793, - "grad_norm": 0.24129338562488556, - "learning_rate": 1.103198241239542e-06, - "loss": 0.07289690524339676, + "grad_norm": 0.6136502623558044, + "learning_rate": 5.51599120619771e-07, + "loss": 0.17880572378635406, "step": 1688 }, { "epoch": 3.7472283813747227, - "grad_norm": 1.1850625276565552, - "learning_rate": 1.0997132283197324e-06, - "loss": 0.5156506896018982, + "grad_norm": 1.1024833917617798, + "learning_rate": 5.498566141598662e-07, + "loss": 0.6866117715835571, "step": 1690 }, { "epoch": 3.7516629711751666, - "grad_norm": 2.6560683250427246, - "learning_rate": 1.0962874159176454e-06, - "loss": 0.5042511820793152, + "grad_norm": 1.260596513748169, + "learning_rate": 5.481437079588227e-07, + "loss": 0.6682636737823486, "step": 1692 }, { "epoch": 3.7560975609756095, - "grad_norm": 1.1266878843307495, - "learning_rate": 1.0929208501234286e-06, - "loss": 0.5055519938468933, + "grad_norm": 0.8481225371360779, + "learning_rate": 5.464604250617143e-07, + "loss": 0.7785466909408569, "step": 1694 }, { "epoch": 3.7605321507760534, - "grad_norm": 1.4622995853424072, - "learning_rate": 1.0896135762301393e-06, - "loss": 0.46531805396080017, + "grad_norm": 1.0920374393463135, + "learning_rate": 5.448067881150697e-07, + "loss": 0.7681268453598022, "step": 1696 }, { "epoch": 3.764966740576497, - "grad_norm": 1.787441372871399, - "learning_rate": 1.0863656387331328e-06, - "loss": 0.21703627705574036, + "grad_norm": 1.2000559568405151, + "learning_rate": 5.431828193665664e-07, + "loss": 0.3771549463272095, "step": 1698 }, { "epoch": 3.7694013303769403, - "grad_norm": 1.8496445417404175, - "learning_rate": 1.0831770813294668e-06, - "loss": 0.3597804009914398, + "grad_norm": 4.952986240386963, + "learning_rate": 5.415885406647334e-07, + "loss": 0.5442360639572144, "step": 1700 }, { "epoch": 3.7738359201773837, - "grad_norm": 0.731442928314209, - "learning_rate": 1.0800479469173101e-06, - "loss": 0.6957812309265137, + "grad_norm": 1.6961767673492432, + "learning_rate": 5.400239734586551e-07, + "loss": 0.9097031950950623, "step": 1702 }, { "epoch": 3.778270509977827, - "grad_norm": 0.28417858481407166, - "learning_rate": 1.076978277595369e-06, - "loss": 0.05468475818634033, + "grad_norm": 0.585956871509552, + "learning_rate": 5.384891387976845e-07, + "loss": 0.11114199459552765, "step": 1704 }, { "epoch": 3.7827050997782705, - "grad_norm": 0.22050651907920837, - "learning_rate": 1.0739681146623185e-06, - "loss": 0.26601287722587585, + "grad_norm": 0.36832162737846375, + "learning_rate": 5.369840573311593e-07, + "loss": 0.41135963797569275, "step": 1706 }, { "epoch": 3.787139689578714, - "grad_norm": 1.0851820707321167, - "learning_rate": 1.0710174986162471e-06, - "loss": 0.2386590987443924, + "grad_norm": 1.0942734479904175, + "learning_rate": 5.355087493081236e-07, + "loss": 0.5182826519012451, "step": 1708 }, { "epoch": 3.7915742793791574, - "grad_norm": 0.8813753724098206, - "learning_rate": 1.0681264691541127e-06, - "loss": 0.6136298775672913, + "grad_norm": 0.9384496212005615, + "learning_rate": 5.340632345770564e-07, + "loss": 0.8278499841690063, "step": 1710 }, { "epoch": 3.796008869179601, - "grad_norm": 2.6262285709381104, - "learning_rate": 1.0652950651712072e-06, - "loss": 0.2965908348560333, + "grad_norm": 1.8050825595855713, + "learning_rate": 5.326475325856036e-07, + "loss": 0.4890661835670471, "step": 1712 }, { "epoch": 3.800443458980044, - "grad_norm": 2.0973598957061768, - "learning_rate": 1.0625233247606348e-06, - "loss": 0.2669585049152374, + "grad_norm": 4.0279364585876465, + "learning_rate": 5.312616623803174e-07, + "loss": 0.4427688717842102, "step": 1714 }, { "epoch": 3.8048780487804876, - "grad_norm": 1.7458832263946533, - "learning_rate": 1.059811285212799e-06, - "loss": 0.5059341192245483, + "grad_norm": 1.8224685192108154, + "learning_rate": 5.299056426063995e-07, + "loss": 0.7666689157485962, "step": 1716 }, { "epoch": 3.8093126385809315, - "grad_norm": 1.2826629877090454, - "learning_rate": 1.0571589830149e-06, - "loss": 0.27312329411506653, + "grad_norm": 1.169655203819275, + "learning_rate": 5.2857949150745e-07, + "loss": 0.5377134084701538, "step": 1718 }, { "epoch": 3.8137472283813745, - "grad_norm": 3.2964842319488525, - "learning_rate": 1.054566453850444e-06, - "loss": 0.3187982738018036, + "grad_norm": 1.5467808246612549, + "learning_rate": 5.27283226925222e-07, + "loss": 0.5351519584655762, "step": 1720 }, { "epoch": 3.8181818181818183, - "grad_norm": 2.765103340148926, - "learning_rate": 1.0520337325987649e-06, - "loss": 0.5375199913978577, + "grad_norm": 0.9303249716758728, + "learning_rate": 5.260168662993824e-07, + "loss": 0.7153096199035645, "step": 1722 }, { "epoch": 3.8226164079822618, - "grad_norm": 5.0600361824035645, - "learning_rate": 1.049560853334553e-06, - "loss": 0.4722135066986084, + "grad_norm": 1.7566769123077393, + "learning_rate": 5.247804266672765e-07, + "loss": 0.7039221525192261, "step": 1724 }, { "epoch": 3.827050997782705, - "grad_norm": 1.3732205629348755, - "learning_rate": 1.0471478493273976e-06, - "loss": 0.4118424654006958, + "grad_norm": 1.3865876197814941, + "learning_rate": 5.235739246636988e-07, + "loss": 0.6029395461082458, "step": 1726 }, { "epoch": 3.8314855875831486, - "grad_norm": 1.0661771297454834, - "learning_rate": 1.0447947530413389e-06, - "loss": 0.2754386365413666, + "grad_norm": 1.0234519243240356, + "learning_rate": 5.223973765206694e-07, + "loss": 0.3769378960132599, "step": 1728 }, { "epoch": 3.835920177383592, - "grad_norm": 1.2804239988327026, - "learning_rate": 1.042501596134431e-06, - "loss": 0.2026994377374649, + "grad_norm": 1.2475616931915283, + "learning_rate": 5.212507980672155e-07, + "loss": 0.4270702302455902, "step": 1730 }, { "epoch": 3.8403547671840355, - "grad_norm": 1.4092806577682495, - "learning_rate": 1.0402684094583173e-06, - "loss": 0.4652438163757324, + "grad_norm": 1.0097404718399048, + "learning_rate": 5.201342047291587e-07, + "loss": 0.7340813279151917, "step": 1732 }, { "epoch": 3.844789356984479, - "grad_norm": 4.014774322509766, - "learning_rate": 1.0380952230578125e-06, - "loss": 0.379792720079422, + "grad_norm": 3.0468151569366455, + "learning_rate": 5.190476115289063e-07, + "loss": 0.8035828471183777, "step": 1734 }, { "epoch": 3.8492239467849223, - "grad_norm": 1.469107985496521, - "learning_rate": 1.0359820661705042e-06, - "loss": 0.3514306843280792, + "grad_norm": 1.1387958526611328, + "learning_rate": 5.179910330852521e-07, + "loss": 0.7475385069847107, "step": 1736 }, { "epoch": 3.8536585365853657, - "grad_norm": 1.1409002542495728, - "learning_rate": 1.0339289672263519e-06, - "loss": 0.44202250242233276, + "grad_norm": 1.7286393642425537, + "learning_rate": 5.169644836131759e-07, + "loss": 0.6196325421333313, "step": 1738 }, { "epoch": 3.858093126385809, - "grad_norm": 0.8285048604011536, - "learning_rate": 1.0319359538473107e-06, - "loss": 0.23279811441898346, + "grad_norm": 1.653084397315979, + "learning_rate": 5.159679769236553e-07, + "loss": 0.37859052419662476, "step": 1740 }, { "epoch": 3.8625277161862526, - "grad_norm": 1.168976902961731, - "learning_rate": 1.0300030528469564e-06, - "loss": 0.20990443229675293, + "grad_norm": 2.3914763927459717, + "learning_rate": 5.150015264234782e-07, + "loss": 0.4192189574241638, "step": 1742 }, { "epoch": 3.8669623059866964, - "grad_norm": 1.5003247261047363, - "learning_rate": 1.0281302902301254e-06, - "loss": 0.4064357280731201, + "grad_norm": 1.2099565267562866, + "learning_rate": 5.140651451150627e-07, + "loss": 0.6012130379676819, "step": 1744 }, { "epoch": 3.8713968957871394, - "grad_norm": 0.8279268741607666, - "learning_rate": 1.026317691192567e-06, - "loss": 0.4411630630493164, + "grad_norm": 1.0337554216384888, + "learning_rate": 5.131588455962835e-07, + "loss": 0.6700254678726196, "step": 1746 }, { "epoch": 3.8758314855875833, - "grad_norm": 1.1625535488128662, - "learning_rate": 1.0245652801205999e-06, - "loss": 0.2272336483001709, + "grad_norm": 1.2129027843475342, + "learning_rate": 5.122826400602999e-07, + "loss": 0.3560533821582794, "step": 1748 }, { "epoch": 3.8802660753880267, - "grad_norm": 12.195842742919922, - "learning_rate": 1.0228730805907891e-06, - "loss": 0.3394715189933777, + "grad_norm": 1.0586822032928467, + "learning_rate": 5.114365402953946e-07, + "loss": 0.4538826048374176, "step": 1750 }, { "epoch": 3.88470066518847, - "grad_norm": 3.731518507003784, - "learning_rate": 1.0212411153696247e-06, - "loss": 0.34466421604156494, + "grad_norm": 2.09010910987854, + "learning_rate": 5.106205576848123e-07, + "loss": 0.6869809031486511, "step": 1752 }, { "epoch": 3.8891352549889135, - "grad_norm": 1.2109719514846802, - "learning_rate": 1.019669406413218e-06, - "loss": 0.3292272388935089, + "grad_norm": 1.5580908060073853, + "learning_rate": 5.09834703206609e-07, + "loss": 0.7175853848457336, "step": 1754 }, { "epoch": 3.893569844789357, - "grad_norm": 3.1141412258148193, - "learning_rate": 1.0181579748670054e-06, - "loss": 0.3200131058692932, + "grad_norm": 3.137685537338257, + "learning_rate": 5.090789874335027e-07, + "loss": 0.5131061673164368, "step": 1756 }, { "epoch": 3.8980044345898004, - "grad_norm": 0.9708091020584106, - "learning_rate": 1.0167068410654643e-06, - "loss": 0.4250810444355011, + "grad_norm": 1.276068091392517, + "learning_rate": 5.083534205327321e-07, + "loss": 0.8281271457672119, "step": 1758 }, { "epoch": 3.902439024390244, - "grad_norm": 0.030687185004353523, - "learning_rate": 1.0153160245318384e-06, - "loss": 0.002097110729664564, + "grad_norm": 0.07891691476106644, + "learning_rate": 5.076580122659192e-07, + "loss": 0.0017504242714494467, "step": 1760 }, { "epoch": 3.9068736141906872, - "grad_norm": 1.3904731273651123, - "learning_rate": 1.0139855439778766e-06, - "loss": 0.08549664914608002, + "grad_norm": 3.17092227935791, + "learning_rate": 5.069927719889383e-07, + "loss": 0.37903013825416565, "step": 1762 }, { "epoch": 3.9113082039911307, - "grad_norm": 0.10882235318422318, - "learning_rate": 1.0127154173035787e-06, - "loss": 0.28717437386512756, + "grad_norm": 0.18284766376018524, + "learning_rate": 5.063577086517894e-07, + "loss": 0.3342430591583252, "step": 1764 }, { "epoch": 3.9157427937915745, - "grad_norm": 1.5540599822998047, - "learning_rate": 1.0115056615969584e-06, - "loss": 0.2964329421520233, + "grad_norm": 7.076814651489258, + "learning_rate": 5.057528307984792e-07, + "loss": 0.4555712938308716, "step": 1766 }, { "epoch": 3.9201773835920175, - "grad_norm": 1.1777206659317017, - "learning_rate": 1.0103562931338105e-06, - "loss": 0.619647741317749, + "grad_norm": 1.618480920791626, + "learning_rate": 5.051781465669053e-07, + "loss": 0.8074356913566589, "step": 1768 }, { "epoch": 3.9246119733924614, - "grad_norm": 2.035280704498291, - "learning_rate": 1.009267327377492e-06, - "loss": 0.48704907298088074, + "grad_norm": 1.1312144994735718, + "learning_rate": 5.04633663688746e-07, + "loss": 0.6972216963768005, "step": 1770 }, { "epoch": 3.929046563192905, - "grad_norm": 0.8825568556785583, - "learning_rate": 1.008238778978716e-06, - "loss": 0.18097802996635437, + "grad_norm": 1.7633752822875977, + "learning_rate": 5.04119389489358e-07, + "loss": 0.2154180407524109, "step": 1772 }, { "epoch": 3.933481152993348, - "grad_norm": 0.8677657246589661, - "learning_rate": 1.0072706617753528e-06, - "loss": 0.39700010418891907, + "grad_norm": 1.3447540998458862, + "learning_rate": 5.036353308876764e-07, + "loss": 0.637127697467804, "step": 1774 }, { "epoch": 3.9379157427937916, - "grad_norm": 1.6046772003173828, - "learning_rate": 1.0063629887922441e-06, - "loss": 0.5214118361473083, + "grad_norm": 1.6075332164764404, + "learning_rate": 5.031814943961221e-07, + "loss": 0.7520142197608948, "step": 1776 }, { "epoch": 3.942350332594235, - "grad_norm": 0.15087999403476715, - "learning_rate": 1.0055157722410279e-06, - "loss": 0.03213101252913475, + "grad_norm": 0.8913193941116333, + "learning_rate": 5.027578861205139e-07, + "loss": 0.08660762012004852, "step": 1778 }, { "epoch": 3.9467849223946785, - "grad_norm": 3.6360549926757812, - "learning_rate": 1.0047290235199753e-06, - "loss": 0.32588061690330505, + "grad_norm": 2.475306510925293, + "learning_rate": 5.023645117599877e-07, + "loss": 0.42116767168045044, "step": 1780 }, { "epoch": 3.951219512195122, - "grad_norm": 0.1825929582118988, - "learning_rate": 1.0040027532138351e-06, - "loss": 0.22568024694919586, + "grad_norm": 0.3929775357246399, + "learning_rate": 5.020013766069176e-07, + "loss": 0.3883530795574188, "step": 1782 }, { "epoch": 3.9556541019955653, - "grad_norm": 0.905255138874054, - "learning_rate": 1.0033369710936928e-06, - "loss": 0.3658754527568817, + "grad_norm": 1.1025030612945557, + "learning_rate": 5.016684855468464e-07, + "loss": 0.44370949268341064, "step": 1784 }, { "epoch": 3.9600886917960088, - "grad_norm": 0.8106067180633545, - "learning_rate": 1.0027316861168388e-06, - "loss": 0.357939213514328, + "grad_norm": 1.1644660234451294, + "learning_rate": 5.013658430584194e-07, + "loss": 0.651877224445343, "step": 1786 }, { "epoch": 3.964523281596452, - "grad_norm": 0.7470359206199646, - "learning_rate": 1.0021869064266472e-06, - "loss": 0.30442333221435547, + "grad_norm": 0.9477538466453552, + "learning_rate": 5.010934532133236e-07, + "loss": 0.43285292387008667, "step": 1788 }, { "epoch": 3.9689578713968956, - "grad_norm": 0.6582420468330383, - "learning_rate": 1.0017026393524684e-06, - "loss": 0.23345550894737244, + "grad_norm": 0.9940587878227234, + "learning_rate": 5.008513196762342e-07, + "loss": 0.5851073861122131, "step": 1790 }, { "epoch": 3.9733924611973395, - "grad_norm": 1.1834862232208252, - "learning_rate": 1.0012788914095275e-06, - "loss": 0.5164616703987122, + "grad_norm": 1.0942143201828003, + "learning_rate": 5.006394457047638e-07, + "loss": 0.5996626615524292, "step": 1792 }, { "epoch": 3.9778270509977824, - "grad_norm": 0.7614670395851135, - "learning_rate": 1.0009156682988395e-06, - "loss": 0.36194929480552673, + "grad_norm": 1.2127927541732788, + "learning_rate": 5.004578341494197e-07, + "loss": 0.3695821464061737, "step": 1794 }, { "epoch": 3.9822616407982263, - "grad_norm": 1.1429847478866577, - "learning_rate": 1.0006129749071298e-06, - "loss": 0.3539275825023651, + "grad_norm": 1.090599536895752, + "learning_rate": 5.003064874535649e-07, + "loss": 0.7109208106994629, "step": 1796 }, { "epoch": 3.9866962305986697, - "grad_norm": 1.9152874946594238, - "learning_rate": 1.00037081530677e-06, - "loss": 0.4812496304512024, + "grad_norm": 1.2731367349624634, + "learning_rate": 5.00185407653385e-07, + "loss": 0.7177985906600952, "step": 1798 }, { "epoch": 3.991130820399113, - "grad_norm": 4.806298732757568, - "learning_rate": 1.0001891927557255e-06, - "loss": 0.4314287304878235, + "grad_norm": 2.5619304180145264, + "learning_rate": 5.000945963778627e-07, + "loss": 0.9041726589202881, "step": 1800 }, { "epoch": 3.9955654101995566, - "grad_norm": 1.1714129447937012, - "learning_rate": 1.0000681096975056e-06, - "loss": 0.37641072273254395, + "grad_norm": 1.3043817281723022, + "learning_rate": 5.000340548487528e-07, + "loss": 0.6277958154678345, "step": 1802 }, { "epoch": 4.0, - "grad_norm": 0.90379798412323, - "learning_rate": 1.0000075677611364e-06, - "loss": 0.12868885695934296, + "grad_norm": 1.6461491584777832, + "learning_rate": 5.000037838805682e-07, + "loss": 0.2714325189590454, "step": 1804 }, { "epoch": 4.0, "step": 1804, "total_flos": 3.4175049861232067e+18, - "train_loss": 0.7562091423920304, - "train_runtime": 9085.3052, - "train_samples_per_second": 5.957, - "train_steps_per_second": 0.199 + "train_loss": 0.8386258969696222, + "train_runtime": 7909.6987, + "train_samples_per_second": 6.842, + "train_steps_per_second": 0.228 } ], "logging_steps": 2,