{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 2844, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 59.86221694946289, "learning_rate": 5.594405594405594e-08, "loss": 2.1981945037841797, "step": 2 }, { "epoch": 0.008438818565400843, "grad_norm": 12.374919891357422, "learning_rate": 1.6783216783216782e-07, "loss": 1.7811565399169922, "step": 4 }, { "epoch": 0.012658227848101266, "grad_norm": 2.8306868076324463, "learning_rate": 2.7972027972027973e-07, "loss": 1.9376487731933594, "step": 6 }, { "epoch": 0.016877637130801686, "grad_norm": 5.625478267669678, "learning_rate": 3.916083916083916e-07, "loss": 1.9494853019714355, "step": 8 }, { "epoch": 0.02109704641350211, "grad_norm": 15.797261238098145, "learning_rate": 5.034965034965036e-07, "loss": 1.849827766418457, "step": 10 }, { "epoch": 0.02531645569620253, "grad_norm": 2.746943950653076, "learning_rate": 6.153846153846154e-07, "loss": 1.3138155937194824, "step": 12 }, { "epoch": 0.029535864978902954, "grad_norm": 7.311520576477051, "learning_rate": 7.272727272727272e-07, "loss": 1.650458574295044, "step": 14 }, { "epoch": 0.03375527426160337, "grad_norm": 10.263240814208984, "learning_rate": 8.391608391608391e-07, "loss": 2.136387825012207, "step": 16 }, { "epoch": 0.0379746835443038, "grad_norm": 1.834839940071106, "learning_rate": 9.51048951048951e-07, "loss": 1.8011322021484375, "step": 18 }, { "epoch": 0.04219409282700422, "grad_norm": 3.437499761581421, "learning_rate": 1.0629370629370628e-06, "loss": 1.872532606124878, "step": 20 }, { "epoch": 0.046413502109704644, "grad_norm": 2.469942808151245, "learning_rate": 1.1748251748251746e-06, "loss": 1.5344078540802002, "step": 22 }, { "epoch": 0.05063291139240506, "grad_norm": 3.802064895629883, "learning_rate": 1.2867132867132867e-06, "loss": 1.7150638103485107, "step": 24 }, { "epoch": 0.05485232067510549, "grad_norm": 3.20348858833313, "learning_rate": 1.3986013986013985e-06, "loss": 1.5234665870666504, "step": 26 }, { "epoch": 0.05907172995780591, "grad_norm": 3.4241185188293457, "learning_rate": 1.5104895104895103e-06, "loss": 1.8226149082183838, "step": 28 }, { "epoch": 0.06329113924050633, "grad_norm": 9.308449745178223, "learning_rate": 1.6223776223776222e-06, "loss": 1.499394178390503, "step": 30 }, { "epoch": 0.06751054852320675, "grad_norm": 2.2547056674957275, "learning_rate": 1.734265734265734e-06, "loss": 1.6274735927581787, "step": 32 }, { "epoch": 0.07172995780590717, "grad_norm": 43.91905212402344, "learning_rate": 1.8461538461538462e-06, "loss": 1.1456708908081055, "step": 34 }, { "epoch": 0.0759493670886076, "grad_norm": 3.9160234928131104, "learning_rate": 1.958041958041958e-06, "loss": 1.6436142921447754, "step": 36 }, { "epoch": 0.08016877637130802, "grad_norm": 4.995796203613281, "learning_rate": 2.06993006993007e-06, "loss": 1.1828241348266602, "step": 38 }, { "epoch": 0.08438818565400844, "grad_norm": 1.9018964767456055, "learning_rate": 2.1818181818181815e-06, "loss": 1.6038843393325806, "step": 40 }, { "epoch": 0.08860759493670886, "grad_norm": 70.95392608642578, "learning_rate": 2.2937062937062938e-06, "loss": 1.1521950960159302, "step": 42 }, { "epoch": 0.09282700421940929, "grad_norm": 5.062403202056885, "learning_rate": 2.405594405594405e-06, "loss": 1.8195090293884277, "step": 44 }, { "epoch": 0.0970464135021097, "grad_norm": 7.01928186416626, "learning_rate": 2.5174825174825174e-06, "loss": 1.6717772483825684, "step": 46 }, { "epoch": 0.10126582278481013, "grad_norm": 2.7374989986419678, "learning_rate": 2.629370629370629e-06, "loss": 1.5755090713500977, "step": 48 }, { "epoch": 0.10548523206751055, "grad_norm": 3.8747036457061768, "learning_rate": 2.741258741258741e-06, "loss": 0.8314967751502991, "step": 50 }, { "epoch": 0.10970464135021098, "grad_norm": 5.7753753662109375, "learning_rate": 2.8531468531468534e-06, "loss": 0.8825576305389404, "step": 52 }, { "epoch": 0.11392405063291139, "grad_norm": 2.804755449295044, "learning_rate": 2.9650349650349648e-06, "loss": 0.8038457632064819, "step": 54 }, { "epoch": 0.11814345991561181, "grad_norm": 2.4263148307800293, "learning_rate": 3.076923076923077e-06, "loss": 1.4315626621246338, "step": 56 }, { "epoch": 0.12236286919831224, "grad_norm": 8.127001762390137, "learning_rate": 3.1888111888111884e-06, "loss": 0.7465399503707886, "step": 58 }, { "epoch": 0.12658227848101267, "grad_norm": 2.8598272800445557, "learning_rate": 3.3006993006993007e-06, "loss": 1.4817099571228027, "step": 60 }, { "epoch": 0.1308016877637131, "grad_norm": 3.184314489364624, "learning_rate": 3.4125874125874125e-06, "loss": 1.5089186429977417, "step": 62 }, { "epoch": 0.1350210970464135, "grad_norm": 1.7058652639389038, "learning_rate": 3.5244755244755243e-06, "loss": 1.0980231761932373, "step": 64 }, { "epoch": 0.13924050632911392, "grad_norm": 4.485511779785156, "learning_rate": 3.636363636363636e-06, "loss": 0.8224247694015503, "step": 66 }, { "epoch": 0.14345991561181434, "grad_norm": 2.326599359512329, "learning_rate": 3.748251748251748e-06, "loss": 1.1156786680221558, "step": 68 }, { "epoch": 0.14767932489451477, "grad_norm": 5.480278491973877, "learning_rate": 3.860139860139859e-06, "loss": 0.9959129095077515, "step": 70 }, { "epoch": 0.1518987341772152, "grad_norm": 2.004271984100342, "learning_rate": 3.972027972027972e-06, "loss": 1.2949004173278809, "step": 72 }, { "epoch": 0.15611814345991562, "grad_norm": 5.011590957641602, "learning_rate": 4.083916083916084e-06, "loss": 1.046140193939209, "step": 74 }, { "epoch": 0.16033755274261605, "grad_norm": 5.78662633895874, "learning_rate": 4.195804195804196e-06, "loss": 0.86857008934021, "step": 76 }, { "epoch": 0.16455696202531644, "grad_norm": 2.636563539505005, "learning_rate": 4.3076923076923076e-06, "loss": 1.3720424175262451, "step": 78 }, { "epoch": 0.16877637130801687, "grad_norm": 2.0765841007232666, "learning_rate": 4.4195804195804185e-06, "loss": 1.3632028102874756, "step": 80 }, { "epoch": 0.1729957805907173, "grad_norm": 2.3837273120880127, "learning_rate": 4.531468531468531e-06, "loss": 1.4058136940002441, "step": 82 }, { "epoch": 0.17721518987341772, "grad_norm": 4.226639747619629, "learning_rate": 4.643356643356643e-06, "loss": 0.7853094339370728, "step": 84 }, { "epoch": 0.18143459915611815, "grad_norm": 4.09359884262085, "learning_rate": 4.755244755244755e-06, "loss": 1.1315921545028687, "step": 86 }, { "epoch": 0.18565400843881857, "grad_norm": 1.9499998092651367, "learning_rate": 4.8671328671328676e-06, "loss": 1.323297381401062, "step": 88 }, { "epoch": 0.189873417721519, "grad_norm": 7.248386383056641, "learning_rate": 4.9790209790209785e-06, "loss": 0.6489843726158142, "step": 90 }, { "epoch": 0.1940928270042194, "grad_norm": 3.945362091064453, "learning_rate": 5.09090909090909e-06, "loss": 1.7082250118255615, "step": 92 }, { "epoch": 0.19831223628691982, "grad_norm": 9.237424850463867, "learning_rate": 5.202797202797202e-06, "loss": 0.9587538242340088, "step": 94 }, { "epoch": 0.20253164556962025, "grad_norm": 2.5106499195098877, "learning_rate": 5.314685314685315e-06, "loss": 1.4145572185516357, "step": 96 }, { "epoch": 0.20675105485232068, "grad_norm": 2.280298948287964, "learning_rate": 5.426573426573427e-06, "loss": 1.4730861186981201, "step": 98 }, { "epoch": 0.2109704641350211, "grad_norm": 4.468693256378174, "learning_rate": 5.538461538461538e-06, "loss": 1.24980628490448, "step": 100 }, { "epoch": 0.21518987341772153, "grad_norm": 2.57384991645813, "learning_rate": 5.6503496503496495e-06, "loss": 1.0641834735870361, "step": 102 }, { "epoch": 0.21940928270042195, "grad_norm": 2.2377758026123047, "learning_rate": 5.762237762237762e-06, "loss": 0.9983944892883301, "step": 104 }, { "epoch": 0.22362869198312235, "grad_norm": 7.83008337020874, "learning_rate": 5.874125874125874e-06, "loss": 0.9789789319038391, "step": 106 }, { "epoch": 0.22784810126582278, "grad_norm": 2.1825568675994873, "learning_rate": 5.986013986013986e-06, "loss": 0.9948168992996216, "step": 108 }, { "epoch": 0.2320675105485232, "grad_norm": 1.7740533351898193, "learning_rate": 6.097902097902097e-06, "loss": 1.0290191173553467, "step": 110 }, { "epoch": 0.23628691983122363, "grad_norm": 1.6431820392608643, "learning_rate": 6.2097902097902095e-06, "loss": 1.3816218376159668, "step": 112 }, { "epoch": 0.24050632911392406, "grad_norm": 4.050329208374023, "learning_rate": 6.321678321678321e-06, "loss": 1.2858781814575195, "step": 114 }, { "epoch": 0.24472573839662448, "grad_norm": 4.519939422607422, "learning_rate": 6.433566433566433e-06, "loss": 1.6122548580169678, "step": 116 }, { "epoch": 0.2489451476793249, "grad_norm": 1.7163703441619873, "learning_rate": 6.545454545454546e-06, "loss": 1.2705044746398926, "step": 118 }, { "epoch": 0.25316455696202533, "grad_norm": 1.1608729362487793, "learning_rate": 6.657342657342657e-06, "loss": 1.0020270347595215, "step": 120 }, { "epoch": 0.25738396624472576, "grad_norm": 4.328707695007324, "learning_rate": 6.769230769230769e-06, "loss": 1.2712280750274658, "step": 122 }, { "epoch": 0.2616033755274262, "grad_norm": 1.8052810430526733, "learning_rate": 6.8811188811188805e-06, "loss": 1.2797789573669434, "step": 124 }, { "epoch": 0.26582278481012656, "grad_norm": 2.120347023010254, "learning_rate": 6.993006993006993e-06, "loss": 1.3641468286514282, "step": 126 }, { "epoch": 0.270042194092827, "grad_norm": 7.924063682556152, "learning_rate": 7.104895104895105e-06, "loss": 0.8769274950027466, "step": 128 }, { "epoch": 0.2742616033755274, "grad_norm": 2.9971201419830322, "learning_rate": 7.216783216783216e-06, "loss": 1.1519945859909058, "step": 130 }, { "epoch": 0.27848101265822783, "grad_norm": 4.976275444030762, "learning_rate": 7.328671328671328e-06, "loss": 1.217698335647583, "step": 132 }, { "epoch": 0.28270042194092826, "grad_norm": 3.9333672523498535, "learning_rate": 7.4405594405594405e-06, "loss": 0.6807541847229004, "step": 134 }, { "epoch": 0.2869198312236287, "grad_norm": 4.12578821182251, "learning_rate": 7.552447552447552e-06, "loss": 0.8635811805725098, "step": 136 }, { "epoch": 0.2911392405063291, "grad_norm": 4.128167629241943, "learning_rate": 7.664335664335663e-06, "loss": 1.3738093376159668, "step": 138 }, { "epoch": 0.29535864978902954, "grad_norm": 4.789083957672119, "learning_rate": 7.776223776223776e-06, "loss": 0.9322667717933655, "step": 140 }, { "epoch": 0.29957805907172996, "grad_norm": 5.845694541931152, "learning_rate": 7.888111888111889e-06, "loss": 1.2719149589538574, "step": 142 }, { "epoch": 0.3037974683544304, "grad_norm": 4.548868656158447, "learning_rate": 8e-06, "loss": 1.0755615234375, "step": 144 }, { "epoch": 0.3080168776371308, "grad_norm": 2.8906826972961426, "learning_rate": 7.99999025946351e-06, "loss": 1.3829221725463867, "step": 146 }, { "epoch": 0.31223628691983124, "grad_norm": 8.330571174621582, "learning_rate": 7.999961037906754e-06, "loss": 1.3621151447296143, "step": 148 }, { "epoch": 0.31645569620253167, "grad_norm": 3.5093352794647217, "learning_rate": 7.999912335487857e-06, "loss": 1.2308037281036377, "step": 150 }, { "epoch": 0.3206751054852321, "grad_norm": 6.054520606994629, "learning_rate": 7.999844152470372e-06, "loss": 1.2870557308197021, "step": 152 }, { "epoch": 0.32489451476793246, "grad_norm": 1.853664755821228, "learning_rate": 7.999756489223264e-06, "loss": 1.465219259262085, "step": 154 }, { "epoch": 0.3291139240506329, "grad_norm": 3.744748592376709, "learning_rate": 7.999649346220915e-06, "loss": 1.2533340454101562, "step": 156 }, { "epoch": 0.3333333333333333, "grad_norm": 3.5540201663970947, "learning_rate": 7.999522724043118e-06, "loss": 1.3192243576049805, "step": 158 }, { "epoch": 0.33755274261603374, "grad_norm": 1.734734058380127, "learning_rate": 7.999376623375078e-06, "loss": 1.302985429763794, "step": 160 }, { "epoch": 0.34177215189873417, "grad_norm": 4.0876383781433105, "learning_rate": 7.999211045007407e-06, "loss": 0.8754786849021912, "step": 162 }, { "epoch": 0.3459915611814346, "grad_norm": 4.771523952484131, "learning_rate": 7.999025989836115e-06, "loss": 1.2280066013336182, "step": 164 }, { "epoch": 0.350210970464135, "grad_norm": 8.509572982788086, "learning_rate": 7.998821458862613e-06, "loss": 0.8188046813011169, "step": 166 }, { "epoch": 0.35443037974683544, "grad_norm": 4.890702724456787, "learning_rate": 7.998597453193701e-06, "loss": 1.2213170528411865, "step": 168 }, { "epoch": 0.35864978902953587, "grad_norm": 1.8209432363510132, "learning_rate": 7.998353974041564e-06, "loss": 1.312690019607544, "step": 170 }, { "epoch": 0.3628691983122363, "grad_norm": 7.146384239196777, "learning_rate": 7.998091022723772e-06, "loss": 1.2072352170944214, "step": 172 }, { "epoch": 0.3670886075949367, "grad_norm": 3.6562139987945557, "learning_rate": 7.997808600663259e-06, "loss": 1.0147764682769775, "step": 174 }, { "epoch": 0.37130801687763715, "grad_norm": 1.787636637687683, "learning_rate": 7.997506709388324e-06, "loss": 1.3151808977127075, "step": 176 }, { "epoch": 0.3755274261603376, "grad_norm": 3.019120454788208, "learning_rate": 7.997185350532626e-06, "loss": 1.3140928745269775, "step": 178 }, { "epoch": 0.379746835443038, "grad_norm": 1.2904940843582153, "learning_rate": 7.996844525835172e-06, "loss": 1.0001540184020996, "step": 180 }, { "epoch": 0.38396624472573837, "grad_norm": 1.6466349363327026, "learning_rate": 7.9964842371403e-06, "loss": 1.2761380672454834, "step": 182 }, { "epoch": 0.3881856540084388, "grad_norm": 2.2190961837768555, "learning_rate": 7.996104486397683e-06, "loss": 1.272679090499878, "step": 184 }, { "epoch": 0.3924050632911392, "grad_norm": 8.958863258361816, "learning_rate": 7.995705275662305e-06, "loss": 0.5206277966499329, "step": 186 }, { "epoch": 0.39662447257383965, "grad_norm": 1.6779208183288574, "learning_rate": 7.995286607094459e-06, "loss": 0.9622843265533447, "step": 188 }, { "epoch": 0.4008438818565401, "grad_norm": 1.6733808517456055, "learning_rate": 7.994848482959734e-06, "loss": 1.0529744625091553, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 6.742412567138672, "learning_rate": 7.994390905628996e-06, "loss": 1.2907187938690186, "step": 192 }, { "epoch": 0.4092827004219409, "grad_norm": 5.04363489151001, "learning_rate": 7.993913877578386e-06, "loss": 1.062695026397705, "step": 194 }, { "epoch": 0.41350210970464135, "grad_norm": 1.8267443180084229, "learning_rate": 7.993417401389293e-06, "loss": 1.2746732234954834, "step": 196 }, { "epoch": 0.4177215189873418, "grad_norm": 2.2780373096466064, "learning_rate": 7.99290147974836e-06, "loss": 1.0202131271362305, "step": 198 }, { "epoch": 0.4219409282700422, "grad_norm": 2.0691215991973877, "learning_rate": 7.992366115447445e-06, "loss": 1.1179842948913574, "step": 200 }, { "epoch": 0.42616033755274263, "grad_norm": 4.217780590057373, "learning_rate": 7.991811311383625e-06, "loss": 1.1258949041366577, "step": 202 }, { "epoch": 0.43037974683544306, "grad_norm": 2.5018441677093506, "learning_rate": 7.991237070559173e-06, "loss": 0.8922556638717651, "step": 204 }, { "epoch": 0.4345991561181435, "grad_norm": 2.525747776031494, "learning_rate": 7.990643396081536e-06, "loss": 1.4427449703216553, "step": 206 }, { "epoch": 0.4388185654008439, "grad_norm": 2.50166916847229, "learning_rate": 7.990030291163336e-06, "loss": 0.723818838596344, "step": 208 }, { "epoch": 0.4430379746835443, "grad_norm": 1.4910839796066284, "learning_rate": 7.98939775912233e-06, "loss": 1.2674278020858765, "step": 210 }, { "epoch": 0.4472573839662447, "grad_norm": 4.068655490875244, "learning_rate": 7.98874580338141e-06, "loss": 0.7890869379043579, "step": 212 }, { "epoch": 0.45147679324894513, "grad_norm": 2.0325798988342285, "learning_rate": 7.988074427468575e-06, "loss": 1.1955333948135376, "step": 214 }, { "epoch": 0.45569620253164556, "grad_norm": 2.587684154510498, "learning_rate": 7.987383635016914e-06, "loss": 1.2449276447296143, "step": 216 }, { "epoch": 0.459915611814346, "grad_norm": 2.6565630435943604, "learning_rate": 7.986673429764587e-06, "loss": 1.294593334197998, "step": 218 }, { "epoch": 0.4641350210970464, "grad_norm": 3.8166139125823975, "learning_rate": 7.985943815554808e-06, "loss": 1.2401716709136963, "step": 220 }, { "epoch": 0.46835443037974683, "grad_norm": 5.548577785491943, "learning_rate": 7.985194796335814e-06, "loss": 1.1999175548553467, "step": 222 }, { "epoch": 0.47257383966244726, "grad_norm": 2.5439255237579346, "learning_rate": 7.98442637616086e-06, "loss": 1.0728691816329956, "step": 224 }, { "epoch": 0.4767932489451477, "grad_norm": 12.275174140930176, "learning_rate": 7.983638559188175e-06, "loss": 1.2714494466781616, "step": 226 }, { "epoch": 0.4810126582278481, "grad_norm": 23.638010025024414, "learning_rate": 7.982831349680965e-06, "loss": 0.7866320610046387, "step": 228 }, { "epoch": 0.48523206751054854, "grad_norm": 4.956063270568848, "learning_rate": 7.982004752007367e-06, "loss": 0.921814501285553, "step": 230 }, { "epoch": 0.48945147679324896, "grad_norm": 3.645993232727051, "learning_rate": 7.98115877064044e-06, "loss": 0.8924152851104736, "step": 232 }, { "epoch": 0.4936708860759494, "grad_norm": 2.041003704071045, "learning_rate": 7.980293410158139e-06, "loss": 1.2708659172058105, "step": 234 }, { "epoch": 0.4978902953586498, "grad_norm": 4.042453289031982, "learning_rate": 7.979408675243278e-06, "loss": 1.3152391910552979, "step": 236 }, { "epoch": 0.5021097046413502, "grad_norm": 4.269692420959473, "learning_rate": 7.978504570683523e-06, "loss": 0.980125367641449, "step": 238 }, { "epoch": 0.5063291139240507, "grad_norm": 3.801175355911255, "learning_rate": 7.977581101371354e-06, "loss": 0.4545478820800781, "step": 240 }, { "epoch": 0.510548523206751, "grad_norm": 4.7520060539245605, "learning_rate": 7.97663827230404e-06, "loss": 1.4193034172058105, "step": 242 }, { "epoch": 0.5147679324894515, "grad_norm": 3.5392112731933594, "learning_rate": 7.975676088583614e-06, "loss": 0.8708986043930054, "step": 244 }, { "epoch": 0.5189873417721519, "grad_norm": 2.672224283218384, "learning_rate": 7.974694555416848e-06, "loss": 1.4755961894989014, "step": 246 }, { "epoch": 0.5232067510548524, "grad_norm": 1.7014986276626587, "learning_rate": 7.973693678115218e-06, "loss": 1.186201810836792, "step": 248 }, { "epoch": 0.5274261603375527, "grad_norm": 1.939778208732605, "learning_rate": 7.97267346209488e-06, "loss": 1.3046202659606934, "step": 250 }, { "epoch": 0.5316455696202531, "grad_norm": 9.32943344116211, "learning_rate": 7.971633912876644e-06, "loss": 1.1170387268066406, "step": 252 }, { "epoch": 0.5358649789029536, "grad_norm": 1.9259498119354248, "learning_rate": 7.97057503608593e-06, "loss": 1.2260360717773438, "step": 254 }, { "epoch": 0.540084388185654, "grad_norm": 2.0438666343688965, "learning_rate": 7.969496837452762e-06, "loss": 1.1931499242782593, "step": 256 }, { "epoch": 0.5443037974683544, "grad_norm": 5.915661811828613, "learning_rate": 7.968399322811707e-06, "loss": 1.0251163244247437, "step": 258 }, { "epoch": 0.5485232067510548, "grad_norm": 5.656018257141113, "learning_rate": 7.967282498101866e-06, "loss": 0.9710787534713745, "step": 260 }, { "epoch": 0.5527426160337553, "grad_norm": 1.6091376543045044, "learning_rate": 7.966146369366839e-06, "loss": 1.2647578716278076, "step": 262 }, { "epoch": 0.5569620253164557, "grad_norm": 2.3578081130981445, "learning_rate": 7.96499094275468e-06, "loss": 1.2789413928985596, "step": 264 }, { "epoch": 0.5611814345991561, "grad_norm": 2.2791264057159424, "learning_rate": 7.963816224517875e-06, "loss": 1.2268846035003662, "step": 266 }, { "epoch": 0.5654008438818565, "grad_norm": 4.264853000640869, "learning_rate": 7.962622221013308e-06, "loss": 1.4937443733215332, "step": 268 }, { "epoch": 0.569620253164557, "grad_norm": 3.884678602218628, "learning_rate": 7.961408938702217e-06, "loss": 1.050868034362793, "step": 270 }, { "epoch": 0.5738396624472574, "grad_norm": 3.3875067234039307, "learning_rate": 7.96017638415017e-06, "loss": 1.3021985292434692, "step": 272 }, { "epoch": 0.5780590717299579, "grad_norm": 2.571835994720459, "learning_rate": 7.958924564027025e-06, "loss": 1.1042567491531372, "step": 274 }, { "epoch": 0.5822784810126582, "grad_norm": 2.283672571182251, "learning_rate": 7.957653485106894e-06, "loss": 1.2543787956237793, "step": 276 }, { "epoch": 0.5864978902953587, "grad_norm": 3.0379388332366943, "learning_rate": 7.956363154268103e-06, "loss": 1.393994688987732, "step": 278 }, { "epoch": 0.5907172995780591, "grad_norm": 4.789394378662109, "learning_rate": 7.95505357849316e-06, "loss": 0.9629275798797607, "step": 280 }, { "epoch": 0.5949367088607594, "grad_norm": 7.168961524963379, "learning_rate": 7.953724764868716e-06, "loss": 1.330991268157959, "step": 282 }, { "epoch": 0.5991561181434599, "grad_norm": 2.166527271270752, "learning_rate": 7.952376720585524e-06, "loss": 1.54081130027771, "step": 284 }, { "epoch": 0.6033755274261603, "grad_norm": 0.7811316847801208, "learning_rate": 7.951009452938407e-06, "loss": 1.1209747791290283, "step": 286 }, { "epoch": 0.6075949367088608, "grad_norm": 1.7322338819503784, "learning_rate": 7.949622969326205e-06, "loss": 1.248884916305542, "step": 288 }, { "epoch": 0.6118143459915611, "grad_norm": 31.41063690185547, "learning_rate": 7.94821727725175e-06, "loss": 0.8109699487686157, "step": 290 }, { "epoch": 0.6160337552742616, "grad_norm": 2.779398202896118, "learning_rate": 7.946792384321818e-06, "loss": 0.6689173579216003, "step": 292 }, { "epoch": 0.620253164556962, "grad_norm": 2.087568521499634, "learning_rate": 7.945348298247087e-06, "loss": 1.2270828485488892, "step": 294 }, { "epoch": 0.6244725738396625, "grad_norm": 5.774115085601807, "learning_rate": 7.943885026842097e-06, "loss": 0.7052218317985535, "step": 296 }, { "epoch": 0.6286919831223629, "grad_norm": 8.402689933776855, "learning_rate": 7.94240257802521e-06, "loss": 0.6502029895782471, "step": 298 }, { "epoch": 0.6329113924050633, "grad_norm": 6.8001556396484375, "learning_rate": 7.94090095981856e-06, "loss": 1.7823140621185303, "step": 300 }, { "epoch": 0.6371308016877637, "grad_norm": 1.8385719060897827, "learning_rate": 7.939380180348018e-06, "loss": 1.2579293251037598, "step": 302 }, { "epoch": 0.6413502109704642, "grad_norm": 3.531500816345215, "learning_rate": 7.937840247843148e-06, "loss": 0.8740752339363098, "step": 304 }, { "epoch": 0.6455696202531646, "grad_norm": 4.576239109039307, "learning_rate": 7.93628117063715e-06, "loss": 1.0002870559692383, "step": 306 }, { "epoch": 0.6497890295358649, "grad_norm": 1.4867647886276245, "learning_rate": 7.934702957166833e-06, "loss": 1.2564589977264404, "step": 308 }, { "epoch": 0.6540084388185654, "grad_norm": 2.41863751411438, "learning_rate": 7.933105615972553e-06, "loss": 1.1982673406600952, "step": 310 }, { "epoch": 0.6582278481012658, "grad_norm": 8.212380409240723, "learning_rate": 7.931489155698178e-06, "loss": 0.9985597729682922, "step": 312 }, { "epoch": 0.6624472573839663, "grad_norm": 1.5422508716583252, "learning_rate": 7.929853585091034e-06, "loss": 1.2045118808746338, "step": 314 }, { "epoch": 0.6666666666666666, "grad_norm": 2.346829652786255, "learning_rate": 7.928198913001865e-06, "loss": 1.0920261144638062, "step": 316 }, { "epoch": 0.6708860759493671, "grad_norm": 2.013681173324585, "learning_rate": 7.926525148384776e-06, "loss": 1.0814929008483887, "step": 318 }, { "epoch": 0.6751054852320675, "grad_norm": 6.084042072296143, "learning_rate": 7.924832300297197e-06, "loss": 1.0774112939834595, "step": 320 }, { "epoch": 0.679324894514768, "grad_norm": 2.2710366249084473, "learning_rate": 7.923120377899818e-06, "loss": 0.872334897518158, "step": 322 }, { "epoch": 0.6835443037974683, "grad_norm": 4.1272969245910645, "learning_rate": 7.921389390456549e-06, "loss": 0.866448163986206, "step": 324 }, { "epoch": 0.6877637130801688, "grad_norm": 1.4605779647827148, "learning_rate": 7.919639347334477e-06, "loss": 0.8561316132545471, "step": 326 }, { "epoch": 0.6919831223628692, "grad_norm": 1.4929612874984741, "learning_rate": 7.917870258003798e-06, "loss": 0.8531728982925415, "step": 328 }, { "epoch": 0.6962025316455697, "grad_norm": 3.4534809589385986, "learning_rate": 7.916082132037782e-06, "loss": 1.5728954076766968, "step": 330 }, { "epoch": 0.70042194092827, "grad_norm": 6.895410537719727, "learning_rate": 7.914274979112704e-06, "loss": 1.0785008668899536, "step": 332 }, { "epoch": 0.7046413502109705, "grad_norm": 2.275595188140869, "learning_rate": 7.912448809007812e-06, "loss": 1.3434700965881348, "step": 334 }, { "epoch": 0.7088607594936709, "grad_norm": 2.5494587421417236, "learning_rate": 7.910603631605259e-06, "loss": 1.2570360898971558, "step": 336 }, { "epoch": 0.7130801687763713, "grad_norm": 1.906052589416504, "learning_rate": 7.908739456890056e-06, "loss": 1.2807261943817139, "step": 338 }, { "epoch": 0.7172995780590717, "grad_norm": 8.606728553771973, "learning_rate": 7.906856294950012e-06, "loss": 1.254488468170166, "step": 340 }, { "epoch": 0.7215189873417721, "grad_norm": 1.998100757598877, "learning_rate": 7.90495415597569e-06, "loss": 1.249230146408081, "step": 342 }, { "epoch": 0.7257383966244726, "grad_norm": 1.9510746002197266, "learning_rate": 7.90303305026034e-06, "loss": 1.07149076461792, "step": 344 }, { "epoch": 0.729957805907173, "grad_norm": 4.211492538452148, "learning_rate": 7.901092988199852e-06, "loss": 0.8842002153396606, "step": 346 }, { "epoch": 0.7341772151898734, "grad_norm": 2.758723497390747, "learning_rate": 7.899133980292698e-06, "loss": 1.2383522987365723, "step": 348 }, { "epoch": 0.7383966244725738, "grad_norm": 10.058371543884277, "learning_rate": 7.897156037139865e-06, "loss": 1.1752148866653442, "step": 350 }, { "epoch": 0.7426160337552743, "grad_norm": 13.896164894104004, "learning_rate": 7.89515916944482e-06, "loss": 1.0859854221343994, "step": 352 }, { "epoch": 0.7468354430379747, "grad_norm": 4.250057697296143, "learning_rate": 7.893143388013425e-06, "loss": 0.6642742156982422, "step": 354 }, { "epoch": 0.7510548523206751, "grad_norm": 9.338140487670898, "learning_rate": 7.891108703753902e-06, "loss": 1.047743320465088, "step": 356 }, { "epoch": 0.7552742616033755, "grad_norm": 3.788804054260254, "learning_rate": 7.88905512767676e-06, "loss": 1.0082635879516602, "step": 358 }, { "epoch": 0.759493670886076, "grad_norm": 2.8348827362060547, "learning_rate": 7.886982670894736e-06, "loss": 1.210444688796997, "step": 360 }, { "epoch": 0.7637130801687764, "grad_norm": 4.0919718742370605, "learning_rate": 7.884891344622746e-06, "loss": 0.98717200756073, "step": 362 }, { "epoch": 0.7679324894514767, "grad_norm": 2.804220676422119, "learning_rate": 7.88278116017781e-06, "loss": 1.5726983547210693, "step": 364 }, { "epoch": 0.7721518987341772, "grad_norm": 11.296149253845215, "learning_rate": 7.880652128978999e-06, "loss": 0.7079776525497437, "step": 366 }, { "epoch": 0.7763713080168776, "grad_norm": 2.3584940433502197, "learning_rate": 7.878504262547373e-06, "loss": 1.1683130264282227, "step": 368 }, { "epoch": 0.7805907172995781, "grad_norm": 4.991513252258301, "learning_rate": 7.876337572505914e-06, "loss": 1.472283124923706, "step": 370 }, { "epoch": 0.7848101265822784, "grad_norm": 7.023402214050293, "learning_rate": 7.87415207057947e-06, "loss": 0.9845293760299683, "step": 372 }, { "epoch": 0.7890295358649789, "grad_norm": 5.190583229064941, "learning_rate": 7.871947768594688e-06, "loss": 1.0484483242034912, "step": 374 }, { "epoch": 0.7932489451476793, "grad_norm": 2.0012083053588867, "learning_rate": 7.869724678479944e-06, "loss": 1.152682900428772, "step": 376 }, { "epoch": 0.7974683544303798, "grad_norm": 0.8431169986724854, "learning_rate": 7.86748281226529e-06, "loss": 1.119347333908081, "step": 378 }, { "epoch": 0.8016877637130801, "grad_norm": 1.5772409439086914, "learning_rate": 7.865222182082384e-06, "loss": 0.7247622013092041, "step": 380 }, { "epoch": 0.8059071729957806, "grad_norm": 4.462345123291016, "learning_rate": 7.862942800164416e-06, "loss": 1.2693476676940918, "step": 382 }, { "epoch": 0.810126582278481, "grad_norm": 18.4215145111084, "learning_rate": 7.860644678846057e-06, "loss": 1.0132197141647339, "step": 384 }, { "epoch": 0.8143459915611815, "grad_norm": 2.6663124561309814, "learning_rate": 7.858327830563384e-06, "loss": 1.2443773746490479, "step": 386 }, { "epoch": 0.8185654008438819, "grad_norm": 7.3652448654174805, "learning_rate": 7.855992267853806e-06, "loss": 0.9400072693824768, "step": 388 }, { "epoch": 0.8227848101265823, "grad_norm": 33.57978820800781, "learning_rate": 7.85363800335601e-06, "loss": 1.216599941253662, "step": 390 }, { "epoch": 0.8270042194092827, "grad_norm": 4.298940181732178, "learning_rate": 7.851265049809886e-06, "loss": 0.9645065069198608, "step": 392 }, { "epoch": 0.8312236286919831, "grad_norm": 3.8016936779022217, "learning_rate": 7.848873420056456e-06, "loss": 0.9074147939682007, "step": 394 }, { "epoch": 0.8354430379746836, "grad_norm": 2.417340040206909, "learning_rate": 7.846463127037807e-06, "loss": 1.2312746047973633, "step": 396 }, { "epoch": 0.8396624472573839, "grad_norm": 0.5479583144187927, "learning_rate": 7.844034183797021e-06, "loss": 1.0866131782531738, "step": 398 }, { "epoch": 0.8438818565400844, "grad_norm": 1.3444585800170898, "learning_rate": 7.841586603478105e-06, "loss": 0.9458938837051392, "step": 400 }, { "epoch": 0.8481012658227848, "grad_norm": 5.396076679229736, "learning_rate": 7.839120399325913e-06, "loss": 0.7105859518051147, "step": 402 }, { "epoch": 0.8523206751054853, "grad_norm": 3.8646159172058105, "learning_rate": 7.836635584686089e-06, "loss": 1.21824049949646, "step": 404 }, { "epoch": 0.8565400843881856, "grad_norm": 1.9759020805358887, "learning_rate": 7.834132173004981e-06, "loss": 0.9442010521888733, "step": 406 }, { "epoch": 0.8607594936708861, "grad_norm": 2.555657148361206, "learning_rate": 7.831610177829574e-06, "loss": 0.9720205664634705, "step": 408 }, { "epoch": 0.8649789029535865, "grad_norm": 15.906916618347168, "learning_rate": 7.829069612807413e-06, "loss": 1.23225998878479, "step": 410 }, { "epoch": 0.869198312236287, "grad_norm": 2.261281967163086, "learning_rate": 7.826510491686538e-06, "loss": 0.8622678518295288, "step": 412 }, { "epoch": 0.8734177215189873, "grad_norm": 30.68505859375, "learning_rate": 7.823932828315398e-06, "loss": 1.210330605506897, "step": 414 }, { "epoch": 0.8776371308016878, "grad_norm": 4.517366409301758, "learning_rate": 7.82133663664279e-06, "loss": 1.0655572414398193, "step": 416 }, { "epoch": 0.8818565400843882, "grad_norm": 8.33056926727295, "learning_rate": 7.81872193071776e-06, "loss": 1.163268804550171, "step": 418 }, { "epoch": 0.8860759493670886, "grad_norm": 1.1660995483398438, "learning_rate": 7.81608872468956e-06, "loss": 0.9473620653152466, "step": 420 }, { "epoch": 0.890295358649789, "grad_norm": 1.9505749940872192, "learning_rate": 7.813437032807541e-06, "loss": 1.3407762050628662, "step": 422 }, { "epoch": 0.8945147679324894, "grad_norm": 13.3614501953125, "learning_rate": 7.810766869421092e-06, "loss": 0.9824624061584473, "step": 424 }, { "epoch": 0.8987341772151899, "grad_norm": 4.366639137268066, "learning_rate": 7.808078248979564e-06, "loss": 1.1881823539733887, "step": 426 }, { "epoch": 0.9029535864978903, "grad_norm": 1.7050063610076904, "learning_rate": 7.805371186032176e-06, "loss": 1.0488433837890625, "step": 428 }, { "epoch": 0.9071729957805907, "grad_norm": 7.406988143920898, "learning_rate": 7.80264569522796e-06, "loss": 1.0144481658935547, "step": 430 }, { "epoch": 0.9113924050632911, "grad_norm": 3.4340415000915527, "learning_rate": 7.799901791315658e-06, "loss": 1.0802500247955322, "step": 432 }, { "epoch": 0.9156118143459916, "grad_norm": 3.148068428039551, "learning_rate": 7.797139489143655e-06, "loss": 1.2489020824432373, "step": 434 }, { "epoch": 0.919831223628692, "grad_norm": 4.874792098999023, "learning_rate": 7.794358803659903e-06, "loss": 0.8715201616287231, "step": 436 }, { "epoch": 0.9240506329113924, "grad_norm": 1.5414555072784424, "learning_rate": 7.791559749911826e-06, "loss": 1.2029755115509033, "step": 438 }, { "epoch": 0.9282700421940928, "grad_norm": 1.9815410375595093, "learning_rate": 7.788742343046248e-06, "loss": 0.9946187138557434, "step": 440 }, { "epoch": 0.9324894514767933, "grad_norm": 10.352864265441895, "learning_rate": 7.785906598309314e-06, "loss": 0.9312165975570679, "step": 442 }, { "epoch": 0.9367088607594937, "grad_norm": 8.163040161132812, "learning_rate": 7.783052531046397e-06, "loss": 1.0982768535614014, "step": 444 }, { "epoch": 0.9409282700421941, "grad_norm": 6.510847091674805, "learning_rate": 7.780180156702023e-06, "loss": 1.3022956848144531, "step": 446 }, { "epoch": 0.9451476793248945, "grad_norm": 1.658566951751709, "learning_rate": 7.777289490819783e-06, "loss": 1.0020906925201416, "step": 448 }, { "epoch": 0.9493670886075949, "grad_norm": 3.4121336936950684, "learning_rate": 7.774380549042255e-06, "loss": 0.9293044209480286, "step": 450 }, { "epoch": 0.9535864978902954, "grad_norm": 1.8673920631408691, "learning_rate": 7.771453347110913e-06, "loss": 1.2867658138275146, "step": 452 }, { "epoch": 0.9578059071729957, "grad_norm": 1.841113805770874, "learning_rate": 7.768507900866044e-06, "loss": 0.982481062412262, "step": 454 }, { "epoch": 0.9620253164556962, "grad_norm": 4.477797985076904, "learning_rate": 7.765544226246663e-06, "loss": 1.1560728549957275, "step": 456 }, { "epoch": 0.9662447257383966, "grad_norm": 2.086230993270874, "learning_rate": 7.762562339290425e-06, "loss": 0.8860993981361389, "step": 458 }, { "epoch": 0.9704641350210971, "grad_norm": 27.607803344726562, "learning_rate": 7.759562256133541e-06, "loss": 0.7165157794952393, "step": 460 }, { "epoch": 0.9746835443037974, "grad_norm": 9.253504753112793, "learning_rate": 7.75654399301069e-06, "loss": 0.9001080989837646, "step": 462 }, { "epoch": 0.9789029535864979, "grad_norm": 1.412550449371338, "learning_rate": 7.753507566254927e-06, "loss": 1.168654441833496, "step": 464 }, { "epoch": 0.9831223628691983, "grad_norm": 4.56351900100708, "learning_rate": 7.750452992297599e-06, "loss": 0.7248488664627075, "step": 466 }, { "epoch": 0.9873417721518988, "grad_norm": 2.234978199005127, "learning_rate": 7.747380287668257e-06, "loss": 1.3152525424957275, "step": 468 }, { "epoch": 0.9915611814345991, "grad_norm": 7.138265609741211, "learning_rate": 7.744289468994562e-06, "loss": 0.8874726891517639, "step": 470 }, { "epoch": 0.9957805907172996, "grad_norm": 3.042675256729126, "learning_rate": 7.741180553002199e-06, "loss": 1.2144908905029297, "step": 472 }, { "epoch": 1.0, "grad_norm": 1.7988324165344238, "learning_rate": 7.738053556514784e-06, "loss": 1.2585757970809937, "step": 474 }, { "epoch": 1.0042194092827004, "grad_norm": 6.92104434967041, "learning_rate": 7.734908496453774e-06, "loss": 1.060208797454834, "step": 476 }, { "epoch": 1.0084388185654007, "grad_norm": 1.8207498788833618, "learning_rate": 7.73174538983838e-06, "loss": 1.1424498558044434, "step": 478 }, { "epoch": 1.0126582278481013, "grad_norm": 2.2936971187591553, "learning_rate": 7.72856425378546e-06, "loss": 1.1568377017974854, "step": 480 }, { "epoch": 1.0168776371308017, "grad_norm": 48.475311279296875, "learning_rate": 7.725365105509444e-06, "loss": 0.8294604420661926, "step": 482 }, { "epoch": 1.021097046413502, "grad_norm": 4.676328659057617, "learning_rate": 7.722147962322236e-06, "loss": 1.0818572044372559, "step": 484 }, { "epoch": 1.0253164556962024, "grad_norm": 5.833449840545654, "learning_rate": 7.718912841633112e-06, "loss": 0.5055439472198486, "step": 486 }, { "epoch": 1.029535864978903, "grad_norm": 4.804643630981445, "learning_rate": 7.715659760948632e-06, "loss": 0.9713239073753357, "step": 488 }, { "epoch": 1.0337552742616034, "grad_norm": 3.2628095149993896, "learning_rate": 7.71238873787255e-06, "loss": 0.8403753042221069, "step": 490 }, { "epoch": 1.0379746835443038, "grad_norm": 2.2691550254821777, "learning_rate": 7.709099790105707e-06, "loss": 1.1320157051086426, "step": 492 }, { "epoch": 1.0421940928270041, "grad_norm": 19.545082092285156, "learning_rate": 7.705792935445948e-06, "loss": 0.8306432962417603, "step": 494 }, { "epoch": 1.0464135021097047, "grad_norm": 4.1693949699401855, "learning_rate": 7.702468191788014e-06, "loss": 0.9293802976608276, "step": 496 }, { "epoch": 1.0506329113924051, "grad_norm": 2.5364606380462646, "learning_rate": 7.699125577123455e-06, "loss": 1.2761287689208984, "step": 498 }, { "epoch": 1.0548523206751055, "grad_norm": 11.509333610534668, "learning_rate": 7.695765109540526e-06, "loss": 1.0367153882980347, "step": 500 }, { "epoch": 1.0590717299578059, "grad_norm": 2.0936784744262695, "learning_rate": 7.692386807224092e-06, "loss": 1.1410118341445923, "step": 502 }, { "epoch": 1.0632911392405062, "grad_norm": 1.661012053489685, "learning_rate": 7.68899068845553e-06, "loss": 1.2624824047088623, "step": 504 }, { "epoch": 1.0675105485232068, "grad_norm": 3.211272716522217, "learning_rate": 7.685576771612624e-06, "loss": 0.727929413318634, "step": 506 }, { "epoch": 1.0717299578059072, "grad_norm": 2.976832151412964, "learning_rate": 7.682145075169482e-06, "loss": 0.9856585264205933, "step": 508 }, { "epoch": 1.0759493670886076, "grad_norm": 1.821731448173523, "learning_rate": 7.678695617696413e-06, "loss": 1.0898807048797607, "step": 510 }, { "epoch": 1.080168776371308, "grad_norm": 3.3283071517944336, "learning_rate": 7.675228417859842e-06, "loss": 1.0197210311889648, "step": 512 }, { "epoch": 1.0843881856540085, "grad_norm": 2.766814708709717, "learning_rate": 7.67174349442221e-06, "loss": 1.07771897315979, "step": 514 }, { "epoch": 1.0886075949367089, "grad_norm": 2.035024881362915, "learning_rate": 7.66824086624186e-06, "loss": 1.0527117252349854, "step": 516 }, { "epoch": 1.0928270042194093, "grad_norm": 2.086318254470825, "learning_rate": 7.664720552272948e-06, "loss": 0.9818480014801025, "step": 518 }, { "epoch": 1.0970464135021096, "grad_norm": 2.671875, "learning_rate": 7.661182571565332e-06, "loss": 0.9276726245880127, "step": 520 }, { "epoch": 1.1012658227848102, "grad_norm": 8.45070743560791, "learning_rate": 7.657626943264474e-06, "loss": 0.8790248036384583, "step": 522 }, { "epoch": 1.1054852320675106, "grad_norm": 2.1058099269866943, "learning_rate": 7.654053686611334e-06, "loss": 1.0137805938720703, "step": 524 }, { "epoch": 1.109704641350211, "grad_norm": 8.899941444396973, "learning_rate": 7.650462820942264e-06, "loss": 0.8656354546546936, "step": 526 }, { "epoch": 1.1139240506329113, "grad_norm": 2.253026247024536, "learning_rate": 7.64685436568891e-06, "loss": 1.2973110675811768, "step": 528 }, { "epoch": 1.1181434599156117, "grad_norm": 1.9157750606536865, "learning_rate": 7.643228340378098e-06, "loss": 1.208802342414856, "step": 530 }, { "epoch": 1.1223628691983123, "grad_norm": 3.9860875606536865, "learning_rate": 7.639584764631736e-06, "loss": 0.6784745454788208, "step": 532 }, { "epoch": 1.1265822784810127, "grad_norm": 4.096652984619141, "learning_rate": 7.6359236581667e-06, "loss": 0.969964861869812, "step": 534 }, { "epoch": 1.130801687763713, "grad_norm": 2.2989354133605957, "learning_rate": 7.632245040794737e-06, "loss": 1.0640895366668701, "step": 536 }, { "epoch": 1.1350210970464134, "grad_norm": 2.0458743572235107, "learning_rate": 7.6285489324223534e-06, "loss": 0.8687632083892822, "step": 538 }, { "epoch": 1.139240506329114, "grad_norm": 3.1451354026794434, "learning_rate": 7.6248353530507e-06, "loss": 0.8472435474395752, "step": 540 }, { "epoch": 1.1434599156118144, "grad_norm": 1.7947264909744263, "learning_rate": 7.621104322775477e-06, "loss": 0.7688232660293579, "step": 542 }, { "epoch": 1.1476793248945147, "grad_norm": 1.6323504447937012, "learning_rate": 7.617355861786813e-06, "loss": 0.7883434891700745, "step": 544 }, { "epoch": 1.1518987341772151, "grad_norm": 3.807711601257324, "learning_rate": 7.613589990369167e-06, "loss": 1.1249892711639404, "step": 546 }, { "epoch": 1.1561181434599157, "grad_norm": 3.2176730632781982, "learning_rate": 7.6098067289012086e-06, "loss": 1.1086885929107666, "step": 548 }, { "epoch": 1.160337552742616, "grad_norm": 8.372694969177246, "learning_rate": 7.606006097855713e-06, "loss": 0.8941718339920044, "step": 550 }, { "epoch": 1.1645569620253164, "grad_norm": 3.027027130126953, "learning_rate": 7.602188117799451e-06, "loss": 1.2869350910186768, "step": 552 }, { "epoch": 1.1687763713080168, "grad_norm": 3.2669458389282227, "learning_rate": 7.598352809393074e-06, "loss": 1.1333280801773071, "step": 554 }, { "epoch": 1.1729957805907172, "grad_norm": 30.656925201416016, "learning_rate": 7.594500193391006e-06, "loss": 0.6378011703491211, "step": 556 }, { "epoch": 1.1772151898734178, "grad_norm": 4.539504528045654, "learning_rate": 7.590630290641327e-06, "loss": 1.1045113801956177, "step": 558 }, { "epoch": 1.1814345991561181, "grad_norm": 26.276714324951172, "learning_rate": 7.586743122085666e-06, "loss": 0.5744074583053589, "step": 560 }, { "epoch": 1.1856540084388185, "grad_norm": 1.9147850275039673, "learning_rate": 7.582838708759082e-06, "loss": 0.7490895986557007, "step": 562 }, { "epoch": 1.189873417721519, "grad_norm": 3.1463582515716553, "learning_rate": 7.5789170717899516e-06, "loss": 1.3113162517547607, "step": 564 }, { "epoch": 1.1940928270042195, "grad_norm": 2.1219849586486816, "learning_rate": 7.57497823239986e-06, "loss": 1.0440800189971924, "step": 566 }, { "epoch": 1.1983122362869199, "grad_norm": 4.773017406463623, "learning_rate": 7.571022211903475e-06, "loss": 0.8006106615066528, "step": 568 }, { "epoch": 1.2025316455696202, "grad_norm": 1.4802496433258057, "learning_rate": 7.567049031708445e-06, "loss": 1.0503010749816895, "step": 570 }, { "epoch": 1.2067510548523206, "grad_norm": 6.079049587249756, "learning_rate": 7.563058713315273e-06, "loss": 1.0806069374084473, "step": 572 }, { "epoch": 1.2109704641350212, "grad_norm": 2.4842324256896973, "learning_rate": 7.559051278317204e-06, "loss": 1.1085004806518555, "step": 574 }, { "epoch": 1.2151898734177216, "grad_norm": 1.6897258758544922, "learning_rate": 7.5550267484001084e-06, "loss": 0.9017348289489746, "step": 576 }, { "epoch": 1.219409282700422, "grad_norm": 3.8387186527252197, "learning_rate": 7.5509851453423665e-06, "loss": 0.6278250813484192, "step": 578 }, { "epoch": 1.2236286919831223, "grad_norm": 4.397797107696533, "learning_rate": 7.546926491014742e-06, "loss": 0.9347223043441772, "step": 580 }, { "epoch": 1.2278481012658227, "grad_norm": 6.560842514038086, "learning_rate": 7.5428508073802765e-06, "loss": 0.8352513313293457, "step": 582 }, { "epoch": 1.2320675105485233, "grad_norm": 2.411914825439453, "learning_rate": 7.538758116494163e-06, "loss": 0.8624718189239502, "step": 584 }, { "epoch": 1.2362869198312236, "grad_norm": 13.554781913757324, "learning_rate": 7.534648440503624e-06, "loss": 0.9081147909164429, "step": 586 }, { "epoch": 1.240506329113924, "grad_norm": 4.150012493133545, "learning_rate": 7.530521801647799e-06, "loss": 0.8333830833435059, "step": 588 }, { "epoch": 1.2447257383966246, "grad_norm": 2.9597084522247314, "learning_rate": 7.52637822225762e-06, "loss": 1.1118628978729248, "step": 590 }, { "epoch": 1.248945147679325, "grad_norm": 1.8890756368637085, "learning_rate": 7.522217724755688e-06, "loss": 0.58323734998703, "step": 592 }, { "epoch": 1.2531645569620253, "grad_norm": 1.4861680269241333, "learning_rate": 7.51804033165616e-06, "loss": 0.8740505576133728, "step": 594 }, { "epoch": 1.2573839662447257, "grad_norm": 7.2344560623168945, "learning_rate": 7.513846065564618e-06, "loss": 0.7560818195343018, "step": 596 }, { "epoch": 1.261603375527426, "grad_norm": 5.018416404724121, "learning_rate": 7.509634949177952e-06, "loss": 0.664783239364624, "step": 598 }, { "epoch": 1.2658227848101267, "grad_norm": 1.6520812511444092, "learning_rate": 7.505407005284236e-06, "loss": 0.9736641645431519, "step": 600 }, { "epoch": 1.270042194092827, "grad_norm": 6.091807842254639, "learning_rate": 7.5011622567626055e-06, "loss": 1.3401249647140503, "step": 602 }, { "epoch": 1.2742616033755274, "grad_norm": 6.131025791168213, "learning_rate": 7.4969007265831284e-06, "loss": 1.0688127279281616, "step": 604 }, { "epoch": 1.2784810126582278, "grad_norm": 5.798713207244873, "learning_rate": 7.4926224378066905e-06, "loss": 0.7948801517486572, "step": 606 }, { "epoch": 1.2827004219409281, "grad_norm": 10.305594444274902, "learning_rate": 7.488327413584863e-06, "loss": 0.8427482843399048, "step": 608 }, { "epoch": 1.2869198312236287, "grad_norm": 8.171891212463379, "learning_rate": 7.484015677159779e-06, "loss": 0.9117364883422852, "step": 610 }, { "epoch": 1.2911392405063291, "grad_norm": 5.7218804359436035, "learning_rate": 7.479687251864008e-06, "loss": 0.6430226564407349, "step": 612 }, { "epoch": 1.2953586497890295, "grad_norm": 1.4333503246307373, "learning_rate": 7.47534216112043e-06, "loss": 0.6988530158996582, "step": 614 }, { "epoch": 1.29957805907173, "grad_norm": 2.1469638347625732, "learning_rate": 7.4709804284421096e-06, "loss": 1.0747710466384888, "step": 616 }, { "epoch": 1.3037974683544304, "grad_norm": 2.6359219551086426, "learning_rate": 7.466602077432167e-06, "loss": 1.0839927196502686, "step": 618 }, { "epoch": 1.3080168776371308, "grad_norm": 2.7480428218841553, "learning_rate": 7.4622071317836495e-06, "loss": 0.6828069090843201, "step": 620 }, { "epoch": 1.3122362869198312, "grad_norm": 3.1572508811950684, "learning_rate": 7.45779561527941e-06, "loss": 0.7725319862365723, "step": 622 }, { "epoch": 1.3164556962025316, "grad_norm": 4.627418518066406, "learning_rate": 7.453367551791965e-06, "loss": 0.8618891835212708, "step": 624 }, { "epoch": 1.3206751054852321, "grad_norm": 13.366847038269043, "learning_rate": 7.448922965283379e-06, "loss": 1.0350444316864014, "step": 626 }, { "epoch": 1.3248945147679325, "grad_norm": 9.86931324005127, "learning_rate": 7.44446187980513e-06, "loss": 1.5988271236419678, "step": 628 }, { "epoch": 1.3291139240506329, "grad_norm": 2.421264171600342, "learning_rate": 7.439984319497975e-06, "loss": 0.8888686895370483, "step": 630 }, { "epoch": 1.3333333333333333, "grad_norm": 2.454784870147705, "learning_rate": 7.435490308591826e-06, "loss": 0.9130518436431885, "step": 632 }, { "epoch": 1.3375527426160336, "grad_norm": 1.1312861442565918, "learning_rate": 7.4309798714056145e-06, "loss": 0.7504403591156006, "step": 634 }, { "epoch": 1.3417721518987342, "grad_norm": 6.462110996246338, "learning_rate": 7.4264530323471605e-06, "loss": 0.6684986352920532, "step": 636 }, { "epoch": 1.3459915611814346, "grad_norm": 34.968597412109375, "learning_rate": 7.421909815913044e-06, "loss": 0.7958526611328125, "step": 638 }, { "epoch": 1.350210970464135, "grad_norm": 2.127056121826172, "learning_rate": 7.4173502466884655e-06, "loss": 1.2176686525344849, "step": 640 }, { "epoch": 1.3544303797468356, "grad_norm": 2.8063347339630127, "learning_rate": 7.412774349347123e-06, "loss": 0.781902015209198, "step": 642 }, { "epoch": 1.358649789029536, "grad_norm": 2.1645452976226807, "learning_rate": 7.408182148651068e-06, "loss": 1.2542736530303955, "step": 644 }, { "epoch": 1.3628691983122363, "grad_norm": 2.789949417114258, "learning_rate": 7.4035736694505765e-06, "loss": 1.1398190259933472, "step": 646 }, { "epoch": 1.3670886075949367, "grad_norm": 3.9663474559783936, "learning_rate": 7.398948936684016e-06, "loss": 0.8999311923980713, "step": 648 }, { "epoch": 1.371308016877637, "grad_norm": 3.7580132484436035, "learning_rate": 7.394307975377705e-06, "loss": 1.3543846607208252, "step": 650 }, { "epoch": 1.3755274261603376, "grad_norm": 2.3560047149658203, "learning_rate": 7.389650810645788e-06, "loss": 1.067474365234375, "step": 652 }, { "epoch": 1.379746835443038, "grad_norm": 8.556138038635254, "learning_rate": 7.384977467690088e-06, "loss": 0.7700819373130798, "step": 654 }, { "epoch": 1.3839662447257384, "grad_norm": 2.5057106018066406, "learning_rate": 7.380287971799974e-06, "loss": 1.1854264736175537, "step": 656 }, { "epoch": 1.3881856540084387, "grad_norm": 3.844391107559204, "learning_rate": 7.37558234835223e-06, "loss": 0.6306626796722412, "step": 658 }, { "epoch": 1.3924050632911391, "grad_norm": 5.850677013397217, "learning_rate": 7.370860622810906e-06, "loss": 1.1226918697357178, "step": 660 }, { "epoch": 1.3966244725738397, "grad_norm": 3.9669106006622314, "learning_rate": 7.3661228207271954e-06, "loss": 0.7194290161132812, "step": 662 }, { "epoch": 1.40084388185654, "grad_norm": 1.6219054460525513, "learning_rate": 7.3613689677392795e-06, "loss": 1.117922306060791, "step": 664 }, { "epoch": 1.4050632911392404, "grad_norm": 2.222062826156616, "learning_rate": 7.356599089572203e-06, "loss": 0.7130357027053833, "step": 666 }, { "epoch": 1.409282700421941, "grad_norm": 2.1885979175567627, "learning_rate": 7.35181321203773e-06, "loss": 1.0384951829910278, "step": 668 }, { "epoch": 1.4135021097046414, "grad_norm": 1.6668306589126587, "learning_rate": 7.3470113610342025e-06, "loss": 1.1504696607589722, "step": 670 }, { "epoch": 1.4177215189873418, "grad_norm": 3.1233675479888916, "learning_rate": 7.342193562546399e-06, "loss": 1.2578648328781128, "step": 672 }, { "epoch": 1.4219409282700421, "grad_norm": 1.6057863235473633, "learning_rate": 7.337359842645397e-06, "loss": 0.6594195365905762, "step": 674 }, { "epoch": 1.4261603375527425, "grad_norm": 4.513533592224121, "learning_rate": 7.332510227488436e-06, "loss": 0.6804168224334717, "step": 676 }, { "epoch": 1.4303797468354431, "grad_norm": 3.0312631130218506, "learning_rate": 7.327644743318766e-06, "loss": 0.9876019954681396, "step": 678 }, { "epoch": 1.4345991561181435, "grad_norm": 5.478196144104004, "learning_rate": 7.322763416465513e-06, "loss": 1.087882399559021, "step": 680 }, { "epoch": 1.4388185654008439, "grad_norm": 4.24491024017334, "learning_rate": 7.317866273343534e-06, "loss": 0.8171271085739136, "step": 682 }, { "epoch": 1.4430379746835442, "grad_norm": 1.7392427921295166, "learning_rate": 7.312953340453274e-06, "loss": 1.1222814321517944, "step": 684 }, { "epoch": 1.4472573839662446, "grad_norm": 6.056474208831787, "learning_rate": 7.308024644380625e-06, "loss": 0.7576450109481812, "step": 686 }, { "epoch": 1.4514767932489452, "grad_norm": 3.2466626167297363, "learning_rate": 7.303080211796774e-06, "loss": 0.8898618221282959, "step": 688 }, { "epoch": 1.4556962025316456, "grad_norm": 8.08736515045166, "learning_rate": 7.298120069458071e-06, "loss": 1.0252546072006226, "step": 690 }, { "epoch": 1.459915611814346, "grad_norm": 6.501577377319336, "learning_rate": 7.293144244205875e-06, "loss": 0.6603308916091919, "step": 692 }, { "epoch": 1.4641350210970465, "grad_norm": 2.7896158695220947, "learning_rate": 7.288152762966415e-06, "loss": 1.225158452987671, "step": 694 }, { "epoch": 1.4683544303797469, "grad_norm": 4.285532474517822, "learning_rate": 7.283145652750635e-06, "loss": 0.5561915636062622, "step": 696 }, { "epoch": 1.4725738396624473, "grad_norm": 1.7746226787567139, "learning_rate": 7.27812294065406e-06, "loss": 1.1419634819030762, "step": 698 }, { "epoch": 1.4767932489451476, "grad_norm": 1.805708408355713, "learning_rate": 7.2730846538566375e-06, "loss": 1.178218126296997, "step": 700 }, { "epoch": 1.481012658227848, "grad_norm": 5.747068405151367, "learning_rate": 7.2680308196226e-06, "loss": 0.989362359046936, "step": 702 }, { "epoch": 1.4852320675105486, "grad_norm": 7.557291030883789, "learning_rate": 7.262961465300312e-06, "loss": 0.7833366394042969, "step": 704 }, { "epoch": 1.489451476793249, "grad_norm": 1.8191746473312378, "learning_rate": 7.257876618322125e-06, "loss": 1.1235054731369019, "step": 706 }, { "epoch": 1.4936708860759493, "grad_norm": 2.4673242568969727, "learning_rate": 7.252776306204226e-06, "loss": 1.1172146797180176, "step": 708 }, { "epoch": 1.49789029535865, "grad_norm": 1.062907099723816, "learning_rate": 7.247660556546489e-06, "loss": 0.984178900718689, "step": 710 }, { "epoch": 1.50210970464135, "grad_norm": 2.470454692840576, "learning_rate": 7.242529397032332e-06, "loss": 0.467578649520874, "step": 712 }, { "epoch": 1.5063291139240507, "grad_norm": 4.536571979522705, "learning_rate": 7.237382855428555e-06, "loss": 1.0994584560394287, "step": 714 }, { "epoch": 1.510548523206751, "grad_norm": 1.8687479496002197, "learning_rate": 7.232220959585203e-06, "loss": 1.0004863739013672, "step": 716 }, { "epoch": 1.5147679324894514, "grad_norm": 1.6920982599258423, "learning_rate": 7.227043737435406e-06, "loss": 1.0185657739639282, "step": 718 }, { "epoch": 1.518987341772152, "grad_norm": 3.6787755489349365, "learning_rate": 7.221851216995229e-06, "loss": 1.0142695903778076, "step": 720 }, { "epoch": 1.5232067510548524, "grad_norm": 3.2205076217651367, "learning_rate": 7.216643426363528e-06, "loss": 1.115187644958496, "step": 722 }, { "epoch": 1.5274261603375527, "grad_norm": 2.2605745792388916, "learning_rate": 7.211420393721787e-06, "loss": 1.204482078552246, "step": 724 }, { "epoch": 1.5316455696202531, "grad_norm": 4.8364667892456055, "learning_rate": 7.206182147333974e-06, "loss": 0.6358588933944702, "step": 726 }, { "epoch": 1.5358649789029535, "grad_norm": 6.24644136428833, "learning_rate": 7.200928715546382e-06, "loss": 0.3853702247142792, "step": 728 }, { "epoch": 1.540084388185654, "grad_norm": 2.5594065189361572, "learning_rate": 7.1956601267874806e-06, "loss": 0.8702763319015503, "step": 730 }, { "epoch": 1.5443037974683544, "grad_norm": 1.8882993459701538, "learning_rate": 7.1903764095677595e-06, "loss": 0.958168625831604, "step": 732 }, { "epoch": 1.5485232067510548, "grad_norm": 2.2798001766204834, "learning_rate": 7.185077592479573e-06, "loss": 1.0963058471679688, "step": 734 }, { "epoch": 1.5527426160337554, "grad_norm": 3.2573153972625732, "learning_rate": 7.17976370419699e-06, "loss": 0.851632833480835, "step": 736 }, { "epoch": 1.5569620253164556, "grad_norm": 3.9381816387176514, "learning_rate": 7.174434773475635e-06, "loss": 0.9015741944313049, "step": 738 }, { "epoch": 1.5611814345991561, "grad_norm": 2.2864365577697754, "learning_rate": 7.169090829152531e-06, "loss": 1.0464608669281006, "step": 740 }, { "epoch": 1.5654008438818565, "grad_norm": 1.112131118774414, "learning_rate": 7.163731900145947e-06, "loss": 0.6916845440864563, "step": 742 }, { "epoch": 1.5696202531645569, "grad_norm": 4.296708583831787, "learning_rate": 7.158358015455243e-06, "loss": 0.5111595392227173, "step": 744 }, { "epoch": 1.5738396624472575, "grad_norm": 3.6669418811798096, "learning_rate": 7.152969204160704e-06, "loss": 0.6755394339561462, "step": 746 }, { "epoch": 1.5780590717299579, "grad_norm": 14.373259544372559, "learning_rate": 7.147565495423394e-06, "loss": 0.6762098073959351, "step": 748 }, { "epoch": 1.5822784810126582, "grad_norm": 1.3357782363891602, "learning_rate": 7.142146918484996e-06, "loss": 1.0881752967834473, "step": 750 }, { "epoch": 1.5864978902953588, "grad_norm": 2.055518627166748, "learning_rate": 7.13671350266764e-06, "loss": 0.6121246814727783, "step": 752 }, { "epoch": 1.590717299578059, "grad_norm": 4.617133617401123, "learning_rate": 7.131265277373768e-06, "loss": 0.7100765705108643, "step": 754 }, { "epoch": 1.5949367088607596, "grad_norm": 2.564394950866699, "learning_rate": 7.125802272085954e-06, "loss": 0.5700541734695435, "step": 756 }, { "epoch": 1.59915611814346, "grad_norm": 3.551905393600464, "learning_rate": 7.120324516366754e-06, "loss": 0.8716294765472412, "step": 758 }, { "epoch": 1.6033755274261603, "grad_norm": 2.745734930038452, "learning_rate": 7.114832039858547e-06, "loss": 1.1156964302062988, "step": 760 }, { "epoch": 1.6075949367088609, "grad_norm": 2.40625262260437, "learning_rate": 7.109324872283371e-06, "loss": 0.6260151863098145, "step": 762 }, { "epoch": 1.611814345991561, "grad_norm": 1.5272423028945923, "learning_rate": 7.10380304344276e-06, "loss": 1.1335524320602417, "step": 764 }, { "epoch": 1.6160337552742616, "grad_norm": 4.155389785766602, "learning_rate": 7.098266583217592e-06, "loss": 1.0015907287597656, "step": 766 }, { "epoch": 1.620253164556962, "grad_norm": 0.7669569849967957, "learning_rate": 7.0927155215679175e-06, "loss": 0.8719363212585449, "step": 768 }, { "epoch": 1.6244725738396624, "grad_norm": 2.079810380935669, "learning_rate": 7.087149888532803e-06, "loss": 1.0807125568389893, "step": 770 }, { "epoch": 1.628691983122363, "grad_norm": 1.8407090902328491, "learning_rate": 7.081569714230168e-06, "loss": 1.1782212257385254, "step": 772 }, { "epoch": 1.6329113924050633, "grad_norm": 2.1639020442962646, "learning_rate": 7.075975028856614e-06, "loss": 1.0721291303634644, "step": 774 }, { "epoch": 1.6371308016877637, "grad_norm": 4.351590633392334, "learning_rate": 7.070365862687276e-06, "loss": 1.0304412841796875, "step": 776 }, { "epoch": 1.6413502109704643, "grad_norm": 3.4083518981933594, "learning_rate": 7.064742246075647e-06, "loss": 0.8413809537887573, "step": 778 }, { "epoch": 1.6455696202531644, "grad_norm": 4.749937534332275, "learning_rate": 7.059104209453417e-06, "loss": 0.3687572479248047, "step": 780 }, { "epoch": 1.649789029535865, "grad_norm": 9.06319808959961, "learning_rate": 7.0534517833303085e-06, "loss": 1.0481884479522705, "step": 782 }, { "epoch": 1.6540084388185654, "grad_norm": 4.062221527099609, "learning_rate": 7.047784998293913e-06, "loss": 0.8559701442718506, "step": 784 }, { "epoch": 1.6582278481012658, "grad_norm": 10.564360618591309, "learning_rate": 7.0421038850095235e-06, "loss": 1.074246883392334, "step": 786 }, { "epoch": 1.6624472573839664, "grad_norm": 12.672545433044434, "learning_rate": 7.036408474219966e-06, "loss": 0.5824824571609497, "step": 788 }, { "epoch": 1.6666666666666665, "grad_norm": 2.3040852546691895, "learning_rate": 7.03069879674544e-06, "loss": 1.0848541259765625, "step": 790 }, { "epoch": 1.6708860759493671, "grad_norm": 4.974740982055664, "learning_rate": 7.024974883483347e-06, "loss": 0.5032600164413452, "step": 792 }, { "epoch": 1.6751054852320675, "grad_norm": 0.5375344753265381, "learning_rate": 7.019236765408122e-06, "loss": 0.7874377965927124, "step": 794 }, { "epoch": 1.6793248945147679, "grad_norm": 2.140566349029541, "learning_rate": 7.013484473571073e-06, "loss": 1.0540302991867065, "step": 796 }, { "epoch": 1.6835443037974684, "grad_norm": 6.119294166564941, "learning_rate": 7.007718039100201e-06, "loss": 0.8562701940536499, "step": 798 }, { "epoch": 1.6877637130801688, "grad_norm": 11.767963409423828, "learning_rate": 7.001937493200045e-06, "loss": 1.2052388191223145, "step": 800 }, { "epoch": 1.6919831223628692, "grad_norm": 17.296886444091797, "learning_rate": 6.996142867151502e-06, "loss": 0.6549183130264282, "step": 802 }, { "epoch": 1.6962025316455698, "grad_norm": 2.24831485748291, "learning_rate": 6.990334192311668e-06, "loss": 1.2283351421356201, "step": 804 }, { "epoch": 1.70042194092827, "grad_norm": 2.4578514099121094, "learning_rate": 6.9845115001136545e-06, "loss": 1.1071836948394775, "step": 806 }, { "epoch": 1.7046413502109705, "grad_norm": 10.491120338439941, "learning_rate": 6.978674822066434e-06, "loss": 0.7744203805923462, "step": 808 }, { "epoch": 1.7088607594936709, "grad_norm": 9.302081108093262, "learning_rate": 6.97282418975466e-06, "loss": 0.8782643675804138, "step": 810 }, { "epoch": 1.7130801687763713, "grad_norm": 1.8191728591918945, "learning_rate": 6.966959634838495e-06, "loss": 1.128312349319458, "step": 812 }, { "epoch": 1.7172995780590719, "grad_norm": 5.150999069213867, "learning_rate": 6.961081189053449e-06, "loss": 1.454809546470642, "step": 814 }, { "epoch": 1.721518987341772, "grad_norm": 1.7257297039031982, "learning_rate": 6.955188884210195e-06, "loss": 1.0828335285186768, "step": 816 }, { "epoch": 1.7257383966244726, "grad_norm": 4.542337894439697, "learning_rate": 6.9492827521944066e-06, "loss": 0.8022172451019287, "step": 818 }, { "epoch": 1.729957805907173, "grad_norm": 8.734732627868652, "learning_rate": 6.943362824966579e-06, "loss": 0.46849238872528076, "step": 820 }, { "epoch": 1.7341772151898733, "grad_norm": 7.7328200340271, "learning_rate": 6.937429134561862e-06, "loss": 0.5579560995101929, "step": 822 }, { "epoch": 1.738396624472574, "grad_norm": 2.0381147861480713, "learning_rate": 6.9314817130898826e-06, "loss": 0.7268804311752319, "step": 824 }, { "epoch": 1.7426160337552743, "grad_norm": 1.5466476678848267, "learning_rate": 6.925520592734571e-06, "loss": 0.9139357805252075, "step": 826 }, { "epoch": 1.7468354430379747, "grad_norm": 9.312906265258789, "learning_rate": 6.919545805753988e-06, "loss": 0.9899505376815796, "step": 828 }, { "epoch": 1.7510548523206753, "grad_norm": 1.7241586446762085, "learning_rate": 6.913557384480151e-06, "loss": 1.066752314567566, "step": 830 }, { "epoch": 1.7552742616033754, "grad_norm": 1.62288236618042, "learning_rate": 6.907555361318859e-06, "loss": 1.0838364362716675, "step": 832 }, { "epoch": 1.759493670886076, "grad_norm": 1.7631701231002808, "learning_rate": 6.901539768749513e-06, "loss": 0.8664329051971436, "step": 834 }, { "epoch": 1.7637130801687764, "grad_norm": 1.4140545129776, "learning_rate": 6.895510639324947e-06, "loss": 1.0501675605773926, "step": 836 }, { "epoch": 1.7679324894514767, "grad_norm": 2.3473498821258545, "learning_rate": 6.889468005671248e-06, "loss": 0.9035965204238892, "step": 838 }, { "epoch": 1.7721518987341773, "grad_norm": 4.273458003997803, "learning_rate": 6.883411900487578e-06, "loss": 0.7962709665298462, "step": 840 }, { "epoch": 1.7763713080168775, "grad_norm": 7.660892963409424, "learning_rate": 6.877342356545999e-06, "loss": 0.9311078190803528, "step": 842 }, { "epoch": 1.780590717299578, "grad_norm": 1.3525784015655518, "learning_rate": 6.871259406691299e-06, "loss": 0.46452265977859497, "step": 844 }, { "epoch": 1.7848101265822784, "grad_norm": 1.2650240659713745, "learning_rate": 6.865163083840808e-06, "loss": 0.653459370136261, "step": 846 }, { "epoch": 1.7890295358649788, "grad_norm": 16.892318725585938, "learning_rate": 6.859053420984222e-06, "loss": 0.587769091129303, "step": 848 }, { "epoch": 1.7932489451476794, "grad_norm": 3.177400588989258, "learning_rate": 6.852930451183426e-06, "loss": 0.8080633878707886, "step": 850 }, { "epoch": 1.7974683544303798, "grad_norm": 2.106072187423706, "learning_rate": 6.846794207572317e-06, "loss": 1.09242582321167, "step": 852 }, { "epoch": 1.8016877637130801, "grad_norm": 3.0559608936309814, "learning_rate": 6.840644723356619e-06, "loss": 1.4061119556427002, "step": 854 }, { "epoch": 1.8059071729957807, "grad_norm": 0.9994240403175354, "learning_rate": 6.834482031813709e-06, "loss": 0.7950407862663269, "step": 856 }, { "epoch": 1.810126582278481, "grad_norm": 17.731142044067383, "learning_rate": 6.82830616629243e-06, "loss": 1.0046894550323486, "step": 858 }, { "epoch": 1.8143459915611815, "grad_norm": 1.822799801826477, "learning_rate": 6.822117160212916e-06, "loss": 0.6357040405273438, "step": 860 }, { "epoch": 1.8185654008438819, "grad_norm": 3.4448676109313965, "learning_rate": 6.815915047066415e-06, "loss": 1.0787222385406494, "step": 862 }, { "epoch": 1.8227848101265822, "grad_norm": 1.7101033926010132, "learning_rate": 6.809699860415097e-06, "loss": 1.0257686376571655, "step": 864 }, { "epoch": 1.8270042194092828, "grad_norm": 6.336645603179932, "learning_rate": 6.80347163389188e-06, "loss": 0.9438542127609253, "step": 866 }, { "epoch": 1.831223628691983, "grad_norm": 4.888845920562744, "learning_rate": 6.797230401200247e-06, "loss": 0.9173398017883301, "step": 868 }, { "epoch": 1.8354430379746836, "grad_norm": 1.9469853639602661, "learning_rate": 6.790976196114059e-06, "loss": 1.036512851715088, "step": 870 }, { "epoch": 1.839662447257384, "grad_norm": 5.8768815994262695, "learning_rate": 6.784709052477382e-06, "loss": 0.8006809949874878, "step": 872 }, { "epoch": 1.8438818565400843, "grad_norm": 2.4700989723205566, "learning_rate": 6.7784290042042924e-06, "loss": 0.9556717276573181, "step": 874 }, { "epoch": 1.8481012658227849, "grad_norm": 3.1491811275482178, "learning_rate": 6.772136085278703e-06, "loss": 1.1224122047424316, "step": 876 }, { "epoch": 1.8523206751054853, "grad_norm": 23.419981002807617, "learning_rate": 6.765830329754171e-06, "loss": 0.7619462013244629, "step": 878 }, { "epoch": 1.8565400843881856, "grad_norm": 4.361552715301514, "learning_rate": 6.7595117717537186e-06, "loss": 0.6938849687576294, "step": 880 }, { "epoch": 1.8607594936708862, "grad_norm": 3.267629623413086, "learning_rate": 6.753180445469651e-06, "loss": 0.8586090803146362, "step": 882 }, { "epoch": 1.8649789029535864, "grad_norm": 1.915306806564331, "learning_rate": 6.746836385163365e-06, "loss": 0.7172484397888184, "step": 884 }, { "epoch": 1.869198312236287, "grad_norm": 10.15339183807373, "learning_rate": 6.740479625165166e-06, "loss": 0.7663919925689697, "step": 886 }, { "epoch": 1.8734177215189873, "grad_norm": 1.4916582107543945, "learning_rate": 6.734110199874082e-06, "loss": 1.0811569690704346, "step": 888 }, { "epoch": 1.8776371308016877, "grad_norm": 3.9946820735931396, "learning_rate": 6.727728143757681e-06, "loss": 0.4816530644893646, "step": 890 }, { "epoch": 1.8818565400843883, "grad_norm": 1.4981932640075684, "learning_rate": 6.7213334913518795e-06, "loss": 0.6716771721839905, "step": 892 }, { "epoch": 1.8860759493670884, "grad_norm": 1.4426230192184448, "learning_rate": 6.714926277260759e-06, "loss": 1.055748462677002, "step": 894 }, { "epoch": 1.890295358649789, "grad_norm": 2.779737949371338, "learning_rate": 6.708506536156375e-06, "loss": 1.2872055768966675, "step": 896 }, { "epoch": 1.8945147679324894, "grad_norm": 18.332468032836914, "learning_rate": 6.702074302778574e-06, "loss": 0.7888720631599426, "step": 898 }, { "epoch": 1.8987341772151898, "grad_norm": 4.269297122955322, "learning_rate": 6.695629611934803e-06, "loss": 0.9360828995704651, "step": 900 }, { "epoch": 1.9029535864978904, "grad_norm": 3.4333150386810303, "learning_rate": 6.689172498499919e-06, "loss": 1.1581498384475708, "step": 902 }, { "epoch": 1.9071729957805907, "grad_norm": 3.325373888015747, "learning_rate": 6.6827029974160085e-06, "loss": 1.0004583597183228, "step": 904 }, { "epoch": 1.9113924050632911, "grad_norm": 1.337743878364563, "learning_rate": 6.676221143692186e-06, "loss": 1.2600127458572388, "step": 906 }, { "epoch": 1.9156118143459917, "grad_norm": 9.333498001098633, "learning_rate": 6.669726972404415e-06, "loss": 0.5244170427322388, "step": 908 }, { "epoch": 1.9198312236286919, "grad_norm": 1.4464713335037231, "learning_rate": 6.663220518695314e-06, "loss": 1.0309032201766968, "step": 910 }, { "epoch": 1.9240506329113924, "grad_norm": 3.8824985027313232, "learning_rate": 6.656701817773966e-06, "loss": 0.7978178262710571, "step": 912 }, { "epoch": 1.9282700421940928, "grad_norm": 2.117260217666626, "learning_rate": 6.650170904915727e-06, "loss": 1.1143381595611572, "step": 914 }, { "epoch": 1.9324894514767932, "grad_norm": 2.3016726970672607, "learning_rate": 6.643627815462041e-06, "loss": 0.7327609062194824, "step": 916 }, { "epoch": 1.9367088607594938, "grad_norm": 1.3250812292099, "learning_rate": 6.637072584820241e-06, "loss": 1.0381274223327637, "step": 918 }, { "epoch": 1.9409282700421941, "grad_norm": 3.2023909091949463, "learning_rate": 6.630505248463364e-06, "loss": 0.5368826985359192, "step": 920 }, { "epoch": 1.9451476793248945, "grad_norm": 1.5674402713775635, "learning_rate": 6.623925841929953e-06, "loss": 1.0610504150390625, "step": 922 }, { "epoch": 1.9493670886075949, "grad_norm": 1.7156344652175903, "learning_rate": 6.617334400823867e-06, "loss": 1.154762625694275, "step": 924 }, { "epoch": 1.9535864978902953, "grad_norm": 1.6762484312057495, "learning_rate": 6.610730960814092e-06, "loss": 0.8508365154266357, "step": 926 }, { "epoch": 1.9578059071729959, "grad_norm": 1.3070154190063477, "learning_rate": 6.604115557634545e-06, "loss": 0.7161068916320801, "step": 928 }, { "epoch": 1.9620253164556962, "grad_norm": 2.4822962284088135, "learning_rate": 6.597488227083879e-06, "loss": 1.1143286228179932, "step": 930 }, { "epoch": 1.9662447257383966, "grad_norm": 2.1459968090057373, "learning_rate": 6.590849005025289e-06, "loss": 0.8785426020622253, "step": 932 }, { "epoch": 1.9704641350210972, "grad_norm": 18.12381935119629, "learning_rate": 6.584197927386326e-06, "loss": 1.200589656829834, "step": 934 }, { "epoch": 1.9746835443037973, "grad_norm": 1.572724461555481, "learning_rate": 6.577535030158689e-06, "loss": 1.1270561218261719, "step": 936 }, { "epoch": 1.978902953586498, "grad_norm": 0.8099290132522583, "learning_rate": 6.570860349398041e-06, "loss": 0.6693128347396851, "step": 938 }, { "epoch": 1.9831223628691983, "grad_norm": 1.4404888153076172, "learning_rate": 6.5641739212238136e-06, "loss": 1.1134912967681885, "step": 940 }, { "epoch": 1.9873417721518987, "grad_norm": 11.569640159606934, "learning_rate": 6.557475781819004e-06, "loss": 0.9092779159545898, "step": 942 }, { "epoch": 1.9915611814345993, "grad_norm": 2.00720477104187, "learning_rate": 6.550765967429984e-06, "loss": 0.7343477010726929, "step": 944 }, { "epoch": 1.9957805907172996, "grad_norm": 8.226292610168457, "learning_rate": 6.544044514366306e-06, "loss": 1.0801680088043213, "step": 946 }, { "epoch": 2.0, "grad_norm": 2.7631890773773193, "learning_rate": 6.537311459000502e-06, "loss": 0.5224167108535767, "step": 948 }, { "epoch": 2.0042194092827006, "grad_norm": 4.7062296867370605, "learning_rate": 6.53056683776789e-06, "loss": 0.8504010438919067, "step": 950 }, { "epoch": 2.0084388185654007, "grad_norm": 8.53116512298584, "learning_rate": 6.5238106871663755e-06, "loss": 0.6483380794525146, "step": 952 }, { "epoch": 2.0126582278481013, "grad_norm": 3.5020530223846436, "learning_rate": 6.517043043756252e-06, "loss": 0.8229789733886719, "step": 954 }, { "epoch": 2.0168776371308015, "grad_norm": 2.5090668201446533, "learning_rate": 6.5102639441600086e-06, "loss": 0.868636965751648, "step": 956 }, { "epoch": 2.021097046413502, "grad_norm": 6.097753047943115, "learning_rate": 6.503473425062126e-06, "loss": 0.6441227197647095, "step": 958 }, { "epoch": 2.0253164556962027, "grad_norm": 3.8150646686553955, "learning_rate": 6.4966715232088835e-06, "loss": 0.7223113179206848, "step": 960 }, { "epoch": 2.029535864978903, "grad_norm": 2.547377109527588, "learning_rate": 6.489858275408152e-06, "loss": 1.046697735786438, "step": 962 }, { "epoch": 2.0337552742616034, "grad_norm": 2.0660111904144287, "learning_rate": 6.483033718529204e-06, "loss": 0.7585334777832031, "step": 964 }, { "epoch": 2.037974683544304, "grad_norm": 1.3951258659362793, "learning_rate": 6.476197889502512e-06, "loss": 0.571182370185852, "step": 966 }, { "epoch": 2.042194092827004, "grad_norm": 5.060699939727783, "learning_rate": 6.46935082531954e-06, "loss": 0.6977952718734741, "step": 968 }, { "epoch": 2.0464135021097047, "grad_norm": 9.534734725952148, "learning_rate": 6.4624925630325555e-06, "loss": 0.924410343170166, "step": 970 }, { "epoch": 2.050632911392405, "grad_norm": 10.224217414855957, "learning_rate": 6.455623139754423e-06, "loss": 0.7734869122505188, "step": 972 }, { "epoch": 2.0548523206751055, "grad_norm": 2.367072343826294, "learning_rate": 6.4487425926584005e-06, "loss": 0.762604832649231, "step": 974 }, { "epoch": 2.059071729957806, "grad_norm": 3.0896172523498535, "learning_rate": 6.441850958977945e-06, "loss": 0.6143279075622559, "step": 976 }, { "epoch": 2.0632911392405062, "grad_norm": 1.7992668151855469, "learning_rate": 6.434948276006505e-06, "loss": 0.6615221500396729, "step": 978 }, { "epoch": 2.067510548523207, "grad_norm": 3.8281936645507812, "learning_rate": 6.4280345810973225e-06, "loss": 0.6476603150367737, "step": 980 }, { "epoch": 2.071729957805907, "grad_norm": 1.7640984058380127, "learning_rate": 6.42110991166323e-06, "loss": 0.8162950277328491, "step": 982 }, { "epoch": 2.0759493670886076, "grad_norm": 4.995830059051514, "learning_rate": 6.414174305176448e-06, "loss": 0.9169092774391174, "step": 984 }, { "epoch": 2.080168776371308, "grad_norm": 1.7362895011901855, "learning_rate": 6.407227799168378e-06, "loss": 0.9022603034973145, "step": 986 }, { "epoch": 2.0843881856540083, "grad_norm": 2.571808338165283, "learning_rate": 6.400270431229409e-06, "loss": 0.9147624969482422, "step": 988 }, { "epoch": 2.088607594936709, "grad_norm": 9.426867485046387, "learning_rate": 6.393302239008705e-06, "loss": 0.46702778339385986, "step": 990 }, { "epoch": 2.0928270042194095, "grad_norm": 1.8141038417816162, "learning_rate": 6.386323260214006e-06, "loss": 0.49038439989089966, "step": 992 }, { "epoch": 2.0970464135021096, "grad_norm": 2.054802894592285, "learning_rate": 6.37933353261142e-06, "loss": 1.0175153017044067, "step": 994 }, { "epoch": 2.1012658227848102, "grad_norm": 5.117776870727539, "learning_rate": 6.372333094025224e-06, "loss": 0.8956054449081421, "step": 996 }, { "epoch": 2.1054852320675104, "grad_norm": 0.727590024471283, "learning_rate": 6.365321982337655e-06, "loss": 0.5565606951713562, "step": 998 }, { "epoch": 2.109704641350211, "grad_norm": 2.0183682441711426, "learning_rate": 6.3583002354887065e-06, "loss": 1.0998228788375854, "step": 1000 }, { "epoch": 2.1139240506329116, "grad_norm": 3.211463689804077, "learning_rate": 6.351267891475925e-06, "loss": 0.8330961465835571, "step": 1002 }, { "epoch": 2.1181434599156117, "grad_norm": 4.385818004608154, "learning_rate": 6.344224988354201e-06, "loss": 0.8911874294281006, "step": 1004 }, { "epoch": 2.1223628691983123, "grad_norm": 1.8507286310195923, "learning_rate": 6.3371715642355665e-06, "loss": 0.5850310325622559, "step": 1006 }, { "epoch": 2.1265822784810124, "grad_norm": 1.5357205867767334, "learning_rate": 6.3301076572889804e-06, "loss": 0.6495864391326904, "step": 1008 }, { "epoch": 2.130801687763713, "grad_norm": 1.9483667612075806, "learning_rate": 6.32303330574014e-06, "loss": 0.6409696340560913, "step": 1010 }, { "epoch": 2.1350210970464136, "grad_norm": 5.264192581176758, "learning_rate": 6.3159485478712504e-06, "loss": 0.8244346976280212, "step": 1012 }, { "epoch": 2.1392405063291138, "grad_norm": 6.72544527053833, "learning_rate": 6.308853422020838e-06, "loss": 1.0458412170410156, "step": 1014 }, { "epoch": 2.1434599156118144, "grad_norm": 4.4975738525390625, "learning_rate": 6.301747966583533e-06, "loss": 0.5240525007247925, "step": 1016 }, { "epoch": 2.147679324894515, "grad_norm": 2.4814205169677734, "learning_rate": 6.294632220009858e-06, "loss": 0.7953197360038757, "step": 1018 }, { "epoch": 2.151898734177215, "grad_norm": 1.7783337831497192, "learning_rate": 6.2875062208060345e-06, "loss": 0.6177500486373901, "step": 1020 }, { "epoch": 2.1561181434599157, "grad_norm": 4.14943790435791, "learning_rate": 6.280370007533755e-06, "loss": 0.7844660878181458, "step": 1022 }, { "epoch": 2.160337552742616, "grad_norm": 17.742002487182617, "learning_rate": 6.2732236188099925e-06, "loss": 0.7165024280548096, "step": 1024 }, { "epoch": 2.1645569620253164, "grad_norm": 1.9838588237762451, "learning_rate": 6.266067093306778e-06, "loss": 0.9177765846252441, "step": 1026 }, { "epoch": 2.168776371308017, "grad_norm": 1.8689168691635132, "learning_rate": 6.258900469751002e-06, "loss": 0.9903367757797241, "step": 1028 }, { "epoch": 2.172995780590717, "grad_norm": 1.8121206760406494, "learning_rate": 6.251723786924195e-06, "loss": 0.9095609188079834, "step": 1030 }, { "epoch": 2.1772151898734178, "grad_norm": 1.719159483909607, "learning_rate": 6.244537083662325e-06, "loss": 0.9629115462303162, "step": 1032 }, { "epoch": 2.181434599156118, "grad_norm": 3.3003413677215576, "learning_rate": 6.237340398855583e-06, "loss": 0.9314064979553223, "step": 1034 }, { "epoch": 2.1856540084388185, "grad_norm": 1.902093768119812, "learning_rate": 6.230133771448174e-06, "loss": 0.8848311305046082, "step": 1036 }, { "epoch": 2.189873417721519, "grad_norm": 4.84321403503418, "learning_rate": 6.222917240438112e-06, "loss": 0.9192149639129639, "step": 1038 }, { "epoch": 2.1940928270042193, "grad_norm": 1.7536414861679077, "learning_rate": 6.215690844876994e-06, "loss": 1.1547870635986328, "step": 1040 }, { "epoch": 2.19831223628692, "grad_norm": 6.852333068847656, "learning_rate": 6.208454623869805e-06, "loss": 0.32395103573799133, "step": 1042 }, { "epoch": 2.2025316455696204, "grad_norm": 1.9538402557373047, "learning_rate": 6.2012086165747e-06, "loss": 0.9581727981567383, "step": 1044 }, { "epoch": 2.2067510548523206, "grad_norm": 4.133998394012451, "learning_rate": 6.193952862202785e-06, "loss": 0.6086496710777283, "step": 1046 }, { "epoch": 2.210970464135021, "grad_norm": 1.9011021852493286, "learning_rate": 6.18668740001792e-06, "loss": 0.7543759346008301, "step": 1048 }, { "epoch": 2.2151898734177213, "grad_norm": 1.46916663646698, "learning_rate": 6.17941226933649e-06, "loss": 0.9485968947410583, "step": 1050 }, { "epoch": 2.219409282700422, "grad_norm": 5.856541156768799, "learning_rate": 6.172127509527205e-06, "loss": 0.8059616088867188, "step": 1052 }, { "epoch": 2.2236286919831225, "grad_norm": 4.198894023895264, "learning_rate": 6.164833160010882e-06, "loss": 0.7487938404083252, "step": 1054 }, { "epoch": 2.2278481012658227, "grad_norm": 23.342222213745117, "learning_rate": 6.157529260260229e-06, "loss": 0.7880909442901611, "step": 1056 }, { "epoch": 2.2320675105485233, "grad_norm": 23.40158462524414, "learning_rate": 6.150215849799637e-06, "loss": 0.5327481031417847, "step": 1058 }, { "epoch": 2.2362869198312234, "grad_norm": 1.634332537651062, "learning_rate": 6.142892968204963e-06, "loss": 0.883295476436615, "step": 1060 }, { "epoch": 2.240506329113924, "grad_norm": 0.7251645922660828, "learning_rate": 6.135560655103316e-06, "loss": 0.5540227890014648, "step": 1062 }, { "epoch": 2.2447257383966246, "grad_norm": 1.5355224609375, "learning_rate": 6.12821895017284e-06, "loss": 0.50773686170578, "step": 1064 }, { "epoch": 2.2489451476793247, "grad_norm": 2.305499792098999, "learning_rate": 6.120867893142506e-06, "loss": 0.8910026550292969, "step": 1066 }, { "epoch": 2.2531645569620253, "grad_norm": 3.746581792831421, "learning_rate": 6.1135075237918905e-06, "loss": 1.0884243249893188, "step": 1068 }, { "epoch": 2.257383966244726, "grad_norm": 3.282155752182007, "learning_rate": 6.106137881950965e-06, "loss": 1.0420414209365845, "step": 1070 }, { "epoch": 2.261603375527426, "grad_norm": 2.951901435852051, "learning_rate": 6.098759007499875e-06, "loss": 0.9006770849227905, "step": 1072 }, { "epoch": 2.2658227848101267, "grad_norm": 2.8723626136779785, "learning_rate": 6.091370940368729e-06, "loss": 1.1099491119384766, "step": 1074 }, { "epoch": 2.270042194092827, "grad_norm": 1.841613531112671, "learning_rate": 6.083973720537386e-06, "loss": 0.9306420087814331, "step": 1076 }, { "epoch": 2.2742616033755274, "grad_norm": 0.8245161771774292, "learning_rate": 6.0765673880352224e-06, "loss": 0.6501108407974243, "step": 1078 }, { "epoch": 2.278481012658228, "grad_norm": 16.89291763305664, "learning_rate": 6.069151982940936e-06, "loss": 0.7018378376960754, "step": 1080 }, { "epoch": 2.282700421940928, "grad_norm": 15.395925521850586, "learning_rate": 6.06172754538232e-06, "loss": 0.3668671250343323, "step": 1082 }, { "epoch": 2.2869198312236287, "grad_norm": 7.03673791885376, "learning_rate": 6.054294115536044e-06, "loss": 0.6594992280006409, "step": 1084 }, { "epoch": 2.291139240506329, "grad_norm": 1.275587797164917, "learning_rate": 6.046851733627436e-06, "loss": 0.48280084133148193, "step": 1086 }, { "epoch": 2.2953586497890295, "grad_norm": 3.333641290664673, "learning_rate": 6.039400439930271e-06, "loss": 0.6253411769866943, "step": 1088 }, { "epoch": 2.29957805907173, "grad_norm": 1.850312352180481, "learning_rate": 6.031940274766546e-06, "loss": 0.49555736780166626, "step": 1090 }, { "epoch": 2.3037974683544302, "grad_norm": 3.2576518058776855, "learning_rate": 6.024471278506269e-06, "loss": 0.7540421485900879, "step": 1092 }, { "epoch": 2.308016877637131, "grad_norm": 2.6489086151123047, "learning_rate": 6.016993491567234e-06, "loss": 0.6014547944068909, "step": 1094 }, { "epoch": 2.3122362869198314, "grad_norm": 5.111599445343018, "learning_rate": 6.0095069544148075e-06, "loss": 0.3525955379009247, "step": 1096 }, { "epoch": 2.3164556962025316, "grad_norm": 13.784951210021973, "learning_rate": 6.002011707561704e-06, "loss": 0.8247784376144409, "step": 1098 }, { "epoch": 2.320675105485232, "grad_norm": 4.951453685760498, "learning_rate": 5.9945077915677695e-06, "loss": 0.8657753467559814, "step": 1100 }, { "epoch": 2.3248945147679323, "grad_norm": 1.6704450845718384, "learning_rate": 5.9869952470397655e-06, "loss": 0.841392993927002, "step": 1102 }, { "epoch": 2.329113924050633, "grad_norm": 1.9911108016967773, "learning_rate": 5.979474114631144e-06, "loss": 1.0287697315216064, "step": 1104 }, { "epoch": 2.3333333333333335, "grad_norm": 1.8969277143478394, "learning_rate": 5.971944435041831e-06, "loss": 0.730893611907959, "step": 1106 }, { "epoch": 2.3375527426160336, "grad_norm": 4.81918478012085, "learning_rate": 5.9644062490180004e-06, "loss": 0.5627094507217407, "step": 1108 }, { "epoch": 2.3417721518987342, "grad_norm": 2.462564468383789, "learning_rate": 5.956859597351862e-06, "loss": 0.8845915198326111, "step": 1110 }, { "epoch": 2.3459915611814344, "grad_norm": 2.167839527130127, "learning_rate": 5.94930452088144e-06, "loss": 0.9817606210708618, "step": 1112 }, { "epoch": 2.350210970464135, "grad_norm": 8.067427635192871, "learning_rate": 5.941741060490339e-06, "loss": 1.1635032892227173, "step": 1114 }, { "epoch": 2.3544303797468356, "grad_norm": 0.5956460237503052, "learning_rate": 5.93416925710754e-06, "loss": 0.4855182468891144, "step": 1116 }, { "epoch": 2.3586497890295357, "grad_norm": 15.0598783493042, "learning_rate": 5.9265891517071695e-06, "loss": 0.9245091676712036, "step": 1118 }, { "epoch": 2.3628691983122363, "grad_norm": 2.6616246700286865, "learning_rate": 5.9190007853082795e-06, "loss": 0.6047594547271729, "step": 1120 }, { "epoch": 2.367088607594937, "grad_norm": 7.563075542449951, "learning_rate": 5.911404198974625e-06, "loss": 0.9117496013641357, "step": 1122 }, { "epoch": 2.371308016877637, "grad_norm": 5.370510101318359, "learning_rate": 5.903799433814442e-06, "loss": 0.5350353717803955, "step": 1124 }, { "epoch": 2.3755274261603376, "grad_norm": 1.819912075996399, "learning_rate": 5.8961865309802285e-06, "loss": 0.667518138885498, "step": 1126 }, { "epoch": 2.379746835443038, "grad_norm": 2.640817165374756, "learning_rate": 5.888565531668514e-06, "loss": 0.8784997463226318, "step": 1128 }, { "epoch": 2.3839662447257384, "grad_norm": 66.16091918945312, "learning_rate": 5.880936477119645e-06, "loss": 0.4616549611091614, "step": 1130 }, { "epoch": 2.388185654008439, "grad_norm": 5.994080066680908, "learning_rate": 5.873299408617559e-06, "loss": 0.3559979200363159, "step": 1132 }, { "epoch": 2.392405063291139, "grad_norm": 23.00125503540039, "learning_rate": 5.865654367489556e-06, "loss": 0.40349674224853516, "step": 1134 }, { "epoch": 2.3966244725738397, "grad_norm": 5.636394500732422, "learning_rate": 5.858001395106082e-06, "loss": 0.5823970437049866, "step": 1136 }, { "epoch": 2.40084388185654, "grad_norm": 2.3996365070343018, "learning_rate": 5.850340532880504e-06, "loss": 0.921074628829956, "step": 1138 }, { "epoch": 2.4050632911392404, "grad_norm": 10.836583137512207, "learning_rate": 5.842671822268878e-06, "loss": 0.7500771880149841, "step": 1140 }, { "epoch": 2.409282700421941, "grad_norm": 7.443431854248047, "learning_rate": 5.83499530476974e-06, "loss": 0.3230987787246704, "step": 1142 }, { "epoch": 2.413502109704641, "grad_norm": 2.597289800643921, "learning_rate": 5.827311021923863e-06, "loss": 0.732123851776123, "step": 1144 }, { "epoch": 2.4177215189873418, "grad_norm": 1.982795238494873, "learning_rate": 5.819619015314047e-06, "loss": 0.9608519077301025, "step": 1146 }, { "epoch": 2.4219409282700424, "grad_norm": 3.819395065307617, "learning_rate": 5.8119193265648865e-06, "loss": 0.6804056167602539, "step": 1148 }, { "epoch": 2.4261603375527425, "grad_norm": 6.851272106170654, "learning_rate": 5.80421199734255e-06, "loss": 1.004921555519104, "step": 1150 }, { "epoch": 2.430379746835443, "grad_norm": 3.0809147357940674, "learning_rate": 5.7964970693545466e-06, "loss": 0.6196656823158264, "step": 1152 }, { "epoch": 2.4345991561181437, "grad_norm": 1.8345930576324463, "learning_rate": 5.788774584349508e-06, "loss": 1.043914556503296, "step": 1154 }, { "epoch": 2.438818565400844, "grad_norm": 3.7520220279693604, "learning_rate": 5.781044584116963e-06, "loss": 0.30900609493255615, "step": 1156 }, { "epoch": 2.4430379746835444, "grad_norm": 1.6004582643508911, "learning_rate": 5.773307110487106e-06, "loss": 0.7037574052810669, "step": 1158 }, { "epoch": 2.4472573839662446, "grad_norm": 1.8472445011138916, "learning_rate": 5.765562205330568e-06, "loss": 0.9773483872413635, "step": 1160 }, { "epoch": 2.451476793248945, "grad_norm": 2.698925018310547, "learning_rate": 5.757809910558205e-06, "loss": 0.6617934703826904, "step": 1162 }, { "epoch": 2.4556962025316453, "grad_norm": 1.6956886053085327, "learning_rate": 5.750050268120851e-06, "loss": 0.851616382598877, "step": 1164 }, { "epoch": 2.459915611814346, "grad_norm": 1.288453221321106, "learning_rate": 5.742283320009111e-06, "loss": 0.8924407958984375, "step": 1166 }, { "epoch": 2.4641350210970465, "grad_norm": 1.4182209968566895, "learning_rate": 5.734509108253117e-06, "loss": 0.48247936367988586, "step": 1168 }, { "epoch": 2.4683544303797467, "grad_norm": 2.1459646224975586, "learning_rate": 5.726727674922309e-06, "loss": 0.8906441926956177, "step": 1170 }, { "epoch": 2.4725738396624473, "grad_norm": 1.393717885017395, "learning_rate": 5.718939062125207e-06, "loss": 0.876624584197998, "step": 1172 }, { "epoch": 2.476793248945148, "grad_norm": 1.8553041219711304, "learning_rate": 5.711143312009183e-06, "loss": 0.9824315309524536, "step": 1174 }, { "epoch": 2.481012658227848, "grad_norm": 2.4953160285949707, "learning_rate": 5.703340466760228e-06, "loss": 0.7499101161956787, "step": 1176 }, { "epoch": 2.4852320675105486, "grad_norm": 5.494137763977051, "learning_rate": 5.695530568602733e-06, "loss": 0.42195141315460205, "step": 1178 }, { "epoch": 2.489451476793249, "grad_norm": 4.595831394195557, "learning_rate": 5.687713659799253e-06, "loss": 0.7049263715744019, "step": 1180 }, { "epoch": 2.4936708860759493, "grad_norm": 5.080184459686279, "learning_rate": 5.679889782650275e-06, "loss": 0.880506157875061, "step": 1182 }, { "epoch": 2.49789029535865, "grad_norm": 27.051029205322266, "learning_rate": 5.672058979494004e-06, "loss": 0.5125079154968262, "step": 1184 }, { "epoch": 2.50210970464135, "grad_norm": 1.5325865745544434, "learning_rate": 5.6642212927061185e-06, "loss": 0.385905385017395, "step": 1186 }, { "epoch": 2.5063291139240507, "grad_norm": 7.488584995269775, "learning_rate": 5.656376764699549e-06, "loss": 0.5802481770515442, "step": 1188 }, { "epoch": 2.510548523206751, "grad_norm": 2.4869072437286377, "learning_rate": 5.648525437924244e-06, "loss": 0.810112476348877, "step": 1190 }, { "epoch": 2.5147679324894514, "grad_norm": 5.196420192718506, "learning_rate": 5.640667354866948e-06, "loss": 0.40649741888046265, "step": 1192 }, { "epoch": 2.518987341772152, "grad_norm": 10.881569862365723, "learning_rate": 5.632802558050964e-06, "loss": 1.1927690505981445, "step": 1194 }, { "epoch": 2.523206751054852, "grad_norm": 2.63506817817688, "learning_rate": 5.6249310900359236e-06, "loss": 0.969944179058075, "step": 1196 }, { "epoch": 2.5274261603375527, "grad_norm": 11.932506561279297, "learning_rate": 5.617052993417562e-06, "loss": 0.9280753135681152, "step": 1198 }, { "epoch": 2.5316455696202533, "grad_norm": 45.778499603271484, "learning_rate": 5.609168310827482e-06, "loss": 0.7399793267250061, "step": 1200 }, { "epoch": 2.5358649789029535, "grad_norm": 1.8853310346603394, "learning_rate": 5.6012770849329275e-06, "loss": 0.7420691251754761, "step": 1202 }, { "epoch": 2.540084388185654, "grad_norm": 3.9101645946502686, "learning_rate": 5.593379358436551e-06, "loss": 0.7088044285774231, "step": 1204 }, { "epoch": 2.5443037974683547, "grad_norm": 3.0373728275299072, "learning_rate": 5.585475174076184e-06, "loss": 0.8735544681549072, "step": 1206 }, { "epoch": 2.548523206751055, "grad_norm": 8.735350608825684, "learning_rate": 5.577564574624599e-06, "loss": 0.6918007135391235, "step": 1208 }, { "epoch": 2.5527426160337554, "grad_norm": 3.198167085647583, "learning_rate": 5.569647602889289e-06, "loss": 1.1403307914733887, "step": 1210 }, { "epoch": 2.5569620253164556, "grad_norm": 2.7564687728881836, "learning_rate": 5.561724301712225e-06, "loss": 0.9847512245178223, "step": 1212 }, { "epoch": 2.561181434599156, "grad_norm": 5.109920501708984, "learning_rate": 5.553794713969632e-06, "loss": 0.30179572105407715, "step": 1214 }, { "epoch": 2.5654008438818563, "grad_norm": 4.672483921051025, "learning_rate": 5.545858882571755e-06, "loss": 0.7192697525024414, "step": 1216 }, { "epoch": 2.569620253164557, "grad_norm": 2.7057371139526367, "learning_rate": 5.5379168504626256e-06, "loss": 0.9119170308113098, "step": 1218 }, { "epoch": 2.5738396624472575, "grad_norm": 5.782102584838867, "learning_rate": 5.5299686606198255e-06, "loss": 0.59529709815979, "step": 1220 }, { "epoch": 2.5780590717299576, "grad_norm": 5.356619834899902, "learning_rate": 5.522014356054264e-06, "loss": 0.888773500919342, "step": 1222 }, { "epoch": 2.5822784810126582, "grad_norm": 34.81894302368164, "learning_rate": 5.51405397980994e-06, "loss": 0.671154260635376, "step": 1224 }, { "epoch": 2.586497890295359, "grad_norm": 4.4581193923950195, "learning_rate": 5.506087574963703e-06, "loss": 0.5387101173400879, "step": 1226 }, { "epoch": 2.590717299578059, "grad_norm": 4.779836177825928, "learning_rate": 5.49811518462503e-06, "loss": 0.9294767379760742, "step": 1228 }, { "epoch": 2.5949367088607596, "grad_norm": 4.019083499908447, "learning_rate": 5.4901368519357886e-06, "loss": 0.9565463066101074, "step": 1230 }, { "epoch": 2.59915611814346, "grad_norm": 2.806299924850464, "learning_rate": 5.482152620070001e-06, "loss": 0.8302749991416931, "step": 1232 }, { "epoch": 2.6033755274261603, "grad_norm": 46.3029899597168, "learning_rate": 5.474162532233609e-06, "loss": 0.28912973403930664, "step": 1234 }, { "epoch": 2.607594936708861, "grad_norm": 4.818080425262451, "learning_rate": 5.4661666316642534e-06, "loss": 1.0101039409637451, "step": 1236 }, { "epoch": 2.611814345991561, "grad_norm": 4.704904556274414, "learning_rate": 5.458164961631019e-06, "loss": 1.141682505607605, "step": 1238 }, { "epoch": 2.6160337552742616, "grad_norm": 2.6413064002990723, "learning_rate": 5.450157565434217e-06, "loss": 0.7691728472709656, "step": 1240 }, { "epoch": 2.620253164556962, "grad_norm": 2.3116259574890137, "learning_rate": 5.442144486405146e-06, "loss": 0.8952039480209351, "step": 1242 }, { "epoch": 2.6244725738396624, "grad_norm": 3.752659797668457, "learning_rate": 5.434125767905855e-06, "loss": 0.41019898653030396, "step": 1244 }, { "epoch": 2.628691983122363, "grad_norm": 2.37690806388855, "learning_rate": 5.426101453328911e-06, "loss": 0.704147219657898, "step": 1246 }, { "epoch": 2.632911392405063, "grad_norm": 3.650939702987671, "learning_rate": 5.418071586097162e-06, "loss": 1.3898766040802002, "step": 1248 }, { "epoch": 2.6371308016877637, "grad_norm": 1.6102105379104614, "learning_rate": 5.410036209663506e-06, "loss": 0.961624026298523, "step": 1250 }, { "epoch": 2.6413502109704643, "grad_norm": 3.446720600128174, "learning_rate": 5.401995367510652e-06, "loss": 0.924649715423584, "step": 1252 }, { "epoch": 2.6455696202531644, "grad_norm": 4.68242073059082, "learning_rate": 5.393949103150889e-06, "loss": 0.4435887932777405, "step": 1254 }, { "epoch": 2.649789029535865, "grad_norm": 51.22077178955078, "learning_rate": 5.385897460125841e-06, "loss": 0.5546849370002747, "step": 1256 }, { "epoch": 2.6540084388185656, "grad_norm": 2.3170604705810547, "learning_rate": 5.377840482006247e-06, "loss": 0.7113304138183594, "step": 1258 }, { "epoch": 2.6582278481012658, "grad_norm": 4.746129512786865, "learning_rate": 5.369778212391713e-06, "loss": 0.8765827417373657, "step": 1260 }, { "epoch": 2.6624472573839664, "grad_norm": 4.732134819030762, "learning_rate": 5.361710694910476e-06, "loss": 0.8504003882408142, "step": 1262 }, { "epoch": 2.6666666666666665, "grad_norm": 1.4788029193878174, "learning_rate": 5.3536379732191735e-06, "loss": 0.5229237079620361, "step": 1264 }, { "epoch": 2.670886075949367, "grad_norm": 9.586766242980957, "learning_rate": 5.3455600910026075e-06, "loss": 0.776203989982605, "step": 1266 }, { "epoch": 2.6751054852320673, "grad_norm": 1.5792274475097656, "learning_rate": 5.337477091973503e-06, "loss": 0.7061780691146851, "step": 1268 }, { "epoch": 2.679324894514768, "grad_norm": 1.9227761030197144, "learning_rate": 5.3293890198722765e-06, "loss": 0.40927794575691223, "step": 1270 }, { "epoch": 2.6835443037974684, "grad_norm": 2.802013635635376, "learning_rate": 5.321295918466793e-06, "loss": 0.9143922924995422, "step": 1272 }, { "epoch": 2.6877637130801686, "grad_norm": 14.795599937438965, "learning_rate": 5.3131978315521355e-06, "loss": 0.6321116089820862, "step": 1274 }, { "epoch": 2.691983122362869, "grad_norm": 3.627547264099121, "learning_rate": 5.305094802950368e-06, "loss": 0.7536362409591675, "step": 1276 }, { "epoch": 2.6962025316455698, "grad_norm": 0.6867983937263489, "learning_rate": 5.296986876510293e-06, "loss": 0.27872833609580994, "step": 1278 }, { "epoch": 2.70042194092827, "grad_norm": 3.1109073162078857, "learning_rate": 5.288874096107218e-06, "loss": 0.8334829807281494, "step": 1280 }, { "epoch": 2.7046413502109705, "grad_norm": 1.4203203916549683, "learning_rate": 5.2807565056427155e-06, "loss": 0.9659562110900879, "step": 1282 }, { "epoch": 2.708860759493671, "grad_norm": 2.113590955734253, "learning_rate": 5.2726341490443915e-06, "loss": 0.3422914743423462, "step": 1284 }, { "epoch": 2.7130801687763713, "grad_norm": 4.1795759201049805, "learning_rate": 5.264507070265639e-06, "loss": 0.44313400983810425, "step": 1286 }, { "epoch": 2.717299578059072, "grad_norm": 0.8358619809150696, "learning_rate": 5.256375313285407e-06, "loss": 0.50257408618927, "step": 1288 }, { "epoch": 2.721518987341772, "grad_norm": 0.6378387808799744, "learning_rate": 5.248238922107958e-06, "loss": 0.5335341095924377, "step": 1290 }, { "epoch": 2.7257383966244726, "grad_norm": 2.0149025917053223, "learning_rate": 5.240097940762638e-06, "loss": 0.9738786220550537, "step": 1292 }, { "epoch": 2.7299578059071727, "grad_norm": 3.022477149963379, "learning_rate": 5.231952413303623e-06, "loss": 0.41252389550209045, "step": 1294 }, { "epoch": 2.7341772151898733, "grad_norm": 2.5130767822265625, "learning_rate": 5.2238023838097e-06, "loss": 0.9761707186698914, "step": 1296 }, { "epoch": 2.738396624472574, "grad_norm": 2.43636155128479, "learning_rate": 5.21564789638401e-06, "loss": 0.9268041253089905, "step": 1298 }, { "epoch": 2.742616033755274, "grad_norm": 1.058410406112671, "learning_rate": 5.207488995153821e-06, "loss": 0.6909565925598145, "step": 1300 }, { "epoch": 2.7468354430379747, "grad_norm": 1.987685203552246, "learning_rate": 5.1993257242702874e-06, "loss": 1.0122733116149902, "step": 1302 }, { "epoch": 2.7510548523206753, "grad_norm": 1.9147788286209106, "learning_rate": 5.191158127908207e-06, "loss": 0.5920695066452026, "step": 1304 }, { "epoch": 2.7552742616033754, "grad_norm": 2.714449405670166, "learning_rate": 5.182986250265786e-06, "loss": 1.0310044288635254, "step": 1306 }, { "epoch": 2.759493670886076, "grad_norm": 1.7678923606872559, "learning_rate": 5.174810135564397e-06, "loss": 0.9253189563751221, "step": 1308 }, { "epoch": 2.7637130801687766, "grad_norm": 5.892001152038574, "learning_rate": 5.1666298280483436e-06, "loss": 0.80256587266922, "step": 1310 }, { "epoch": 2.7679324894514767, "grad_norm": 12.360274314880371, "learning_rate": 5.158445371984614e-06, "loss": 0.9463623762130737, "step": 1312 }, { "epoch": 2.7721518987341773, "grad_norm": 2.3304073810577393, "learning_rate": 5.150256811662653e-06, "loss": 0.9907184839248657, "step": 1314 }, { "epoch": 2.7763713080168775, "grad_norm": 3.796537160873413, "learning_rate": 5.142064191394107e-06, "loss": 0.609095573425293, "step": 1316 }, { "epoch": 2.780590717299578, "grad_norm": 2.1420092582702637, "learning_rate": 5.133867555512599e-06, "loss": 0.5119812488555908, "step": 1318 }, { "epoch": 2.7848101265822782, "grad_norm": 3.1301980018615723, "learning_rate": 5.125666948373477e-06, "loss": 0.9296759366989136, "step": 1320 }, { "epoch": 2.789029535864979, "grad_norm": 2.0127930641174316, "learning_rate": 5.1174624143535845e-06, "loss": 0.4965199828147888, "step": 1322 }, { "epoch": 2.7932489451476794, "grad_norm": 6.2706217765808105, "learning_rate": 5.10925399785101e-06, "loss": 1.0675724744796753, "step": 1324 }, { "epoch": 2.7974683544303796, "grad_norm": 9.385963439941406, "learning_rate": 5.101041743284855e-06, "loss": 0.8606825470924377, "step": 1326 }, { "epoch": 2.80168776371308, "grad_norm": 1.8741198778152466, "learning_rate": 5.0928256950949874e-06, "loss": 0.6247942447662354, "step": 1328 }, { "epoch": 2.8059071729957807, "grad_norm": 1.5202703475952148, "learning_rate": 5.084605897741808e-06, "loss": 0.9863821268081665, "step": 1330 }, { "epoch": 2.810126582278481, "grad_norm": 2.4014227390289307, "learning_rate": 5.076382395706001e-06, "loss": 0.7821711301803589, "step": 1332 }, { "epoch": 2.8143459915611815, "grad_norm": 2.5113935470581055, "learning_rate": 5.0681552334883015e-06, "loss": 0.48877081274986267, "step": 1334 }, { "epoch": 2.818565400843882, "grad_norm": 1.116832971572876, "learning_rate": 5.059924455609252e-06, "loss": 0.639763593673706, "step": 1336 }, { "epoch": 2.8227848101265822, "grad_norm": 2.161829710006714, "learning_rate": 5.051690106608958e-06, "loss": 0.6487863063812256, "step": 1338 }, { "epoch": 2.827004219409283, "grad_norm": 3.6037724018096924, "learning_rate": 5.04345223104685e-06, "loss": 0.8599737882614136, "step": 1340 }, { "epoch": 2.831223628691983, "grad_norm": 25.895301818847656, "learning_rate": 5.035210873501446e-06, "loss": 0.8409707546234131, "step": 1342 }, { "epoch": 2.8354430379746836, "grad_norm": 1.7677528858184814, "learning_rate": 5.026966078570102e-06, "loss": 1.0647809505462646, "step": 1344 }, { "epoch": 2.8396624472573837, "grad_norm": 5.221706390380859, "learning_rate": 5.0187178908687765e-06, "loss": 0.6761691570281982, "step": 1346 }, { "epoch": 2.8438818565400843, "grad_norm": 2.069338798522949, "learning_rate": 5.010466355031788e-06, "loss": 0.5935064554214478, "step": 1348 }, { "epoch": 2.848101265822785, "grad_norm": 1.759072184562683, "learning_rate": 5.002211515711574e-06, "loss": 0.9735701680183411, "step": 1350 }, { "epoch": 2.852320675105485, "grad_norm": 6.272833824157715, "learning_rate": 4.993953417578447e-06, "loss": 0.6328434944152832, "step": 1352 }, { "epoch": 2.8565400843881856, "grad_norm": 8.284124374389648, "learning_rate": 4.985692105320356e-06, "loss": 0.6582671403884888, "step": 1354 }, { "epoch": 2.8607594936708862, "grad_norm": 4.635993003845215, "learning_rate": 4.977427623642641e-06, "loss": 0.56138014793396, "step": 1356 }, { "epoch": 2.8649789029535864, "grad_norm": 1.4133601188659668, "learning_rate": 4.9691600172677945e-06, "loss": 0.9400450587272644, "step": 1358 }, { "epoch": 2.869198312236287, "grad_norm": 3.7701480388641357, "learning_rate": 4.960889330935215e-06, "loss": 0.8297948837280273, "step": 1360 }, { "epoch": 2.8734177215189876, "grad_norm": 3.301804780960083, "learning_rate": 4.952615609400973e-06, "loss": 0.5724865794181824, "step": 1362 }, { "epoch": 2.8776371308016877, "grad_norm": 3.880316972732544, "learning_rate": 4.94433889743756e-06, "loss": 0.9015120267868042, "step": 1364 }, { "epoch": 2.8818565400843883, "grad_norm": 2.4567813873291016, "learning_rate": 4.93605923983365e-06, "loss": 1.0346885919570923, "step": 1366 }, { "epoch": 2.8860759493670884, "grad_norm": 6.220330238342285, "learning_rate": 4.92777668139386e-06, "loss": 0.9936701059341431, "step": 1368 }, { "epoch": 2.890295358649789, "grad_norm": 2.123796224594116, "learning_rate": 4.919491266938501e-06, "loss": 0.9021327495574951, "step": 1370 }, { "epoch": 2.894514767932489, "grad_norm": 5.658734321594238, "learning_rate": 4.911203041303342e-06, "loss": 0.4772055745124817, "step": 1372 }, { "epoch": 2.8987341772151898, "grad_norm": 3.852552890777588, "learning_rate": 4.902912049339362e-06, "loss": 0.7514923214912415, "step": 1374 }, { "epoch": 2.9029535864978904, "grad_norm": 1.6569684743881226, "learning_rate": 4.894618335912511e-06, "loss": 0.9316278696060181, "step": 1376 }, { "epoch": 2.9071729957805905, "grad_norm": 3.75467586517334, "learning_rate": 4.886321945903466e-06, "loss": 0.7876487374305725, "step": 1378 }, { "epoch": 2.911392405063291, "grad_norm": 5.47602653503418, "learning_rate": 4.8780229242073895e-06, "loss": 1.141374111175537, "step": 1380 }, { "epoch": 2.9156118143459917, "grad_norm": 4.051374435424805, "learning_rate": 4.86972131573368e-06, "loss": 0.640509307384491, "step": 1382 }, { "epoch": 2.919831223628692, "grad_norm": 7.645662307739258, "learning_rate": 4.86141716540574e-06, "loss": 0.6477132439613342, "step": 1384 }, { "epoch": 2.9240506329113924, "grad_norm": 2.780121088027954, "learning_rate": 4.853110518160723e-06, "loss": 0.5821589827537537, "step": 1386 }, { "epoch": 2.928270042194093, "grad_norm": 4.329258441925049, "learning_rate": 4.844801418949299e-06, "loss": 0.9183673858642578, "step": 1388 }, { "epoch": 2.932489451476793, "grad_norm": 1.9503802061080933, "learning_rate": 4.836489912735402e-06, "loss": 0.8357143402099609, "step": 1390 }, { "epoch": 2.9367088607594938, "grad_norm": 1.0354031324386597, "learning_rate": 4.8281760444959926e-06, "loss": 0.45355841517448425, "step": 1392 }, { "epoch": 2.9409282700421944, "grad_norm": 5.098978519439697, "learning_rate": 4.8198598592208126e-06, "loss": 0.6029504537582397, "step": 1394 }, { "epoch": 2.9451476793248945, "grad_norm": 1.7021719217300415, "learning_rate": 4.811541401912146e-06, "loss": 0.8993232250213623, "step": 1396 }, { "epoch": 2.9493670886075947, "grad_norm": 0.9127289652824402, "learning_rate": 4.803220717584566e-06, "loss": 0.7546182870864868, "step": 1398 }, { "epoch": 2.9535864978902953, "grad_norm": 4.401092529296875, "learning_rate": 4.7948978512647016e-06, "loss": 0.813082218170166, "step": 1400 }, { "epoch": 2.957805907172996, "grad_norm": 4.511460304260254, "learning_rate": 4.786572847990987e-06, "loss": 0.5571738481521606, "step": 1402 }, { "epoch": 2.962025316455696, "grad_norm": 1.7611995935440063, "learning_rate": 4.778245752813421e-06, "loss": 0.9406437277793884, "step": 1404 }, { "epoch": 2.9662447257383966, "grad_norm": 5.091633319854736, "learning_rate": 4.769916610793324e-06, "loss": 0.7957962155342102, "step": 1406 }, { "epoch": 2.970464135021097, "grad_norm": 4.824461460113525, "learning_rate": 4.76158546700309e-06, "loss": 0.4907096028327942, "step": 1408 }, { "epoch": 2.9746835443037973, "grad_norm": 3.2384073734283447, "learning_rate": 4.75325236652595e-06, "loss": 0.742721676826477, "step": 1410 }, { "epoch": 2.978902953586498, "grad_norm": 3.1998229026794434, "learning_rate": 4.744917354455715e-06, "loss": 0.9864751100540161, "step": 1412 }, { "epoch": 2.9831223628691985, "grad_norm": 8.642891883850098, "learning_rate": 4.73658047589655e-06, "loss": 0.8089879751205444, "step": 1414 }, { "epoch": 2.9873417721518987, "grad_norm": 1.6954267024993896, "learning_rate": 4.7282417759627134e-06, "loss": 0.8185816407203674, "step": 1416 }, { "epoch": 2.9915611814345993, "grad_norm": 1.4983083009719849, "learning_rate": 4.719901299778325e-06, "loss": 0.8309789896011353, "step": 1418 }, { "epoch": 2.9957805907173, "grad_norm": 5.758739948272705, "learning_rate": 4.71155909247711e-06, "loss": 0.9021912813186646, "step": 1420 }, { "epoch": 3.0, "grad_norm": 2.6232898235321045, "learning_rate": 4.703215199202169e-06, "loss": 0.2926831841468811, "step": 1422 }, { "epoch": 3.0042194092827006, "grad_norm": 2.3824779987335205, "learning_rate": 4.6948696651057225e-06, "loss": 0.5607067346572876, "step": 1424 }, { "epoch": 3.0084388185654007, "grad_norm": 10.018150329589844, "learning_rate": 4.6865225353488675e-06, "loss": 0.5354501008987427, "step": 1426 }, { "epoch": 3.0126582278481013, "grad_norm": 16.426225662231445, "learning_rate": 4.678173855101341e-06, "loss": 0.5269479155540466, "step": 1428 }, { "epoch": 3.0168776371308015, "grad_norm": 15.212722778320312, "learning_rate": 4.669823669541266e-06, "loss": 0.39293336868286133, "step": 1430 }, { "epoch": 3.021097046413502, "grad_norm": 2.7751386165618896, "learning_rate": 4.661472023854916e-06, "loss": 0.8252520561218262, "step": 1432 }, { "epoch": 3.0253164556962027, "grad_norm": 2.98408842086792, "learning_rate": 4.653118963236458e-06, "loss": 0.7142210006713867, "step": 1434 }, { "epoch": 3.029535864978903, "grad_norm": 1.7343214750289917, "learning_rate": 4.644764532887726e-06, "loss": 0.8274791240692139, "step": 1436 }, { "epoch": 3.0337552742616034, "grad_norm": 2.1739771366119385, "learning_rate": 4.636408778017957e-06, "loss": 0.3643840551376343, "step": 1438 }, { "epoch": 3.037974683544304, "grad_norm": 1.6796656847000122, "learning_rate": 4.6280517438435616e-06, "loss": 0.7152677178382874, "step": 1440 }, { "epoch": 3.042194092827004, "grad_norm": 1.2829041481018066, "learning_rate": 4.61969347558787e-06, "loss": 0.5194791555404663, "step": 1442 }, { "epoch": 3.0464135021097047, "grad_norm": 1.6921758651733398, "learning_rate": 4.6113340184808925e-06, "loss": 0.6532431840896606, "step": 1444 }, { "epoch": 3.050632911392405, "grad_norm": 1.7007097005844116, "learning_rate": 4.602973417759071e-06, "loss": 0.7926474809646606, "step": 1446 }, { "epoch": 3.0548523206751055, "grad_norm": 2854.81396484375, "learning_rate": 4.594611718665038e-06, "loss": 0.5383695960044861, "step": 1448 }, { "epoch": 3.059071729957806, "grad_norm": 4.033429145812988, "learning_rate": 4.586248966447367e-06, "loss": 0.6349921822547913, "step": 1450 }, { "epoch": 3.0632911392405062, "grad_norm": 1.7910646200180054, "learning_rate": 4.577885206360334e-06, "loss": 0.7665805220603943, "step": 1452 }, { "epoch": 3.067510548523207, "grad_norm": 2.6463685035705566, "learning_rate": 4.5695204836636655e-06, "loss": 0.617534875869751, "step": 1454 }, { "epoch": 3.071729957805907, "grad_norm": 3.8018009662628174, "learning_rate": 4.561154843622299e-06, "loss": 0.4436488151550293, "step": 1456 }, { "epoch": 3.0759493670886076, "grad_norm": 1.6345051527023315, "learning_rate": 4.552788331506134e-06, "loss": 0.45668232440948486, "step": 1458 }, { "epoch": 3.080168776371308, "grad_norm": 10.521839141845703, "learning_rate": 4.544420992589792e-06, "loss": 0.5640779733657837, "step": 1460 }, { "epoch": 3.0843881856540083, "grad_norm": 2.4284989833831787, "learning_rate": 4.53605287215237e-06, "loss": 0.7709170579910278, "step": 1462 }, { "epoch": 3.088607594936709, "grad_norm": 1.7529208660125732, "learning_rate": 4.527684015477188e-06, "loss": 0.7688764333724976, "step": 1464 }, { "epoch": 3.0928270042194095, "grad_norm": 29.2982177734375, "learning_rate": 4.519314467851555e-06, "loss": 0.7450973987579346, "step": 1466 }, { "epoch": 3.0970464135021096, "grad_norm": 6.658877372741699, "learning_rate": 4.510944274566518e-06, "loss": 0.5714298486709595, "step": 1468 }, { "epoch": 3.1012658227848102, "grad_norm": 10.68608283996582, "learning_rate": 4.502573480916617e-06, "loss": 0.17385733127593994, "step": 1470 }, { "epoch": 3.1054852320675104, "grad_norm": 5.784574508666992, "learning_rate": 4.494202132199643e-06, "loss": 0.9861471652984619, "step": 1472 }, { "epoch": 3.109704641350211, "grad_norm": 0.49223047494888306, "learning_rate": 4.485830273716386e-06, "loss": 0.3651547431945801, "step": 1474 }, { "epoch": 3.1139240506329116, "grad_norm": 3.3575212955474854, "learning_rate": 4.4774579507704e-06, "loss": 0.8966869115829468, "step": 1476 }, { "epoch": 3.1181434599156117, "grad_norm": 3.334437370300293, "learning_rate": 4.46908520866775e-06, "loss": 0.9988681077957153, "step": 1478 }, { "epoch": 3.1223628691983123, "grad_norm": 1.8539319038391113, "learning_rate": 4.460712092716768e-06, "loss": 0.7239236831665039, "step": 1480 }, { "epoch": 3.1265822784810124, "grad_norm": 3.4966561794281006, "learning_rate": 4.452338648227813e-06, "loss": 0.8891302347183228, "step": 1482 }, { "epoch": 3.130801687763713, "grad_norm": 2.0679361820220947, "learning_rate": 4.443964920513017e-06, "loss": 0.8403599262237549, "step": 1484 }, { "epoch": 3.1350210970464136, "grad_norm": 6.140291690826416, "learning_rate": 4.435590954886047e-06, "loss": 0.5731205940246582, "step": 1486 }, { "epoch": 3.1392405063291138, "grad_norm": 2.278001070022583, "learning_rate": 4.427216796661857e-06, "loss": 0.5262531638145447, "step": 1488 }, { "epoch": 3.1434599156118144, "grad_norm": 3.5377111434936523, "learning_rate": 4.418842491156445e-06, "loss": 0.8218955993652344, "step": 1490 }, { "epoch": 3.147679324894515, "grad_norm": 1.6017743349075317, "learning_rate": 4.410468083686605e-06, "loss": 0.5413227081298828, "step": 1492 }, { "epoch": 3.151898734177215, "grad_norm": 3.7507541179656982, "learning_rate": 4.402093619569679e-06, "loss": 0.7044564485549927, "step": 1494 }, { "epoch": 3.1561181434599157, "grad_norm": 3.2674598693847656, "learning_rate": 4.393719144123321e-06, "loss": 0.6519253253936768, "step": 1496 }, { "epoch": 3.160337552742616, "grad_norm": 1.4527074098587036, "learning_rate": 4.385344702665246e-06, "loss": 0.3425239622592926, "step": 1498 }, { "epoch": 3.1645569620253164, "grad_norm": 1.9561620950698853, "learning_rate": 4.376970340512979e-06, "loss": 0.4482334852218628, "step": 1500 }, { "epoch": 3.168776371308017, "grad_norm": 2.0695550441741943, "learning_rate": 4.368596102983623e-06, "loss": 0.7770338654518127, "step": 1502 }, { "epoch": 3.172995780590717, "grad_norm": 2.4615397453308105, "learning_rate": 4.360222035393603e-06, "loss": 0.6019558906555176, "step": 1504 }, { "epoch": 3.1772151898734178, "grad_norm": 2.1790032386779785, "learning_rate": 4.351848183058427e-06, "loss": 0.8018068075180054, "step": 1506 }, { "epoch": 3.181434599156118, "grad_norm": 3.2153782844543457, "learning_rate": 4.343474591292432e-06, "loss": 0.8963441848754883, "step": 1508 }, { "epoch": 3.1856540084388185, "grad_norm": 4.124518394470215, "learning_rate": 4.335101305408552e-06, "loss": 0.7522740960121155, "step": 1510 }, { "epoch": 3.189873417721519, "grad_norm": 0.789757251739502, "learning_rate": 4.3267283707180635e-06, "loss": 0.2408222258090973, "step": 1512 }, { "epoch": 3.1940928270042193, "grad_norm": 2.3271267414093018, "learning_rate": 4.31835583253034e-06, "loss": 0.7659440636634827, "step": 1514 }, { "epoch": 3.19831223628692, "grad_norm": 1.0062589645385742, "learning_rate": 4.309983736152612e-06, "loss": 0.5749263763427734, "step": 1516 }, { "epoch": 3.2025316455696204, "grad_norm": 3.043958902359009, "learning_rate": 4.301612126889719e-06, "loss": 0.7307943105697632, "step": 1518 }, { "epoch": 3.2067510548523206, "grad_norm": 4.167404651641846, "learning_rate": 4.293241050043863e-06, "loss": 0.6726250648498535, "step": 1520 }, { "epoch": 3.210970464135021, "grad_norm": 7.460043430328369, "learning_rate": 4.284870550914368e-06, "loss": 0.27290791273117065, "step": 1522 }, { "epoch": 3.2151898734177213, "grad_norm": 7.231512546539307, "learning_rate": 4.276500674797427e-06, "loss": 0.6644264459609985, "step": 1524 }, { "epoch": 3.219409282700422, "grad_norm": 2.330780029296875, "learning_rate": 4.268131466985867e-06, "loss": 0.5520614385604858, "step": 1526 }, { "epoch": 3.2236286919831225, "grad_norm": 0.9592808485031128, "learning_rate": 4.259762972768895e-06, "loss": 0.2992947995662689, "step": 1528 }, { "epoch": 3.2278481012658227, "grad_norm": 4.56012487411499, "learning_rate": 4.2513952374318556e-06, "loss": 0.6852157115936279, "step": 1530 }, { "epoch": 3.2320675105485233, "grad_norm": 1.3004289865493774, "learning_rate": 4.24302830625599e-06, "loss": 0.18629956245422363, "step": 1532 }, { "epoch": 3.2362869198312234, "grad_norm": 2.980001449584961, "learning_rate": 4.2346622245181864e-06, "loss": 0.6743506193161011, "step": 1534 }, { "epoch": 3.240506329113924, "grad_norm": 2.0032145977020264, "learning_rate": 4.226297037490735e-06, "loss": 0.779093861579895, "step": 1536 }, { "epoch": 3.2447257383966246, "grad_norm": 1.4844651222229004, "learning_rate": 4.217932790441087e-06, "loss": 0.7138203382492065, "step": 1538 }, { "epoch": 3.2489451476793247, "grad_norm": 9.559041023254395, "learning_rate": 4.209569528631604e-06, "loss": 0.726833701133728, "step": 1540 }, { "epoch": 3.2531645569620253, "grad_norm": 5.846635341644287, "learning_rate": 4.201207297319318e-06, "loss": 0.577594518661499, "step": 1542 }, { "epoch": 3.257383966244726, "grad_norm": 14.799168586730957, "learning_rate": 4.192846141755686e-06, "loss": 0.6153043508529663, "step": 1544 }, { "epoch": 3.261603375527426, "grad_norm": 4.5415568351745605, "learning_rate": 4.184486107186338e-06, "loss": 0.3514612317085266, "step": 1546 }, { "epoch": 3.2658227848101267, "grad_norm": 1.909765362739563, "learning_rate": 4.176127238850845e-06, "loss": 0.6445936560630798, "step": 1548 }, { "epoch": 3.270042194092827, "grad_norm": 4.638195037841797, "learning_rate": 4.1677695819824615e-06, "loss": 0.32674679160118103, "step": 1550 }, { "epoch": 3.2742616033755274, "grad_norm": 1.878891944885254, "learning_rate": 4.159413181807891e-06, "loss": 0.2638033628463745, "step": 1552 }, { "epoch": 3.278481012658228, "grad_norm": 6.237101078033447, "learning_rate": 4.151058083547031e-06, "loss": 0.46362948417663574, "step": 1554 }, { "epoch": 3.282700421940928, "grad_norm": 2.6107330322265625, "learning_rate": 4.142704332412738e-06, "loss": 0.767645001411438, "step": 1556 }, { "epoch": 3.2869198312236287, "grad_norm": 0.5579416155815125, "learning_rate": 4.1343519736105785e-06, "loss": 0.6301885843276978, "step": 1558 }, { "epoch": 3.291139240506329, "grad_norm": 2.669329881668091, "learning_rate": 4.126001052338581e-06, "loss": 0.4373775124549866, "step": 1560 }, { "epoch": 3.2953586497890295, "grad_norm": 3.803680181503296, "learning_rate": 4.1176516137870004e-06, "loss": 0.5417683720588684, "step": 1562 }, { "epoch": 3.29957805907173, "grad_norm": 7.650591850280762, "learning_rate": 4.109303703138063e-06, "loss": 0.7619826793670654, "step": 1564 }, { "epoch": 3.3037974683544302, "grad_norm": 12.526813507080078, "learning_rate": 4.1009573655657295e-06, "loss": 0.696597695350647, "step": 1566 }, { "epoch": 3.308016877637131, "grad_norm": 1.6425329446792603, "learning_rate": 4.092612646235447e-06, "loss": 0.4307796359062195, "step": 1568 }, { "epoch": 3.3122362869198314, "grad_norm": 3.352663516998291, "learning_rate": 4.084269590303907e-06, "loss": 0.3921862244606018, "step": 1570 }, { "epoch": 3.3164556962025316, "grad_norm": 2.774632215499878, "learning_rate": 4.075928242918798e-06, "loss": 0.4460093677043915, "step": 1572 }, { "epoch": 3.320675105485232, "grad_norm": 2.935922384262085, "learning_rate": 4.067588649218564e-06, "loss": 0.935857892036438, "step": 1574 }, { "epoch": 3.3248945147679323, "grad_norm": 2.058513641357422, "learning_rate": 4.059250854332159e-06, "loss": 0.6347423791885376, "step": 1576 }, { "epoch": 3.329113924050633, "grad_norm": 1.685788869857788, "learning_rate": 4.050914903378802e-06, "loss": 0.7031244039535522, "step": 1578 }, { "epoch": 3.3333333333333335, "grad_norm": 2.3649990558624268, "learning_rate": 4.0425808414677345e-06, "loss": 0.43982017040252686, "step": 1580 }, { "epoch": 3.3375527426160336, "grad_norm": 1.6167001724243164, "learning_rate": 4.034248713697977e-06, "loss": 0.40530964732170105, "step": 1582 }, { "epoch": 3.3417721518987342, "grad_norm": 10.420100212097168, "learning_rate": 4.025918565158079e-06, "loss": 0.6115049123764038, "step": 1584 }, { "epoch": 3.3459915611814344, "grad_norm": 7.325076580047607, "learning_rate": 4.0175904409258844e-06, "loss": 0.5467356443405151, "step": 1586 }, { "epoch": 3.350210970464135, "grad_norm": 2.629723310470581, "learning_rate": 4.009264386068281e-06, "loss": 0.3660237789154053, "step": 1588 }, { "epoch": 3.3544303797468356, "grad_norm": 1.3594039678573608, "learning_rate": 4.000940445640959e-06, "loss": 0.8356277942657471, "step": 1590 }, { "epoch": 3.3586497890295357, "grad_norm": 2.2674577236175537, "learning_rate": 3.992618664688165e-06, "loss": 0.7481639981269836, "step": 1592 }, { "epoch": 3.3628691983122363, "grad_norm": 0.9013781547546387, "learning_rate": 3.98429908824246e-06, "loss": 0.3871142268180847, "step": 1594 }, { "epoch": 3.367088607594937, "grad_norm": 3.642791271209717, "learning_rate": 3.975981761324477e-06, "loss": 0.4039541482925415, "step": 1596 }, { "epoch": 3.371308016877637, "grad_norm": 1.674315333366394, "learning_rate": 3.967666728942675e-06, "loss": 0.3363262712955475, "step": 1598 }, { "epoch": 3.3755274261603376, "grad_norm": 4.324201583862305, "learning_rate": 3.959354036093097e-06, "loss": 0.45887890458106995, "step": 1600 }, { "epoch": 3.379746835443038, "grad_norm": 1.464799404144287, "learning_rate": 3.951043727759125e-06, "loss": 0.47278836369514465, "step": 1602 }, { "epoch": 3.3839662447257384, "grad_norm": 0.5969643592834473, "learning_rate": 3.942735848911236e-06, "loss": 0.4599458575248718, "step": 1604 }, { "epoch": 3.388185654008439, "grad_norm": 34.39630126953125, "learning_rate": 3.9344304445067644e-06, "loss": 0.8346083164215088, "step": 1606 }, { "epoch": 3.392405063291139, "grad_norm": 2.1359801292419434, "learning_rate": 3.9261275594896495e-06, "loss": 0.532837450504303, "step": 1608 }, { "epoch": 3.3966244725738397, "grad_norm": 1.2699977159500122, "learning_rate": 3.9178272387902e-06, "loss": 0.2630946636199951, "step": 1610 }, { "epoch": 3.40084388185654, "grad_norm": 2.12693452835083, "learning_rate": 3.909529527324849e-06, "loss": 0.7574643492698669, "step": 1612 }, { "epoch": 3.4050632911392404, "grad_norm": 2.4170689582824707, "learning_rate": 3.9012344699959045e-06, "loss": 0.2519644498825073, "step": 1614 }, { "epoch": 3.409282700421941, "grad_norm": 3.567059278488159, "learning_rate": 3.892942111691319e-06, "loss": 0.6072185039520264, "step": 1616 }, { "epoch": 3.413502109704641, "grad_norm": 2.984300136566162, "learning_rate": 3.884652497284436e-06, "loss": 0.9044985771179199, "step": 1618 }, { "epoch": 3.4177215189873418, "grad_norm": 1.356608271598816, "learning_rate": 3.8763656716337496e-06, "loss": 0.8276529908180237, "step": 1620 }, { "epoch": 3.4219409282700424, "grad_norm": 2.127027750015259, "learning_rate": 3.868081679582664e-06, "loss": 0.45381200313568115, "step": 1622 }, { "epoch": 3.4261603375527425, "grad_norm": 9.49200439453125, "learning_rate": 3.8598005659592505e-06, "loss": 0.35857370495796204, "step": 1624 }, { "epoch": 3.430379746835443, "grad_norm": 7.919655799865723, "learning_rate": 3.851522375576004e-06, "loss": 0.2886282801628113, "step": 1626 }, { "epoch": 3.4345991561181437, "grad_norm": 2.5264101028442383, "learning_rate": 3.843247153229598e-06, "loss": 0.7439049482345581, "step": 1628 }, { "epoch": 3.438818565400844, "grad_norm": 3.5521364212036133, "learning_rate": 3.834974943700646e-06, "loss": 0.1677057147026062, "step": 1630 }, { "epoch": 3.4430379746835444, "grad_norm": 3.5299649238586426, "learning_rate": 3.82670579175346e-06, "loss": 0.867900013923645, "step": 1632 }, { "epoch": 3.4472573839662446, "grad_norm": 1.0358866453170776, "learning_rate": 3.818439742135804e-06, "loss": 0.4616679549217224, "step": 1634 }, { "epoch": 3.451476793248945, "grad_norm": 17.166608810424805, "learning_rate": 3.8101768395786555e-06, "loss": 0.8641064167022705, "step": 1636 }, { "epoch": 3.4556962025316453, "grad_norm": 3.9892635345458984, "learning_rate": 3.80191712879596e-06, "loss": 0.7791248559951782, "step": 1638 }, { "epoch": 3.459915611814346, "grad_norm": 3.8166918754577637, "learning_rate": 3.7936606544843936e-06, "loss": 0.8491038084030151, "step": 1640 }, { "epoch": 3.4641350210970465, "grad_norm": 12.44215202331543, "learning_rate": 3.7854074613231156e-06, "loss": 0.8689329624176025, "step": 1642 }, { "epoch": 3.4683544303797467, "grad_norm": 9.996358871459961, "learning_rate": 3.777157593973531e-06, "loss": 0.1393229067325592, "step": 1644 }, { "epoch": 3.4725738396624473, "grad_norm": 2.609384298324585, "learning_rate": 3.768911097079048e-06, "loss": 0.5422978401184082, "step": 1646 }, { "epoch": 3.476793248945148, "grad_norm": 1.4818031787872314, "learning_rate": 3.7606680152648363e-06, "loss": 0.6728254556655884, "step": 1648 }, { "epoch": 3.481012658227848, "grad_norm": 2.1601715087890625, "learning_rate": 3.752428393137582e-06, "loss": 0.35271987318992615, "step": 1650 }, { "epoch": 3.4852320675105486, "grad_norm": 2.9078116416931152, "learning_rate": 3.744192275285254e-06, "loss": 0.6402429938316345, "step": 1652 }, { "epoch": 3.489451476793249, "grad_norm": 1.4320260286331177, "learning_rate": 3.735959706276855e-06, "loss": 0.4159366488456726, "step": 1654 }, { "epoch": 3.4936708860759493, "grad_norm": 2.64323091506958, "learning_rate": 3.727730730662185e-06, "loss": 0.45933040976524353, "step": 1656 }, { "epoch": 3.49789029535865, "grad_norm": 5.375485897064209, "learning_rate": 3.719505392971597e-06, "loss": 0.7267172336578369, "step": 1658 }, { "epoch": 3.50210970464135, "grad_norm": 1.8793143033981323, "learning_rate": 3.7112837377157595e-06, "loss": 0.750633955001831, "step": 1660 }, { "epoch": 3.5063291139240507, "grad_norm": 3.7220072746276855, "learning_rate": 3.7030658093854116e-06, "loss": 0.7886282205581665, "step": 1662 }, { "epoch": 3.510548523206751, "grad_norm": 4.647188186645508, "learning_rate": 3.6948516524511284e-06, "loss": 0.4952489733695984, "step": 1664 }, { "epoch": 3.5147679324894514, "grad_norm": 4.3419575691223145, "learning_rate": 3.686641311363072e-06, "loss": 0.7061523199081421, "step": 1666 }, { "epoch": 3.518987341772152, "grad_norm": 2.792799949645996, "learning_rate": 3.678434830550758e-06, "loss": 0.4294711947441101, "step": 1668 }, { "epoch": 3.523206751054852, "grad_norm": 3.9279825687408447, "learning_rate": 3.670232254422812e-06, "loss": 0.6987364888191223, "step": 1670 }, { "epoch": 3.5274261603375527, "grad_norm": 4.345192909240723, "learning_rate": 3.6620336273667292e-06, "loss": 0.2978661060333252, "step": 1672 }, { "epoch": 3.5316455696202533, "grad_norm": 2.069209575653076, "learning_rate": 3.6538389937486356e-06, "loss": 0.4812040627002716, "step": 1674 }, { "epoch": 3.5358649789029535, "grad_norm": 14.877073287963867, "learning_rate": 3.6456483979130477e-06, "loss": 0.5612766146659851, "step": 1676 }, { "epoch": 3.540084388185654, "grad_norm": 6.497949600219727, "learning_rate": 3.6374618841826285e-06, "loss": 0.6456748843193054, "step": 1678 }, { "epoch": 3.5443037974683547, "grad_norm": 6.732306003570557, "learning_rate": 3.629279496857955e-06, "loss": 0.713530421257019, "step": 1680 }, { "epoch": 3.548523206751055, "grad_norm": 2.6278674602508545, "learning_rate": 3.621101280217272e-06, "loss": 0.6881183385848999, "step": 1682 }, { "epoch": 3.5527426160337554, "grad_norm": 5.155986785888672, "learning_rate": 3.612927278516257e-06, "loss": 0.5856807827949524, "step": 1684 }, { "epoch": 3.5569620253164556, "grad_norm": 6.799246788024902, "learning_rate": 3.6047575359877768e-06, "loss": 0.36446380615234375, "step": 1686 }, { "epoch": 3.561181434599156, "grad_norm": 1.061477541923523, "learning_rate": 3.596592096841651e-06, "loss": 0.4035094976425171, "step": 1688 }, { "epoch": 3.5654008438818563, "grad_norm": 4.340595245361328, "learning_rate": 3.5884310052644127e-06, "loss": 0.7940167188644409, "step": 1690 }, { "epoch": 3.569620253164557, "grad_norm": 8.34933853149414, "learning_rate": 3.580274305419067e-06, "loss": 0.25536781549453735, "step": 1692 }, { "epoch": 3.5738396624472575, "grad_norm": 0.517219603061676, "learning_rate": 3.572122041444853e-06, "loss": 0.3392212688922882, "step": 1694 }, { "epoch": 3.5780590717299576, "grad_norm": 7.081967830657959, "learning_rate": 3.5639742574570084e-06, "loss": 0.24323059618473053, "step": 1696 }, { "epoch": 3.5822784810126582, "grad_norm": 1.9360688924789429, "learning_rate": 3.5558309975465256e-06, "loss": 0.600135326385498, "step": 1698 }, { "epoch": 3.586497890295359, "grad_norm": 2.5145275592803955, "learning_rate": 3.5476923057799165e-06, "loss": 0.4567859172821045, "step": 1700 }, { "epoch": 3.590717299578059, "grad_norm": 3.178347110748291, "learning_rate": 3.53955822619897e-06, "loss": 0.4825342893600464, "step": 1702 }, { "epoch": 3.5949367088607596, "grad_norm": 2.0541470050811768, "learning_rate": 3.531428802820521e-06, "loss": 1.0025891065597534, "step": 1704 }, { "epoch": 3.59915611814346, "grad_norm": 1.977526307106018, "learning_rate": 3.5233040796362038e-06, "loss": 0.5798022747039795, "step": 1706 }, { "epoch": 3.6033755274261603, "grad_norm": 4.426157474517822, "learning_rate": 3.515184100612222e-06, "loss": 0.5708905458450317, "step": 1708 }, { "epoch": 3.607594936708861, "grad_norm": 0.8931450843811035, "learning_rate": 3.5070689096891045e-06, "loss": 0.3289738893508911, "step": 1710 }, { "epoch": 3.611814345991561, "grad_norm": 2.223947048187256, "learning_rate": 3.4989585507814684e-06, "loss": 0.6438009142875671, "step": 1712 }, { "epoch": 3.6160337552742616, "grad_norm": 4.223023414611816, "learning_rate": 3.4908530677777846e-06, "loss": 0.8552393913269043, "step": 1714 }, { "epoch": 3.620253164556962, "grad_norm": 1.7854139804840088, "learning_rate": 3.482752504540138e-06, "loss": 0.4675080180168152, "step": 1716 }, { "epoch": 3.6244725738396624, "grad_norm": 2.395404577255249, "learning_rate": 3.474656904903991e-06, "loss": 0.35858801007270813, "step": 1718 }, { "epoch": 3.628691983122363, "grad_norm": 4.765064239501953, "learning_rate": 3.466566312677946e-06, "loss": 0.3300427198410034, "step": 1720 }, { "epoch": 3.632911392405063, "grad_norm": 2.5676372051239014, "learning_rate": 3.458480771643507e-06, "loss": 0.7667765617370605, "step": 1722 }, { "epoch": 3.6371308016877637, "grad_norm": 5.16605281829834, "learning_rate": 3.4504003255548454e-06, "loss": 0.3946114182472229, "step": 1724 }, { "epoch": 3.6413502109704643, "grad_norm": 15.37302303314209, "learning_rate": 3.44232501813856e-06, "loss": 0.31146499514579773, "step": 1726 }, { "epoch": 3.6455696202531644, "grad_norm": 11.36103343963623, "learning_rate": 3.4342548930934447e-06, "loss": 0.7634888887405396, "step": 1728 }, { "epoch": 3.649789029535865, "grad_norm": 8.875736236572266, "learning_rate": 3.426189994090249e-06, "loss": 0.20420894026756287, "step": 1730 }, { "epoch": 3.6540084388185656, "grad_norm": 6.140727996826172, "learning_rate": 3.418130364771438e-06, "loss": 0.7999590635299683, "step": 1732 }, { "epoch": 3.6582278481012658, "grad_norm": 3.5605967044830322, "learning_rate": 3.4100760487509677e-06, "loss": 0.22376415133476257, "step": 1734 }, { "epoch": 3.6624472573839664, "grad_norm": 2.0715627670288086, "learning_rate": 3.4020270896140338e-06, "loss": 0.30320820212364197, "step": 1736 }, { "epoch": 3.6666666666666665, "grad_norm": 1.8760136365890503, "learning_rate": 3.3939835309168494e-06, "loss": 0.5345732569694519, "step": 1738 }, { "epoch": 3.670886075949367, "grad_norm": 5.121237277984619, "learning_rate": 3.385945416186402e-06, "loss": 0.25805044174194336, "step": 1740 }, { "epoch": 3.6751054852320673, "grad_norm": 1.5474026203155518, "learning_rate": 3.377912788920218e-06, "loss": 0.811784029006958, "step": 1742 }, { "epoch": 3.679324894514768, "grad_norm": 1.5448044538497925, "learning_rate": 3.3698856925861306e-06, "loss": 0.4863538146018982, "step": 1744 }, { "epoch": 3.6835443037974684, "grad_norm": 4.263956069946289, "learning_rate": 3.361864170622043e-06, "loss": 0.38036102056503296, "step": 1746 }, { "epoch": 3.6877637130801686, "grad_norm": 2.2748067378997803, "learning_rate": 3.3538482664356938e-06, "loss": 0.8080613613128662, "step": 1748 }, { "epoch": 3.691983122362869, "grad_norm": 2.969224214553833, "learning_rate": 3.345838023404419e-06, "loss": 0.7013299465179443, "step": 1750 }, { "epoch": 3.6962025316455698, "grad_norm": 2.08278751373291, "learning_rate": 3.3378334848749193e-06, "loss": 0.6944292187690735, "step": 1752 }, { "epoch": 3.70042194092827, "grad_norm": 2.476149797439575, "learning_rate": 3.329834694163032e-06, "loss": 0.8725452423095703, "step": 1754 }, { "epoch": 3.7046413502109705, "grad_norm": 18.001956939697266, "learning_rate": 3.321841694553482e-06, "loss": 0.6215965747833252, "step": 1756 }, { "epoch": 3.708860759493671, "grad_norm": 5.473003387451172, "learning_rate": 3.3138545292996636e-06, "loss": 0.7003090977668762, "step": 1758 }, { "epoch": 3.7130801687763713, "grad_norm": 24.688859939575195, "learning_rate": 3.305873241623395e-06, "loss": 0.6492451429367065, "step": 1760 }, { "epoch": 3.717299578059072, "grad_norm": 5.999505996704102, "learning_rate": 3.2978978747146886e-06, "loss": 0.27890729904174805, "step": 1762 }, { "epoch": 3.721518987341772, "grad_norm": 6.559441566467285, "learning_rate": 3.28992847173152e-06, "loss": 0.382098525762558, "step": 1764 }, { "epoch": 3.7257383966244726, "grad_norm": 2.456238269805908, "learning_rate": 3.2819650757995882e-06, "loss": 0.7096537947654724, "step": 1766 }, { "epoch": 3.7299578059071727, "grad_norm": 8.340387344360352, "learning_rate": 3.2740077300120874e-06, "loss": 0.5058803558349609, "step": 1768 }, { "epoch": 3.7341772151898733, "grad_norm": 2.959620952606201, "learning_rate": 3.2660564774294698e-06, "loss": 0.5690555572509766, "step": 1770 }, { "epoch": 3.738396624472574, "grad_norm": 1.9937338829040527, "learning_rate": 3.2581113610792186e-06, "loss": 0.6931591033935547, "step": 1772 }, { "epoch": 3.742616033755274, "grad_norm": 1.217617392539978, "learning_rate": 3.2501724239556093e-06, "loss": 0.20921635627746582, "step": 1774 }, { "epoch": 3.7468354430379747, "grad_norm": 0.2667827904224396, "learning_rate": 3.2422397090194763e-06, "loss": 0.3903126120567322, "step": 1776 }, { "epoch": 3.7510548523206753, "grad_norm": 3.232510566711426, "learning_rate": 3.2343132591979893e-06, "loss": 0.6602214574813843, "step": 1778 }, { "epoch": 3.7552742616033754, "grad_norm": 1.6198503971099854, "learning_rate": 3.2263931173844077e-06, "loss": 0.7261852025985718, "step": 1780 }, { "epoch": 3.759493670886076, "grad_norm": 2.057166814804077, "learning_rate": 3.2184793264378635e-06, "loss": 0.6649327278137207, "step": 1782 }, { "epoch": 3.7637130801687766, "grad_norm": 2.829087495803833, "learning_rate": 3.210571929183115e-06, "loss": 0.6382551789283752, "step": 1784 }, { "epoch": 3.7679324894514767, "grad_norm": 2.4798736572265625, "learning_rate": 3.2026709684103248e-06, "loss": 0.6738499402999878, "step": 1786 }, { "epoch": 3.7721518987341773, "grad_norm": 10.70611572265625, "learning_rate": 3.194776486874825e-06, "loss": 0.19844934344291687, "step": 1788 }, { "epoch": 3.7763713080168775, "grad_norm": 4.095230579376221, "learning_rate": 3.186888527296885e-06, "loss": 0.5124695301055908, "step": 1790 }, { "epoch": 3.780590717299578, "grad_norm": 2.3026554584503174, "learning_rate": 3.1790071323614794e-06, "loss": 0.6329219937324524, "step": 1792 }, { "epoch": 3.7848101265822782, "grad_norm": 5.607376575469971, "learning_rate": 3.1711323447180637e-06, "loss": 0.5636836290359497, "step": 1794 }, { "epoch": 3.789029535864979, "grad_norm": 2.444586992263794, "learning_rate": 3.163264206980336e-06, "loss": 0.6737933158874512, "step": 1796 }, { "epoch": 3.7932489451476794, "grad_norm": 4.4093451499938965, "learning_rate": 3.155402761726006e-06, "loss": 0.8205442428588867, "step": 1798 }, { "epoch": 3.7974683544303796, "grad_norm": 2.5362284183502197, "learning_rate": 3.1475480514965733e-06, "loss": 0.7304701209068298, "step": 1800 }, { "epoch": 3.80168776371308, "grad_norm": 1.82133150100708, "learning_rate": 3.139700118797088e-06, "loss": 0.7703126072883606, "step": 1802 }, { "epoch": 3.8059071729957807, "grad_norm": 1.8650217056274414, "learning_rate": 3.131859006095926e-06, "loss": 0.45118463039398193, "step": 1804 }, { "epoch": 3.810126582278481, "grad_norm": 17.568998336791992, "learning_rate": 3.124024755824554e-06, "loss": 0.2017352283000946, "step": 1806 }, { "epoch": 3.8143459915611815, "grad_norm": 3.5482592582702637, "learning_rate": 3.1161974103773066e-06, "loss": 0.728500485420227, "step": 1808 }, { "epoch": 3.818565400843882, "grad_norm": 2.8701515197753906, "learning_rate": 3.108377012111154e-06, "loss": 0.7613662481307983, "step": 1810 }, { "epoch": 3.8227848101265822, "grad_norm": 3.2422940731048584, "learning_rate": 3.10056360334547e-06, "loss": 0.37432968616485596, "step": 1812 }, { "epoch": 3.827004219409283, "grad_norm": 1.7439910173416138, "learning_rate": 3.0927572263618062e-06, "loss": 0.7083200216293335, "step": 1814 }, { "epoch": 3.831223628691983, "grad_norm": 3.794440746307373, "learning_rate": 3.084957923403662e-06, "loss": 0.7253645658493042, "step": 1816 }, { "epoch": 3.8354430379746836, "grad_norm": 8.467775344848633, "learning_rate": 3.0771657366762586e-06, "loss": 0.6260569095611572, "step": 1818 }, { "epoch": 3.8396624472573837, "grad_norm": 6.704847812652588, "learning_rate": 3.069380708346305e-06, "loss": 0.5025795698165894, "step": 1820 }, { "epoch": 3.8438818565400843, "grad_norm": 1.6902318000793457, "learning_rate": 3.061602880541776e-06, "loss": 0.6335855722427368, "step": 1822 }, { "epoch": 3.848101265822785, "grad_norm": 3.424485206604004, "learning_rate": 3.0538322953516807e-06, "loss": 0.5025821328163147, "step": 1824 }, { "epoch": 3.852320675105485, "grad_norm": 3.550658941268921, "learning_rate": 3.046068994825832e-06, "loss": 0.7374518513679504, "step": 1826 }, { "epoch": 3.8565400843881856, "grad_norm": 4.101608753204346, "learning_rate": 3.0383130209746287e-06, "loss": 0.7142576575279236, "step": 1828 }, { "epoch": 3.8607594936708862, "grad_norm": 1.8561471700668335, "learning_rate": 3.0305644157688175e-06, "loss": 0.6271055936813354, "step": 1830 }, { "epoch": 3.8649789029535864, "grad_norm": 19.705900192260742, "learning_rate": 3.022823221139272e-06, "loss": 0.3404349088668823, "step": 1832 }, { "epoch": 3.869198312236287, "grad_norm": 9.467658042907715, "learning_rate": 3.0150894789767627e-06, "loss": 0.5793641805648804, "step": 1834 }, { "epoch": 3.8734177215189876, "grad_norm": 6.555062294006348, "learning_rate": 3.007363231131733e-06, "loss": 0.5979642868041992, "step": 1836 }, { "epoch": 3.8776371308016877, "grad_norm": 12.590143203735352, "learning_rate": 2.9996445194140723e-06, "loss": 0.49834197759628296, "step": 1838 }, { "epoch": 3.8818565400843883, "grad_norm": 11.55475902557373, "learning_rate": 2.9919333855928875e-06, "loss": 0.7811706066131592, "step": 1840 }, { "epoch": 3.8860759493670884, "grad_norm": 1.6321529150009155, "learning_rate": 2.9842298713962795e-06, "loss": 0.4640495777130127, "step": 1842 }, { "epoch": 3.890295358649789, "grad_norm": 1.108053207397461, "learning_rate": 2.9765340185111134e-06, "loss": 0.5240273475646973, "step": 1844 }, { "epoch": 3.894514767932489, "grad_norm": 1.4660061597824097, "learning_rate": 2.968845868582799e-06, "loss": 0.6336109042167664, "step": 1846 }, { "epoch": 3.8987341772151898, "grad_norm": 7.276936054229736, "learning_rate": 2.961165463215062e-06, "loss": 0.48461082577705383, "step": 1848 }, { "epoch": 3.9029535864978904, "grad_norm": 1.9613572359085083, "learning_rate": 2.9534928439697186e-06, "loss": 0.6677671670913696, "step": 1850 }, { "epoch": 3.9071729957805905, "grad_norm": 1.520216464996338, "learning_rate": 2.9458280523664493e-06, "loss": 0.8395076990127563, "step": 1852 }, { "epoch": 3.911392405063291, "grad_norm": 3.0033154487609863, "learning_rate": 2.938171129882579e-06, "loss": 0.6944848299026489, "step": 1854 }, { "epoch": 3.9156118143459917, "grad_norm": 1.6401822566986084, "learning_rate": 2.930522117952847e-06, "loss": 0.7018183469772339, "step": 1856 }, { "epoch": 3.919831223628692, "grad_norm": 2.8167307376861572, "learning_rate": 2.922881057969188e-06, "loss": 0.7709340453147888, "step": 1858 }, { "epoch": 3.9240506329113924, "grad_norm": 2.7081515789031982, "learning_rate": 2.9152479912805028e-06, "loss": 0.7548224925994873, "step": 1860 }, { "epoch": 3.928270042194093, "grad_norm": 3.791499137878418, "learning_rate": 2.907622959192439e-06, "loss": 0.5371965169906616, "step": 1862 }, { "epoch": 3.932489451476793, "grad_norm": 2.503772497177124, "learning_rate": 2.9000060029671644e-06, "loss": 0.5366585850715637, "step": 1864 }, { "epoch": 3.9367088607594938, "grad_norm": 6.065065383911133, "learning_rate": 2.8923971638231466e-06, "loss": 0.9665102958679199, "step": 1866 }, { "epoch": 3.9409282700421944, "grad_norm": 2.7202789783477783, "learning_rate": 2.884796482934927e-06, "loss": 0.7356393337249756, "step": 1868 }, { "epoch": 3.9451476793248945, "grad_norm": 3.0500247478485107, "learning_rate": 2.877204001432899e-06, "loss": 0.5012904405593872, "step": 1870 }, { "epoch": 3.9493670886075947, "grad_norm": 0.8024043440818787, "learning_rate": 2.869619760403089e-06, "loss": 0.3538365662097931, "step": 1872 }, { "epoch": 3.9535864978902953, "grad_norm": 191.6532745361328, "learning_rate": 2.8620438008869264e-06, "loss": 0.434034138917923, "step": 1874 }, { "epoch": 3.957805907172996, "grad_norm": 2.8688251972198486, "learning_rate": 2.8544761638810277e-06, "loss": 0.6301808953285217, "step": 1876 }, { "epoch": 3.962025316455696, "grad_norm": 4.15130090713501, "learning_rate": 2.8469168903369733e-06, "loss": 0.596470832824707, "step": 1878 }, { "epoch": 3.9662447257383966, "grad_norm": 2.7118093967437744, "learning_rate": 2.8393660211610864e-06, "loss": 0.4589231610298157, "step": 1880 }, { "epoch": 3.970464135021097, "grad_norm": 2.9497010707855225, "learning_rate": 2.8318235972142075e-06, "loss": 0.7778608798980713, "step": 1882 }, { "epoch": 3.9746835443037973, "grad_norm": 10.08464241027832, "learning_rate": 2.824289659311481e-06, "loss": 0.3298872113227844, "step": 1884 }, { "epoch": 3.978902953586498, "grad_norm": 2.5433638095855713, "learning_rate": 2.8167642482221274e-06, "loss": 0.6300213932991028, "step": 1886 }, { "epoch": 3.9831223628691985, "grad_norm": 11.90830135345459, "learning_rate": 2.8092474046692227e-06, "loss": 0.4418677091598511, "step": 1888 }, { "epoch": 3.9873417721518987, "grad_norm": 4.765434741973877, "learning_rate": 2.801739169329486e-06, "loss": 0.6927688121795654, "step": 1890 }, { "epoch": 3.9915611814345993, "grad_norm": 6.100020408630371, "learning_rate": 2.7942395828330477e-06, "loss": 0.5399014949798584, "step": 1892 }, { "epoch": 3.9957805907173, "grad_norm": 1.7746268510818481, "learning_rate": 2.7867486857632417e-06, "loss": 0.7801375389099121, "step": 1894 }, { "epoch": 4.0, "grad_norm": 1.9672950506210327, "learning_rate": 2.7792665186563753e-06, "loss": 0.6976273059844971, "step": 1896 }, { "epoch": 4.0042194092827, "grad_norm": 2.0892131328582764, "learning_rate": 2.771793122001518e-06, "loss": 0.5950413942337036, "step": 1898 }, { "epoch": 4.008438818565401, "grad_norm": 3.418523073196411, "learning_rate": 2.764328536240274e-06, "loss": 0.48346221446990967, "step": 1900 }, { "epoch": 4.012658227848101, "grad_norm": 2.4079160690307617, "learning_rate": 2.7568728017665734e-06, "loss": 0.5231744647026062, "step": 1902 }, { "epoch": 4.0168776371308015, "grad_norm": 10.42201042175293, "learning_rate": 2.749425958926447e-06, "loss": 0.36587753891944885, "step": 1904 }, { "epoch": 4.0210970464135025, "grad_norm": 0.36827781796455383, "learning_rate": 2.7419880480178055e-06, "loss": 0.18869513273239136, "step": 1906 }, { "epoch": 4.025316455696203, "grad_norm": 2.4577043056488037, "learning_rate": 2.734559109290229e-06, "loss": 0.5424115061759949, "step": 1908 }, { "epoch": 4.029535864978903, "grad_norm": 7.176480293273926, "learning_rate": 2.7271391829447447e-06, "loss": 0.09614966064691544, "step": 1910 }, { "epoch": 4.033755274261603, "grad_norm": 2.078049898147583, "learning_rate": 2.71972830913361e-06, "loss": 0.5041449069976807, "step": 1912 }, { "epoch": 4.037974683544304, "grad_norm": 3.0364325046539307, "learning_rate": 2.712326527960096e-06, "loss": 0.6174269914627075, "step": 1914 }, { "epoch": 4.042194092827004, "grad_norm": 0.6836444139480591, "learning_rate": 2.704933879478268e-06, "loss": 0.3205277919769287, "step": 1916 }, { "epoch": 4.046413502109704, "grad_norm": 6.195359230041504, "learning_rate": 2.697550403692773e-06, "loss": 0.14734962582588196, "step": 1918 }, { "epoch": 4.050632911392405, "grad_norm": 2.888777732849121, "learning_rate": 2.69017614055862e-06, "loss": 0.5565149784088135, "step": 1920 }, { "epoch": 4.0548523206751055, "grad_norm": 12.064739227294922, "learning_rate": 2.682811129980962e-06, "loss": 0.47878050804138184, "step": 1922 }, { "epoch": 4.059071729957806, "grad_norm": 1.9031803607940674, "learning_rate": 2.6754554118148857e-06, "loss": 0.3945463299751282, "step": 1924 }, { "epoch": 4.063291139240507, "grad_norm": 6.993194103240967, "learning_rate": 2.668109025865191e-06, "loss": 0.2721104919910431, "step": 1926 }, { "epoch": 4.067510548523207, "grad_norm": 7.187300205230713, "learning_rate": 2.660772011886178e-06, "loss": 0.572750449180603, "step": 1928 }, { "epoch": 4.071729957805907, "grad_norm": 9.433985710144043, "learning_rate": 2.6534444095814334e-06, "loss": 0.14224952459335327, "step": 1930 }, { "epoch": 4.075949367088608, "grad_norm": 6.624326705932617, "learning_rate": 2.646126258603612e-06, "loss": 0.429046630859375, "step": 1932 }, { "epoch": 4.080168776371308, "grad_norm": 5.319462776184082, "learning_rate": 2.6388175985542193e-06, "loss": 0.4175564646720886, "step": 1934 }, { "epoch": 4.084388185654008, "grad_norm": 7.918082237243652, "learning_rate": 2.631518468983407e-06, "loss": 0.5208654403686523, "step": 1936 }, { "epoch": 4.0886075949367084, "grad_norm": 2.524588108062744, "learning_rate": 2.6242289093897533e-06, "loss": 0.30576610565185547, "step": 1938 }, { "epoch": 4.0928270042194095, "grad_norm": 17.760915756225586, "learning_rate": 2.6169489592200457e-06, "loss": 0.3638699948787689, "step": 1940 }, { "epoch": 4.09704641350211, "grad_norm": 3.6685545444488525, "learning_rate": 2.6096786578690738e-06, "loss": 0.2502339482307434, "step": 1942 }, { "epoch": 4.10126582278481, "grad_norm": 1.735503077507019, "learning_rate": 2.6024180446794133e-06, "loss": 0.2844234108924866, "step": 1944 }, { "epoch": 4.105485232067511, "grad_norm": 2.3414032459259033, "learning_rate": 2.5951671589412127e-06, "loss": 0.5370857119560242, "step": 1946 }, { "epoch": 4.109704641350211, "grad_norm": 9.196849822998047, "learning_rate": 2.587926039891983e-06, "loss": 0.45078617334365845, "step": 1948 }, { "epoch": 4.113924050632911, "grad_norm": 0.8639876842498779, "learning_rate": 2.580694726716379e-06, "loss": 0.3761923313140869, "step": 1950 }, { "epoch": 4.118143459915612, "grad_norm": 1.6307646036148071, "learning_rate": 2.573473258545997e-06, "loss": 0.44236212968826294, "step": 1952 }, { "epoch": 4.122362869198312, "grad_norm": 4.56338357925415, "learning_rate": 2.566261674459156e-06, "loss": 0.707075834274292, "step": 1954 }, { "epoch": 4.1265822784810124, "grad_norm": 3.0363290309906006, "learning_rate": 2.5590600134806873e-06, "loss": 0.12159548699855804, "step": 1956 }, { "epoch": 4.1308016877637135, "grad_norm": 2.8413619995117188, "learning_rate": 2.551868314581726e-06, "loss": 0.6649860739707947, "step": 1958 }, { "epoch": 4.135021097046414, "grad_norm": 4.109986782073975, "learning_rate": 2.544686616679497e-06, "loss": 0.6205018758773804, "step": 1960 }, { "epoch": 4.139240506329114, "grad_norm": 6.5747504234313965, "learning_rate": 2.537514958637107e-06, "loss": 0.37222298979759216, "step": 1962 }, { "epoch": 4.143459915611814, "grad_norm": 2.581211566925049, "learning_rate": 2.5303533792633306e-06, "loss": 0.4583626687526703, "step": 1964 }, { "epoch": 4.147679324894515, "grad_norm": 0.3391718864440918, "learning_rate": 2.5232019173124043e-06, "loss": 0.24545279145240784, "step": 1966 }, { "epoch": 4.151898734177215, "grad_norm": 3.354196071624756, "learning_rate": 2.5160606114838158e-06, "loss": 0.6107680797576904, "step": 1968 }, { "epoch": 4.156118143459915, "grad_norm": 1.9463728666305542, "learning_rate": 2.5089295004220927e-06, "loss": 0.41494786739349365, "step": 1970 }, { "epoch": 4.160337552742616, "grad_norm": 3.8241024017333984, "learning_rate": 2.5018086227165937e-06, "loss": 0.5631481409072876, "step": 1972 }, { "epoch": 4.1645569620253164, "grad_norm": 3.7971303462982178, "learning_rate": 2.494698016901302e-06, "loss": 0.13252116739749908, "step": 1974 }, { "epoch": 4.168776371308017, "grad_norm": 5.456217288970947, "learning_rate": 2.487597721454616e-06, "loss": 0.4099525213241577, "step": 1976 }, { "epoch": 4.172995780590718, "grad_norm": 18.906333923339844, "learning_rate": 2.4805077747991403e-06, "loss": 0.33811259269714355, "step": 1978 }, { "epoch": 4.177215189873418, "grad_norm": 11.150616645812988, "learning_rate": 2.473428215301474e-06, "loss": 0.2853623032569885, "step": 1980 }, { "epoch": 4.181434599156118, "grad_norm": 23.042560577392578, "learning_rate": 2.466359081272012e-06, "loss": 0.3581426441669464, "step": 1982 }, { "epoch": 4.185654008438819, "grad_norm": 4.002007007598877, "learning_rate": 2.459300410964731e-06, "loss": 0.3014911413192749, "step": 1984 }, { "epoch": 4.189873417721519, "grad_norm": 6.566624164581299, "learning_rate": 2.452252242576984e-06, "loss": 0.11508725583553314, "step": 1986 }, { "epoch": 4.194092827004219, "grad_norm": 18.410457611083984, "learning_rate": 2.445214614249294e-06, "loss": 0.3810286521911621, "step": 1988 }, { "epoch": 4.198312236286919, "grad_norm": 6.431080341339111, "learning_rate": 2.4381875640651466e-06, "loss": 0.20014682412147522, "step": 1990 }, { "epoch": 4.2025316455696204, "grad_norm": 3.2412610054016113, "learning_rate": 2.431171130050788e-06, "loss": 0.6001700162887573, "step": 1992 }, { "epoch": 4.206751054852321, "grad_norm": 3.1228854656219482, "learning_rate": 2.4241653501750117e-06, "loss": 0.29799264669418335, "step": 1994 }, { "epoch": 4.210970464135021, "grad_norm": 2.178508996963501, "learning_rate": 2.4171702623489588e-06, "loss": 0.5007591247558594, "step": 1996 }, { "epoch": 4.215189873417722, "grad_norm": 7.447211265563965, "learning_rate": 2.410185904425912e-06, "loss": 0.7163572907447815, "step": 1998 }, { "epoch": 4.219409282700422, "grad_norm": 2.8777246475219727, "learning_rate": 2.403212314201088e-06, "loss": 0.5820721387863159, "step": 2000 }, { "epoch": 4.223628691983122, "grad_norm": 4.454619884490967, "learning_rate": 2.3962495294114403e-06, "loss": 0.41988158226013184, "step": 2002 }, { "epoch": 4.227848101265823, "grad_norm": 4.4292426109313965, "learning_rate": 2.3892975877354452e-06, "loss": 0.14902547001838684, "step": 2004 }, { "epoch": 4.232067510548523, "grad_norm": 2.666948080062866, "learning_rate": 2.3823565267929036e-06, "loss": 0.6181389093399048, "step": 2006 }, { "epoch": 4.236286919831223, "grad_norm": 3.547452688217163, "learning_rate": 2.375426384144735e-06, "loss": 0.33217155933380127, "step": 2008 }, { "epoch": 4.2405063291139244, "grad_norm": 2.134594440460205, "learning_rate": 2.368507197292777e-06, "loss": 0.4793064594268799, "step": 2010 }, { "epoch": 4.244725738396625, "grad_norm": 25.654151916503906, "learning_rate": 2.361599003679582e-06, "loss": 0.13546811044216156, "step": 2012 }, { "epoch": 4.248945147679325, "grad_norm": 4.229970455169678, "learning_rate": 2.3547018406882104e-06, "loss": 0.3434482216835022, "step": 2014 }, { "epoch": 4.253164556962025, "grad_norm": 4.361436367034912, "learning_rate": 2.347815745642035e-06, "loss": 0.6057535409927368, "step": 2016 }, { "epoch": 4.257383966244726, "grad_norm": 17.874441146850586, "learning_rate": 2.340940755804532e-06, "loss": 0.5280637741088867, "step": 2018 }, { "epoch": 4.261603375527426, "grad_norm": 8.038070678710938, "learning_rate": 2.334076908379086e-06, "loss": 0.07331550121307373, "step": 2020 }, { "epoch": 4.265822784810126, "grad_norm": 0.6873889565467834, "learning_rate": 2.327224240508784e-06, "loss": 0.15723557770252228, "step": 2022 }, { "epoch": 4.270042194092827, "grad_norm": 4.693041801452637, "learning_rate": 2.3203827892762136e-06, "loss": 0.45733606815338135, "step": 2024 }, { "epoch": 4.274261603375527, "grad_norm": 21.652511596679688, "learning_rate": 2.313552591703267e-06, "loss": 0.20987409353256226, "step": 2026 }, { "epoch": 4.2784810126582276, "grad_norm": 1.620386004447937, "learning_rate": 2.3067336847509405e-06, "loss": 0.18322864174842834, "step": 2028 }, { "epoch": 4.282700421940929, "grad_norm": 0.5709916949272156, "learning_rate": 2.2999261053191264e-06, "loss": 0.264180064201355, "step": 2030 }, { "epoch": 4.286919831223629, "grad_norm": 6.232452392578125, "learning_rate": 2.2931298902464242e-06, "loss": 0.581986129283905, "step": 2032 }, { "epoch": 4.291139240506329, "grad_norm": 2.427851438522339, "learning_rate": 2.286345076309935e-06, "loss": 0.08267831802368164, "step": 2034 }, { "epoch": 4.29535864978903, "grad_norm": 7.5021843910217285, "learning_rate": 2.279571700225061e-06, "loss": 0.3914198875427246, "step": 2036 }, { "epoch": 4.29957805907173, "grad_norm": 17.116886138916016, "learning_rate": 2.272809798645313e-06, "loss": 0.4527243375778198, "step": 2038 }, { "epoch": 4.30379746835443, "grad_norm": 9.568516731262207, "learning_rate": 2.2660594081621068e-06, "loss": 0.5110298991203308, "step": 2040 }, { "epoch": 4.308016877637131, "grad_norm": 2.8458101749420166, "learning_rate": 2.259320565304568e-06, "loss": 0.3989183306694031, "step": 2042 }, { "epoch": 4.312236286919831, "grad_norm": 3.3316569328308105, "learning_rate": 2.2525933065393316e-06, "loss": 0.4240986406803131, "step": 2044 }, { "epoch": 4.3164556962025316, "grad_norm": 3.5117201805114746, "learning_rate": 2.2458776682703478e-06, "loss": 0.5510097146034241, "step": 2046 }, { "epoch": 4.320675105485232, "grad_norm": 2.211899757385254, "learning_rate": 2.2391736868386826e-06, "loss": 0.47137928009033203, "step": 2048 }, { "epoch": 4.324894514767933, "grad_norm": 2.8007261753082275, "learning_rate": 2.2324813985223236e-06, "loss": 0.13788414001464844, "step": 2050 }, { "epoch": 4.329113924050633, "grad_norm": 5.883923530578613, "learning_rate": 2.2258008395359814e-06, "loss": 0.21625080704689026, "step": 2052 }, { "epoch": 4.333333333333333, "grad_norm": 2.6445043087005615, "learning_rate": 2.2191320460308913e-06, "loss": 0.43525630235671997, "step": 2054 }, { "epoch": 4.337552742616034, "grad_norm": 4.206122875213623, "learning_rate": 2.2124750540946258e-06, "loss": 0.22658753395080566, "step": 2056 }, { "epoch": 4.341772151898734, "grad_norm": 7.528255462646484, "learning_rate": 2.2058298997508916e-06, "loss": 0.19083625078201294, "step": 2058 }, { "epoch": 4.345991561181434, "grad_norm": 2.3334925174713135, "learning_rate": 2.1991966189593375e-06, "loss": 0.5279438495635986, "step": 2060 }, { "epoch": 4.350210970464135, "grad_norm": 3.07808780670166, "learning_rate": 2.1925752476153598e-06, "loss": 0.5324735641479492, "step": 2062 }, { "epoch": 4.3544303797468356, "grad_norm": 7.293347358703613, "learning_rate": 2.1859658215499094e-06, "loss": 0.4442484378814697, "step": 2064 }, { "epoch": 4.358649789029536, "grad_norm": 3.767479419708252, "learning_rate": 2.1793683765292943e-06, "loss": 0.6478234529495239, "step": 2066 }, { "epoch": 4.362869198312236, "grad_norm": 1.7366708517074585, "learning_rate": 2.172782948254989e-06, "loss": 0.22714099287986755, "step": 2068 }, { "epoch": 4.367088607594937, "grad_norm": 2.4501614570617676, "learning_rate": 2.1662095723634387e-06, "loss": 0.7067612409591675, "step": 2070 }, { "epoch": 4.371308016877637, "grad_norm": 2.0209014415740967, "learning_rate": 2.159648284425872e-06, "loss": 0.6720374226570129, "step": 2072 }, { "epoch": 4.375527426160337, "grad_norm": 2.6613192558288574, "learning_rate": 2.1530991199481e-06, "loss": 0.46383750438690186, "step": 2074 }, { "epoch": 4.379746835443038, "grad_norm": 10.552399635314941, "learning_rate": 2.1465621143703354e-06, "loss": 0.4360678195953369, "step": 2076 }, { "epoch": 4.383966244725738, "grad_norm": 2.3267464637756348, "learning_rate": 2.1400373030669878e-06, "loss": 0.32150259613990784, "step": 2078 }, { "epoch": 4.3881856540084385, "grad_norm": 11.424999237060547, "learning_rate": 2.1335247213464816e-06, "loss": 0.6122124195098877, "step": 2080 }, { "epoch": 4.3924050632911396, "grad_norm": 1.8929657936096191, "learning_rate": 2.1270244044510596e-06, "loss": 0.29143026471138, "step": 2082 }, { "epoch": 4.39662447257384, "grad_norm": 5.961505889892578, "learning_rate": 2.120536387556597e-06, "loss": 0.44119709730148315, "step": 2084 }, { "epoch": 4.40084388185654, "grad_norm": 4.30864953994751, "learning_rate": 2.114060705772409e-06, "loss": 0.7014176845550537, "step": 2086 }, { "epoch": 4.405063291139241, "grad_norm": 2.612563371658325, "learning_rate": 2.107597394141057e-06, "loss": 0.5459550023078918, "step": 2088 }, { "epoch": 4.409282700421941, "grad_norm": 2.4660723209381104, "learning_rate": 2.1011464876381663e-06, "loss": 0.46325892210006714, "step": 2090 }, { "epoch": 4.413502109704641, "grad_norm": 4.131664276123047, "learning_rate": 2.0947080211722317e-06, "loss": 0.4953617453575134, "step": 2092 }, { "epoch": 4.417721518987342, "grad_norm": 1.9574029445648193, "learning_rate": 2.0882820295844285e-06, "loss": 0.5186775922775269, "step": 2094 }, { "epoch": 4.421940928270042, "grad_norm": 3.840588092803955, "learning_rate": 2.081868547648429e-06, "loss": 0.31746193766593933, "step": 2096 }, { "epoch": 4.4261603375527425, "grad_norm": 2.727635383605957, "learning_rate": 2.0754676100702045e-06, "loss": 0.7108813524246216, "step": 2098 }, { "epoch": 4.430379746835443, "grad_norm": 4.424046039581299, "learning_rate": 2.0690792514878495e-06, "loss": 0.48461851477622986, "step": 2100 }, { "epoch": 4.434599156118144, "grad_norm": 2.04559326171875, "learning_rate": 2.0627035064713857e-06, "loss": 0.4159836769104004, "step": 2102 }, { "epoch": 4.438818565400844, "grad_norm": 1.8618910312652588, "learning_rate": 2.056340409522577e-06, "loss": 0.36201441287994385, "step": 2104 }, { "epoch": 4.443037974683544, "grad_norm": 2.5027105808258057, "learning_rate": 2.049989995074746e-06, "loss": 0.5959118008613586, "step": 2106 }, { "epoch": 4.447257383966245, "grad_norm": 11.552289009094238, "learning_rate": 2.043652297492583e-06, "loss": 0.3659658432006836, "step": 2108 }, { "epoch": 4.451476793248945, "grad_norm": 4.931119441986084, "learning_rate": 2.037327351071963e-06, "loss": 0.48589879274368286, "step": 2110 }, { "epoch": 4.455696202531645, "grad_norm": 4.232883930206299, "learning_rate": 2.031015190039759e-06, "loss": 0.5243382453918457, "step": 2112 }, { "epoch": 4.459915611814346, "grad_norm": 0.3998461961746216, "learning_rate": 2.0247158485536565e-06, "loss": 0.5077897310256958, "step": 2114 }, { "epoch": 4.4641350210970465, "grad_norm": 1.7971662282943726, "learning_rate": 2.0184293607019707e-06, "loss": 0.2606506943702698, "step": 2116 }, { "epoch": 4.468354430379747, "grad_norm": 2.3619842529296875, "learning_rate": 2.012155760503458e-06, "loss": 0.543289065361023, "step": 2118 }, { "epoch": 4.472573839662447, "grad_norm": 1.1135996580123901, "learning_rate": 2.0058950819071384e-06, "loss": 0.08294013142585754, "step": 2120 }, { "epoch": 4.476793248945148, "grad_norm": 6.450394630432129, "learning_rate": 1.999647358792103e-06, "loss": 0.27434927225112915, "step": 2122 }, { "epoch": 4.481012658227848, "grad_norm": 9.028851509094238, "learning_rate": 1.993412624967339e-06, "loss": 0.18550115823745728, "step": 2124 }, { "epoch": 4.485232067510548, "grad_norm": 3.7954587936401367, "learning_rate": 1.9871909141715433e-06, "loss": 0.25095483660697937, "step": 2126 }, { "epoch": 4.489451476793249, "grad_norm": 2.933171033859253, "learning_rate": 1.980982260072936e-06, "loss": 0.29782503843307495, "step": 2128 }, { "epoch": 4.493670886075949, "grad_norm": 5.5410475730896, "learning_rate": 1.9747866962690864e-06, "loss": 0.37597131729125977, "step": 2130 }, { "epoch": 4.4978902953586495, "grad_norm": 7.844871997833252, "learning_rate": 1.9686042562867247e-06, "loss": 0.591028094291687, "step": 2132 }, { "epoch": 4.5021097046413505, "grad_norm": 5.038850784301758, "learning_rate": 1.962434973581564e-06, "loss": 0.45768237113952637, "step": 2134 }, { "epoch": 4.506329113924051, "grad_norm": 6.212744235992432, "learning_rate": 1.9562788815381164e-06, "loss": 0.11174334585666656, "step": 2136 }, { "epoch": 4.510548523206751, "grad_norm": 1.0894521474838257, "learning_rate": 1.950136013469515e-06, "loss": 0.1324283480644226, "step": 2138 }, { "epoch": 4.514767932489452, "grad_norm": 5.448882579803467, "learning_rate": 1.944006402617333e-06, "loss": 0.13975661993026733, "step": 2140 }, { "epoch": 4.518987341772152, "grad_norm": 0.6249382495880127, "learning_rate": 1.937890082151403e-06, "loss": 0.32427144050598145, "step": 2142 }, { "epoch": 4.523206751054852, "grad_norm": 11.115077018737793, "learning_rate": 1.9317870851696356e-06, "loss": 0.10621624439954758, "step": 2144 }, { "epoch": 4.527426160337553, "grad_norm": 3.9892995357513428, "learning_rate": 1.9256974446978464e-06, "loss": 0.38272783160209656, "step": 2146 }, { "epoch": 4.531645569620253, "grad_norm": 2.471816301345825, "learning_rate": 1.919621193689569e-06, "loss": 0.3882204294204712, "step": 2148 }, { "epoch": 4.5358649789029535, "grad_norm": 0.016054954379796982, "learning_rate": 1.9135583650258873e-06, "loss": 0.2680031657218933, "step": 2150 }, { "epoch": 4.540084388185654, "grad_norm": 2.9162821769714355, "learning_rate": 1.9075089915152464e-06, "loss": 0.3421184718608856, "step": 2152 }, { "epoch": 4.544303797468355, "grad_norm": 3.484391212463379, "learning_rate": 1.9014731058932827e-06, "loss": 0.5047986507415771, "step": 2154 }, { "epoch": 4.548523206751055, "grad_norm": 1.9593169689178467, "learning_rate": 1.8954507408226409e-06, "loss": 0.46260231733322144, "step": 2156 }, { "epoch": 4.552742616033755, "grad_norm": 2.716538667678833, "learning_rate": 1.8894419288928027e-06, "loss": 0.5966385006904602, "step": 2158 }, { "epoch": 4.556962025316456, "grad_norm": 3.1801514625549316, "learning_rate": 1.883446702619909e-06, "loss": 0.37797853350639343, "step": 2160 }, { "epoch": 4.561181434599156, "grad_norm": 2.5519282817840576, "learning_rate": 1.8774650944465816e-06, "loss": 0.4353446960449219, "step": 2162 }, { "epoch": 4.565400843881856, "grad_norm": 3.090348243713379, "learning_rate": 1.8714971367417503e-06, "loss": 0.36761924624443054, "step": 2164 }, { "epoch": 4.569620253164557, "grad_norm": 2.526357889175415, "learning_rate": 1.8655428618004757e-06, "loss": 0.5436191558837891, "step": 2166 }, { "epoch": 4.5738396624472575, "grad_norm": 6.702995777130127, "learning_rate": 1.8596023018437756e-06, "loss": 0.5698112845420837, "step": 2168 }, { "epoch": 4.578059071729958, "grad_norm": 21.6278133392334, "learning_rate": 1.8536754890184514e-06, "loss": 0.12127143144607544, "step": 2170 }, { "epoch": 4.582278481012658, "grad_norm": 2.4644062519073486, "learning_rate": 1.8477624553969126e-06, "loss": 0.3572949767112732, "step": 2172 }, { "epoch": 4.586497890295359, "grad_norm": 4.449887275695801, "learning_rate": 1.8418632329770014e-06, "loss": 0.4991232752799988, "step": 2174 }, { "epoch": 4.590717299578059, "grad_norm": 2.306753396987915, "learning_rate": 1.8359778536818252e-06, "loss": 0.6089332103729248, "step": 2176 }, { "epoch": 4.594936708860759, "grad_norm": 9.263266563415527, "learning_rate": 1.8301063493595794e-06, "loss": 0.44372105598449707, "step": 2178 }, { "epoch": 4.59915611814346, "grad_norm": 1.82095205783844, "learning_rate": 1.824248751783377e-06, "loss": 0.3401510715484619, "step": 2180 }, { "epoch": 4.60337552742616, "grad_norm": 2.3795061111450195, "learning_rate": 1.8184050926510743e-06, "loss": 0.5080521106719971, "step": 2182 }, { "epoch": 4.6075949367088604, "grad_norm": 29.6896915435791, "learning_rate": 1.8125754035851018e-06, "loss": 0.0813543051481247, "step": 2184 }, { "epoch": 4.6118143459915615, "grad_norm": 3.2905502319335938, "learning_rate": 1.806759716132293e-06, "loss": 0.5500208139419556, "step": 2186 }, { "epoch": 4.616033755274262, "grad_norm": 2.1505532264709473, "learning_rate": 1.800958061763712e-06, "loss": 0.26043060421943665, "step": 2188 }, { "epoch": 4.620253164556962, "grad_norm": 2.0198612213134766, "learning_rate": 1.7951704718744841e-06, "loss": 0.6140601634979248, "step": 2190 }, { "epoch": 4.624472573839663, "grad_norm": 2.324085235595703, "learning_rate": 1.7893969777836265e-06, "loss": 0.20785805583000183, "step": 2192 }, { "epoch": 4.628691983122363, "grad_norm": 2.0707149505615234, "learning_rate": 1.7836376107338783e-06, "loss": 0.5573110580444336, "step": 2194 }, { "epoch": 4.632911392405063, "grad_norm": 3.6579232215881348, "learning_rate": 1.7778924018915302e-06, "loss": 0.2335490882396698, "step": 2196 }, { "epoch": 4.637130801687764, "grad_norm": 2.841978073120117, "learning_rate": 1.772161382346259e-06, "loss": 0.3419453501701355, "step": 2198 }, { "epoch": 4.641350210970464, "grad_norm": 2.595341682434082, "learning_rate": 1.7664445831109566e-06, "loss": 0.535962700843811, "step": 2200 }, { "epoch": 4.6455696202531644, "grad_norm": 2.8027384281158447, "learning_rate": 1.7607420351215616e-06, "loss": 0.4780561923980713, "step": 2202 }, { "epoch": 4.649789029535865, "grad_norm": 0.4611937701702118, "learning_rate": 1.7550537692368942e-06, "loss": 0.3059866428375244, "step": 2204 }, { "epoch": 4.654008438818566, "grad_norm": 1.5873767137527466, "learning_rate": 1.74937981623849e-06, "loss": 0.46250712871551514, "step": 2206 }, { "epoch": 4.658227848101266, "grad_norm": 1.6936619281768799, "learning_rate": 1.7437202068304287e-06, "loss": 0.452869713306427, "step": 2208 }, { "epoch": 4.662447257383966, "grad_norm": 2.697862386703491, "learning_rate": 1.7380749716391737e-06, "loss": 0.5035865306854248, "step": 2210 }, { "epoch": 4.666666666666667, "grad_norm": 3.739734649658203, "learning_rate": 1.7324441412134013e-06, "loss": 0.3993757367134094, "step": 2212 }, { "epoch": 4.670886075949367, "grad_norm": 7.6267852783203125, "learning_rate": 1.7268277460238397e-06, "loss": 0.3390964865684509, "step": 2214 }, { "epoch": 4.675105485232067, "grad_norm": 1.8734283447265625, "learning_rate": 1.7212258164631027e-06, "loss": 0.5280478000640869, "step": 2216 }, { "epoch": 4.679324894514768, "grad_norm": 6.668360710144043, "learning_rate": 1.7156383828455204e-06, "loss": 0.4059964418411255, "step": 2218 }, { "epoch": 4.6835443037974684, "grad_norm": 2.475369930267334, "learning_rate": 1.710065475406983e-06, "loss": 0.4801621735095978, "step": 2220 }, { "epoch": 4.687763713080169, "grad_norm": 2.3857297897338867, "learning_rate": 1.7045071243047728e-06, "loss": 0.0963069349527359, "step": 2222 }, { "epoch": 4.691983122362869, "grad_norm": 2.433400869369507, "learning_rate": 1.6989633596174029e-06, "loss": 0.47518980503082275, "step": 2224 }, { "epoch": 4.69620253164557, "grad_norm": 2.3119516372680664, "learning_rate": 1.6934342113444524e-06, "loss": 0.2933182120323181, "step": 2226 }, { "epoch": 4.70042194092827, "grad_norm": 12.671791076660156, "learning_rate": 1.6879197094064043e-06, "loss": 0.08877721428871155, "step": 2228 }, { "epoch": 4.70464135021097, "grad_norm": 2.207108497619629, "learning_rate": 1.6824198836444858e-06, "loss": 0.622957706451416, "step": 2230 }, { "epoch": 4.708860759493671, "grad_norm": 6.14840030670166, "learning_rate": 1.676934763820503e-06, "loss": 0.5102095603942871, "step": 2232 }, { "epoch": 4.713080168776371, "grad_norm": 4.767087936401367, "learning_rate": 1.6714643796166835e-06, "loss": 0.5292322635650635, "step": 2234 }, { "epoch": 4.717299578059071, "grad_norm": 0.11136994510889053, "learning_rate": 1.6660087606355153e-06, "loss": 0.31627708673477173, "step": 2236 }, { "epoch": 4.7215189873417724, "grad_norm": 4.295990467071533, "learning_rate": 1.6605679363995848e-06, "loss": 0.33531737327575684, "step": 2238 }, { "epoch": 4.725738396624473, "grad_norm": 6.078010559082031, "learning_rate": 1.6551419363514182e-06, "loss": 0.43265092372894287, "step": 2240 }, { "epoch": 4.729957805907173, "grad_norm": 3.5457258224487305, "learning_rate": 1.6497307898533218e-06, "loss": 0.6657654047012329, "step": 2242 }, { "epoch": 4.734177215189874, "grad_norm": 0.8174402713775635, "learning_rate": 1.6443345261872228e-06, "loss": 0.05635060369968414, "step": 2244 }, { "epoch": 4.738396624472574, "grad_norm": 2.277449369430542, "learning_rate": 1.6389531745545138e-06, "loss": 0.40952473878860474, "step": 2246 }, { "epoch": 4.742616033755274, "grad_norm": 2.1519405841827393, "learning_rate": 1.6335867640758876e-06, "loss": 0.6268118023872375, "step": 2248 }, { "epoch": 4.746835443037975, "grad_norm": 10.723348617553711, "learning_rate": 1.6282353237911881e-06, "loss": 0.08097459375858307, "step": 2250 }, { "epoch": 4.751054852320675, "grad_norm": 6.452489852905273, "learning_rate": 1.6228988826592484e-06, "loss": 0.5121550559997559, "step": 2252 }, { "epoch": 4.755274261603375, "grad_norm": 3.1199183464050293, "learning_rate": 1.617577469557735e-06, "loss": 0.417529433965683, "step": 2254 }, { "epoch": 4.759493670886076, "grad_norm": 1.670754075050354, "learning_rate": 1.6122711132829917e-06, "loss": 0.23685501515865326, "step": 2256 }, { "epoch": 4.763713080168777, "grad_norm": 3.9786603450775146, "learning_rate": 1.606979842549883e-06, "loss": 0.08441432565450668, "step": 2258 }, { "epoch": 4.767932489451477, "grad_norm": 3.4749438762664795, "learning_rate": 1.60170368599164e-06, "loss": 0.1604347825050354, "step": 2260 }, { "epoch": 4.772151898734177, "grad_norm": 2.590517044067383, "learning_rate": 1.5964426721597048e-06, "loss": 0.3043164014816284, "step": 2262 }, { "epoch": 4.776371308016878, "grad_norm": 3.320221185684204, "learning_rate": 1.5911968295235756e-06, "loss": 0.5432933568954468, "step": 2264 }, { "epoch": 4.780590717299578, "grad_norm": 6.788969993591309, "learning_rate": 1.5859661864706533e-06, "loss": 0.4840553402900696, "step": 2266 }, { "epoch": 4.784810126582278, "grad_norm": 5.413600444793701, "learning_rate": 1.5807507713060879e-06, "loss": 0.6614431142807007, "step": 2268 }, { "epoch": 4.789029535864979, "grad_norm": 3.4263274669647217, "learning_rate": 1.5755506122526248e-06, "loss": 0.4286192059516907, "step": 2270 }, { "epoch": 4.793248945147679, "grad_norm": 3.0580050945281982, "learning_rate": 1.5703657374504516e-06, "loss": 0.7800706624984741, "step": 2272 }, { "epoch": 4.7974683544303796, "grad_norm": 6.522762775421143, "learning_rate": 1.565196174957049e-06, "loss": 0.2070183902978897, "step": 2274 }, { "epoch": 4.80168776371308, "grad_norm": 1.439235806465149, "learning_rate": 1.5600419527470331e-06, "loss": 0.10173705220222473, "step": 2276 }, { "epoch": 4.805907172995781, "grad_norm": 3.4041426181793213, "learning_rate": 1.5549030987120095e-06, "loss": 0.3341836929321289, "step": 2278 }, { "epoch": 4.810126582278481, "grad_norm": 3.3857309818267822, "learning_rate": 1.5497796406604202e-06, "loss": 0.20992735028266907, "step": 2280 }, { "epoch": 4.814345991561181, "grad_norm": 2.2873666286468506, "learning_rate": 1.5446716063173935e-06, "loss": 0.424138605594635, "step": 2282 }, { "epoch": 4.818565400843882, "grad_norm": 8.471506118774414, "learning_rate": 1.5395790233245924e-06, "loss": 0.5139745473861694, "step": 2284 }, { "epoch": 4.822784810126582, "grad_norm": 2.079385757446289, "learning_rate": 1.5345019192400677e-06, "loss": 0.494828999042511, "step": 2286 }, { "epoch": 4.827004219409282, "grad_norm": 2.116379737854004, "learning_rate": 1.529440321538107e-06, "loss": 0.12557630240917206, "step": 2288 }, { "epoch": 4.831223628691983, "grad_norm": 5.065046787261963, "learning_rate": 1.5243942576090872e-06, "loss": 0.6678446531295776, "step": 2290 }, { "epoch": 4.8354430379746836, "grad_norm": 2.4034457206726074, "learning_rate": 1.5193637547593231e-06, "loss": 0.4627326428890228, "step": 2292 }, { "epoch": 4.839662447257384, "grad_norm": 2.859379291534424, "learning_rate": 1.5143488402109239e-06, "loss": 0.44882258772850037, "step": 2294 }, { "epoch": 4.843881856540085, "grad_norm": 1.7705358266830444, "learning_rate": 1.509349541101646e-06, "loss": 0.4356788694858551, "step": 2296 }, { "epoch": 4.848101265822785, "grad_norm": 2.958854913711548, "learning_rate": 1.5043658844847414e-06, "loss": 0.7101269960403442, "step": 2298 }, { "epoch": 4.852320675105485, "grad_norm": 5.127024173736572, "learning_rate": 1.499397897328815e-06, "loss": 0.6652213931083679, "step": 2300 }, { "epoch": 4.856540084388186, "grad_norm": 0.7210382223129272, "learning_rate": 1.4944456065176785e-06, "loss": 0.23934832215309143, "step": 2302 }, { "epoch": 4.860759493670886, "grad_norm": 2.468538284301758, "learning_rate": 1.4895090388502043e-06, "loss": 0.26671305298805237, "step": 2304 }, { "epoch": 4.864978902953586, "grad_norm": 2.5370748043060303, "learning_rate": 1.4845882210401776e-06, "loss": 0.4928842782974243, "step": 2306 }, { "epoch": 4.869198312236287, "grad_norm": 2.779625654220581, "learning_rate": 1.479683179716159e-06, "loss": 0.2867523729801178, "step": 2308 }, { "epoch": 4.8734177215189876, "grad_norm": 4.043989658355713, "learning_rate": 1.4747939414213334e-06, "loss": 0.4452981948852539, "step": 2310 }, { "epoch": 4.877637130801688, "grad_norm": 2.612654209136963, "learning_rate": 1.4699205326133696e-06, "loss": 0.47218436002731323, "step": 2312 }, { "epoch": 4.881856540084388, "grad_norm": 4.8351593017578125, "learning_rate": 1.4650629796642774e-06, "loss": 0.5447877049446106, "step": 2314 }, { "epoch": 4.886075949367089, "grad_norm": 2.5699872970581055, "learning_rate": 1.460221308860262e-06, "loss": 0.5671508312225342, "step": 2316 }, { "epoch": 4.890295358649789, "grad_norm": 3.086909055709839, "learning_rate": 1.4553955464015868e-06, "loss": 0.39557531476020813, "step": 2318 }, { "epoch": 4.894514767932489, "grad_norm": 5.661040782928467, "learning_rate": 1.4505857184024262e-06, "loss": 0.44218748807907104, "step": 2320 }, { "epoch": 4.89873417721519, "grad_norm": 3.8085811138153076, "learning_rate": 1.4457918508907268e-06, "loss": 0.3575529456138611, "step": 2322 }, { "epoch": 4.90295358649789, "grad_norm": 2.3151283264160156, "learning_rate": 1.441013969808068e-06, "loss": 0.5917726755142212, "step": 2324 }, { "epoch": 4.9071729957805905, "grad_norm": 3.560556650161743, "learning_rate": 1.4362521010095186e-06, "loss": 0.33830514550209045, "step": 2326 }, { "epoch": 4.911392405063291, "grad_norm": 2.334346294403076, "learning_rate": 1.4315062702634997e-06, "loss": 0.4876287281513214, "step": 2328 }, { "epoch": 4.915611814345992, "grad_norm": 2.1452908515930176, "learning_rate": 1.426776503251643e-06, "loss": 0.6366673111915588, "step": 2330 }, { "epoch": 4.919831223628692, "grad_norm": 0.41547343134880066, "learning_rate": 1.4220628255686533e-06, "loss": 0.25237974524497986, "step": 2332 }, { "epoch": 4.924050632911392, "grad_norm": 54.84702682495117, "learning_rate": 1.4173652627221686e-06, "loss": 0.43499624729156494, "step": 2334 }, { "epoch": 4.928270042194093, "grad_norm": 2.2124152183532715, "learning_rate": 1.4126838401326243e-06, "loss": 0.5627238750457764, "step": 2336 }, { "epoch": 4.932489451476793, "grad_norm": 7.185441017150879, "learning_rate": 1.4080185831331126e-06, "loss": 0.25834035873413086, "step": 2338 }, { "epoch": 4.936708860759493, "grad_norm": 5.250967502593994, "learning_rate": 1.4033695169692485e-06, "loss": 0.2957782447338104, "step": 2340 }, { "epoch": 4.940928270042194, "grad_norm": 6.259135723114014, "learning_rate": 1.398736666799031e-06, "loss": 0.6378402709960938, "step": 2342 }, { "epoch": 4.9451476793248945, "grad_norm": 16.73641586303711, "learning_rate": 1.3941200576927088e-06, "loss": 0.35595816373825073, "step": 2344 }, { "epoch": 4.949367088607595, "grad_norm": 3.1287739276885986, "learning_rate": 1.3895197146326414e-06, "loss": 0.7204777002334595, "step": 2346 }, { "epoch": 4.953586497890296, "grad_norm": 2.49485445022583, "learning_rate": 1.3849356625131692e-06, "loss": 0.3135877847671509, "step": 2348 }, { "epoch": 4.957805907172996, "grad_norm": 3.383075714111328, "learning_rate": 1.3803679261404716e-06, "loss": 0.49237698316574097, "step": 2350 }, { "epoch": 4.962025316455696, "grad_norm": 2.2556025981903076, "learning_rate": 1.3758165302324397e-06, "loss": 0.16111743450164795, "step": 2352 }, { "epoch": 4.966244725738397, "grad_norm": 4.6603546142578125, "learning_rate": 1.3712814994185395e-06, "loss": 0.6392441987991333, "step": 2354 }, { "epoch": 4.970464135021097, "grad_norm": 12.963358879089355, "learning_rate": 1.366762858239679e-06, "loss": 0.35483598709106445, "step": 2356 }, { "epoch": 4.974683544303797, "grad_norm": 3.961883068084717, "learning_rate": 1.3622606311480729e-06, "loss": 0.5934839248657227, "step": 2358 }, { "epoch": 4.978902953586498, "grad_norm": 1.0137317180633545, "learning_rate": 1.3577748425071152e-06, "loss": 0.28861305117607117, "step": 2360 }, { "epoch": 4.9831223628691985, "grad_norm": 3.8444621562957764, "learning_rate": 1.3533055165912433e-06, "loss": 0.5528509616851807, "step": 2362 }, { "epoch": 4.987341772151899, "grad_norm": 12.442330360412598, "learning_rate": 1.3488526775858087e-06, "loss": 0.6871875524520874, "step": 2364 }, { "epoch": 4.991561181434599, "grad_norm": 2.018998622894287, "learning_rate": 1.3444163495869444e-06, "loss": 0.6601129770278931, "step": 2366 }, { "epoch": 4.9957805907173, "grad_norm": 4.588871002197266, "learning_rate": 1.3399965566014363e-06, "loss": 0.3472335934638977, "step": 2368 }, { "epoch": 5.0, "grad_norm": 20.997011184692383, "learning_rate": 1.3355933225465938e-06, "loss": 0.1488598883152008, "step": 2370 }, { "epoch": 5.0042194092827, "grad_norm": 1.974225401878357, "learning_rate": 1.3312066712501176e-06, "loss": 0.4649539589881897, "step": 2372 }, { "epoch": 5.008438818565401, "grad_norm": 2.101741075515747, "learning_rate": 1.3268366264499723e-06, "loss": 0.40653684735298157, "step": 2374 }, { "epoch": 5.012658227848101, "grad_norm": 3.2043211460113525, "learning_rate": 1.322483211794259e-06, "loss": 0.20105722546577454, "step": 2376 }, { "epoch": 5.0168776371308015, "grad_norm": 3.830211877822876, "learning_rate": 1.3181464508410858e-06, "loss": 0.4869913160800934, "step": 2378 }, { "epoch": 5.0210970464135025, "grad_norm": 3.2576708793640137, "learning_rate": 1.3138263670584392e-06, "loss": 0.3144640028476715, "step": 2380 }, { "epoch": 5.025316455696203, "grad_norm": 2.3268511295318604, "learning_rate": 1.309522983824061e-06, "loss": 0.4795665144920349, "step": 2382 }, { "epoch": 5.029535864978903, "grad_norm": 4.2797112464904785, "learning_rate": 1.3052363244253188e-06, "loss": 0.303976833820343, "step": 2384 }, { "epoch": 5.033755274261603, "grad_norm": 2.950766086578369, "learning_rate": 1.3009664120590806e-06, "loss": 0.2566067576408386, "step": 2386 }, { "epoch": 5.037974683544304, "grad_norm": 4.176736354827881, "learning_rate": 1.296713269831589e-06, "loss": 0.33072197437286377, "step": 2388 }, { "epoch": 5.042194092827004, "grad_norm": 0.061758268624544144, "learning_rate": 1.2924769207583368e-06, "loss": 0.17066842317581177, "step": 2390 }, { "epoch": 5.046413502109704, "grad_norm": 4.471809387207031, "learning_rate": 1.2882573877639427e-06, "loss": 0.24980589747428894, "step": 2392 }, { "epoch": 5.050632911392405, "grad_norm": 4.955497741699219, "learning_rate": 1.2840546936820263e-06, "loss": 0.2576749622821808, "step": 2394 }, { "epoch": 5.0548523206751055, "grad_norm": 0.7454743981361389, "learning_rate": 1.2798688612550838e-06, "loss": 0.040055617690086365, "step": 2396 }, { "epoch": 5.059071729957806, "grad_norm": 2.270764112472534, "learning_rate": 1.2756999131343677e-06, "loss": 0.4545499086380005, "step": 2398 }, { "epoch": 5.063291139240507, "grad_norm": 8.608718872070312, "learning_rate": 1.271547871879762e-06, "loss": 0.46691781282424927, "step": 2400 }, { "epoch": 5.067510548523207, "grad_norm": 4.590333938598633, "learning_rate": 1.267412759959661e-06, "loss": 0.3534661829471588, "step": 2402 }, { "epoch": 5.071729957805907, "grad_norm": 6.176363468170166, "learning_rate": 1.2632945997508469e-06, "loss": 0.03008463606238365, "step": 2404 }, { "epoch": 5.075949367088608, "grad_norm": 4.096558570861816, "learning_rate": 1.25919341353837e-06, "loss": 0.4609118103981018, "step": 2406 }, { "epoch": 5.080168776371308, "grad_norm": 1.3339556455612183, "learning_rate": 1.2551092235154265e-06, "loss": 0.25634127855300903, "step": 2408 }, { "epoch": 5.084388185654008, "grad_norm": 3.1009860038757324, "learning_rate": 1.2510420517832399e-06, "loss": 0.3237183690071106, "step": 2410 }, { "epoch": 5.0886075949367084, "grad_norm": 6.112014293670654, "learning_rate": 1.2469919203509406e-06, "loss": 0.45163053274154663, "step": 2412 }, { "epoch": 5.0928270042194095, "grad_norm": 1.8072830438613892, "learning_rate": 1.2429588511354468e-06, "loss": 0.3245161175727844, "step": 2414 }, { "epoch": 5.09704641350211, "grad_norm": 1.5899354219436646, "learning_rate": 1.2389428659613465e-06, "loss": 0.09791871905326843, "step": 2416 }, { "epoch": 5.10126582278481, "grad_norm": 2.595155954360962, "learning_rate": 1.2349439865607783e-06, "loss": 0.20728906989097595, "step": 2418 }, { "epoch": 5.105485232067511, "grad_norm": 2.6492655277252197, "learning_rate": 1.2309622345733153e-06, "loss": 0.52880859375, "step": 2420 }, { "epoch": 5.109704641350211, "grad_norm": 3.113187789916992, "learning_rate": 1.226997631545846e-06, "loss": 0.34188008308410645, "step": 2422 }, { "epoch": 5.113924050632911, "grad_norm": 0.3923889100551605, "learning_rate": 1.2230501989324606e-06, "loss": 0.39657163619995117, "step": 2424 }, { "epoch": 5.118143459915612, "grad_norm": 2.9111595153808594, "learning_rate": 1.219119958094331e-06, "loss": 0.37215137481689453, "step": 2426 }, { "epoch": 5.122362869198312, "grad_norm": 5.908500671386719, "learning_rate": 1.215206930299599e-06, "loss": 0.28079548478126526, "step": 2428 }, { "epoch": 5.1265822784810124, "grad_norm": 0.15474487841129303, "learning_rate": 1.2113111367232582e-06, "loss": 0.16562075912952423, "step": 2430 }, { "epoch": 5.1308016877637135, "grad_norm": 2.4118106365203857, "learning_rate": 1.2074325984470428e-06, "loss": 0.3783321678638458, "step": 2432 }, { "epoch": 5.135021097046414, "grad_norm": 2.499565601348877, "learning_rate": 1.2035713364593102e-06, "loss": 0.4123075604438782, "step": 2434 }, { "epoch": 5.139240506329114, "grad_norm": 3.3785624504089355, "learning_rate": 1.1997273716549284e-06, "loss": 0.25959959626197815, "step": 2436 }, { "epoch": 5.143459915611814, "grad_norm": 3.4707491397857666, "learning_rate": 1.195900724835164e-06, "loss": 0.03673313558101654, "step": 2438 }, { "epoch": 5.147679324894515, "grad_norm": 2.67907977104187, "learning_rate": 1.1920914167075696e-06, "loss": 0.2947133779525757, "step": 2440 }, { "epoch": 5.151898734177215, "grad_norm": 3.3348686695098877, "learning_rate": 1.1882994678858675e-06, "loss": 0.3776189684867859, "step": 2442 }, { "epoch": 5.156118143459915, "grad_norm": 3.6222927570343018, "learning_rate": 1.1845248988898464e-06, "loss": 0.2443552017211914, "step": 2444 }, { "epoch": 5.160337552742616, "grad_norm": 3.27504301071167, "learning_rate": 1.1807677301452437e-06, "loss": 0.5414304733276367, "step": 2446 }, { "epoch": 5.1645569620253164, "grad_norm": 3.2731869220733643, "learning_rate": 1.1770279819836355e-06, "loss": 0.18883806467056274, "step": 2448 }, { "epoch": 5.168776371308017, "grad_norm": 3.275451421737671, "learning_rate": 1.1733056746423304e-06, "loss": 0.37931862473487854, "step": 2450 }, { "epoch": 5.172995780590718, "grad_norm": 0.034749243408441544, "learning_rate": 1.1696008282642559e-06, "loss": 0.20449881255626678, "step": 2452 }, { "epoch": 5.177215189873418, "grad_norm": 4.930509567260742, "learning_rate": 1.165913462897852e-06, "loss": 0.035537637770175934, "step": 2454 }, { "epoch": 5.181434599156118, "grad_norm": 1.7042666673660278, "learning_rate": 1.1622435984969602e-06, "loss": 0.20552217960357666, "step": 2456 }, { "epoch": 5.185654008438819, "grad_norm": 14.41930103302002, "learning_rate": 1.1585912549207196e-06, "loss": 0.3709006607532501, "step": 2458 }, { "epoch": 5.189873417721519, "grad_norm": 6.651240348815918, "learning_rate": 1.1549564519334556e-06, "loss": 0.18409161269664764, "step": 2460 }, { "epoch": 5.194092827004219, "grad_norm": 2.1087753772735596, "learning_rate": 1.1513392092045736e-06, "loss": 0.39856773614883423, "step": 2462 }, { "epoch": 5.198312236286919, "grad_norm": 2.5043351650238037, "learning_rate": 1.147739546308455e-06, "loss": 0.3595339059829712, "step": 2464 }, { "epoch": 5.2025316455696204, "grad_norm": 3.3729639053344727, "learning_rate": 1.1441574827243478e-06, "loss": 0.17214104533195496, "step": 2466 }, { "epoch": 5.206751054852321, "grad_norm": 2.227221965789795, "learning_rate": 1.1405930378362648e-06, "loss": 0.3033697009086609, "step": 2468 }, { "epoch": 5.210970464135021, "grad_norm": 5.146759510040283, "learning_rate": 1.1370462309328743e-06, "loss": 0.36619800329208374, "step": 2470 }, { "epoch": 5.215189873417722, "grad_norm": 3.0241358280181885, "learning_rate": 1.1335170812073999e-06, "loss": 0.30589285492897034, "step": 2472 }, { "epoch": 5.219409282700422, "grad_norm": 2.229212522506714, "learning_rate": 1.1300056077575154e-06, "loss": 0.3369923233985901, "step": 2474 }, { "epoch": 5.223628691983122, "grad_norm": 2.9154298305511475, "learning_rate": 1.1265118295852404e-06, "loss": 0.19644200801849365, "step": 2476 }, { "epoch": 5.227848101265823, "grad_norm": 5.483319282531738, "learning_rate": 1.1230357655968371e-06, "loss": 0.06274639070034027, "step": 2478 }, { "epoch": 5.232067510548523, "grad_norm": 0.5124228596687317, "learning_rate": 1.119577434602711e-06, "loss": 0.20770075917243958, "step": 2480 }, { "epoch": 5.236286919831223, "grad_norm": 2.4156696796417236, "learning_rate": 1.116136855317307e-06, "loss": 0.29468050599098206, "step": 2482 }, { "epoch": 5.2405063291139244, "grad_norm": 1.4421368837356567, "learning_rate": 1.1127140463590055e-06, "loss": 0.23361340165138245, "step": 2484 }, { "epoch": 5.244725738396625, "grad_norm": 6.403635025024414, "learning_rate": 1.1093090262500266e-06, "loss": 0.4423346519470215, "step": 2486 }, { "epoch": 5.248945147679325, "grad_norm": 4.7648606300354, "learning_rate": 1.105921813416328e-06, "loss": 0.5721250772476196, "step": 2488 }, { "epoch": 5.253164556962025, "grad_norm": 4.072231292724609, "learning_rate": 1.1025524261875041e-06, "loss": 0.5335391163825989, "step": 2490 }, { "epoch": 5.257383966244726, "grad_norm": 2.6612138748168945, "learning_rate": 1.0992008827966874e-06, "loss": 0.5658106803894043, "step": 2492 }, { "epoch": 5.261603375527426, "grad_norm": 2.578683614730835, "learning_rate": 1.095867201380451e-06, "loss": 0.41335171461105347, "step": 2494 }, { "epoch": 5.265822784810126, "grad_norm": 5.410678863525391, "learning_rate": 1.0925513999787086e-06, "loss": 0.15254725515842438, "step": 2496 }, { "epoch": 5.270042194092827, "grad_norm": 3.9477524757385254, "learning_rate": 1.0892534965346192e-06, "loss": 0.44648611545562744, "step": 2498 }, { "epoch": 5.274261603375527, "grad_norm": 7.086328506469727, "learning_rate": 1.0859735088944868e-06, "loss": 0.16064085066318512, "step": 2500 }, { "epoch": 5.2784810126582276, "grad_norm": 2.0906810760498047, "learning_rate": 1.0827114548076663e-06, "loss": 0.2642805874347687, "step": 2502 }, { "epoch": 5.282700421940929, "grad_norm": 2.104074716567993, "learning_rate": 1.0794673519264675e-06, "loss": 0.24389728903770447, "step": 2504 }, { "epoch": 5.286919831223629, "grad_norm": 6.062882423400879, "learning_rate": 1.0762412178060587e-06, "loss": 0.31626439094543457, "step": 2506 }, { "epoch": 5.291139240506329, "grad_norm": 2.444314956665039, "learning_rate": 1.0730330699043717e-06, "loss": 0.4520007371902466, "step": 2508 }, { "epoch": 5.29535864978903, "grad_norm": 5.062936305999756, "learning_rate": 1.0698429255820068e-06, "loss": 0.09191440790891647, "step": 2510 }, { "epoch": 5.29957805907173, "grad_norm": 2.516993761062622, "learning_rate": 1.0666708021021406e-06, "loss": 0.21818026900291443, "step": 2512 }, { "epoch": 5.30379746835443, "grad_norm": 3.7304656505584717, "learning_rate": 1.063516716630432e-06, "loss": 0.33304083347320557, "step": 2514 }, { "epoch": 5.308016877637131, "grad_norm": 2.554382562637329, "learning_rate": 1.0603806862349255e-06, "loss": 0.3670189380645752, "step": 2516 }, { "epoch": 5.312236286919831, "grad_norm": 2.6083078384399414, "learning_rate": 1.0572627278859675e-06, "loss": 0.4783245027065277, "step": 2518 }, { "epoch": 5.3164556962025316, "grad_norm": 4.347960472106934, "learning_rate": 1.0541628584561052e-06, "loss": 0.4460408687591553, "step": 2520 }, { "epoch": 5.320675105485232, "grad_norm": 4.647004127502441, "learning_rate": 1.0510810947200003e-06, "loss": 0.3045784533023834, "step": 2522 }, { "epoch": 5.324894514767933, "grad_norm": 3.011627197265625, "learning_rate": 1.0480174533543372e-06, "loss": 0.33729833364486694, "step": 2524 }, { "epoch": 5.329113924050633, "grad_norm": 2.8627593517303467, "learning_rate": 1.044971950937734e-06, "loss": 0.5005810260772705, "step": 2526 }, { "epoch": 5.333333333333333, "grad_norm": 4.140803337097168, "learning_rate": 1.041944603950649e-06, "loss": 0.44916412234306335, "step": 2528 }, { "epoch": 5.337552742616034, "grad_norm": 3.677198886871338, "learning_rate": 1.038935428775296e-06, "loss": 0.5101116299629211, "step": 2530 }, { "epoch": 5.341772151898734, "grad_norm": 3.9427692890167236, "learning_rate": 1.0359444416955528e-06, "loss": 0.3052045702934265, "step": 2532 }, { "epoch": 5.345991561181434, "grad_norm": 3.024719715118408, "learning_rate": 1.0329716588968745e-06, "loss": 0.2897722125053406, "step": 2534 }, { "epoch": 5.350210970464135, "grad_norm": 2.671980619430542, "learning_rate": 1.030017096466205e-06, "loss": 0.3393900692462921, "step": 2536 }, { "epoch": 5.3544303797468356, "grad_norm": 2.4032411575317383, "learning_rate": 1.027080770391891e-06, "loss": 0.40280789136886597, "step": 2538 }, { "epoch": 5.358649789029536, "grad_norm": 1.9715185165405273, "learning_rate": 1.0241626965635942e-06, "loss": 0.2567780017852783, "step": 2540 }, { "epoch": 5.362869198312236, "grad_norm": 14.593756675720215, "learning_rate": 1.0212628907722062e-06, "loss": 0.04668917506933212, "step": 2542 }, { "epoch": 5.367088607594937, "grad_norm": 0.3638130724430084, "learning_rate": 1.0183813687097618e-06, "loss": 0.16572636365890503, "step": 2544 }, { "epoch": 5.371308016877637, "grad_norm": 4.365072250366211, "learning_rate": 1.0155181459693565e-06, "loss": 0.3552468717098236, "step": 2546 }, { "epoch": 5.375527426160337, "grad_norm": 2.508363723754883, "learning_rate": 1.0126732380450596e-06, "loss": 0.38389939069747925, "step": 2548 }, { "epoch": 5.379746835443038, "grad_norm": 3.3055357933044434, "learning_rate": 1.0098466603318323e-06, "loss": 0.31817764043807983, "step": 2550 }, { "epoch": 5.383966244725738, "grad_norm": 3.0569851398468018, "learning_rate": 1.0070384281254425e-06, "loss": 0.12491938471794128, "step": 2552 }, { "epoch": 5.3881856540084385, "grad_norm": 2.883695363998413, "learning_rate": 1.0042485566223848e-06, "loss": 0.4344925284385681, "step": 2554 }, { "epoch": 5.3924050632911396, "grad_norm": 11.280695915222168, "learning_rate": 1.0014770609197957e-06, "loss": 0.3988388180732727, "step": 2556 }, { "epoch": 5.39662447257384, "grad_norm": 8.63589096069336, "learning_rate": 9.98723956015371e-07, "loss": 0.17392376065254211, "step": 2558 }, { "epoch": 5.40084388185654, "grad_norm": 10.322222709655762, "learning_rate": 9.959892568072881e-07, "loss": 0.08735622465610504, "step": 2560 }, { "epoch": 5.405063291139241, "grad_norm": 3.0921213626861572, "learning_rate": 9.932729780941237e-07, "loss": 0.20220640301704407, "step": 2562 }, { "epoch": 5.409282700421941, "grad_norm": 2.6708106994628906, "learning_rate": 9.905751345747734e-07, "loss": 0.5822624564170837, "step": 2564 }, { "epoch": 5.413502109704641, "grad_norm": 2.136763572692871, "learning_rate": 9.878957408483718e-07, "loss": 0.16230207681655884, "step": 2566 }, { "epoch": 5.417721518987342, "grad_norm": 2.1476376056671143, "learning_rate": 9.852348114142155e-07, "loss": 0.3189689517021179, "step": 2568 }, { "epoch": 5.421940928270042, "grad_norm": 27.836748123168945, "learning_rate": 9.825923606716818e-07, "loss": 0.05949246510863304, "step": 2570 }, { "epoch": 5.4261603375527425, "grad_norm": 1.7536920309066772, "learning_rate": 9.799684029201536e-07, "loss": 0.23696368932724, "step": 2572 }, { "epoch": 5.430379746835443, "grad_norm": 0.2607567608356476, "learning_rate": 9.773629523589387e-07, "loss": 0.014276674017310143, "step": 2574 }, { "epoch": 5.434599156118144, "grad_norm": 2.661586046218872, "learning_rate": 9.747760230871965e-07, "loss": 0.27866894006729126, "step": 2576 }, { "epoch": 5.438818565400844, "grad_norm": 6.1830010414123535, "learning_rate": 9.722076291038605e-07, "loss": 0.5345185399055481, "step": 2578 }, { "epoch": 5.443037974683544, "grad_norm": 2.385190010070801, "learning_rate": 9.696577843075608e-07, "loss": 0.4049319624900818, "step": 2580 }, { "epoch": 5.447257383966245, "grad_norm": 3.472625732421875, "learning_rate": 9.671265024965509e-07, "loss": 0.35417062044143677, "step": 2582 }, { "epoch": 5.451476793248945, "grad_norm": 2.2873806953430176, "learning_rate": 9.646137973686324e-07, "loss": 0.22758211195468903, "step": 2584 }, { "epoch": 5.455696202531645, "grad_norm": 3.527923345565796, "learning_rate": 9.621196825210814e-07, "loss": 0.332139790058136, "step": 2586 }, { "epoch": 5.459915611814346, "grad_norm": 2.947986364364624, "learning_rate": 9.596441714505732e-07, "loss": 0.07135351002216339, "step": 2588 }, { "epoch": 5.4641350210970465, "grad_norm": 4.305266857147217, "learning_rate": 9.57187277553111e-07, "loss": 0.5696512460708618, "step": 2590 }, { "epoch": 5.468354430379747, "grad_norm": 3.028513193130493, "learning_rate": 9.547490141239534e-07, "loss": 0.19437383115291595, "step": 2592 }, { "epoch": 5.472573839662447, "grad_norm": 2.7946903705596924, "learning_rate": 9.523293943575414e-07, "loss": 0.15654590725898743, "step": 2594 }, { "epoch": 5.476793248945148, "grad_norm": 6.811203956604004, "learning_rate": 9.499284313474276e-07, "loss": 0.11999380588531494, "step": 2596 }, { "epoch": 5.481012658227848, "grad_norm": 2.8398826122283936, "learning_rate": 9.475461380862047e-07, "loss": 0.04623116925358772, "step": 2598 }, { "epoch": 5.485232067510548, "grad_norm": 0.62392657995224, "learning_rate": 9.451825274654373e-07, "loss": 0.1718018651008606, "step": 2600 }, { "epoch": 5.489451476793249, "grad_norm": 1.799946904182434, "learning_rate": 9.428376122755884e-07, "loss": 0.2459963858127594, "step": 2602 }, { "epoch": 5.493670886075949, "grad_norm": 3.90554141998291, "learning_rate": 9.405114052059541e-07, "loss": 0.23852768540382385, "step": 2604 }, { "epoch": 5.4978902953586495, "grad_norm": 6.165667533874512, "learning_rate": 9.382039188445925e-07, "loss": 0.05722271651029587, "step": 2606 }, { "epoch": 5.5021097046413505, "grad_norm": 2.0570311546325684, "learning_rate": 9.359151656782567e-07, "loss": 0.19151735305786133, "step": 2608 }, { "epoch": 5.506329113924051, "grad_norm": 1.9145231246948242, "learning_rate": 9.336451580923262e-07, "loss": 0.03127627447247505, "step": 2610 }, { "epoch": 5.510548523206751, "grad_norm": 3.4458625316619873, "learning_rate": 9.313939083707413e-07, "loss": 0.1748735010623932, "step": 2612 }, { "epoch": 5.514767932489452, "grad_norm": 3.4101099967956543, "learning_rate": 9.291614286959349e-07, "loss": 0.382763147354126, "step": 2614 }, { "epoch": 5.518987341772152, "grad_norm": 0.40910443663597107, "learning_rate": 9.269477311487686e-07, "loss": 0.1556778848171234, "step": 2616 }, { "epoch": 5.523206751054852, "grad_norm": 3.5669357776641846, "learning_rate": 9.247528277084645e-07, "loss": 0.1594393253326416, "step": 2618 }, { "epoch": 5.527426160337553, "grad_norm": 3.370866537094116, "learning_rate": 9.225767302525441e-07, "loss": 0.4137956500053406, "step": 2620 }, { "epoch": 5.531645569620253, "grad_norm": 0.21743591129779816, "learning_rate": 9.20419450556761e-07, "loss": 0.4230045676231384, "step": 2622 }, { "epoch": 5.5358649789029535, "grad_norm": 2.6428186893463135, "learning_rate": 9.182810002950378e-07, "loss": 0.42899954319000244, "step": 2624 }, { "epoch": 5.540084388185654, "grad_norm": 6.077566623687744, "learning_rate": 9.16161391039404e-07, "loss": 0.20327959954738617, "step": 2626 }, { "epoch": 5.544303797468355, "grad_norm": 5.41641902923584, "learning_rate": 9.140606342599332e-07, "loss": 0.44856715202331543, "step": 2628 }, { "epoch": 5.548523206751055, "grad_norm": 4.853996753692627, "learning_rate": 9.119787413246795e-07, "loss": 0.4271373748779297, "step": 2630 }, { "epoch": 5.552742616033755, "grad_norm": 4.13206672668457, "learning_rate": 9.099157234996173e-07, "loss": 0.23560848832130432, "step": 2632 }, { "epoch": 5.556962025316456, "grad_norm": 2.2997729778289795, "learning_rate": 9.078715919485798e-07, "loss": 0.23265743255615234, "step": 2634 }, { "epoch": 5.561181434599156, "grad_norm": 1.88694167137146, "learning_rate": 9.058463577331999e-07, "loss": 0.15787991881370544, "step": 2636 }, { "epoch": 5.565400843881856, "grad_norm": 2.646512746810913, "learning_rate": 9.03840031812848e-07, "loss": 0.1242508590221405, "step": 2638 }, { "epoch": 5.569620253164557, "grad_norm": 0.22765684127807617, "learning_rate": 9.018526250445747e-07, "loss": 0.07518874108791351, "step": 2640 }, { "epoch": 5.5738396624472575, "grad_norm": 2.321202278137207, "learning_rate": 8.998841481830515e-07, "loss": 0.30490678548812866, "step": 2642 }, { "epoch": 5.578059071729958, "grad_norm": 4.0073418617248535, "learning_rate": 8.97934611880512e-07, "loss": 0.611823558807373, "step": 2644 }, { "epoch": 5.582278481012658, "grad_norm": 18.506275177001953, "learning_rate": 8.960040266866948e-07, "loss": 0.3300861120223999, "step": 2646 }, { "epoch": 5.586497890295359, "grad_norm": 2.6837949752807617, "learning_rate": 8.94092403048786e-07, "loss": 0.018273882567882538, "step": 2648 }, { "epoch": 5.590717299578059, "grad_norm": 2.3034257888793945, "learning_rate": 8.921997513113637e-07, "loss": 0.25158876180648804, "step": 2650 }, { "epoch": 5.594936708860759, "grad_norm": 2.5194528102874756, "learning_rate": 8.903260817163402e-07, "loss": 0.18762826919555664, "step": 2652 }, { "epoch": 5.59915611814346, "grad_norm": 4.4369096755981445, "learning_rate": 8.884714044029092e-07, "loss": 0.06930024921894073, "step": 2654 }, { "epoch": 5.60337552742616, "grad_norm": 1.652228832244873, "learning_rate": 8.86635729407488e-07, "loss": 0.2389906346797943, "step": 2656 }, { "epoch": 5.6075949367088604, "grad_norm": 5.807703971862793, "learning_rate": 8.848190666636651e-07, "loss": 0.10554240643978119, "step": 2658 }, { "epoch": 5.6118143459915615, "grad_norm": 3.1773393154144287, "learning_rate": 8.830214260021459e-07, "loss": 0.16849491000175476, "step": 2660 }, { "epoch": 5.616033755274262, "grad_norm": 23.979202270507812, "learning_rate": 8.812428171506998e-07, "loss": 0.04333914816379547, "step": 2662 }, { "epoch": 5.620253164556962, "grad_norm": 0.8357203602790833, "learning_rate": 8.794832497341065e-07, "loss": 0.30027642846107483, "step": 2664 }, { "epoch": 5.624472573839663, "grad_norm": 2.281597375869751, "learning_rate": 8.77742733274106e-07, "loss": 0.027927353978157043, "step": 2666 }, { "epoch": 5.628691983122363, "grad_norm": 2.4539661407470703, "learning_rate": 8.760212771893442e-07, "loss": 0.10624615103006363, "step": 2668 }, { "epoch": 5.632911392405063, "grad_norm": 0.6876718401908875, "learning_rate": 8.743188907953251e-07, "loss": 0.3938605487346649, "step": 2670 }, { "epoch": 5.637130801687764, "grad_norm": 3.4516823291778564, "learning_rate": 8.726355833043575e-07, "loss": 0.3330395519733429, "step": 2672 }, { "epoch": 5.641350210970464, "grad_norm": 3.539989471435547, "learning_rate": 8.709713638255074e-07, "loss": 0.6006532907485962, "step": 2674 }, { "epoch": 5.6455696202531644, "grad_norm": 3.0337464809417725, "learning_rate": 8.693262413645464e-07, "loss": 0.3575003445148468, "step": 2676 }, { "epoch": 5.649789029535865, "grad_norm": 3.694211721420288, "learning_rate": 8.677002248239066e-07, "loss": 0.2969297766685486, "step": 2678 }, { "epoch": 5.654008438818566, "grad_norm": 1.0974704027175903, "learning_rate": 8.660933230026276e-07, "loss": 0.05868370085954666, "step": 2680 }, { "epoch": 5.658227848101266, "grad_norm": 2.3777191638946533, "learning_rate": 8.645055445963135e-07, "loss": 0.31508857011795044, "step": 2682 }, { "epoch": 5.662447257383966, "grad_norm": 0.7402134537696838, "learning_rate": 8.629368981970822e-07, "loss": 0.04464399069547653, "step": 2684 }, { "epoch": 5.666666666666667, "grad_norm": 1.678879737854004, "learning_rate": 8.613873922935217e-07, "loss": 0.4207780957221985, "step": 2686 }, { "epoch": 5.670886075949367, "grad_norm": 0.16458748281002045, "learning_rate": 8.598570352706425e-07, "loss": 0.33038753271102905, "step": 2688 }, { "epoch": 5.675105485232067, "grad_norm": 2.9141294956207275, "learning_rate": 8.583458354098318e-07, "loss": 0.471355140209198, "step": 2690 }, { "epoch": 5.679324894514768, "grad_norm": 2.882704973220825, "learning_rate": 8.56853800888812e-07, "loss": 0.01554950326681137, "step": 2692 }, { "epoch": 5.6835443037974684, "grad_norm": 2.264005422592163, "learning_rate": 8.553809397815909e-07, "loss": 0.5948341488838196, "step": 2694 }, { "epoch": 5.687763713080169, "grad_norm": 5.996740341186523, "learning_rate": 8.539272600584227e-07, "loss": 0.2293516844511032, "step": 2696 }, { "epoch": 5.691983122362869, "grad_norm": 3.936495542526245, "learning_rate": 8.524927695857636e-07, "loss": 0.4448416829109192, "step": 2698 }, { "epoch": 5.69620253164557, "grad_norm": 2.383849859237671, "learning_rate": 8.510774761262285e-07, "loss": 0.3430967926979065, "step": 2700 }, { "epoch": 5.70042194092827, "grad_norm": 3.2722134590148926, "learning_rate": 8.496813873385494e-07, "loss": 0.38816744089126587, "step": 2702 }, { "epoch": 5.70464135021097, "grad_norm": 0.12035968899726868, "learning_rate": 8.483045107775337e-07, "loss": 0.2644461393356323, "step": 2704 }, { "epoch": 5.708860759493671, "grad_norm": 0.15902626514434814, "learning_rate": 8.469468538940241e-07, "loss": 0.15841832756996155, "step": 2706 }, { "epoch": 5.713080168776371, "grad_norm": 2.9765915870666504, "learning_rate": 8.456084240348575e-07, "loss": 0.03421106934547424, "step": 2708 }, { "epoch": 5.717299578059071, "grad_norm": 0.5361355543136597, "learning_rate": 8.44289228442825e-07, "loss": 0.21108713746070862, "step": 2710 }, { "epoch": 5.7215189873417724, "grad_norm": 3.1111233234405518, "learning_rate": 8.429892742566344e-07, "loss": 0.38604629039764404, "step": 2712 }, { "epoch": 5.725738396624473, "grad_norm": 2.709472417831421, "learning_rate": 8.417085685108695e-07, "loss": 0.4284287095069885, "step": 2714 }, { "epoch": 5.729957805907173, "grad_norm": 2.9085452556610107, "learning_rate": 8.404471181359526e-07, "loss": 0.2729555666446686, "step": 2716 }, { "epoch": 5.734177215189874, "grad_norm": 13.333749771118164, "learning_rate": 8.392049299581083e-07, "loss": 0.49384695291519165, "step": 2718 }, { "epoch": 5.738396624472574, "grad_norm": 2.3657379150390625, "learning_rate": 8.379820106993253e-07, "loss": 0.42707446217536926, "step": 2720 }, { "epoch": 5.742616033755274, "grad_norm": 4.9322943687438965, "learning_rate": 8.367783669773196e-07, "loss": 0.4772263467311859, "step": 2722 }, { "epoch": 5.746835443037975, "grad_norm": 2.417219638824463, "learning_rate": 8.355940053054999e-07, "loss": 0.11725395172834396, "step": 2724 }, { "epoch": 5.751054852320675, "grad_norm": 3.8851850032806396, "learning_rate": 8.344289320929321e-07, "loss": 0.3932940363883972, "step": 2726 }, { "epoch": 5.755274261603375, "grad_norm": 2.6408393383026123, "learning_rate": 8.332831536443035e-07, "loss": 0.3797783851623535, "step": 2728 }, { "epoch": 5.759493670886076, "grad_norm": 4.485612392425537, "learning_rate": 8.321566761598909e-07, "loss": 0.28436335921287537, "step": 2730 }, { "epoch": 5.763713080168777, "grad_norm": 2.002480983734131, "learning_rate": 8.310495057355242e-07, "loss": 0.5089020729064941, "step": 2732 }, { "epoch": 5.767932489451477, "grad_norm": 2.7712652683258057, "learning_rate": 8.299616483625561e-07, "loss": 0.09954804182052612, "step": 2734 }, { "epoch": 5.772151898734177, "grad_norm": 8.521866798400879, "learning_rate": 8.288931099278275e-07, "loss": 0.28571265935897827, "step": 2736 }, { "epoch": 5.776371308016878, "grad_norm": 3.3578455448150635, "learning_rate": 8.27843896213637e-07, "loss": 0.4965103268623352, "step": 2738 }, { "epoch": 5.780590717299578, "grad_norm": 2.0499563217163086, "learning_rate": 8.2681401289771e-07, "loss": 0.2735576629638672, "step": 2740 }, { "epoch": 5.784810126582278, "grad_norm": 3.7373149394989014, "learning_rate": 8.258034655531661e-07, "loss": 0.4888134002685547, "step": 2742 }, { "epoch": 5.789029535864979, "grad_norm": 2.028367757797241, "learning_rate": 8.248122596484903e-07, "loss": 0.16572898626327515, "step": 2744 }, { "epoch": 5.793248945147679, "grad_norm": 2.7090587615966797, "learning_rate": 8.23840400547503e-07, "loss": 0.2500470280647278, "step": 2746 }, { "epoch": 5.7974683544303796, "grad_norm": 5.770694255828857, "learning_rate": 8.228878935093327e-07, "loss": 0.6361812949180603, "step": 2748 }, { "epoch": 5.80168776371308, "grad_norm": 5.137115478515625, "learning_rate": 8.219547436883832e-07, "loss": 0.25070175528526306, "step": 2750 }, { "epoch": 5.805907172995781, "grad_norm": 7.0401716232299805, "learning_rate": 8.210409561343112e-07, "loss": 0.4003854990005493, "step": 2752 }, { "epoch": 5.810126582278481, "grad_norm": 8.947293281555176, "learning_rate": 8.201465357919941e-07, "loss": 0.5776923894882202, "step": 2754 }, { "epoch": 5.814345991561181, "grad_norm": 2.4191689491271973, "learning_rate": 8.192714875015071e-07, "loss": 0.21931633353233337, "step": 2756 }, { "epoch": 5.818565400843882, "grad_norm": 3.477417469024658, "learning_rate": 8.184158159980942e-07, "loss": 0.034300077706575394, "step": 2758 }, { "epoch": 5.822784810126582, "grad_norm": 2.9756200313568115, "learning_rate": 8.175795259121438e-07, "loss": 0.38680142164230347, "step": 2760 }, { "epoch": 5.827004219409282, "grad_norm": 13.028040885925293, "learning_rate": 8.167626217691641e-07, "loss": 0.3836379647254944, "step": 2762 }, { "epoch": 5.831223628691983, "grad_norm": 8.563456535339355, "learning_rate": 8.15965107989757e-07, "loss": 0.17900025844573975, "step": 2764 }, { "epoch": 5.8354430379746836, "grad_norm": 3.2769968509674072, "learning_rate": 8.151869888895971e-07, "loss": 0.41699984669685364, "step": 2766 }, { "epoch": 5.839662447257384, "grad_norm": 4.447607040405273, "learning_rate": 8.144282686794042e-07, "loss": 0.3173035979270935, "step": 2768 }, { "epoch": 5.843881856540085, "grad_norm": 3.0378546714782715, "learning_rate": 8.136889514649242e-07, "loss": 0.40285831689834595, "step": 2770 }, { "epoch": 5.848101265822785, "grad_norm": 2.4900171756744385, "learning_rate": 8.129690412469046e-07, "loss": 0.2346557378768921, "step": 2772 }, { "epoch": 5.852320675105485, "grad_norm": 4.628073215484619, "learning_rate": 8.122685419210748e-07, "loss": 0.34134355187416077, "step": 2774 }, { "epoch": 5.856540084388186, "grad_norm": 2.439875602722168, "learning_rate": 8.11587457278123e-07, "loss": 0.32176950573921204, "step": 2776 }, { "epoch": 5.860759493670886, "grad_norm": 2.549513816833496, "learning_rate": 8.109257910036767e-07, "loss": 0.4297516345977783, "step": 2778 }, { "epoch": 5.864978902953586, "grad_norm": 2.4646005630493164, "learning_rate": 8.102835466782829e-07, "loss": 0.3611939251422882, "step": 2780 }, { "epoch": 5.869198312236287, "grad_norm": 2.1484479904174805, "learning_rate": 8.096607277773885e-07, "loss": 0.3919060528278351, "step": 2782 }, { "epoch": 5.8734177215189876, "grad_norm": 4.247211456298828, "learning_rate": 8.090573376713214e-07, "loss": 0.4738028943538666, "step": 2784 }, { "epoch": 5.877637130801688, "grad_norm": 12.467236518859863, "learning_rate": 8.084733796252727e-07, "loss": 0.14323553442955017, "step": 2786 }, { "epoch": 5.881856540084388, "grad_norm": 2.3769068717956543, "learning_rate": 8.079088567992778e-07, "loss": 0.3547300100326538, "step": 2788 }, { "epoch": 5.886075949367089, "grad_norm": 1.2352712154388428, "learning_rate": 8.073637722482008e-07, "loss": 0.028360096737742424, "step": 2790 }, { "epoch": 5.890295358649789, "grad_norm": 2.2478432655334473, "learning_rate": 8.068381289217173e-07, "loss": 0.13877378404140472, "step": 2792 }, { "epoch": 5.894514767932489, "grad_norm": 2.8125381469726562, "learning_rate": 8.063319296642983e-07, "loss": 0.40060657262802124, "step": 2794 }, { "epoch": 5.89873417721519, "grad_norm": 5.122012615203857, "learning_rate": 8.058451772151953e-07, "loss": 0.40660685300827026, "step": 2796 }, { "epoch": 5.90295358649789, "grad_norm": 3.058626890182495, "learning_rate": 8.05377874208425e-07, "loss": 0.2716779410839081, "step": 2798 }, { "epoch": 5.9071729957805905, "grad_norm": 3.8683018684387207, "learning_rate": 8.049300231727548e-07, "loss": 0.3559970259666443, "step": 2800 }, { "epoch": 5.911392405063291, "grad_norm": 3.477356433868408, "learning_rate": 8.045016265316904e-07, "loss": 0.29196757078170776, "step": 2802 }, { "epoch": 5.915611814345992, "grad_norm": 3.347072124481201, "learning_rate": 8.04092686603461e-07, "loss": 0.24977904558181763, "step": 2804 }, { "epoch": 5.919831223628692, "grad_norm": 2.251091957092285, "learning_rate": 8.037032056010077e-07, "loss": 0.03224069997668266, "step": 2806 }, { "epoch": 5.924050632911392, "grad_norm": 7.518223762512207, "learning_rate": 8.03333185631972e-07, "loss": 0.3484126329421997, "step": 2808 }, { "epoch": 5.928270042194093, "grad_norm": 3.4275622367858887, "learning_rate": 8.02982628698683e-07, "loss": 0.059894442558288574, "step": 2810 }, { "epoch": 5.932489451476793, "grad_norm": 2.064310073852539, "learning_rate": 8.026515366981481e-07, "loss": 0.12616072595119476, "step": 2812 }, { "epoch": 5.936708860759493, "grad_norm": 0.0712597668170929, "learning_rate": 8.023399114220411e-07, "loss": 0.2311958521604538, "step": 2814 }, { "epoch": 5.940928270042194, "grad_norm": 2.0846590995788574, "learning_rate": 8.020477545566941e-07, "loss": 0.27708864212036133, "step": 2816 }, { "epoch": 5.9451476793248945, "grad_norm": 2.385714530944824, "learning_rate": 8.017750676830876e-07, "loss": 0.19477054476737976, "step": 2818 }, { "epoch": 5.949367088607595, "grad_norm": 0.2615872025489807, "learning_rate": 8.015218522768414e-07, "loss": 0.12333428859710693, "step": 2820 }, { "epoch": 5.953586497890296, "grad_norm": 2.914166212081909, "learning_rate": 8.012881097082082e-07, "loss": 0.3903350234031677, "step": 2822 }, { "epoch": 5.957805907172996, "grad_norm": 2.832524538040161, "learning_rate": 8.010738412420643e-07, "loss": 0.25948387384414673, "step": 2824 }, { "epoch": 5.962025316455696, "grad_norm": 2.6522233486175537, "learning_rate": 8.008790480379041e-07, "loss": 0.42445188760757446, "step": 2826 }, { "epoch": 5.966244725738397, "grad_norm": 0.424772173166275, "learning_rate": 8.007037311498337e-07, "loss": 0.18149511516094208, "step": 2828 }, { "epoch": 5.970464135021097, "grad_norm": 2.7508718967437744, "learning_rate": 8.005478915265643e-07, "loss": 0.08192159235477448, "step": 2830 }, { "epoch": 5.974683544303797, "grad_norm": 7.296056270599365, "learning_rate": 8.004115300114071e-07, "loss": 0.3574886918067932, "step": 2832 }, { "epoch": 5.978902953586498, "grad_norm": 0.17789922654628754, "learning_rate": 8.002946473422713e-07, "loss": 0.2169741988182068, "step": 2834 }, { "epoch": 5.9831223628691985, "grad_norm": 5.66543436050415, "learning_rate": 8.001972441516558e-07, "loss": 0.06217677891254425, "step": 2836 }, { "epoch": 5.987341772151899, "grad_norm": 2.6673474311828613, "learning_rate": 8.001193209666501e-07, "loss": 0.3183894753456116, "step": 2838 }, { "epoch": 5.991561181434599, "grad_norm": 0.4517356753349304, "learning_rate": 8.000608782089275e-07, "loss": 0.26500433683395386, "step": 2840 }, { "epoch": 5.9957805907173, "grad_norm": 13.812853813171387, "learning_rate": 8.000219161947466e-07, "loss": 0.4387038052082062, "step": 2842 }, { "epoch": 6.0, "grad_norm": 2.1090548038482666, "learning_rate": 8.000024351349457e-07, "loss": 0.4745343327522278, "step": 2844 }, { "epoch": 6.0, "step": 2844, "total_flos": 5.392281114922451e+18, "train_loss": 0.6896465788091076, "train_runtime": 6887.9733, "train_samples_per_second": 12.387, "train_steps_per_second": 0.413 } ], "logging_steps": 2, "max_steps": 2844, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.392281114922451e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }