{ "best_metric": 1.3083828687667847, "best_model_checkpoint": "storage/models/again-mt5-base-model-full-dataset-es-an-translation/checkpoint-2120", "epoch": 4.994110718492344, "eval_steps": 212, "global_step": 2120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002355712603062426, "grad_norm": 3.341477394104004, "learning_rate": 2.3584905660377358e-07, "loss": 3.3531, "step": 1 }, { "epoch": 0.004711425206124852, "grad_norm": 3.0634207725524902, "learning_rate": 4.7169811320754717e-07, "loss": 3.2763, "step": 2 }, { "epoch": 0.007067137809187279, "grad_norm": 3.245035409927368, "learning_rate": 7.075471698113208e-07, "loss": 3.5209, "step": 3 }, { "epoch": 0.009422850412249705, "grad_norm": 3.016467809677124, "learning_rate": 9.433962264150943e-07, "loss": 3.3334, "step": 4 }, { "epoch": 0.011778563015312132, "grad_norm": 3.4311678409576416, "learning_rate": 1.179245283018868e-06, "loss": 3.3505, "step": 5 }, { "epoch": 0.014134275618374558, "grad_norm": 6.1449503898620605, "learning_rate": 1.4150943396226415e-06, "loss": 3.0468, "step": 6 }, { "epoch": 0.016489988221436984, "grad_norm": 4.340108871459961, "learning_rate": 1.650943396226415e-06, "loss": 3.4982, "step": 7 }, { "epoch": 0.01884570082449941, "grad_norm": 3.2377517223358154, "learning_rate": 1.8867924528301887e-06, "loss": 3.3484, "step": 8 }, { "epoch": 0.02120141342756184, "grad_norm": 4.3267621994018555, "learning_rate": 2.1226415094339624e-06, "loss": 3.183, "step": 9 }, { "epoch": 0.023557126030624265, "grad_norm": 3.0833380222320557, "learning_rate": 2.358490566037736e-06, "loss": 3.5341, "step": 10 }, { "epoch": 0.02591283863368669, "grad_norm": 3.221043825149536, "learning_rate": 2.5943396226415095e-06, "loss": 3.2853, "step": 11 }, { "epoch": 0.028268551236749116, "grad_norm": 3.126291036605835, "learning_rate": 2.830188679245283e-06, "loss": 3.774, "step": 12 }, { "epoch": 0.030624263839811542, "grad_norm": 3.172091484069824, "learning_rate": 3.0660377358490567e-06, "loss": 3.2745, "step": 13 }, { "epoch": 0.03297997644287397, "grad_norm": 3.2447891235351562, "learning_rate": 3.30188679245283e-06, "loss": 3.2284, "step": 14 }, { "epoch": 0.0353356890459364, "grad_norm": 3.253422975540161, "learning_rate": 3.5377358490566038e-06, "loss": 3.5398, "step": 15 }, { "epoch": 0.03769140164899882, "grad_norm": 3.088097095489502, "learning_rate": 3.7735849056603773e-06, "loss": 3.472, "step": 16 }, { "epoch": 0.04004711425206125, "grad_norm": 3.5624895095825195, "learning_rate": 4.009433962264151e-06, "loss": 3.204, "step": 17 }, { "epoch": 0.04240282685512368, "grad_norm": 5.093189239501953, "learning_rate": 4.245283018867925e-06, "loss": 3.1947, "step": 18 }, { "epoch": 0.0447585394581861, "grad_norm": 3.4906768798828125, "learning_rate": 4.481132075471698e-06, "loss": 3.2269, "step": 19 }, { "epoch": 0.04711425206124853, "grad_norm": 3.4522705078125, "learning_rate": 4.716981132075472e-06, "loss": 3.3822, "step": 20 }, { "epoch": 0.04946996466431095, "grad_norm": 3.59317684173584, "learning_rate": 4.952830188679246e-06, "loss": 3.5275, "step": 21 }, { "epoch": 0.05182567726737338, "grad_norm": 3.4576406478881836, "learning_rate": 5.188679245283019e-06, "loss": 3.3897, "step": 22 }, { "epoch": 0.05418138987043581, "grad_norm": 3.3660781383514404, "learning_rate": 5.424528301886793e-06, "loss": 3.5011, "step": 23 }, { "epoch": 0.05653710247349823, "grad_norm": 3.679382562637329, "learning_rate": 5.660377358490566e-06, "loss": 3.184, "step": 24 }, { "epoch": 0.05889281507656066, "grad_norm": 4.357712745666504, "learning_rate": 5.89622641509434e-06, "loss": 3.5666, "step": 25 }, { "epoch": 0.061248527679623084, "grad_norm": 3.3471999168395996, "learning_rate": 6.132075471698113e-06, "loss": 3.3235, "step": 26 }, { "epoch": 0.0636042402826855, "grad_norm": 2.94535231590271, "learning_rate": 6.367924528301887e-06, "loss": 3.61, "step": 27 }, { "epoch": 0.06595995288574794, "grad_norm": 2.990603446960449, "learning_rate": 6.60377358490566e-06, "loss": 3.1524, "step": 28 }, { "epoch": 0.06831566548881036, "grad_norm": 7.756312847137451, "learning_rate": 6.839622641509434e-06, "loss": 3.3712, "step": 29 }, { "epoch": 0.0706713780918728, "grad_norm": 3.0072011947631836, "learning_rate": 7.0754716981132075e-06, "loss": 3.3166, "step": 30 }, { "epoch": 0.07302709069493522, "grad_norm": 3.1572799682617188, "learning_rate": 7.3113207547169815e-06, "loss": 3.1375, "step": 31 }, { "epoch": 0.07538280329799764, "grad_norm": 8.323050498962402, "learning_rate": 7.547169811320755e-06, "loss": 3.2841, "step": 32 }, { "epoch": 0.07773851590106007, "grad_norm": 3.688499927520752, "learning_rate": 7.783018867924528e-06, "loss": 3.6712, "step": 33 }, { "epoch": 0.0800942285041225, "grad_norm": 2.9477272033691406, "learning_rate": 8.018867924528302e-06, "loss": 3.5507, "step": 34 }, { "epoch": 0.08244994110718493, "grad_norm": 3.1139721870422363, "learning_rate": 8.254716981132076e-06, "loss": 3.4986, "step": 35 }, { "epoch": 0.08480565371024736, "grad_norm": 2.9492828845977783, "learning_rate": 8.49056603773585e-06, "loss": 3.487, "step": 36 }, { "epoch": 0.08716136631330977, "grad_norm": 2.9183638095855713, "learning_rate": 8.726415094339622e-06, "loss": 3.1739, "step": 37 }, { "epoch": 0.0895170789163722, "grad_norm": 16.113895416259766, "learning_rate": 8.962264150943396e-06, "loss": 3.2777, "step": 38 }, { "epoch": 0.09187279151943463, "grad_norm": 2.690722942352295, "learning_rate": 9.19811320754717e-06, "loss": 3.272, "step": 39 }, { "epoch": 0.09422850412249706, "grad_norm": 2.6189067363739014, "learning_rate": 9.433962264150944e-06, "loss": 3.1855, "step": 40 }, { "epoch": 0.09658421672555949, "grad_norm": 4.234733581542969, "learning_rate": 9.669811320754718e-06, "loss": 3.1536, "step": 41 }, { "epoch": 0.0989399293286219, "grad_norm": 5.904226779937744, "learning_rate": 9.905660377358492e-06, "loss": 3.1006, "step": 42 }, { "epoch": 0.10129564193168433, "grad_norm": 4.29901647567749, "learning_rate": 1.0141509433962266e-05, "loss": 3.2114, "step": 43 }, { "epoch": 0.10365135453474676, "grad_norm": 3.3230977058410645, "learning_rate": 1.0377358490566038e-05, "loss": 3.2024, "step": 44 }, { "epoch": 0.10600706713780919, "grad_norm": 2.831529378890991, "learning_rate": 1.0613207547169812e-05, "loss": 3.0835, "step": 45 }, { "epoch": 0.10836277974087162, "grad_norm": 5.644282341003418, "learning_rate": 1.0849056603773586e-05, "loss": 2.9087, "step": 46 }, { "epoch": 0.11071849234393404, "grad_norm": 2.4353883266448975, "learning_rate": 1.108490566037736e-05, "loss": 3.029, "step": 47 }, { "epoch": 0.11307420494699646, "grad_norm": 2.123765468597412, "learning_rate": 1.1320754716981132e-05, "loss": 2.8757, "step": 48 }, { "epoch": 0.1154299175500589, "grad_norm": 2.2514028549194336, "learning_rate": 1.1556603773584906e-05, "loss": 3.1244, "step": 49 }, { "epoch": 0.11778563015312132, "grad_norm": 3.127729892730713, "learning_rate": 1.179245283018868e-05, "loss": 2.9182, "step": 50 }, { "epoch": 0.12014134275618374, "grad_norm": 1.8605588674545288, "learning_rate": 1.2028301886792454e-05, "loss": 2.8411, "step": 51 }, { "epoch": 0.12249705535924617, "grad_norm": 2.101807117462158, "learning_rate": 1.2264150943396227e-05, "loss": 2.9928, "step": 52 }, { "epoch": 0.1248527679623086, "grad_norm": 2.0361831188201904, "learning_rate": 1.25e-05, "loss": 2.8863, "step": 53 }, { "epoch": 0.127208480565371, "grad_norm": 2.5669167041778564, "learning_rate": 1.2735849056603775e-05, "loss": 2.8285, "step": 54 }, { "epoch": 0.12956419316843346, "grad_norm": 1.8827002048492432, "learning_rate": 1.2971698113207547e-05, "loss": 2.7373, "step": 55 }, { "epoch": 0.13191990577149587, "grad_norm": 2.704479217529297, "learning_rate": 1.320754716981132e-05, "loss": 2.822, "step": 56 }, { "epoch": 0.13427561837455831, "grad_norm": 1.918635368347168, "learning_rate": 1.3443396226415095e-05, "loss": 3.053, "step": 57 }, { "epoch": 0.13663133097762073, "grad_norm": 10.703645706176758, "learning_rate": 1.3679245283018869e-05, "loss": 2.813, "step": 58 }, { "epoch": 0.13898704358068315, "grad_norm": 1.7153394222259521, "learning_rate": 1.3915094339622641e-05, "loss": 2.8688, "step": 59 }, { "epoch": 0.1413427561837456, "grad_norm": 1.6881450414657593, "learning_rate": 1.4150943396226415e-05, "loss": 2.8716, "step": 60 }, { "epoch": 0.143698468786808, "grad_norm": 1.785086750984192, "learning_rate": 1.4386792452830189e-05, "loss": 3.0122, "step": 61 }, { "epoch": 0.14605418138987045, "grad_norm": 3.440720319747925, "learning_rate": 1.4622641509433963e-05, "loss": 2.9201, "step": 62 }, { "epoch": 0.14840989399293286, "grad_norm": 1.9360870122909546, "learning_rate": 1.4858490566037735e-05, "loss": 2.8179, "step": 63 }, { "epoch": 0.15076560659599528, "grad_norm": 2.6613950729370117, "learning_rate": 1.509433962264151e-05, "loss": 2.9275, "step": 64 }, { "epoch": 0.15312131919905772, "grad_norm": 1.7211722135543823, "learning_rate": 1.5330188679245283e-05, "loss": 2.7984, "step": 65 }, { "epoch": 0.15547703180212014, "grad_norm": 2.9398767948150635, "learning_rate": 1.5566037735849056e-05, "loss": 2.9059, "step": 66 }, { "epoch": 0.15783274440518258, "grad_norm": 1.7080775499343872, "learning_rate": 1.580188679245283e-05, "loss": 2.9329, "step": 67 }, { "epoch": 0.160188457008245, "grad_norm": 1.611788272857666, "learning_rate": 1.6037735849056604e-05, "loss": 2.8353, "step": 68 }, { "epoch": 0.1625441696113074, "grad_norm": 2.0845274925231934, "learning_rate": 1.6273584905660376e-05, "loss": 2.7496, "step": 69 }, { "epoch": 0.16489988221436985, "grad_norm": 1.367681622505188, "learning_rate": 1.650943396226415e-05, "loss": 2.5957, "step": 70 }, { "epoch": 0.16725559481743227, "grad_norm": 1.4696868658065796, "learning_rate": 1.6745283018867924e-05, "loss": 2.809, "step": 71 }, { "epoch": 0.1696113074204947, "grad_norm": 1.784449577331543, "learning_rate": 1.69811320754717e-05, "loss": 2.6699, "step": 72 }, { "epoch": 0.17196702002355713, "grad_norm": 1.3151042461395264, "learning_rate": 1.7216981132075472e-05, "loss": 2.4744, "step": 73 }, { "epoch": 0.17432273262661954, "grad_norm": 1.291449785232544, "learning_rate": 1.7452830188679244e-05, "loss": 2.5651, "step": 74 }, { "epoch": 0.17667844522968199, "grad_norm": 1.1756322383880615, "learning_rate": 1.768867924528302e-05, "loss": 2.5967, "step": 75 }, { "epoch": 0.1790341578327444, "grad_norm": 1.2732700109481812, "learning_rate": 1.7924528301886792e-05, "loss": 2.5175, "step": 76 }, { "epoch": 0.18138987043580684, "grad_norm": 1.7737306356430054, "learning_rate": 1.8160377358490564e-05, "loss": 2.6831, "step": 77 }, { "epoch": 0.18374558303886926, "grad_norm": 3.1076860427856445, "learning_rate": 1.839622641509434e-05, "loss": 2.8181, "step": 78 }, { "epoch": 0.18610129564193167, "grad_norm": 1.7151079177856445, "learning_rate": 1.8632075471698112e-05, "loss": 2.5676, "step": 79 }, { "epoch": 0.18845700824499412, "grad_norm": 1.2124723196029663, "learning_rate": 1.8867924528301888e-05, "loss": 2.5403, "step": 80 }, { "epoch": 0.19081272084805653, "grad_norm": 1.1690768003463745, "learning_rate": 1.9103773584905664e-05, "loss": 2.471, "step": 81 }, { "epoch": 0.19316843345111898, "grad_norm": 1.2793933153152466, "learning_rate": 1.9339622641509436e-05, "loss": 2.5209, "step": 82 }, { "epoch": 0.1955241460541814, "grad_norm": 1.627171516418457, "learning_rate": 1.9575471698113208e-05, "loss": 2.7075, "step": 83 }, { "epoch": 0.1978798586572438, "grad_norm": 1.6611031293869019, "learning_rate": 1.9811320754716984e-05, "loss": 2.5097, "step": 84 }, { "epoch": 0.20023557126030625, "grad_norm": 0.9960787892341614, "learning_rate": 2.0047169811320756e-05, "loss": 2.3212, "step": 85 }, { "epoch": 0.20259128386336867, "grad_norm": 0.9730187058448792, "learning_rate": 2.0283018867924532e-05, "loss": 2.58, "step": 86 }, { "epoch": 0.2049469964664311, "grad_norm": 1.2024030685424805, "learning_rate": 2.0518867924528304e-05, "loss": 2.5249, "step": 87 }, { "epoch": 0.20730270906949352, "grad_norm": 1.0087497234344482, "learning_rate": 2.0754716981132076e-05, "loss": 2.473, "step": 88 }, { "epoch": 0.20965842167255594, "grad_norm": 0.9917868375778198, "learning_rate": 2.0990566037735852e-05, "loss": 2.5045, "step": 89 }, { "epoch": 0.21201413427561838, "grad_norm": 0.8549998998641968, "learning_rate": 2.1226415094339624e-05, "loss": 2.2968, "step": 90 }, { "epoch": 0.2143698468786808, "grad_norm": 1.1582450866699219, "learning_rate": 2.1462264150943397e-05, "loss": 2.5671, "step": 91 }, { "epoch": 0.21672555948174324, "grad_norm": 0.844853401184082, "learning_rate": 2.1698113207547172e-05, "loss": 2.4652, "step": 92 }, { "epoch": 0.21908127208480566, "grad_norm": 1.0340232849121094, "learning_rate": 2.1933962264150945e-05, "loss": 2.3319, "step": 93 }, { "epoch": 0.22143698468786807, "grad_norm": 0.8976126313209534, "learning_rate": 2.216981132075472e-05, "loss": 2.4692, "step": 94 }, { "epoch": 0.22379269729093051, "grad_norm": 0.9033194184303284, "learning_rate": 2.2405660377358493e-05, "loss": 2.4716, "step": 95 }, { "epoch": 0.22614840989399293, "grad_norm": 1.0866897106170654, "learning_rate": 2.2641509433962265e-05, "loss": 2.2575, "step": 96 }, { "epoch": 0.22850412249705537, "grad_norm": 0.8037015795707703, "learning_rate": 2.287735849056604e-05, "loss": 2.4325, "step": 97 }, { "epoch": 0.2308598351001178, "grad_norm": 3.296945810317993, "learning_rate": 2.3113207547169813e-05, "loss": 2.2718, "step": 98 }, { "epoch": 0.2332155477031802, "grad_norm": 0.9205549955368042, "learning_rate": 2.3349056603773585e-05, "loss": 2.4624, "step": 99 }, { "epoch": 0.23557126030624265, "grad_norm": 0.8715971112251282, "learning_rate": 2.358490566037736e-05, "loss": 2.3047, "step": 100 }, { "epoch": 0.23792697290930506, "grad_norm": 0.7769468426704407, "learning_rate": 2.3820754716981133e-05, "loss": 2.5125, "step": 101 }, { "epoch": 0.24028268551236748, "grad_norm": 0.8480455875396729, "learning_rate": 2.405660377358491e-05, "loss": 2.4121, "step": 102 }, { "epoch": 0.24263839811542992, "grad_norm": 0.8049631714820862, "learning_rate": 2.429245283018868e-05, "loss": 2.3881, "step": 103 }, { "epoch": 0.24499411071849234, "grad_norm": 1.1711924076080322, "learning_rate": 2.4528301886792453e-05, "loss": 2.2158, "step": 104 }, { "epoch": 0.24734982332155478, "grad_norm": 0.8052265048027039, "learning_rate": 2.476415094339623e-05, "loss": 2.1536, "step": 105 }, { "epoch": 0.2497055359246172, "grad_norm": 0.8493515849113464, "learning_rate": 2.5e-05, "loss": 2.2545, "step": 106 }, { "epoch": 0.25206124852767964, "grad_norm": 0.8474149107933044, "learning_rate": 2.5235849056603777e-05, "loss": 2.2463, "step": 107 }, { "epoch": 0.254416961130742, "grad_norm": 0.678053617477417, "learning_rate": 2.547169811320755e-05, "loss": 2.1565, "step": 108 }, { "epoch": 0.25677267373380447, "grad_norm": 0.856350302696228, "learning_rate": 2.5707547169811325e-05, "loss": 2.3259, "step": 109 }, { "epoch": 0.2591283863368669, "grad_norm": 0.729192316532135, "learning_rate": 2.5943396226415094e-05, "loss": 2.1752, "step": 110 }, { "epoch": 0.26148409893992935, "grad_norm": 0.8643910884857178, "learning_rate": 2.6179245283018873e-05, "loss": 2.388, "step": 111 }, { "epoch": 0.26383981154299174, "grad_norm": 0.6870043873786926, "learning_rate": 2.641509433962264e-05, "loss": 2.135, "step": 112 }, { "epoch": 0.2661955241460542, "grad_norm": 0.9979189038276672, "learning_rate": 2.6650943396226417e-05, "loss": 2.1527, "step": 113 }, { "epoch": 0.26855123674911663, "grad_norm": 0.6986424326896667, "learning_rate": 2.688679245283019e-05, "loss": 2.1461, "step": 114 }, { "epoch": 0.270906949352179, "grad_norm": 2.0524418354034424, "learning_rate": 2.7122641509433965e-05, "loss": 2.1283, "step": 115 }, { "epoch": 0.27326266195524146, "grad_norm": 0.7831687331199646, "learning_rate": 2.7358490566037738e-05, "loss": 2.2618, "step": 116 }, { "epoch": 0.2756183745583039, "grad_norm": 0.6844900250434875, "learning_rate": 2.7594339622641513e-05, "loss": 2.2345, "step": 117 }, { "epoch": 0.2779740871613663, "grad_norm": 0.891901433467865, "learning_rate": 2.7830188679245282e-05, "loss": 2.3846, "step": 118 }, { "epoch": 0.28032979976442873, "grad_norm": 1.2015186548233032, "learning_rate": 2.806603773584906e-05, "loss": 2.145, "step": 119 }, { "epoch": 0.2826855123674912, "grad_norm": 0.7242400050163269, "learning_rate": 2.830188679245283e-05, "loss": 2.0468, "step": 120 }, { "epoch": 0.2850412249705536, "grad_norm": 0.8652706742286682, "learning_rate": 2.8537735849056606e-05, "loss": 2.1654, "step": 121 }, { "epoch": 0.287396937573616, "grad_norm": 0.6857221126556396, "learning_rate": 2.8773584905660378e-05, "loss": 2.1091, "step": 122 }, { "epoch": 0.28975265017667845, "grad_norm": 0.8854426741600037, "learning_rate": 2.9009433962264154e-05, "loss": 2.1775, "step": 123 }, { "epoch": 0.2921083627797409, "grad_norm": 0.6908788681030273, "learning_rate": 2.9245283018867926e-05, "loss": 2.2952, "step": 124 }, { "epoch": 0.2944640753828033, "grad_norm": 1.0300172567367554, "learning_rate": 2.9481132075471702e-05, "loss": 2.2652, "step": 125 }, { "epoch": 0.2968197879858657, "grad_norm": 1.1899813413619995, "learning_rate": 2.971698113207547e-05, "loss": 2.0186, "step": 126 }, { "epoch": 0.29917550058892817, "grad_norm": 0.674911379814148, "learning_rate": 2.995283018867925e-05, "loss": 2.104, "step": 127 }, { "epoch": 0.30153121319199055, "grad_norm": 0.7416008710861206, "learning_rate": 3.018867924528302e-05, "loss": 2.0111, "step": 128 }, { "epoch": 0.303886925795053, "grad_norm": 0.7811684608459473, "learning_rate": 3.0424528301886794e-05, "loss": 2.0991, "step": 129 }, { "epoch": 0.30624263839811544, "grad_norm": 0.6986581087112427, "learning_rate": 3.0660377358490567e-05, "loss": 2.2471, "step": 130 }, { "epoch": 0.30859835100117783, "grad_norm": 0.6835696697235107, "learning_rate": 3.0896226415094346e-05, "loss": 2.1081, "step": 131 }, { "epoch": 0.31095406360424027, "grad_norm": 0.7385230660438538, "learning_rate": 3.113207547169811e-05, "loss": 2.0885, "step": 132 }, { "epoch": 0.3133097762073027, "grad_norm": 0.7642261385917664, "learning_rate": 3.136792452830189e-05, "loss": 2.1168, "step": 133 }, { "epoch": 0.31566548881036516, "grad_norm": 0.6923811435699463, "learning_rate": 3.160377358490566e-05, "loss": 2.0884, "step": 134 }, { "epoch": 0.31802120141342755, "grad_norm": 0.7741503119468689, "learning_rate": 3.1839622641509435e-05, "loss": 2.125, "step": 135 }, { "epoch": 0.32037691401649, "grad_norm": 0.7629136443138123, "learning_rate": 3.207547169811321e-05, "loss": 2.0872, "step": 136 }, { "epoch": 0.32273262661955243, "grad_norm": 0.8064791560173035, "learning_rate": 3.2311320754716986e-05, "loss": 2.1831, "step": 137 }, { "epoch": 0.3250883392226148, "grad_norm": 0.763308048248291, "learning_rate": 3.254716981132075e-05, "loss": 2.2567, "step": 138 }, { "epoch": 0.32744405182567726, "grad_norm": 0.7312076687812805, "learning_rate": 3.278301886792453e-05, "loss": 2.2065, "step": 139 }, { "epoch": 0.3297997644287397, "grad_norm": 15.171049118041992, "learning_rate": 3.30188679245283e-05, "loss": 1.9685, "step": 140 }, { "epoch": 0.3321554770318021, "grad_norm": 0.7163378596305847, "learning_rate": 3.3254716981132075e-05, "loss": 2.1674, "step": 141 }, { "epoch": 0.33451118963486454, "grad_norm": 0.8286659121513367, "learning_rate": 3.349056603773585e-05, "loss": 2.123, "step": 142 }, { "epoch": 0.336866902237927, "grad_norm": 0.7719376683235168, "learning_rate": 3.3726415094339627e-05, "loss": 2.2571, "step": 143 }, { "epoch": 0.3392226148409894, "grad_norm": 0.7743678092956543, "learning_rate": 3.39622641509434e-05, "loss": 2.2521, "step": 144 }, { "epoch": 0.3415783274440518, "grad_norm": 0.7513464689254761, "learning_rate": 3.419811320754717e-05, "loss": 1.8513, "step": 145 }, { "epoch": 0.34393404004711425, "grad_norm": 0.6726008057594299, "learning_rate": 3.4433962264150943e-05, "loss": 2.1507, "step": 146 }, { "epoch": 0.3462897526501767, "grad_norm": 0.7488909959793091, "learning_rate": 3.466981132075472e-05, "loss": 2.1535, "step": 147 }, { "epoch": 0.3486454652532391, "grad_norm": 0.6362333297729492, "learning_rate": 3.490566037735849e-05, "loss": 2.1789, "step": 148 }, { "epoch": 0.3510011778563015, "grad_norm": 0.7335079312324524, "learning_rate": 3.514150943396227e-05, "loss": 1.9772, "step": 149 }, { "epoch": 0.35335689045936397, "grad_norm": 0.9391010403633118, "learning_rate": 3.537735849056604e-05, "loss": 1.9796, "step": 150 }, { "epoch": 0.35571260306242636, "grad_norm": 0.5802990198135376, "learning_rate": 3.561320754716981e-05, "loss": 2.0076, "step": 151 }, { "epoch": 0.3580683156654888, "grad_norm": 4.624221324920654, "learning_rate": 3.5849056603773584e-05, "loss": 2.0856, "step": 152 }, { "epoch": 0.36042402826855124, "grad_norm": 0.7226186394691467, "learning_rate": 3.608490566037736e-05, "loss": 2.067, "step": 153 }, { "epoch": 0.3627797408716137, "grad_norm": 0.712346613407135, "learning_rate": 3.632075471698113e-05, "loss": 2.3233, "step": 154 }, { "epoch": 0.3651354534746761, "grad_norm": 0.688882052898407, "learning_rate": 3.655660377358491e-05, "loss": 1.9979, "step": 155 }, { "epoch": 0.3674911660777385, "grad_norm": 0.6710765957832336, "learning_rate": 3.679245283018868e-05, "loss": 2.2225, "step": 156 }, { "epoch": 0.36984687868080096, "grad_norm": 1.414029836654663, "learning_rate": 3.702830188679245e-05, "loss": 2.1974, "step": 157 }, { "epoch": 0.37220259128386335, "grad_norm": 0.6274915933609009, "learning_rate": 3.7264150943396224e-05, "loss": 2.0646, "step": 158 }, { "epoch": 0.3745583038869258, "grad_norm": 0.6071403622627258, "learning_rate": 3.7500000000000003e-05, "loss": 2.1038, "step": 159 }, { "epoch": 0.37691401648998824, "grad_norm": 0.6084411144256592, "learning_rate": 3.7735849056603776e-05, "loss": 2.0261, "step": 160 }, { "epoch": 0.3792697290930506, "grad_norm": 0.6371297836303711, "learning_rate": 3.797169811320755e-05, "loss": 2.0238, "step": 161 }, { "epoch": 0.38162544169611307, "grad_norm": 0.6999629139900208, "learning_rate": 3.820754716981133e-05, "loss": 2.002, "step": 162 }, { "epoch": 0.3839811542991755, "grad_norm": 0.6545054912567139, "learning_rate": 3.844339622641509e-05, "loss": 2.0976, "step": 163 }, { "epoch": 0.38633686690223795, "grad_norm": 0.8287113904953003, "learning_rate": 3.867924528301887e-05, "loss": 2.202, "step": 164 }, { "epoch": 0.38869257950530034, "grad_norm": 2.145648717880249, "learning_rate": 3.8915094339622644e-05, "loss": 2.0901, "step": 165 }, { "epoch": 0.3910482921083628, "grad_norm": 0.6257143020629883, "learning_rate": 3.9150943396226416e-05, "loss": 2.0555, "step": 166 }, { "epoch": 0.3934040047114252, "grad_norm": 0.594342052936554, "learning_rate": 3.938679245283019e-05, "loss": 1.9789, "step": 167 }, { "epoch": 0.3957597173144876, "grad_norm": 0.6157491207122803, "learning_rate": 3.962264150943397e-05, "loss": 1.9361, "step": 168 }, { "epoch": 0.39811542991755006, "grad_norm": 0.7027248740196228, "learning_rate": 3.985849056603774e-05, "loss": 2.1406, "step": 169 }, { "epoch": 0.4004711425206125, "grad_norm": 0.5973758697509766, "learning_rate": 4.009433962264151e-05, "loss": 1.9585, "step": 170 }, { "epoch": 0.4028268551236749, "grad_norm": 0.7109379172325134, "learning_rate": 4.0330188679245284e-05, "loss": 2.0038, "step": 171 }, { "epoch": 0.40518256772673733, "grad_norm": 0.8359628915786743, "learning_rate": 4.0566037735849064e-05, "loss": 2.0603, "step": 172 }, { "epoch": 0.4075382803297998, "grad_norm": 0.6391820311546326, "learning_rate": 4.080188679245283e-05, "loss": 2.0505, "step": 173 }, { "epoch": 0.4098939929328622, "grad_norm": 0.6712766885757446, "learning_rate": 4.103773584905661e-05, "loss": 1.9763, "step": 174 }, { "epoch": 0.4122497055359246, "grad_norm": 0.6558383703231812, "learning_rate": 4.127358490566038e-05, "loss": 2.0707, "step": 175 }, { "epoch": 0.41460541813898705, "grad_norm": 0.7275684475898743, "learning_rate": 4.150943396226415e-05, "loss": 1.9601, "step": 176 }, { "epoch": 0.4169611307420495, "grad_norm": 0.6585489511489868, "learning_rate": 4.1745283018867925e-05, "loss": 1.9929, "step": 177 }, { "epoch": 0.4193168433451119, "grad_norm": 0.6494356989860535, "learning_rate": 4.1981132075471704e-05, "loss": 2.1627, "step": 178 }, { "epoch": 0.4216725559481743, "grad_norm": 0.644859254360199, "learning_rate": 4.221698113207547e-05, "loss": 1.9972, "step": 179 }, { "epoch": 0.42402826855123676, "grad_norm": 0.8550021052360535, "learning_rate": 4.245283018867925e-05, "loss": 2.1693, "step": 180 }, { "epoch": 0.42638398115429915, "grad_norm": 0.8633426427841187, "learning_rate": 4.268867924528302e-05, "loss": 1.8494, "step": 181 }, { "epoch": 0.4287396937573616, "grad_norm": 0.6398471593856812, "learning_rate": 4.292452830188679e-05, "loss": 1.9604, "step": 182 }, { "epoch": 0.43109540636042404, "grad_norm": 0.6882141828536987, "learning_rate": 4.3160377358490565e-05, "loss": 2.1199, "step": 183 }, { "epoch": 0.4334511189634865, "grad_norm": 4.500829219818115, "learning_rate": 4.3396226415094345e-05, "loss": 2.0153, "step": 184 }, { "epoch": 0.43580683156654887, "grad_norm": 0.6347240209579468, "learning_rate": 4.363207547169812e-05, "loss": 1.98, "step": 185 }, { "epoch": 0.4381625441696113, "grad_norm": 1.3279805183410645, "learning_rate": 4.386792452830189e-05, "loss": 1.9597, "step": 186 }, { "epoch": 0.44051825677267376, "grad_norm": 0.6573253870010376, "learning_rate": 4.410377358490566e-05, "loss": 2.13, "step": 187 }, { "epoch": 0.44287396937573614, "grad_norm": 0.6021633148193359, "learning_rate": 4.433962264150944e-05, "loss": 1.9242, "step": 188 }, { "epoch": 0.4452296819787986, "grad_norm": 0.7392600178718567, "learning_rate": 4.4575471698113206e-05, "loss": 2.0446, "step": 189 }, { "epoch": 0.44758539458186103, "grad_norm": 0.7796009182929993, "learning_rate": 4.4811320754716985e-05, "loss": 2.1131, "step": 190 }, { "epoch": 0.4499411071849234, "grad_norm": 1.3506840467453003, "learning_rate": 4.504716981132076e-05, "loss": 1.8387, "step": 191 }, { "epoch": 0.45229681978798586, "grad_norm": 0.6292462348937988, "learning_rate": 4.528301886792453e-05, "loss": 2.0908, "step": 192 }, { "epoch": 0.4546525323910483, "grad_norm": 0.6413406729698181, "learning_rate": 4.55188679245283e-05, "loss": 1.9302, "step": 193 }, { "epoch": 0.45700824499411075, "grad_norm": 0.6294993758201599, "learning_rate": 4.575471698113208e-05, "loss": 1.8739, "step": 194 }, { "epoch": 0.45936395759717313, "grad_norm": 0.6342466473579407, "learning_rate": 4.5990566037735846e-05, "loss": 1.9609, "step": 195 }, { "epoch": 0.4617196702002356, "grad_norm": 0.6886049509048462, "learning_rate": 4.6226415094339625e-05, "loss": 1.8775, "step": 196 }, { "epoch": 0.464075382803298, "grad_norm": 0.6618643403053284, "learning_rate": 4.64622641509434e-05, "loss": 1.8866, "step": 197 }, { "epoch": 0.4664310954063604, "grad_norm": 3.9357001781463623, "learning_rate": 4.669811320754717e-05, "loss": 1.9623, "step": 198 }, { "epoch": 0.46878680800942285, "grad_norm": 1.9928069114685059, "learning_rate": 4.693396226415094e-05, "loss": 2.0696, "step": 199 }, { "epoch": 0.4711425206124853, "grad_norm": 0.7599535584449768, "learning_rate": 4.716981132075472e-05, "loss": 1.8438, "step": 200 }, { "epoch": 0.4734982332155477, "grad_norm": 0.6601504683494568, "learning_rate": 4.7405660377358494e-05, "loss": 1.9739, "step": 201 }, { "epoch": 0.4758539458186101, "grad_norm": 0.8352360725402832, "learning_rate": 4.7641509433962266e-05, "loss": 1.9231, "step": 202 }, { "epoch": 0.47820965842167257, "grad_norm": 0.6142988801002502, "learning_rate": 4.787735849056604e-05, "loss": 1.7375, "step": 203 }, { "epoch": 0.48056537102473496, "grad_norm": 0.5656473636627197, "learning_rate": 4.811320754716982e-05, "loss": 1.9006, "step": 204 }, { "epoch": 0.4829210836277974, "grad_norm": 0.7227504253387451, "learning_rate": 4.834905660377358e-05, "loss": 1.7713, "step": 205 }, { "epoch": 0.48527679623085984, "grad_norm": 1.139112114906311, "learning_rate": 4.858490566037736e-05, "loss": 1.8441, "step": 206 }, { "epoch": 0.4876325088339223, "grad_norm": 0.6989550590515137, "learning_rate": 4.8820754716981134e-05, "loss": 2.0427, "step": 207 }, { "epoch": 0.48998822143698467, "grad_norm": 0.6388002038002014, "learning_rate": 4.9056603773584906e-05, "loss": 1.6794, "step": 208 }, { "epoch": 0.4923439340400471, "grad_norm": 0.7161305546760559, "learning_rate": 4.929245283018868e-05, "loss": 1.8625, "step": 209 }, { "epoch": 0.49469964664310956, "grad_norm": 0.5549436211585999, "learning_rate": 4.952830188679246e-05, "loss": 1.7291, "step": 210 }, { "epoch": 0.49705535924617195, "grad_norm": 2.5703818798065186, "learning_rate": 4.976415094339622e-05, "loss": 1.9632, "step": 211 }, { "epoch": 0.4994110718492344, "grad_norm": 0.6657212376594543, "learning_rate": 5e-05, "loss": 1.9461, "step": 212 }, { "epoch": 0.4994110718492344, "eval_loss": 1.8006898164749146, "eval_runtime": 5.7852, "eval_samples_per_second": 432.134, "eval_steps_per_second": 6.914, "step": 212 }, { "epoch": 0.5017667844522968, "grad_norm": 0.6769471168518066, "learning_rate": 4.999996611145527e-05, "loss": 2.0216, "step": 213 }, { "epoch": 0.5041224970553593, "grad_norm": 0.6282266974449158, "learning_rate": 4.9999864445912916e-05, "loss": 1.8744, "step": 214 }, { "epoch": 0.5064782096584217, "grad_norm": 0.6426438689231873, "learning_rate": 4.9999695003648586e-05, "loss": 1.9476, "step": 215 }, { "epoch": 0.508833922261484, "grad_norm": 0.6457599401473999, "learning_rate": 4.999945778512164e-05, "loss": 2.0774, "step": 216 }, { "epoch": 0.5111896348645465, "grad_norm": 0.7095407247543335, "learning_rate": 4.9999152790975205e-05, "loss": 1.8355, "step": 217 }, { "epoch": 0.5135453474676089, "grad_norm": 0.6032217144966125, "learning_rate": 4.999878002203614e-05, "loss": 1.9117, "step": 218 }, { "epoch": 0.5159010600706714, "grad_norm": 0.6550146341323853, "learning_rate": 4.9998339479315066e-05, "loss": 1.8745, "step": 219 }, { "epoch": 0.5182567726737338, "grad_norm": 0.6016785502433777, "learning_rate": 4.9997831164006316e-05, "loss": 1.8694, "step": 220 }, { "epoch": 0.5206124852767963, "grad_norm": 0.6491071581840515, "learning_rate": 4.999725507748798e-05, "loss": 1.6804, "step": 221 }, { "epoch": 0.5229681978798587, "grad_norm": 0.6343960165977478, "learning_rate": 4.999661122132187e-05, "loss": 1.8642, "step": 222 }, { "epoch": 0.525323910482921, "grad_norm": 0.65626060962677, "learning_rate": 4.999589959725355e-05, "loss": 1.9938, "step": 223 }, { "epoch": 0.5276796230859835, "grad_norm": 0.7074131965637207, "learning_rate": 4.999512020721228e-05, "loss": 1.8343, "step": 224 }, { "epoch": 0.5300353356890459, "grad_norm": 0.6016308069229126, "learning_rate": 4.9994273053311044e-05, "loss": 1.7553, "step": 225 }, { "epoch": 0.5323910482921084, "grad_norm": 1.5065337419509888, "learning_rate": 4.999335813784657e-05, "loss": 1.7601, "step": 226 }, { "epoch": 0.5347467608951708, "grad_norm": 0.6413789391517639, "learning_rate": 4.999237546329925e-05, "loss": 1.8783, "step": 227 }, { "epoch": 0.5371024734982333, "grad_norm": 0.6494936943054199, "learning_rate": 4.999132503233321e-05, "loss": 2.0282, "step": 228 }, { "epoch": 0.5394581861012956, "grad_norm": 0.5891048312187195, "learning_rate": 4.999020684779625e-05, "loss": 1.8264, "step": 229 }, { "epoch": 0.541813898704358, "grad_norm": 0.6679076552391052, "learning_rate": 4.9989020912719864e-05, "loss": 1.8928, "step": 230 }, { "epoch": 0.5441696113074205, "grad_norm": 0.6594038605690002, "learning_rate": 4.9987767230319215e-05, "loss": 1.7692, "step": 231 }, { "epoch": 0.5465253239104829, "grad_norm": 0.697609543800354, "learning_rate": 4.9986445803993146e-05, "loss": 1.9078, "step": 232 }, { "epoch": 0.5488810365135454, "grad_norm": 0.6231794357299805, "learning_rate": 4.9985056637324154e-05, "loss": 2.039, "step": 233 }, { "epoch": 0.5512367491166078, "grad_norm": 0.5495326519012451, "learning_rate": 4.998359973407839e-05, "loss": 1.791, "step": 234 }, { "epoch": 0.5535924617196702, "grad_norm": 0.5953809022903442, "learning_rate": 4.9982075098205625e-05, "loss": 1.8047, "step": 235 }, { "epoch": 0.5559481743227326, "grad_norm": 0.5445505976676941, "learning_rate": 4.99804827338393e-05, "loss": 1.7876, "step": 236 }, { "epoch": 0.558303886925795, "grad_norm": 0.5820225477218628, "learning_rate": 4.997882264529642e-05, "loss": 1.8992, "step": 237 }, { "epoch": 0.5606595995288575, "grad_norm": 0.6324096918106079, "learning_rate": 4.997709483707765e-05, "loss": 1.9252, "step": 238 }, { "epoch": 0.5630153121319199, "grad_norm": 0.6112015843391418, "learning_rate": 4.99752993138672e-05, "loss": 1.92, "step": 239 }, { "epoch": 0.5653710247349824, "grad_norm": 0.5869399309158325, "learning_rate": 4.99734360805329e-05, "loss": 2.0446, "step": 240 }, { "epoch": 0.5677267373380448, "grad_norm": 0.6269591450691223, "learning_rate": 4.997150514212611e-05, "loss": 1.8065, "step": 241 }, { "epoch": 0.5700824499411072, "grad_norm": 0.6631391048431396, "learning_rate": 4.9969506503881794e-05, "loss": 1.9359, "step": 242 }, { "epoch": 0.5724381625441696, "grad_norm": 0.560210645198822, "learning_rate": 4.996744017121841e-05, "loss": 1.8612, "step": 243 }, { "epoch": 0.574793875147232, "grad_norm": 0.8256493806838989, "learning_rate": 4.996530614973795e-05, "loss": 2.1189, "step": 244 }, { "epoch": 0.5771495877502945, "grad_norm": 0.6898834109306335, "learning_rate": 4.996310444522595e-05, "loss": 2.0112, "step": 245 }, { "epoch": 0.5795053003533569, "grad_norm": 0.6693411469459534, "learning_rate": 4.99608350636514e-05, "loss": 2.0998, "step": 246 }, { "epoch": 0.5818610129564193, "grad_norm": 1.1690595149993896, "learning_rate": 4.9958498011166775e-05, "loss": 1.91, "step": 247 }, { "epoch": 0.5842167255594818, "grad_norm": 0.669636607170105, "learning_rate": 4.995609329410804e-05, "loss": 1.7085, "step": 248 }, { "epoch": 0.5865724381625441, "grad_norm": 0.6437779068946838, "learning_rate": 4.9953620918994566e-05, "loss": 2.0304, "step": 249 }, { "epoch": 0.5889281507656066, "grad_norm": 0.6279498934745789, "learning_rate": 4.9951080892529176e-05, "loss": 1.8378, "step": 250 }, { "epoch": 0.591283863368669, "grad_norm": 0.8346914649009705, "learning_rate": 4.9948473221598094e-05, "loss": 1.772, "step": 251 }, { "epoch": 0.5936395759717314, "grad_norm": 0.6687989234924316, "learning_rate": 4.994579791327093e-05, "loss": 1.7891, "step": 252 }, { "epoch": 0.5959952885747939, "grad_norm": 0.7373409271240234, "learning_rate": 4.994305497480067e-05, "loss": 1.9203, "step": 253 }, { "epoch": 0.5983510011778563, "grad_norm": 0.6117722988128662, "learning_rate": 4.994024441362366e-05, "loss": 2.0566, "step": 254 }, { "epoch": 0.6007067137809188, "grad_norm": 1.1306655406951904, "learning_rate": 4.993736623735955e-05, "loss": 1.8772, "step": 255 }, { "epoch": 0.6030624263839811, "grad_norm": 0.6340383887290955, "learning_rate": 4.9934420453811334e-05, "loss": 1.7935, "step": 256 }, { "epoch": 0.6054181389870436, "grad_norm": 0.5813724398612976, "learning_rate": 4.9931407070965254e-05, "loss": 1.869, "step": 257 }, { "epoch": 0.607773851590106, "grad_norm": 0.5423834919929504, "learning_rate": 4.992832609699086e-05, "loss": 1.7559, "step": 258 }, { "epoch": 0.6101295641931684, "grad_norm": 0.614717423915863, "learning_rate": 4.992517754024093e-05, "loss": 1.8779, "step": 259 }, { "epoch": 0.6124852767962309, "grad_norm": 0.5490809679031372, "learning_rate": 4.9921961409251464e-05, "loss": 1.7619, "step": 260 }, { "epoch": 0.6148409893992933, "grad_norm": 0.6208000779151917, "learning_rate": 4.9918677712741644e-05, "loss": 1.7843, "step": 261 }, { "epoch": 0.6171967020023557, "grad_norm": 0.5541819930076599, "learning_rate": 4.991532645961387e-05, "loss": 1.6837, "step": 262 }, { "epoch": 0.6195524146054181, "grad_norm": 3.8291265964508057, "learning_rate": 4.9911907658953664e-05, "loss": 1.6909, "step": 263 }, { "epoch": 0.6219081272084805, "grad_norm": 0.6292436718940735, "learning_rate": 4.990842132002966e-05, "loss": 1.7922, "step": 264 }, { "epoch": 0.624263839811543, "grad_norm": 0.5951045155525208, "learning_rate": 4.990486745229364e-05, "loss": 1.8001, "step": 265 }, { "epoch": 0.6266195524146054, "grad_norm": 0.6285128593444824, "learning_rate": 4.990124606538042e-05, "loss": 1.9755, "step": 266 }, { "epoch": 0.6289752650176679, "grad_norm": 0.6398470997810364, "learning_rate": 4.98975571691079e-05, "loss": 1.8482, "step": 267 }, { "epoch": 0.6313309776207303, "grad_norm": 0.8720383048057556, "learning_rate": 4.9893800773476965e-05, "loss": 1.7867, "step": 268 }, { "epoch": 0.6336866902237926, "grad_norm": 0.5867232084274292, "learning_rate": 4.988997688867153e-05, "loss": 1.9016, "step": 269 }, { "epoch": 0.6360424028268551, "grad_norm": 0.5350059866905212, "learning_rate": 4.988608552505847e-05, "loss": 1.6426, "step": 270 }, { "epoch": 0.6383981154299175, "grad_norm": 0.5895572304725647, "learning_rate": 4.988212669318758e-05, "loss": 1.9962, "step": 271 }, { "epoch": 0.64075382803298, "grad_norm": 0.6291583776473999, "learning_rate": 4.98781004037916e-05, "loss": 1.8901, "step": 272 }, { "epoch": 0.6431095406360424, "grad_norm": 0.5860071182250977, "learning_rate": 4.987400666778614e-05, "loss": 1.7443, "step": 273 }, { "epoch": 0.6454652532391049, "grad_norm": 0.5617952942848206, "learning_rate": 4.986984549626964e-05, "loss": 1.8183, "step": 274 }, { "epoch": 0.6478209658421673, "grad_norm": 0.6653210520744324, "learning_rate": 4.9865616900523406e-05, "loss": 1.7622, "step": 275 }, { "epoch": 0.6501766784452296, "grad_norm": 0.6908093094825745, "learning_rate": 4.98613208920115e-05, "loss": 1.9267, "step": 276 }, { "epoch": 0.6525323910482921, "grad_norm": 0.6031364798545837, "learning_rate": 4.985695748238076e-05, "loss": 1.6087, "step": 277 }, { "epoch": 0.6548881036513545, "grad_norm": 0.7004727125167847, "learning_rate": 4.985252668346076e-05, "loss": 1.8771, "step": 278 }, { "epoch": 0.657243816254417, "grad_norm": 0.574828028678894, "learning_rate": 4.984802850726378e-05, "loss": 1.76, "step": 279 }, { "epoch": 0.6595995288574794, "grad_norm": 0.5871966481208801, "learning_rate": 4.984346296598472e-05, "loss": 1.7852, "step": 280 }, { "epoch": 0.6619552414605419, "grad_norm": 0.6312404274940491, "learning_rate": 4.9838830072001165e-05, "loss": 1.8548, "step": 281 }, { "epoch": 0.6643109540636042, "grad_norm": 0.6077771186828613, "learning_rate": 4.983412983787328e-05, "loss": 1.7155, "step": 282 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5615354776382446, "learning_rate": 4.982936227634378e-05, "loss": 1.8257, "step": 283 }, { "epoch": 0.6690223792697291, "grad_norm": 0.57930588722229, "learning_rate": 4.982452740033793e-05, "loss": 1.5975, "step": 284 }, { "epoch": 0.6713780918727915, "grad_norm": 0.6212072968482971, "learning_rate": 4.981962522296348e-05, "loss": 1.6618, "step": 285 }, { "epoch": 0.673733804475854, "grad_norm": 0.5742200016975403, "learning_rate": 4.9814655757510644e-05, "loss": 1.62, "step": 286 }, { "epoch": 0.6760895170789164, "grad_norm": 1.123051643371582, "learning_rate": 4.980961901745206e-05, "loss": 1.7496, "step": 287 }, { "epoch": 0.6784452296819788, "grad_norm": 0.5811015963554382, "learning_rate": 4.9804515016442754e-05, "loss": 1.7005, "step": 288 }, { "epoch": 0.6808009422850412, "grad_norm": 1.0648763179779053, "learning_rate": 4.979934376832009e-05, "loss": 1.9501, "step": 289 }, { "epoch": 0.6831566548881036, "grad_norm": 0.6187362670898438, "learning_rate": 4.979410528710377e-05, "loss": 1.754, "step": 290 }, { "epoch": 0.6855123674911661, "grad_norm": 0.589699387550354, "learning_rate": 4.978879958699573e-05, "loss": 1.7977, "step": 291 }, { "epoch": 0.6878680800942285, "grad_norm": 0.7317181825637817, "learning_rate": 4.978342668238018e-05, "loss": 1.8501, "step": 292 }, { "epoch": 0.690223792697291, "grad_norm": 0.5470342040061951, "learning_rate": 4.9777986587823523e-05, "loss": 1.8302, "step": 293 }, { "epoch": 0.6925795053003534, "grad_norm": 0.8782589435577393, "learning_rate": 4.9772479318074296e-05, "loss": 1.7005, "step": 294 }, { "epoch": 0.6949352179034158, "grad_norm": 0.6678377985954285, "learning_rate": 4.976690488806317e-05, "loss": 1.7317, "step": 295 }, { "epoch": 0.6972909305064782, "grad_norm": 0.6853668093681335, "learning_rate": 4.9761263312902895e-05, "loss": 1.9207, "step": 296 }, { "epoch": 0.6996466431095406, "grad_norm": 0.6446820497512817, "learning_rate": 4.975555460788825e-05, "loss": 1.8787, "step": 297 }, { "epoch": 0.702002355712603, "grad_norm": 0.6354290246963501, "learning_rate": 4.974977878849601e-05, "loss": 1.7606, "step": 298 }, { "epoch": 0.7043580683156655, "grad_norm": 0.5959171652793884, "learning_rate": 4.9743935870384906e-05, "loss": 1.8176, "step": 299 }, { "epoch": 0.7067137809187279, "grad_norm": 0.618241012096405, "learning_rate": 4.973802586939558e-05, "loss": 1.8867, "step": 300 }, { "epoch": 0.7090694935217904, "grad_norm": 1.0189661979675293, "learning_rate": 4.973204880155053e-05, "loss": 1.795, "step": 301 }, { "epoch": 0.7114252061248527, "grad_norm": 0.6313762664794922, "learning_rate": 4.97260046830541e-05, "loss": 1.7504, "step": 302 }, { "epoch": 0.7137809187279152, "grad_norm": 0.6779804825782776, "learning_rate": 4.9719893530292396e-05, "loss": 1.8747, "step": 303 }, { "epoch": 0.7161366313309776, "grad_norm": 0.9850471615791321, "learning_rate": 4.971371535983325e-05, "loss": 1.8268, "step": 304 }, { "epoch": 0.71849234393404, "grad_norm": 0.6137601137161255, "learning_rate": 4.970747018842622e-05, "loss": 1.8958, "step": 305 }, { "epoch": 0.7208480565371025, "grad_norm": 0.6188938617706299, "learning_rate": 4.970115803300247e-05, "loss": 1.6883, "step": 306 }, { "epoch": 0.7232037691401649, "grad_norm": 0.6189510822296143, "learning_rate": 4.9694778910674785e-05, "loss": 1.8922, "step": 307 }, { "epoch": 0.7255594817432274, "grad_norm": 0.5877013206481934, "learning_rate": 4.9688332838737504e-05, "loss": 1.9098, "step": 308 }, { "epoch": 0.7279151943462897, "grad_norm": 0.6208062171936035, "learning_rate": 4.968181983466647e-05, "loss": 2.007, "step": 309 }, { "epoch": 0.7302709069493521, "grad_norm": 0.7047111988067627, "learning_rate": 4.9675239916118975e-05, "loss": 1.7296, "step": 310 }, { "epoch": 0.7326266195524146, "grad_norm": 0.9179831147193909, "learning_rate": 4.966859310093372e-05, "loss": 1.8694, "step": 311 }, { "epoch": 0.734982332155477, "grad_norm": 0.581491231918335, "learning_rate": 4.966187940713079e-05, "loss": 1.8475, "step": 312 }, { "epoch": 0.7373380447585395, "grad_norm": 0.5414103269577026, "learning_rate": 4.965509885291157e-05, "loss": 1.821, "step": 313 }, { "epoch": 0.7396937573616019, "grad_norm": 0.5915893316268921, "learning_rate": 4.96482514566587e-05, "loss": 1.6652, "step": 314 }, { "epoch": 0.7420494699646644, "grad_norm": 0.5945522785186768, "learning_rate": 4.964133723693606e-05, "loss": 1.591, "step": 315 }, { "epoch": 0.7444051825677267, "grad_norm": 0.6191164255142212, "learning_rate": 4.963435621248865e-05, "loss": 1.8451, "step": 316 }, { "epoch": 0.7467608951707891, "grad_norm": 0.7283751368522644, "learning_rate": 4.962730840224263e-05, "loss": 1.8069, "step": 317 }, { "epoch": 0.7491166077738516, "grad_norm": 0.699566662311554, "learning_rate": 4.962019382530521e-05, "loss": 1.8694, "step": 318 }, { "epoch": 0.751472320376914, "grad_norm": 0.5441856980323792, "learning_rate": 4.961301250096457e-05, "loss": 1.5864, "step": 319 }, { "epoch": 0.7538280329799765, "grad_norm": 0.6140335202217102, "learning_rate": 4.960576444868992e-05, "loss": 1.8208, "step": 320 }, { "epoch": 0.7561837455830389, "grad_norm": 0.5850114226341248, "learning_rate": 4.959844968813132e-05, "loss": 1.7265, "step": 321 }, { "epoch": 0.7585394581861012, "grad_norm": 0.5820872187614441, "learning_rate": 4.9591068239119684e-05, "loss": 1.7387, "step": 322 }, { "epoch": 0.7608951707891637, "grad_norm": 0.6233322620391846, "learning_rate": 4.9583620121666754e-05, "loss": 1.7013, "step": 323 }, { "epoch": 0.7632508833922261, "grad_norm": 0.6401897072792053, "learning_rate": 4.9576105355964995e-05, "loss": 1.9157, "step": 324 }, { "epoch": 0.7656065959952886, "grad_norm": 3.3652503490448, "learning_rate": 4.956852396238756e-05, "loss": 1.6955, "step": 325 }, { "epoch": 0.767962308598351, "grad_norm": 0.6145344972610474, "learning_rate": 4.956087596148824e-05, "loss": 1.8384, "step": 326 }, { "epoch": 0.7703180212014135, "grad_norm": 0.6970428824424744, "learning_rate": 4.955316137400141e-05, "loss": 1.7948, "step": 327 }, { "epoch": 0.7726737338044759, "grad_norm": 0.5910301208496094, "learning_rate": 4.954538022084196e-05, "loss": 1.556, "step": 328 }, { "epoch": 0.7750294464075382, "grad_norm": 0.5500912070274353, "learning_rate": 4.953753252310526e-05, "loss": 1.7743, "step": 329 }, { "epoch": 0.7773851590106007, "grad_norm": 0.6361621618270874, "learning_rate": 4.952961830206704e-05, "loss": 1.8463, "step": 330 }, { "epoch": 0.7797408716136631, "grad_norm": 0.5666980743408203, "learning_rate": 4.952163757918344e-05, "loss": 1.8397, "step": 331 }, { "epoch": 0.7820965842167256, "grad_norm": 0.6597093343734741, "learning_rate": 4.951359037609088e-05, "loss": 1.9733, "step": 332 }, { "epoch": 0.784452296819788, "grad_norm": 0.5398449897766113, "learning_rate": 4.950547671460596e-05, "loss": 1.7557, "step": 333 }, { "epoch": 0.7868080094228505, "grad_norm": 0.6826884150505066, "learning_rate": 4.949729661672553e-05, "loss": 1.6537, "step": 334 }, { "epoch": 0.7891637220259128, "grad_norm": 0.653356671333313, "learning_rate": 4.94890501046265e-05, "loss": 1.5766, "step": 335 }, { "epoch": 0.7915194346289752, "grad_norm": 0.5746085047721863, "learning_rate": 4.948073720066587e-05, "loss": 1.714, "step": 336 }, { "epoch": 0.7938751472320377, "grad_norm": 0.6020729541778564, "learning_rate": 4.9472357927380595e-05, "loss": 1.6903, "step": 337 }, { "epoch": 0.7962308598351001, "grad_norm": 0.6362228989601135, "learning_rate": 4.94639123074876e-05, "loss": 1.8787, "step": 338 }, { "epoch": 0.7985865724381626, "grad_norm": 0.5767459869384766, "learning_rate": 4.945540036388366e-05, "loss": 1.5849, "step": 339 }, { "epoch": 0.800942285041225, "grad_norm": 0.5863094329833984, "learning_rate": 4.944682211964538e-05, "loss": 1.7365, "step": 340 }, { "epoch": 0.8032979976442874, "grad_norm": 0.5866963267326355, "learning_rate": 4.943817759802908e-05, "loss": 1.7624, "step": 341 }, { "epoch": 0.8056537102473498, "grad_norm": 0.5772634148597717, "learning_rate": 4.9429466822470786e-05, "loss": 1.7361, "step": 342 }, { "epoch": 0.8080094228504122, "grad_norm": 0.6611394286155701, "learning_rate": 4.942068981658614e-05, "loss": 1.8466, "step": 343 }, { "epoch": 0.8103651354534747, "grad_norm": 0.7224000096321106, "learning_rate": 4.9411846604170345e-05, "loss": 1.6435, "step": 344 }, { "epoch": 0.8127208480565371, "grad_norm": 0.6376648545265198, "learning_rate": 4.940293720919807e-05, "loss": 1.7142, "step": 345 }, { "epoch": 0.8150765606595995, "grad_norm": 0.581255316734314, "learning_rate": 4.9393961655823454e-05, "loss": 1.4524, "step": 346 }, { "epoch": 0.817432273262662, "grad_norm": 0.5886533260345459, "learning_rate": 4.9384919968379945e-05, "loss": 1.8383, "step": 347 }, { "epoch": 0.8197879858657244, "grad_norm": 0.7854883074760437, "learning_rate": 4.9375812171380334e-05, "loss": 1.8286, "step": 348 }, { "epoch": 0.8221436984687868, "grad_norm": 0.6230300068855286, "learning_rate": 4.9366638289516613e-05, "loss": 1.7542, "step": 349 }, { "epoch": 0.8244994110718492, "grad_norm": 0.5796071887016296, "learning_rate": 4.935739834765995e-05, "loss": 1.6971, "step": 350 }, { "epoch": 0.8268551236749117, "grad_norm": 1.5019482374191284, "learning_rate": 4.934809237086059e-05, "loss": 1.843, "step": 351 }, { "epoch": 0.8292108362779741, "grad_norm": 0.5636314749717712, "learning_rate": 4.933872038434782e-05, "loss": 1.8525, "step": 352 }, { "epoch": 0.8315665488810365, "grad_norm": 2.198636770248413, "learning_rate": 4.932928241352987e-05, "loss": 1.7347, "step": 353 }, { "epoch": 0.833922261484099, "grad_norm": 0.6105632185935974, "learning_rate": 4.931977848399386e-05, "loss": 1.7863, "step": 354 }, { "epoch": 0.8362779740871613, "grad_norm": 0.6488332152366638, "learning_rate": 4.931020862150577e-05, "loss": 1.8749, "step": 355 }, { "epoch": 0.8386336866902238, "grad_norm": 0.5949331521987915, "learning_rate": 4.930057285201027e-05, "loss": 1.7784, "step": 356 }, { "epoch": 0.8409893992932862, "grad_norm": 0.5943877100944519, "learning_rate": 4.929087120163075e-05, "loss": 1.7366, "step": 357 }, { "epoch": 0.8433451118963486, "grad_norm": 0.6062324643135071, "learning_rate": 4.928110369666918e-05, "loss": 1.8079, "step": 358 }, { "epoch": 0.8457008244994111, "grad_norm": 0.5146484375, "learning_rate": 4.9271270363606106e-05, "loss": 1.6455, "step": 359 }, { "epoch": 0.8480565371024735, "grad_norm": 0.5779191255569458, "learning_rate": 4.926137122910048e-05, "loss": 1.8195, "step": 360 }, { "epoch": 0.850412249705536, "grad_norm": 0.6685857176780701, "learning_rate": 4.9251406319989725e-05, "loss": 1.6158, "step": 361 }, { "epoch": 0.8527679623085983, "grad_norm": 0.7092341780662537, "learning_rate": 4.9241375663289516e-05, "loss": 1.5658, "step": 362 }, { "epoch": 0.8551236749116607, "grad_norm": 1.1926854848861694, "learning_rate": 4.92312792861938e-05, "loss": 1.6644, "step": 363 }, { "epoch": 0.8574793875147232, "grad_norm": 0.6292633414268494, "learning_rate": 4.922111721607471e-05, "loss": 1.4956, "step": 364 }, { "epoch": 0.8598351001177856, "grad_norm": 0.5968769192695618, "learning_rate": 4.921088948048247e-05, "loss": 1.73, "step": 365 }, { "epoch": 0.8621908127208481, "grad_norm": 0.6557877659797668, "learning_rate": 4.920059610714531e-05, "loss": 1.8831, "step": 366 }, { "epoch": 0.8645465253239105, "grad_norm": 0.5954815745353699, "learning_rate": 4.919023712396944e-05, "loss": 1.5396, "step": 367 }, { "epoch": 0.866902237926973, "grad_norm": 0.6220850944519043, "learning_rate": 4.917981255903893e-05, "loss": 1.7124, "step": 368 }, { "epoch": 0.8692579505300353, "grad_norm": 0.672264039516449, "learning_rate": 4.916932244061564e-05, "loss": 1.77, "step": 369 }, { "epoch": 0.8716136631330977, "grad_norm": 0.9075942635536194, "learning_rate": 4.915876679713916e-05, "loss": 1.6479, "step": 370 }, { "epoch": 0.8739693757361602, "grad_norm": 0.6195477843284607, "learning_rate": 4.914814565722671e-05, "loss": 1.6519, "step": 371 }, { "epoch": 0.8763250883392226, "grad_norm": 0.9346473813056946, "learning_rate": 4.9137459049673105e-05, "loss": 1.734, "step": 372 }, { "epoch": 0.8786808009422851, "grad_norm": 0.9948516488075256, "learning_rate": 4.912670700345063e-05, "loss": 1.668, "step": 373 }, { "epoch": 0.8810365135453475, "grad_norm": 0.6109210252761841, "learning_rate": 4.911588954770897e-05, "loss": 1.8623, "step": 374 }, { "epoch": 0.8833922261484098, "grad_norm": 3.9632914066314697, "learning_rate": 4.9105006711775157e-05, "loss": 1.6687, "step": 375 }, { "epoch": 0.8857479387514723, "grad_norm": 0.6873841285705566, "learning_rate": 4.9094058525153475e-05, "loss": 1.724, "step": 376 }, { "epoch": 0.8881036513545347, "grad_norm": 0.575564444065094, "learning_rate": 4.908304501752536e-05, "loss": 1.5528, "step": 377 }, { "epoch": 0.8904593639575972, "grad_norm": 0.5377947092056274, "learning_rate": 4.907196621874937e-05, "loss": 1.5681, "step": 378 }, { "epoch": 0.8928150765606596, "grad_norm": 0.5980172753334045, "learning_rate": 4.906082215886104e-05, "loss": 1.7711, "step": 379 }, { "epoch": 0.8951707891637221, "grad_norm": 0.6780989766120911, "learning_rate": 4.9049612868072844e-05, "loss": 1.726, "step": 380 }, { "epoch": 0.8975265017667845, "grad_norm": 0.6411519646644592, "learning_rate": 4.9038338376774124e-05, "loss": 1.7187, "step": 381 }, { "epoch": 0.8998822143698468, "grad_norm": 0.5894859433174133, "learning_rate": 4.902699871553095e-05, "loss": 1.8432, "step": 382 }, { "epoch": 0.9022379269729093, "grad_norm": 0.6304222941398621, "learning_rate": 4.901559391508611e-05, "loss": 1.7324, "step": 383 }, { "epoch": 0.9045936395759717, "grad_norm": 0.895294189453125, "learning_rate": 4.900412400635895e-05, "loss": 1.834, "step": 384 }, { "epoch": 0.9069493521790342, "grad_norm": 0.5941535830497742, "learning_rate": 4.899258902044536e-05, "loss": 1.8499, "step": 385 }, { "epoch": 0.9093050647820966, "grad_norm": 0.6085304021835327, "learning_rate": 4.898098898861766e-05, "loss": 1.632, "step": 386 }, { "epoch": 0.911660777385159, "grad_norm": 0.6239786148071289, "learning_rate": 4.8969323942324494e-05, "loss": 1.6422, "step": 387 }, { "epoch": 0.9140164899882215, "grad_norm": 0.5546228289604187, "learning_rate": 4.895759391319079e-05, "loss": 1.7297, "step": 388 }, { "epoch": 0.9163722025912838, "grad_norm": 0.6163750886917114, "learning_rate": 4.8945798933017616e-05, "loss": 1.6739, "step": 389 }, { "epoch": 0.9187279151943463, "grad_norm": 0.653086245059967, "learning_rate": 4.893393903378217e-05, "loss": 1.6182, "step": 390 }, { "epoch": 0.9210836277974087, "grad_norm": 1.7431739568710327, "learning_rate": 4.892201424763762e-05, "loss": 1.953, "step": 391 }, { "epoch": 0.9234393404004712, "grad_norm": 2.9103164672851562, "learning_rate": 4.891002460691306e-05, "loss": 1.6654, "step": 392 }, { "epoch": 0.9257950530035336, "grad_norm": 0.5413578748703003, "learning_rate": 4.8897970144113416e-05, "loss": 1.7318, "step": 393 }, { "epoch": 0.928150765606596, "grad_norm": 0.735379695892334, "learning_rate": 4.8885850891919326e-05, "loss": 1.5396, "step": 394 }, { "epoch": 0.9305064782096584, "grad_norm": 2.7076292037963867, "learning_rate": 4.8873666883187105e-05, "loss": 1.7183, "step": 395 }, { "epoch": 0.9328621908127208, "grad_norm": 0.5648126006126404, "learning_rate": 4.886141815094863e-05, "loss": 1.5761, "step": 396 }, { "epoch": 0.9352179034157833, "grad_norm": 0.6309358477592468, "learning_rate": 4.8849104728411225e-05, "loss": 1.726, "step": 397 }, { "epoch": 0.9375736160188457, "grad_norm": 0.554670512676239, "learning_rate": 4.883672664895761e-05, "loss": 1.7465, "step": 398 }, { "epoch": 0.9399293286219081, "grad_norm": 0.6125677227973938, "learning_rate": 4.882428394614579e-05, "loss": 1.7745, "step": 399 }, { "epoch": 0.9422850412249706, "grad_norm": 4.0775604248046875, "learning_rate": 4.881177665370898e-05, "loss": 1.7072, "step": 400 }, { "epoch": 0.944640753828033, "grad_norm": 0.6792272925376892, "learning_rate": 4.879920480555549e-05, "loss": 1.8636, "step": 401 }, { "epoch": 0.9469964664310954, "grad_norm": 0.5895318984985352, "learning_rate": 4.878656843576865e-05, "loss": 1.612, "step": 402 }, { "epoch": 0.9493521790341578, "grad_norm": 0.5848386287689209, "learning_rate": 4.8773867578606716e-05, "loss": 1.7455, "step": 403 }, { "epoch": 0.9517078916372202, "grad_norm": 0.5538029670715332, "learning_rate": 4.876110226850278e-05, "loss": 1.6585, "step": 404 }, { "epoch": 0.9540636042402827, "grad_norm": 0.585517168045044, "learning_rate": 4.8748272540064655e-05, "loss": 1.7647, "step": 405 }, { "epoch": 0.9564193168433451, "grad_norm": 0.6457448601722717, "learning_rate": 4.8735378428074806e-05, "loss": 1.568, "step": 406 }, { "epoch": 0.9587750294464076, "grad_norm": 0.6435514688491821, "learning_rate": 4.872241996749026e-05, "loss": 1.811, "step": 407 }, { "epoch": 0.9611307420494699, "grad_norm": 0.5694456696510315, "learning_rate": 4.8709397193442474e-05, "loss": 1.6466, "step": 408 }, { "epoch": 0.9634864546525324, "grad_norm": 0.5727643370628357, "learning_rate": 4.8696310141237286e-05, "loss": 1.7072, "step": 409 }, { "epoch": 0.9658421672555948, "grad_norm": 0.5494128465652466, "learning_rate": 4.8683158846354786e-05, "loss": 1.6212, "step": 410 }, { "epoch": 0.9681978798586572, "grad_norm": 0.5743308067321777, "learning_rate": 4.866994334444923e-05, "loss": 1.6375, "step": 411 }, { "epoch": 0.9705535924617197, "grad_norm": 0.6023212671279907, "learning_rate": 4.865666367134896e-05, "loss": 1.7818, "step": 412 }, { "epoch": 0.9729093050647821, "grad_norm": 0.6677923202514648, "learning_rate": 4.864331986305626e-05, "loss": 1.5487, "step": 413 }, { "epoch": 0.9752650176678446, "grad_norm": 0.584697425365448, "learning_rate": 4.862991195574732e-05, "loss": 1.7279, "step": 414 }, { "epoch": 0.9776207302709069, "grad_norm": 0.7143948674201965, "learning_rate": 4.8616439985772097e-05, "loss": 1.6314, "step": 415 }, { "epoch": 0.9799764428739693, "grad_norm": 0.6194884777069092, "learning_rate": 4.8602903989654224e-05, "loss": 1.6968, "step": 416 }, { "epoch": 0.9823321554770318, "grad_norm": 0.5930228233337402, "learning_rate": 4.858930400409093e-05, "loss": 1.6787, "step": 417 }, { "epoch": 0.9846878680800942, "grad_norm": 0.5381578207015991, "learning_rate": 4.8575640065952905e-05, "loss": 1.4405, "step": 418 }, { "epoch": 0.9870435806831567, "grad_norm": 0.6392894983291626, "learning_rate": 4.8561912212284223e-05, "loss": 1.6775, "step": 419 }, { "epoch": 0.9893992932862191, "grad_norm": 0.6020627617835999, "learning_rate": 4.8548120480302245e-05, "loss": 1.6117, "step": 420 }, { "epoch": 0.9917550058892816, "grad_norm": 0.6004360318183899, "learning_rate": 4.853426490739751e-05, "loss": 1.7249, "step": 421 }, { "epoch": 0.9941107184923439, "grad_norm": 0.668624997138977, "learning_rate": 4.8520345531133636e-05, "loss": 1.8198, "step": 422 }, { "epoch": 0.9964664310954063, "grad_norm": 0.5948902368545532, "learning_rate": 4.850636238924722e-05, "loss": 1.749, "step": 423 }, { "epoch": 0.9988221436984688, "grad_norm": 0.5641354918479919, "learning_rate": 4.849231551964771e-05, "loss": 1.6198, "step": 424 }, { "epoch": 0.9988221436984688, "eval_loss": 1.544342041015625, "eval_runtime": 5.7627, "eval_samples_per_second": 433.826, "eval_steps_per_second": 6.941, "step": 424 }, { "epoch": 1.0011778563015312, "grad_norm": 0.6064740419387817, "learning_rate": 4.8478204960417364e-05, "loss": 1.463, "step": 425 }, { "epoch": 1.0035335689045937, "grad_norm": 2.208122491836548, "learning_rate": 4.846403074981107e-05, "loss": 1.5992, "step": 426 }, { "epoch": 1.005889281507656, "grad_norm": 0.7967551350593567, "learning_rate": 4.844979292625632e-05, "loss": 1.7462, "step": 427 }, { "epoch": 1.0082449941107186, "grad_norm": 0.5634446740150452, "learning_rate": 4.8435491528353026e-05, "loss": 1.6955, "step": 428 }, { "epoch": 1.010600706713781, "grad_norm": 8.470071792602539, "learning_rate": 4.8421126594873476e-05, "loss": 1.5876, "step": 429 }, { "epoch": 1.0129564193168434, "grad_norm": 0.5738750696182251, "learning_rate": 4.8406698164762206e-05, "loss": 1.5861, "step": 430 }, { "epoch": 1.0153121319199059, "grad_norm": 0.5818897485733032, "learning_rate": 4.8392206277135896e-05, "loss": 1.6656, "step": 431 }, { "epoch": 1.017667844522968, "grad_norm": 0.6000969409942627, "learning_rate": 4.837765097128326e-05, "loss": 1.5597, "step": 432 }, { "epoch": 1.0200235571260305, "grad_norm": 0.5801917910575867, "learning_rate": 4.8363032286664964e-05, "loss": 1.7276, "step": 433 }, { "epoch": 1.022379269729093, "grad_norm": 0.519159734249115, "learning_rate": 4.834835026291347e-05, "loss": 1.5654, "step": 434 }, { "epoch": 1.0247349823321554, "grad_norm": 0.6257019639015198, "learning_rate": 4.8333604939832974e-05, "loss": 1.6431, "step": 435 }, { "epoch": 1.0270906949352179, "grad_norm": 0.7148321270942688, "learning_rate": 4.831879635739929e-05, "loss": 1.6346, "step": 436 }, { "epoch": 1.0294464075382803, "grad_norm": 0.6827388405799866, "learning_rate": 4.83039245557597e-05, "loss": 1.8716, "step": 437 }, { "epoch": 1.0318021201413428, "grad_norm": 0.5663527846336365, "learning_rate": 4.828898957523292e-05, "loss": 1.6633, "step": 438 }, { "epoch": 1.0341578327444052, "grad_norm": 0.613182008266449, "learning_rate": 4.827399145630892e-05, "loss": 1.6806, "step": 439 }, { "epoch": 1.0365135453474676, "grad_norm": 0.5635647177696228, "learning_rate": 4.8258930239648865e-05, "loss": 1.635, "step": 440 }, { "epoch": 1.03886925795053, "grad_norm": 0.7704114317893982, "learning_rate": 4.824380596608497e-05, "loss": 1.75, "step": 441 }, { "epoch": 1.0412249705535925, "grad_norm": 0.6245626211166382, "learning_rate": 4.8228618676620384e-05, "loss": 1.7712, "step": 442 }, { "epoch": 1.043580683156655, "grad_norm": 0.5988133549690247, "learning_rate": 4.821336841242915e-05, "loss": 1.7974, "step": 443 }, { "epoch": 1.0459363957597174, "grad_norm": 0.6470246315002441, "learning_rate": 4.8198055214855984e-05, "loss": 1.7524, "step": 444 }, { "epoch": 1.0482921083627796, "grad_norm": 0.5523703694343567, "learning_rate": 4.818267912541626e-05, "loss": 1.687, "step": 445 }, { "epoch": 1.050647820965842, "grad_norm": 0.6131590604782104, "learning_rate": 4.8167240185795835e-05, "loss": 1.6322, "step": 446 }, { "epoch": 1.0530035335689045, "grad_norm": 0.5566257238388062, "learning_rate": 4.815173843785097e-05, "loss": 1.6383, "step": 447 }, { "epoch": 1.055359246171967, "grad_norm": 0.6725663542747498, "learning_rate": 4.813617392360818e-05, "loss": 1.7443, "step": 448 }, { "epoch": 1.0577149587750294, "grad_norm": 0.6082566380500793, "learning_rate": 4.8120546685264197e-05, "loss": 1.8492, "step": 449 }, { "epoch": 1.0600706713780919, "grad_norm": 0.5824471712112427, "learning_rate": 4.8104856765185746e-05, "loss": 1.7073, "step": 450 }, { "epoch": 1.0624263839811543, "grad_norm": 0.6267476081848145, "learning_rate": 4.8089104205909506e-05, "loss": 1.6399, "step": 451 }, { "epoch": 1.0647820965842167, "grad_norm": 2.478548049926758, "learning_rate": 4.807328905014201e-05, "loss": 1.6308, "step": 452 }, { "epoch": 1.0671378091872792, "grad_norm": 0.564146101474762, "learning_rate": 4.805741134075944e-05, "loss": 1.6282, "step": 453 }, { "epoch": 1.0694935217903416, "grad_norm": 1.9383280277252197, "learning_rate": 4.8041471120807605e-05, "loss": 1.6734, "step": 454 }, { "epoch": 1.071849234393404, "grad_norm": 0.5730167627334595, "learning_rate": 4.8025468433501774e-05, "loss": 1.6655, "step": 455 }, { "epoch": 1.0742049469964665, "grad_norm": 0.614245593547821, "learning_rate": 4.800940332222656e-05, "loss": 1.6435, "step": 456 }, { "epoch": 1.076560659599529, "grad_norm": 0.8287820219993591, "learning_rate": 4.799327583053585e-05, "loss": 1.709, "step": 457 }, { "epoch": 1.0789163722025914, "grad_norm": 0.5220635533332825, "learning_rate": 4.797708600215258e-05, "loss": 1.4952, "step": 458 }, { "epoch": 1.0812720848056536, "grad_norm": 0.5129628777503967, "learning_rate": 4.796083388096877e-05, "loss": 1.5674, "step": 459 }, { "epoch": 1.083627797408716, "grad_norm": 0.7245091199874878, "learning_rate": 4.794451951104525e-05, "loss": 1.699, "step": 460 }, { "epoch": 1.0859835100117785, "grad_norm": 0.617669939994812, "learning_rate": 4.792814293661164e-05, "loss": 1.7529, "step": 461 }, { "epoch": 1.088339222614841, "grad_norm": 0.6354297399520874, "learning_rate": 4.791170420206622e-05, "loss": 1.7308, "step": 462 }, { "epoch": 1.0906949352179034, "grad_norm": 0.6580603718757629, "learning_rate": 4.789520335197577e-05, "loss": 1.5546, "step": 463 }, { "epoch": 1.0930506478209658, "grad_norm": 0.5997500419616699, "learning_rate": 4.787864043107546e-05, "loss": 1.6025, "step": 464 }, { "epoch": 1.0954063604240283, "grad_norm": 0.6430968046188354, "learning_rate": 4.786201548426877e-05, "loss": 1.6082, "step": 465 }, { "epoch": 1.0977620730270907, "grad_norm": 0.604248583316803, "learning_rate": 4.7845328556627306e-05, "loss": 1.6119, "step": 466 }, { "epoch": 1.1001177856301532, "grad_norm": 0.6496431827545166, "learning_rate": 4.782857969339073e-05, "loss": 1.6845, "step": 467 }, { "epoch": 1.1024734982332156, "grad_norm": 0.5222844481468201, "learning_rate": 4.7811768939966605e-05, "loss": 1.516, "step": 468 }, { "epoch": 1.104829210836278, "grad_norm": 0.5884813070297241, "learning_rate": 4.77948963419303e-05, "loss": 1.6519, "step": 469 }, { "epoch": 1.1071849234393405, "grad_norm": 0.6031155586242676, "learning_rate": 4.777796194502483e-05, "loss": 1.6111, "step": 470 }, { "epoch": 1.1095406360424027, "grad_norm": 0.7031946778297424, "learning_rate": 4.776096579516076e-05, "loss": 1.6297, "step": 471 }, { "epoch": 1.1118963486454652, "grad_norm": 0.6292266249656677, "learning_rate": 4.7743907938416074e-05, "loss": 1.7494, "step": 472 }, { "epoch": 1.1142520612485276, "grad_norm": 0.6390065550804138, "learning_rate": 4.772678842103605e-05, "loss": 1.5896, "step": 473 }, { "epoch": 1.11660777385159, "grad_norm": 0.6709311604499817, "learning_rate": 4.7709607289433124e-05, "loss": 1.6025, "step": 474 }, { "epoch": 1.1189634864546525, "grad_norm": 0.6233572363853455, "learning_rate": 4.769236459018679e-05, "loss": 1.503, "step": 475 }, { "epoch": 1.121319199057715, "grad_norm": 0.5501691699028015, "learning_rate": 4.767506037004344e-05, "loss": 1.5657, "step": 476 }, { "epoch": 1.1236749116607774, "grad_norm": 0.671038031578064, "learning_rate": 4.765769467591625e-05, "loss": 1.7805, "step": 477 }, { "epoch": 1.1260306242638398, "grad_norm": 0.6200709342956543, "learning_rate": 4.7640267554885085e-05, "loss": 1.5141, "step": 478 }, { "epoch": 1.1283863368669023, "grad_norm": 0.7369521260261536, "learning_rate": 4.762277905419633e-05, "loss": 1.8404, "step": 479 }, { "epoch": 1.1307420494699647, "grad_norm": 0.5494653582572937, "learning_rate": 4.760522922126276e-05, "loss": 1.605, "step": 480 }, { "epoch": 1.1330977620730271, "grad_norm": 0.5984039306640625, "learning_rate": 4.7587618103663444e-05, "loss": 1.5733, "step": 481 }, { "epoch": 1.1354534746760896, "grad_norm": 0.5719306468963623, "learning_rate": 4.756994574914359e-05, "loss": 1.5968, "step": 482 }, { "epoch": 1.137809187279152, "grad_norm": 0.6263169050216675, "learning_rate": 4.755221220561443e-05, "loss": 1.6337, "step": 483 }, { "epoch": 1.1401648998822145, "grad_norm": 0.717789351940155, "learning_rate": 4.7534417521153084e-05, "loss": 1.5826, "step": 484 }, { "epoch": 1.142520612485277, "grad_norm": 0.9273025393486023, "learning_rate": 4.751656174400243e-05, "loss": 1.5297, "step": 485 }, { "epoch": 1.1448763250883391, "grad_norm": 0.5356437563896179, "learning_rate": 4.7498644922570966e-05, "loss": 1.5151, "step": 486 }, { "epoch": 1.1472320376914016, "grad_norm": 0.5246230363845825, "learning_rate": 4.74806671054327e-05, "loss": 1.5776, "step": 487 }, { "epoch": 1.149587750294464, "grad_norm": 0.5235253572463989, "learning_rate": 4.7462628341326995e-05, "loss": 1.5879, "step": 488 }, { "epoch": 1.1519434628975265, "grad_norm": 7.307822227478027, "learning_rate": 4.744452867915844e-05, "loss": 1.4838, "step": 489 }, { "epoch": 1.154299175500589, "grad_norm": 0.5713460445404053, "learning_rate": 4.742636816799675e-05, "loss": 1.4949, "step": 490 }, { "epoch": 1.1566548881036514, "grad_norm": 0.58955317735672, "learning_rate": 4.7408146857076566e-05, "loss": 1.5045, "step": 491 }, { "epoch": 1.1590106007067138, "grad_norm": 0.6120131015777588, "learning_rate": 4.73898647957974e-05, "loss": 1.8877, "step": 492 }, { "epoch": 1.1613663133097762, "grad_norm": 0.5710542798042297, "learning_rate": 4.737152203372345e-05, "loss": 1.4437, "step": 493 }, { "epoch": 1.1637220259128387, "grad_norm": 0.5266141891479492, "learning_rate": 4.735311862058346e-05, "loss": 1.4414, "step": 494 }, { "epoch": 1.1660777385159011, "grad_norm": 0.6219840049743652, "learning_rate": 4.733465460627064e-05, "loss": 1.6103, "step": 495 }, { "epoch": 1.1684334511189636, "grad_norm": 0.9414778351783752, "learning_rate": 4.7316130040842466e-05, "loss": 1.5969, "step": 496 }, { "epoch": 1.1707891637220258, "grad_norm": 0.6036843061447144, "learning_rate": 4.7297544974520594e-05, "loss": 1.5457, "step": 497 }, { "epoch": 1.1731448763250882, "grad_norm": 0.573947548866272, "learning_rate": 4.727889945769067e-05, "loss": 1.6854, "step": 498 }, { "epoch": 1.1755005889281507, "grad_norm": 0.6203517317771912, "learning_rate": 4.726019354090226e-05, "loss": 1.8086, "step": 499 }, { "epoch": 1.1778563015312131, "grad_norm": 0.5064033269882202, "learning_rate": 4.724142727486869e-05, "loss": 1.5492, "step": 500 }, { "epoch": 1.1802120141342756, "grad_norm": 0.5605738162994385, "learning_rate": 4.722260071046683e-05, "loss": 1.5832, "step": 501 }, { "epoch": 1.182567726737338, "grad_norm": 0.7284584045410156, "learning_rate": 4.720371389873711e-05, "loss": 1.6627, "step": 502 }, { "epoch": 1.1849234393404005, "grad_norm": 1.267369270324707, "learning_rate": 4.718476689088325e-05, "loss": 1.6019, "step": 503 }, { "epoch": 1.187279151943463, "grad_norm": 0.5752831697463989, "learning_rate": 4.7165759738272144e-05, "loss": 1.6655, "step": 504 }, { "epoch": 1.1896348645465253, "grad_norm": 2.5802974700927734, "learning_rate": 4.71466924924338e-05, "loss": 1.647, "step": 505 }, { "epoch": 1.1919905771495878, "grad_norm": 0.714806079864502, "learning_rate": 4.7127565205061096e-05, "loss": 1.4605, "step": 506 }, { "epoch": 1.1943462897526502, "grad_norm": 0.5411986708641052, "learning_rate": 4.7108377928009725e-05, "loss": 1.6032, "step": 507 }, { "epoch": 1.1967020023557127, "grad_norm": 0.5166033506393433, "learning_rate": 4.708913071329798e-05, "loss": 1.4109, "step": 508 }, { "epoch": 1.1990577149587751, "grad_norm": 0.6639083623886108, "learning_rate": 4.706982361310669e-05, "loss": 1.7107, "step": 509 }, { "epoch": 1.2014134275618376, "grad_norm": 0.5835407376289368, "learning_rate": 4.7050456679779e-05, "loss": 1.7149, "step": 510 }, { "epoch": 1.2037691401649, "grad_norm": 0.5714329481124878, "learning_rate": 4.703102996582028e-05, "loss": 1.6091, "step": 511 }, { "epoch": 1.2061248527679622, "grad_norm": 0.5299686789512634, "learning_rate": 4.7011543523897996e-05, "loss": 1.6577, "step": 512 }, { "epoch": 1.2084805653710247, "grad_norm": 0.5636796951293945, "learning_rate": 4.6991997406841504e-05, "loss": 1.6393, "step": 513 }, { "epoch": 1.210836277974087, "grad_norm": 0.5699754953384399, "learning_rate": 4.6972391667641974e-05, "loss": 1.6494, "step": 514 }, { "epoch": 1.2131919905771495, "grad_norm": 0.6111688017845154, "learning_rate": 4.6952726359452193e-05, "loss": 1.49, "step": 515 }, { "epoch": 1.215547703180212, "grad_norm": 0.6137304306030273, "learning_rate": 4.693300153558646e-05, "loss": 1.7358, "step": 516 }, { "epoch": 1.2179034157832744, "grad_norm": 2.0160441398620605, "learning_rate": 4.691321724952041e-05, "loss": 1.4985, "step": 517 }, { "epoch": 1.2202591283863369, "grad_norm": 0.5996425747871399, "learning_rate": 4.6893373554890925e-05, "loss": 1.5552, "step": 518 }, { "epoch": 1.2226148409893993, "grad_norm": 1.5899049043655396, "learning_rate": 4.687347050549589e-05, "loss": 1.5698, "step": 519 }, { "epoch": 1.2249705535924618, "grad_norm": 0.6388573050498962, "learning_rate": 4.685350815529414e-05, "loss": 1.4764, "step": 520 }, { "epoch": 1.2273262661955242, "grad_norm": 0.6261616349220276, "learning_rate": 4.683348655840529e-05, "loss": 1.7065, "step": 521 }, { "epoch": 1.2296819787985867, "grad_norm": 0.5283573269844055, "learning_rate": 4.681340576910955e-05, "loss": 1.5214, "step": 522 }, { "epoch": 1.232037691401649, "grad_norm": 0.6212832927703857, "learning_rate": 4.6793265841847624e-05, "loss": 1.7627, "step": 523 }, { "epoch": 1.2343934040047113, "grad_norm": 1.5392024517059326, "learning_rate": 4.677306683122054e-05, "loss": 1.5665, "step": 524 }, { "epoch": 1.2367491166077738, "grad_norm": 0.6339794993400574, "learning_rate": 4.6752808791989486e-05, "loss": 1.5165, "step": 525 }, { "epoch": 1.2391048292108362, "grad_norm": 0.5163040161132812, "learning_rate": 4.673249177907571e-05, "loss": 1.4214, "step": 526 }, { "epoch": 1.2414605418138986, "grad_norm": 3.1503663063049316, "learning_rate": 4.6712115847560355e-05, "loss": 1.6607, "step": 527 }, { "epoch": 1.243816254416961, "grad_norm": 0.5808469653129578, "learning_rate": 4.669168105268424e-05, "loss": 1.6704, "step": 528 }, { "epoch": 1.2461719670200235, "grad_norm": 0.5449988842010498, "learning_rate": 4.667118744984783e-05, "loss": 1.5734, "step": 529 }, { "epoch": 1.248527679623086, "grad_norm": 0.7108696699142456, "learning_rate": 4.665063509461097e-05, "loss": 1.6862, "step": 530 }, { "epoch": 1.2508833922261484, "grad_norm": 0.8704025149345398, "learning_rate": 4.663002404269283e-05, "loss": 1.4091, "step": 531 }, { "epoch": 1.2532391048292109, "grad_norm": 0.7941853404045105, "learning_rate": 4.660935434997168e-05, "loss": 1.4742, "step": 532 }, { "epoch": 1.2555948174322733, "grad_norm": 0.8258230686187744, "learning_rate": 4.65886260724848e-05, "loss": 1.4205, "step": 533 }, { "epoch": 1.2579505300353357, "grad_norm": 2.276745557785034, "learning_rate": 4.6567839266428276e-05, "loss": 1.4826, "step": 534 }, { "epoch": 1.2603062426383982, "grad_norm": 0.6036691665649414, "learning_rate": 4.654699398815687e-05, "loss": 1.482, "step": 535 }, { "epoch": 1.2626619552414606, "grad_norm": 0.6713625192642212, "learning_rate": 4.652609029418389e-05, "loss": 1.6389, "step": 536 }, { "epoch": 1.265017667844523, "grad_norm": 0.5230117440223694, "learning_rate": 4.650512824118098e-05, "loss": 1.5817, "step": 537 }, { "epoch": 1.2673733804475855, "grad_norm": 0.5565871000289917, "learning_rate": 4.6484107885978035e-05, "loss": 1.4823, "step": 538 }, { "epoch": 1.2697290930506477, "grad_norm": 0.6699042320251465, "learning_rate": 4.6463029285562974e-05, "loss": 1.5508, "step": 539 }, { "epoch": 1.2720848056537102, "grad_norm": 0.7892242074012756, "learning_rate": 4.644189249708166e-05, "loss": 1.597, "step": 540 }, { "epoch": 1.2744405182567726, "grad_norm": 0.8091634511947632, "learning_rate": 4.642069757783769e-05, "loss": 1.5464, "step": 541 }, { "epoch": 1.276796230859835, "grad_norm": 0.6346912384033203, "learning_rate": 4.639944458529226e-05, "loss": 1.6847, "step": 542 }, { "epoch": 1.2791519434628975, "grad_norm": 0.7518364191055298, "learning_rate": 4.637813357706401e-05, "loss": 1.7844, "step": 543 }, { "epoch": 1.28150765606596, "grad_norm": 0.5622678399085999, "learning_rate": 4.635676461092886e-05, "loss": 1.511, "step": 544 }, { "epoch": 1.2838633686690224, "grad_norm": 0.5590486526489258, "learning_rate": 4.6335337744819873e-05, "loss": 1.7096, "step": 545 }, { "epoch": 1.2862190812720848, "grad_norm": 0.6024837493896484, "learning_rate": 4.6313853036827057e-05, "loss": 1.5987, "step": 546 }, { "epoch": 1.2885747938751473, "grad_norm": 0.854170560836792, "learning_rate": 4.629231054519727e-05, "loss": 1.5444, "step": 547 }, { "epoch": 1.2909305064782097, "grad_norm": 0.7178241610527039, "learning_rate": 4.6270710328334004e-05, "loss": 1.6703, "step": 548 }, { "epoch": 1.293286219081272, "grad_norm": 0.5701743960380554, "learning_rate": 4.624905244479724e-05, "loss": 1.5584, "step": 549 }, { "epoch": 1.2956419316843344, "grad_norm": 4.711905479431152, "learning_rate": 4.622733695330332e-05, "loss": 1.418, "step": 550 }, { "epoch": 1.2979976442873968, "grad_norm": 0.6278677582740784, "learning_rate": 4.620556391272476e-05, "loss": 1.7839, "step": 551 }, { "epoch": 1.3003533568904593, "grad_norm": 0.6536201238632202, "learning_rate": 4.618373338209008e-05, "loss": 1.7141, "step": 552 }, { "epoch": 1.3027090694935217, "grad_norm": 0.6233187317848206, "learning_rate": 4.6161845420583685e-05, "loss": 1.6441, "step": 553 }, { "epoch": 1.3050647820965842, "grad_norm": 0.5657503604888916, "learning_rate": 4.613990008754565e-05, "loss": 1.37, "step": 554 }, { "epoch": 1.3074204946996466, "grad_norm": 0.6036571264266968, "learning_rate": 4.6117897442471634e-05, "loss": 1.7032, "step": 555 }, { "epoch": 1.309776207302709, "grad_norm": 0.5349748730659485, "learning_rate": 4.609583754501263e-05, "loss": 1.5181, "step": 556 }, { "epoch": 1.3121319199057715, "grad_norm": 0.6233786940574646, "learning_rate": 4.6073720454974864e-05, "loss": 1.6957, "step": 557 }, { "epoch": 1.314487632508834, "grad_norm": 0.5817857980728149, "learning_rate": 4.605154623231962e-05, "loss": 1.5303, "step": 558 }, { "epoch": 1.3168433451118964, "grad_norm": 1.8294174671173096, "learning_rate": 4.602931493716307e-05, "loss": 1.5857, "step": 559 }, { "epoch": 1.3191990577149588, "grad_norm": 0.6367030739784241, "learning_rate": 4.6007026629776104e-05, "loss": 1.6078, "step": 560 }, { "epoch": 1.3215547703180213, "grad_norm": 0.6138246059417725, "learning_rate": 4.59846813705842e-05, "loss": 1.4985, "step": 561 }, { "epoch": 1.3239104829210837, "grad_norm": 0.9630900025367737, "learning_rate": 4.5962279220167215e-05, "loss": 1.4876, "step": 562 }, { "epoch": 1.3262661955241462, "grad_norm": 0.578799843788147, "learning_rate": 4.593982023925926e-05, "loss": 1.7004, "step": 563 }, { "epoch": 1.3286219081272086, "grad_norm": 2.0147721767425537, "learning_rate": 4.5917304488748486e-05, "loss": 1.5643, "step": 564 }, { "epoch": 1.330977620730271, "grad_norm": 0.604483425617218, "learning_rate": 4.589473202967699e-05, "loss": 1.4088, "step": 565 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5512965321540833, "learning_rate": 4.587210292324061e-05, "loss": 1.704, "step": 566 }, { "epoch": 1.3356890459363957, "grad_norm": 0.6815775632858276, "learning_rate": 4.584941723078872e-05, "loss": 1.756, "step": 567 }, { "epoch": 1.3380447585394581, "grad_norm": 0.602510929107666, "learning_rate": 4.582667501382414e-05, "loss": 1.5928, "step": 568 }, { "epoch": 1.3404004711425206, "grad_norm": 1.1094412803649902, "learning_rate": 4.580387633400292e-05, "loss": 1.691, "step": 569 }, { "epoch": 1.342756183745583, "grad_norm": 0.89102703332901, "learning_rate": 4.578102125313418e-05, "loss": 1.5018, "step": 570 }, { "epoch": 1.3451118963486455, "grad_norm": 0.5697596669197083, "learning_rate": 4.5758109833179963e-05, "loss": 1.5595, "step": 571 }, { "epoch": 1.347467608951708, "grad_norm": 0.6775079965591431, "learning_rate": 4.573514213625505e-05, "loss": 1.5713, "step": 572 }, { "epoch": 1.3498233215547704, "grad_norm": 1.1441680192947388, "learning_rate": 4.571211822462676e-05, "loss": 1.5885, "step": 573 }, { "epoch": 1.3521790341578328, "grad_norm": 0.7743953466415405, "learning_rate": 4.568903816071488e-05, "loss": 1.512, "step": 574 }, { "epoch": 1.3545347467608952, "grad_norm": 1.2860732078552246, "learning_rate": 4.566590200709136e-05, "loss": 1.7519, "step": 575 }, { "epoch": 1.3568904593639575, "grad_norm": 0.6444522738456726, "learning_rate": 4.5642709826480256e-05, "loss": 1.5832, "step": 576 }, { "epoch": 1.35924617196702, "grad_norm": 0.9091301560401917, "learning_rate": 4.561946168175751e-05, "loss": 1.7499, "step": 577 }, { "epoch": 1.3616018845700824, "grad_norm": 0.5483787655830383, "learning_rate": 4.559615763595079e-05, "loss": 1.5626, "step": 578 }, { "epoch": 1.3639575971731448, "grad_norm": 0.6238767504692078, "learning_rate": 4.557279775223931e-05, "loss": 1.5303, "step": 579 }, { "epoch": 1.3663133097762072, "grad_norm": 0.64593106508255, "learning_rate": 4.5549382093953666e-05, "loss": 1.7177, "step": 580 }, { "epoch": 1.3686690223792697, "grad_norm": 0.8642447590827942, "learning_rate": 4.552591072457565e-05, "loss": 1.4648, "step": 581 }, { "epoch": 1.3710247349823321, "grad_norm": 0.5802530646324158, "learning_rate": 4.550238370773813e-05, "loss": 1.6493, "step": 582 }, { "epoch": 1.3733804475853946, "grad_norm": 0.5861689448356628, "learning_rate": 4.54788011072248e-05, "loss": 1.5572, "step": 583 }, { "epoch": 1.375736160188457, "grad_norm": 0.5160636305809021, "learning_rate": 4.545516298697006e-05, "loss": 1.4776, "step": 584 }, { "epoch": 1.3780918727915195, "grad_norm": 0.604594886302948, "learning_rate": 4.5431469411058844e-05, "loss": 1.6782, "step": 585 }, { "epoch": 1.380447585394582, "grad_norm": 0.6128488779067993, "learning_rate": 4.54077204437264e-05, "loss": 1.7475, "step": 586 }, { "epoch": 1.3828032979976443, "grad_norm": 0.6195985078811646, "learning_rate": 4.5383916149358175e-05, "loss": 1.5037, "step": 587 }, { "epoch": 1.3851590106007068, "grad_norm": 0.5518015623092651, "learning_rate": 4.53600565924896e-05, "loss": 1.6232, "step": 588 }, { "epoch": 1.3875147232037692, "grad_norm": 1.1977450847625732, "learning_rate": 4.533614183780591e-05, "loss": 1.5502, "step": 589 }, { "epoch": 1.3898704358068317, "grad_norm": 0.9441741704940796, "learning_rate": 4.5312171950142034e-05, "loss": 1.7557, "step": 590 }, { "epoch": 1.3922261484098941, "grad_norm": 0.5428605675697327, "learning_rate": 4.528814699448232e-05, "loss": 1.4736, "step": 591 }, { "epoch": 1.3945818610129563, "grad_norm": 0.9040888547897339, "learning_rate": 4.5264067035960434e-05, "loss": 1.3837, "step": 592 }, { "epoch": 1.3969375736160188, "grad_norm": 0.6128194332122803, "learning_rate": 4.5239932139859154e-05, "loss": 1.5633, "step": 593 }, { "epoch": 1.3992932862190812, "grad_norm": 0.5833112001419067, "learning_rate": 4.521574237161021e-05, "loss": 1.566, "step": 594 }, { "epoch": 1.4016489988221437, "grad_norm": 0.6433863639831543, "learning_rate": 4.519149779679408e-05, "loss": 1.7261, "step": 595 }, { "epoch": 1.404004711425206, "grad_norm": 0.585862398147583, "learning_rate": 4.5167198481139825e-05, "loss": 1.5548, "step": 596 }, { "epoch": 1.4063604240282686, "grad_norm": 1.220085859298706, "learning_rate": 4.5142844490524936e-05, "loss": 1.4378, "step": 597 }, { "epoch": 1.408716136631331, "grad_norm": 0.6128588914871216, "learning_rate": 4.5118435890975106e-05, "loss": 1.5792, "step": 598 }, { "epoch": 1.4110718492343934, "grad_norm": 0.6133366227149963, "learning_rate": 4.509397274866409e-05, "loss": 1.5349, "step": 599 }, { "epoch": 1.4134275618374559, "grad_norm": 0.6057227253913879, "learning_rate": 4.506945512991352e-05, "loss": 1.6515, "step": 600 }, { "epoch": 1.4157832744405183, "grad_norm": 1.5638344287872314, "learning_rate": 4.5044883101192695e-05, "loss": 1.5664, "step": 601 }, { "epoch": 1.4181389870435805, "grad_norm": 0.6143893003463745, "learning_rate": 4.5020256729118446e-05, "loss": 1.6203, "step": 602 }, { "epoch": 1.420494699646643, "grad_norm": 0.6892445087432861, "learning_rate": 4.4995576080454924e-05, "loss": 1.6468, "step": 603 }, { "epoch": 1.4228504122497054, "grad_norm": 0.5601422190666199, "learning_rate": 4.497084122211344e-05, "loss": 1.4772, "step": 604 }, { "epoch": 1.4252061248527679, "grad_norm": 0.5502814650535583, "learning_rate": 4.494605222115225e-05, "loss": 1.378, "step": 605 }, { "epoch": 1.4275618374558303, "grad_norm": 2.3836569786071777, "learning_rate": 4.4921209144776414e-05, "loss": 1.6316, "step": 606 }, { "epoch": 1.4299175500588928, "grad_norm": 0.6598193645477295, "learning_rate": 4.489631206033758e-05, "loss": 1.5854, "step": 607 }, { "epoch": 1.4322732626619552, "grad_norm": 0.6107590198516846, "learning_rate": 4.4871361035333836e-05, "loss": 1.5336, "step": 608 }, { "epoch": 1.4346289752650176, "grad_norm": 0.5310948491096497, "learning_rate": 4.484635613740949e-05, "loss": 1.3908, "step": 609 }, { "epoch": 1.43698468786808, "grad_norm": 0.5438498258590698, "learning_rate": 4.482129743435491e-05, "loss": 1.6603, "step": 610 }, { "epoch": 1.4393404004711425, "grad_norm": 0.5578397512435913, "learning_rate": 4.479618499410634e-05, "loss": 1.5692, "step": 611 }, { "epoch": 1.441696113074205, "grad_norm": 0.560607373714447, "learning_rate": 4.4771018884745705e-05, "loss": 1.4998, "step": 612 }, { "epoch": 1.4440518256772674, "grad_norm": 0.5718947649002075, "learning_rate": 4.4745799174500414e-05, "loss": 1.5148, "step": 613 }, { "epoch": 1.4464075382803299, "grad_norm": 0.6231041550636292, "learning_rate": 4.472052593174323e-05, "loss": 1.7142, "step": 614 }, { "epoch": 1.4487632508833923, "grad_norm": 0.6304630041122437, "learning_rate": 4.469519922499202e-05, "loss": 1.6695, "step": 615 }, { "epoch": 1.4511189634864547, "grad_norm": 0.7161644697189331, "learning_rate": 4.466981912290959e-05, "loss": 1.561, "step": 616 }, { "epoch": 1.4534746760895172, "grad_norm": 0.5755468606948853, "learning_rate": 4.464438569430354e-05, "loss": 1.6033, "step": 617 }, { "epoch": 1.4558303886925796, "grad_norm": 0.5369083285331726, "learning_rate": 4.4618899008126005e-05, "loss": 1.6197, "step": 618 }, { "epoch": 1.4581861012956419, "grad_norm": 0.8576413989067078, "learning_rate": 4.459335913347352e-05, "loss": 1.5726, "step": 619 }, { "epoch": 1.4605418138987043, "grad_norm": 0.6040985584259033, "learning_rate": 4.456776613958683e-05, "loss": 1.5603, "step": 620 }, { "epoch": 1.4628975265017667, "grad_norm": 0.5894367694854736, "learning_rate": 4.454212009585068e-05, "loss": 1.5901, "step": 621 }, { "epoch": 1.4652532391048292, "grad_norm": 0.5494040250778198, "learning_rate": 4.4516421071793635e-05, "loss": 1.3336, "step": 622 }, { "epoch": 1.4676089517078916, "grad_norm": 1.308799147605896, "learning_rate": 4.449066913708789e-05, "loss": 1.4495, "step": 623 }, { "epoch": 1.469964664310954, "grad_norm": 0.6530017852783203, "learning_rate": 4.4464864361549105e-05, "loss": 1.6116, "step": 624 }, { "epoch": 1.4723203769140165, "grad_norm": 1.4411321878433228, "learning_rate": 4.443900681513617e-05, "loss": 1.4905, "step": 625 }, { "epoch": 1.474676089517079, "grad_norm": 0.6518925428390503, "learning_rate": 4.441309656795106e-05, "loss": 1.7084, "step": 626 }, { "epoch": 1.4770318021201414, "grad_norm": 0.6084282994270325, "learning_rate": 4.438713369023863e-05, "loss": 1.543, "step": 627 }, { "epoch": 1.4793875147232038, "grad_norm": 0.5194880366325378, "learning_rate": 4.43611182523864e-05, "loss": 1.3939, "step": 628 }, { "epoch": 1.481743227326266, "grad_norm": 0.588249921798706, "learning_rate": 4.43350503249244e-05, "loss": 1.5596, "step": 629 }, { "epoch": 1.4840989399293285, "grad_norm": 0.5514758229255676, "learning_rate": 4.430892997852496e-05, "loss": 1.5111, "step": 630 }, { "epoch": 1.486454652532391, "grad_norm": 0.9755181670188904, "learning_rate": 4.4282757284002515e-05, "loss": 1.4632, "step": 631 }, { "epoch": 1.4888103651354534, "grad_norm": 0.5885037779808044, "learning_rate": 4.425653231231344e-05, "loss": 1.464, "step": 632 }, { "epoch": 1.4911660777385158, "grad_norm": 0.5441231727600098, "learning_rate": 4.423025513455582e-05, "loss": 1.4128, "step": 633 }, { "epoch": 1.4935217903415783, "grad_norm": 1.0909383296966553, "learning_rate": 4.4203925821969283e-05, "loss": 1.4348, "step": 634 }, { "epoch": 1.4958775029446407, "grad_norm": 0.5905348658561707, "learning_rate": 4.417754444593478e-05, "loss": 1.4694, "step": 635 }, { "epoch": 1.4982332155477032, "grad_norm": 0.5977327227592468, "learning_rate": 4.415111107797445e-05, "loss": 1.5373, "step": 636 }, { "epoch": 1.4982332155477032, "eval_loss": 1.4488883018493652, "eval_runtime": 5.7669, "eval_samples_per_second": 433.512, "eval_steps_per_second": 6.936, "step": 636 }, { "epoch": 1.5005889281507656, "grad_norm": 0.9330520033836365, "learning_rate": 4.412462578975135e-05, "loss": 1.6137, "step": 637 }, { "epoch": 1.502944640753828, "grad_norm": 0.6448716521263123, "learning_rate": 4.409808865306931e-05, "loss": 1.5051, "step": 638 }, { "epoch": 1.5053003533568905, "grad_norm": 0.8892787098884583, "learning_rate": 4.407149973987273e-05, "loss": 1.4498, "step": 639 }, { "epoch": 1.507656065959953, "grad_norm": 0.5800485610961914, "learning_rate": 4.4044859122246374e-05, "loss": 1.6368, "step": 640 }, { "epoch": 1.5100117785630154, "grad_norm": 0.635712206363678, "learning_rate": 4.4018166872415176e-05, "loss": 1.4683, "step": 641 }, { "epoch": 1.5123674911660778, "grad_norm": 0.6780152916908264, "learning_rate": 4.399142306274408e-05, "loss": 1.6479, "step": 642 }, { "epoch": 1.5147232037691403, "grad_norm": 0.5039349794387817, "learning_rate": 4.396462776573775e-05, "loss": 1.5217, "step": 643 }, { "epoch": 1.5170789163722027, "grad_norm": 1.1504089832305908, "learning_rate": 4.3937781054040505e-05, "loss": 1.5491, "step": 644 }, { "epoch": 1.5194346289752652, "grad_norm": 0.6189066767692566, "learning_rate": 4.391088300043602e-05, "loss": 1.3452, "step": 645 }, { "epoch": 1.5217903415783276, "grad_norm": 0.7755336761474609, "learning_rate": 4.3883933677847154e-05, "loss": 1.582, "step": 646 }, { "epoch": 1.5241460541813898, "grad_norm": 0.6041241884231567, "learning_rate": 4.385693315933579e-05, "loss": 1.5321, "step": 647 }, { "epoch": 1.5265017667844523, "grad_norm": 0.9410498738288879, "learning_rate": 4.3829881518102576e-05, "loss": 1.6669, "step": 648 }, { "epoch": 1.5288574793875147, "grad_norm": 0.5859458446502686, "learning_rate": 4.380277882748679e-05, "loss": 1.4114, "step": 649 }, { "epoch": 1.5312131919905771, "grad_norm": 1.025149941444397, "learning_rate": 4.377562516096607e-05, "loss": 1.439, "step": 650 }, { "epoch": 1.5335689045936396, "grad_norm": 0.5894811153411865, "learning_rate": 4.374842059215629e-05, "loss": 1.5314, "step": 651 }, { "epoch": 1.535924617196702, "grad_norm": 0.5160365700721741, "learning_rate": 4.372116519481131e-05, "loss": 1.4619, "step": 652 }, { "epoch": 1.5382803297997645, "grad_norm": 0.6037544012069702, "learning_rate": 4.3693859042822774e-05, "loss": 1.5557, "step": 653 }, { "epoch": 1.5406360424028267, "grad_norm": 1.2457871437072754, "learning_rate": 4.366650221021996e-05, "loss": 1.5509, "step": 654 }, { "epoch": 1.5429917550058891, "grad_norm": 0.5875025391578674, "learning_rate": 4.363909477116952e-05, "loss": 1.5672, "step": 655 }, { "epoch": 1.5453474676089516, "grad_norm": 1.091813087463379, "learning_rate": 4.361163679997532e-05, "loss": 1.4952, "step": 656 }, { "epoch": 1.547703180212014, "grad_norm": 1.0470727682113647, "learning_rate": 4.35841283710782e-05, "loss": 1.5503, "step": 657 }, { "epoch": 1.5500588928150765, "grad_norm": 0.5391929149627686, "learning_rate": 4.3556569559055824e-05, "loss": 1.4124, "step": 658 }, { "epoch": 1.552414605418139, "grad_norm": 0.5521981120109558, "learning_rate": 4.3528960438622425e-05, "loss": 1.41, "step": 659 }, { "epoch": 1.5547703180212014, "grad_norm": 0.5705544948577881, "learning_rate": 4.350130108462864e-05, "loss": 1.5562, "step": 660 }, { "epoch": 1.5571260306242638, "grad_norm": 1.2580606937408447, "learning_rate": 4.34735915720613e-05, "loss": 1.5, "step": 661 }, { "epoch": 1.5594817432273262, "grad_norm": 0.6015611290931702, "learning_rate": 4.344583197604318e-05, "loss": 1.621, "step": 662 }, { "epoch": 1.5618374558303887, "grad_norm": 0.6082919239997864, "learning_rate": 4.34180223718329e-05, "loss": 1.6502, "step": 663 }, { "epoch": 1.5641931684334511, "grad_norm": 1.0463343858718872, "learning_rate": 4.33901628348246e-05, "loss": 1.5281, "step": 664 }, { "epoch": 1.5665488810365136, "grad_norm": 2.4668612480163574, "learning_rate": 4.336225344054782e-05, "loss": 1.5621, "step": 665 }, { "epoch": 1.568904593639576, "grad_norm": 0.5991259217262268, "learning_rate": 4.3334294264667255e-05, "loss": 1.527, "step": 666 }, { "epoch": 1.5712603062426385, "grad_norm": 0.6542274951934814, "learning_rate": 4.330628538298257e-05, "loss": 1.6438, "step": 667 }, { "epoch": 1.573616018845701, "grad_norm": 16.451032638549805, "learning_rate": 4.327822687142819e-05, "loss": 1.6133, "step": 668 }, { "epoch": 1.5759717314487633, "grad_norm": 0.5953258275985718, "learning_rate": 4.3250118806073074e-05, "loss": 1.6806, "step": 669 }, { "epoch": 1.5783274440518258, "grad_norm": 0.5378175973892212, "learning_rate": 4.322196126312055e-05, "loss": 1.497, "step": 670 }, { "epoch": 1.5806831566548882, "grad_norm": 0.9835163354873657, "learning_rate": 4.319375431890806e-05, "loss": 1.7271, "step": 671 }, { "epoch": 1.5830388692579507, "grad_norm": 0.574870765209198, "learning_rate": 4.316549804990699e-05, "loss": 1.6969, "step": 672 }, { "epoch": 1.5853945818610131, "grad_norm": 0.6998090147972107, "learning_rate": 4.313719253272246e-05, "loss": 1.4155, "step": 673 }, { "epoch": 1.5877502944640753, "grad_norm": 0.5895383358001709, "learning_rate": 4.310883784409306e-05, "loss": 1.3682, "step": 674 }, { "epoch": 1.5901060070671378, "grad_norm": 0.5245603919029236, "learning_rate": 4.308043406089076e-05, "loss": 1.5637, "step": 675 }, { "epoch": 1.5924617196702002, "grad_norm": 0.5971135497093201, "learning_rate": 4.305198126012057e-05, "loss": 1.5197, "step": 676 }, { "epoch": 1.5948174322732627, "grad_norm": 0.6199404001235962, "learning_rate": 4.30234795189204e-05, "loss": 1.5721, "step": 677 }, { "epoch": 1.5971731448763251, "grad_norm": 0.5818317532539368, "learning_rate": 4.2994928914560874e-05, "loss": 1.4372, "step": 678 }, { "epoch": 1.5995288574793876, "grad_norm": 0.5436182022094727, "learning_rate": 4.296632952444505e-05, "loss": 1.4901, "step": 679 }, { "epoch": 1.6018845700824498, "grad_norm": 0.5043879747390747, "learning_rate": 4.293768142610828e-05, "loss": 1.3142, "step": 680 }, { "epoch": 1.6042402826855122, "grad_norm": 0.5604328513145447, "learning_rate": 4.290898469721795e-05, "loss": 1.4726, "step": 681 }, { "epoch": 1.6065959952885747, "grad_norm": 0.5853621959686279, "learning_rate": 4.288023941557327e-05, "loss": 1.5058, "step": 682 }, { "epoch": 1.608951707891637, "grad_norm": 0.5832139849662781, "learning_rate": 4.2851445659105126e-05, "loss": 1.5399, "step": 683 }, { "epoch": 1.6113074204946995, "grad_norm": 0.6122182607650757, "learning_rate": 4.282260350587579e-05, "loss": 1.617, "step": 684 }, { "epoch": 1.613663133097762, "grad_norm": 0.5452379584312439, "learning_rate": 4.279371303407875e-05, "loss": 1.4331, "step": 685 }, { "epoch": 1.6160188457008244, "grad_norm": 1.5208348035812378, "learning_rate": 4.2764774322038494e-05, "loss": 1.5408, "step": 686 }, { "epoch": 1.6183745583038869, "grad_norm": 0.5433605909347534, "learning_rate": 4.273578744821027e-05, "loss": 1.5024, "step": 687 }, { "epoch": 1.6207302709069493, "grad_norm": 0.6126108765602112, "learning_rate": 4.270675249117994e-05, "loss": 1.7176, "step": 688 }, { "epoch": 1.6230859835100118, "grad_norm": 0.5748864412307739, "learning_rate": 4.267766952966369e-05, "loss": 1.5598, "step": 689 }, { "epoch": 1.6254416961130742, "grad_norm": 4.64862060546875, "learning_rate": 4.264853864250786e-05, "loss": 1.4871, "step": 690 }, { "epoch": 1.6277974087161367, "grad_norm": 0.530872642993927, "learning_rate": 4.261935990868871e-05, "loss": 1.5121, "step": 691 }, { "epoch": 1.630153121319199, "grad_norm": 0.6162238121032715, "learning_rate": 4.259013340731224e-05, "loss": 1.6521, "step": 692 }, { "epoch": 1.6325088339222615, "grad_norm": 8.413911819458008, "learning_rate": 4.256085921761393e-05, "loss": 1.6814, "step": 693 }, { "epoch": 1.634864546525324, "grad_norm": 0.5340136289596558, "learning_rate": 4.2531537418958554e-05, "loss": 1.5279, "step": 694 }, { "epoch": 1.6372202591283864, "grad_norm": 1.4834232330322266, "learning_rate": 4.250216809083997e-05, "loss": 1.6013, "step": 695 }, { "epoch": 1.6395759717314489, "grad_norm": 0.5039743185043335, "learning_rate": 4.247275131288086e-05, "loss": 1.598, "step": 696 }, { "epoch": 1.6419316843345113, "grad_norm": 0.5958728194236755, "learning_rate": 4.24432871648326e-05, "loss": 1.6061, "step": 697 }, { "epoch": 1.6442873969375738, "grad_norm": 0.5715299248695374, "learning_rate": 4.241377572657493e-05, "loss": 1.4717, "step": 698 }, { "epoch": 1.6466431095406362, "grad_norm": 0.5306159853935242, "learning_rate": 4.238421707811583e-05, "loss": 1.6001, "step": 699 }, { "epoch": 1.6489988221436984, "grad_norm": 0.5035179853439331, "learning_rate": 4.235461129959127e-05, "loss": 1.4918, "step": 700 }, { "epoch": 1.6513545347467609, "grad_norm": 0.5966494679450989, "learning_rate": 4.2324958471265006e-05, "loss": 1.5774, "step": 701 }, { "epoch": 1.6537102473498233, "grad_norm": 0.5359866619110107, "learning_rate": 4.229525867352831e-05, "loss": 1.4379, "step": 702 }, { "epoch": 1.6560659599528857, "grad_norm": 0.634860098361969, "learning_rate": 4.226551198689982e-05, "loss": 1.5032, "step": 703 }, { "epoch": 1.6584216725559482, "grad_norm": 0.6145931482315063, "learning_rate": 4.22357184920253e-05, "loss": 1.4757, "step": 704 }, { "epoch": 1.6607773851590106, "grad_norm": 1.8401107788085938, "learning_rate": 4.22058782696774e-05, "loss": 1.4088, "step": 705 }, { "epoch": 1.663133097762073, "grad_norm": 3.347764015197754, "learning_rate": 4.217599140075546e-05, "loss": 1.5208, "step": 706 }, { "epoch": 1.6654888103651353, "grad_norm": 0.5742576718330383, "learning_rate": 4.214605796628527e-05, "loss": 1.5482, "step": 707 }, { "epoch": 1.6678445229681977, "grad_norm": 0.7064507603645325, "learning_rate": 4.211607804741887e-05, "loss": 1.4287, "step": 708 }, { "epoch": 1.6702002355712602, "grad_norm": 0.5619456171989441, "learning_rate": 4.2086051725434343e-05, "loss": 1.4293, "step": 709 }, { "epoch": 1.6725559481743226, "grad_norm": 0.5406466126441956, "learning_rate": 4.205597908173555e-05, "loss": 1.4289, "step": 710 }, { "epoch": 1.674911660777385, "grad_norm": 0.8446951508522034, "learning_rate": 4.202586019785194e-05, "loss": 1.4783, "step": 711 }, { "epoch": 1.6772673733804475, "grad_norm": 0.6178669929504395, "learning_rate": 4.1995695155438326e-05, "loss": 1.4901, "step": 712 }, { "epoch": 1.67962308598351, "grad_norm": 0.5987133383750916, "learning_rate": 4.196548403627465e-05, "loss": 1.6084, "step": 713 }, { "epoch": 1.6819787985865724, "grad_norm": 0.6318785548210144, "learning_rate": 4.19352269222658e-05, "loss": 1.6084, "step": 714 }, { "epoch": 1.6843345111896348, "grad_norm": 0.5897310376167297, "learning_rate": 4.190492389544132e-05, "loss": 1.4349, "step": 715 }, { "epoch": 1.6866902237926973, "grad_norm": 0.548944890499115, "learning_rate": 4.187457503795527e-05, "loss": 1.3795, "step": 716 }, { "epoch": 1.6890459363957597, "grad_norm": 0.5220634341239929, "learning_rate": 4.184418043208592e-05, "loss": 1.4882, "step": 717 }, { "epoch": 1.6914016489988222, "grad_norm": 0.6271385550498962, "learning_rate": 4.181374016023559e-05, "loss": 1.6125, "step": 718 }, { "epoch": 1.6937573616018846, "grad_norm": 0.785491943359375, "learning_rate": 4.178325430493041e-05, "loss": 1.4687, "step": 719 }, { "epoch": 1.696113074204947, "grad_norm": 0.5517330765724182, "learning_rate": 4.175272294882008e-05, "loss": 1.4441, "step": 720 }, { "epoch": 1.6984687868080095, "grad_norm": 0.7612289190292358, "learning_rate": 4.172214617467765e-05, "loss": 1.503, "step": 721 }, { "epoch": 1.700824499411072, "grad_norm": 0.6236852407455444, "learning_rate": 4.1691524065399324e-05, "loss": 1.5212, "step": 722 }, { "epoch": 1.7031802120141344, "grad_norm": 0.6527695059776306, "learning_rate": 4.166085670400418e-05, "loss": 1.5687, "step": 723 }, { "epoch": 1.7055359246171968, "grad_norm": 0.5139789581298828, "learning_rate": 4.163014417363401e-05, "loss": 1.2797, "step": 724 }, { "epoch": 1.7078916372202593, "grad_norm": 0.5866367220878601, "learning_rate": 4.159938655755306e-05, "loss": 1.5208, "step": 725 }, { "epoch": 1.7102473498233217, "grad_norm": 0.6322000026702881, "learning_rate": 4.156858393914779e-05, "loss": 1.6337, "step": 726 }, { "epoch": 1.712603062426384, "grad_norm": 0.5505334138870239, "learning_rate": 4.153773640192666e-05, "loss": 1.4034, "step": 727 }, { "epoch": 1.7149587750294464, "grad_norm": 0.5639030337333679, "learning_rate": 4.150684402951994e-05, "loss": 1.4841, "step": 728 }, { "epoch": 1.7173144876325088, "grad_norm": 0.6906667351722717, "learning_rate": 4.147590690567942e-05, "loss": 1.6114, "step": 729 }, { "epoch": 1.7196702002355713, "grad_norm": 0.618108332157135, "learning_rate": 4.144492511427823e-05, "loss": 1.4439, "step": 730 }, { "epoch": 1.7220259128386337, "grad_norm": 0.5913695096969604, "learning_rate": 4.1413898739310605e-05, "loss": 1.5651, "step": 731 }, { "epoch": 1.7243816254416962, "grad_norm": 0.7327756285667419, "learning_rate": 4.138282786489165e-05, "loss": 1.3561, "step": 732 }, { "epoch": 1.7267373380447584, "grad_norm": 0.6229339241981506, "learning_rate": 4.135171257525707e-05, "loss": 1.5371, "step": 733 }, { "epoch": 1.7290930506478208, "grad_norm": 0.7689246535301208, "learning_rate": 4.1320552954763044e-05, "loss": 1.6552, "step": 734 }, { "epoch": 1.7314487632508833, "grad_norm": 3.602565288543701, "learning_rate": 4.12893490878859e-05, "loss": 1.5533, "step": 735 }, { "epoch": 1.7338044758539457, "grad_norm": 0.5904092192649841, "learning_rate": 4.1258101059221914e-05, "loss": 1.5358, "step": 736 }, { "epoch": 1.7361601884570081, "grad_norm": 1.209659218788147, "learning_rate": 4.122680895348713e-05, "loss": 1.581, "step": 737 }, { "epoch": 1.7385159010600706, "grad_norm": 4.423306465148926, "learning_rate": 4.1195472855517045e-05, "loss": 1.4921, "step": 738 }, { "epoch": 1.740871613663133, "grad_norm": 0.5627479553222656, "learning_rate": 4.116409285026643e-05, "loss": 1.4694, "step": 739 }, { "epoch": 1.7432273262661955, "grad_norm": 0.547874391078949, "learning_rate": 4.1132669022809136e-05, "loss": 1.4124, "step": 740 }, { "epoch": 1.745583038869258, "grad_norm": 0.6694099307060242, "learning_rate": 4.110120145833775e-05, "loss": 1.4676, "step": 741 }, { "epoch": 1.7479387514723204, "grad_norm": 1.116496205329895, "learning_rate": 4.1069690242163484e-05, "loss": 1.573, "step": 742 }, { "epoch": 1.7502944640753828, "grad_norm": 0.5915555953979492, "learning_rate": 4.103813545971589e-05, "loss": 1.3715, "step": 743 }, { "epoch": 1.7526501766784452, "grad_norm": 0.5799093842506409, "learning_rate": 4.100653719654259e-05, "loss": 1.5195, "step": 744 }, { "epoch": 1.7550058892815077, "grad_norm": 0.6760711669921875, "learning_rate": 4.0974895538309156e-05, "loss": 1.4231, "step": 745 }, { "epoch": 1.7573616018845701, "grad_norm": 0.6966275572776794, "learning_rate": 4.094321057079874e-05, "loss": 1.4838, "step": 746 }, { "epoch": 1.7597173144876326, "grad_norm": 0.6444734334945679, "learning_rate": 4.0911482379911936e-05, "loss": 1.4843, "step": 747 }, { "epoch": 1.762073027090695, "grad_norm": 0.8908310532569885, "learning_rate": 4.0879711051666534e-05, "loss": 1.4717, "step": 748 }, { "epoch": 1.7644287396937575, "grad_norm": 0.6018040776252747, "learning_rate": 4.0847896672197264e-05, "loss": 1.4777, "step": 749 }, { "epoch": 1.76678445229682, "grad_norm": 0.821266233921051, "learning_rate": 4.081603932775556e-05, "loss": 1.6304, "step": 750 }, { "epoch": 1.7691401648998824, "grad_norm": 0.5755026340484619, "learning_rate": 4.078413910470934e-05, "loss": 1.5318, "step": 751 }, { "epoch": 1.7714958775029448, "grad_norm": 0.5675933361053467, "learning_rate": 4.075219608954278e-05, "loss": 1.4998, "step": 752 }, { "epoch": 1.773851590106007, "grad_norm": 0.6010406613349915, "learning_rate": 4.072021036885607e-05, "loss": 1.4696, "step": 753 }, { "epoch": 1.7762073027090695, "grad_norm": 0.5393832921981812, "learning_rate": 4.068818202936516e-05, "loss": 1.325, "step": 754 }, { "epoch": 1.778563015312132, "grad_norm": 0.5492481589317322, "learning_rate": 4.0656111157901567e-05, "loss": 1.566, "step": 755 }, { "epoch": 1.7809187279151943, "grad_norm": 0.5975539088249207, "learning_rate": 4.062399784141209e-05, "loss": 1.4788, "step": 756 }, { "epoch": 1.7832744405182568, "grad_norm": 0.5394216179847717, "learning_rate": 4.0591842166958625e-05, "loss": 1.5235, "step": 757 }, { "epoch": 1.7856301531213192, "grad_norm": 0.6430364847183228, "learning_rate": 4.05596442217179e-05, "loss": 1.6786, "step": 758 }, { "epoch": 1.7879858657243817, "grad_norm": 7.00200891494751, "learning_rate": 4.052740409298121e-05, "loss": 1.3485, "step": 759 }, { "epoch": 1.790341578327444, "grad_norm": 0.5530775785446167, "learning_rate": 4.049512186815427e-05, "loss": 1.5769, "step": 760 }, { "epoch": 1.7926972909305063, "grad_norm": 0.5714100003242493, "learning_rate": 4.046279763475687e-05, "loss": 1.681, "step": 761 }, { "epoch": 1.7950530035335688, "grad_norm": 0.6198472380638123, "learning_rate": 4.043043148042271e-05, "loss": 1.4271, "step": 762 }, { "epoch": 1.7974087161366312, "grad_norm": 0.5776901245117188, "learning_rate": 4.039802349289914e-05, "loss": 1.5036, "step": 763 }, { "epoch": 1.7997644287396937, "grad_norm": 1.593724250793457, "learning_rate": 4.036557376004694e-05, "loss": 1.5586, "step": 764 }, { "epoch": 1.802120141342756, "grad_norm": 3.518050193786621, "learning_rate": 4.033308236984002e-05, "loss": 1.3076, "step": 765 }, { "epoch": 1.8044758539458186, "grad_norm": 0.509986400604248, "learning_rate": 4.0300549410365276e-05, "loss": 1.4158, "step": 766 }, { "epoch": 1.806831566548881, "grad_norm": 0.5594547390937805, "learning_rate": 4.026797496982226e-05, "loss": 1.3821, "step": 767 }, { "epoch": 1.8091872791519434, "grad_norm": 0.5610556602478027, "learning_rate": 4.023535913652302e-05, "loss": 1.3929, "step": 768 }, { "epoch": 1.8115429917550059, "grad_norm": 0.61439448595047, "learning_rate": 4.02027019988918e-05, "loss": 1.6638, "step": 769 }, { "epoch": 1.8138987043580683, "grad_norm": 1.1248878240585327, "learning_rate": 4.017000364546484e-05, "loss": 1.5115, "step": 770 }, { "epoch": 1.8162544169611308, "grad_norm": 0.5938189029693604, "learning_rate": 4.013726416489009e-05, "loss": 1.6082, "step": 771 }, { "epoch": 1.8186101295641932, "grad_norm": 0.6432561874389648, "learning_rate": 4.0104483645927026e-05, "loss": 1.7007, "step": 772 }, { "epoch": 1.8209658421672557, "grad_norm": 6.758146286010742, "learning_rate": 4.0071662177446376e-05, "loss": 1.4661, "step": 773 }, { "epoch": 1.823321554770318, "grad_norm": 0.5475225448608398, "learning_rate": 4.003879984842989e-05, "loss": 1.4753, "step": 774 }, { "epoch": 1.8256772673733805, "grad_norm": 0.5922585725784302, "learning_rate": 4.0005896747970084e-05, "loss": 1.4358, "step": 775 }, { "epoch": 1.828032979976443, "grad_norm": 0.5608125925064087, "learning_rate": 3.9972952965270006e-05, "loss": 1.5565, "step": 776 }, { "epoch": 1.8303886925795054, "grad_norm": 0.7956491708755493, "learning_rate": 3.993996858964302e-05, "loss": 1.6069, "step": 777 }, { "epoch": 1.8327444051825679, "grad_norm": 0.5631522536277771, "learning_rate": 3.99069437105125e-05, "loss": 1.5515, "step": 778 }, { "epoch": 1.8351001177856303, "grad_norm": 1.6201059818267822, "learning_rate": 3.987387841741169e-05, "loss": 1.5568, "step": 779 }, { "epoch": 1.8374558303886925, "grad_norm": 0.8333932757377625, "learning_rate": 3.9840772799983326e-05, "loss": 1.576, "step": 780 }, { "epoch": 1.839811542991755, "grad_norm": 0.618141770362854, "learning_rate": 3.980762694797953e-05, "loss": 1.5273, "step": 781 }, { "epoch": 1.8421672555948174, "grad_norm": 0.6008347868919373, "learning_rate": 3.977444095126146e-05, "loss": 1.4754, "step": 782 }, { "epoch": 1.8445229681978799, "grad_norm": 3.44985032081604, "learning_rate": 3.974121489979914e-05, "loss": 1.3656, "step": 783 }, { "epoch": 1.8468786808009423, "grad_norm": 0.5248243808746338, "learning_rate": 3.9707948883671165e-05, "loss": 1.405, "step": 784 }, { "epoch": 1.8492343934040048, "grad_norm": 0.6414283514022827, "learning_rate": 3.967464299306448e-05, "loss": 1.7501, "step": 785 }, { "epoch": 1.851590106007067, "grad_norm": 0.6283320784568787, "learning_rate": 3.964129731827415e-05, "loss": 1.543, "step": 786 }, { "epoch": 1.8539458186101294, "grad_norm": 0.5882829427719116, "learning_rate": 3.9607911949703086e-05, "loss": 1.4518, "step": 787 }, { "epoch": 1.8563015312131919, "grad_norm": 0.614022433757782, "learning_rate": 3.95744869778618e-05, "loss": 1.4777, "step": 788 }, { "epoch": 1.8586572438162543, "grad_norm": 0.5827060341835022, "learning_rate": 3.95410224933682e-05, "loss": 1.5435, "step": 789 }, { "epoch": 1.8610129564193167, "grad_norm": 0.568570077419281, "learning_rate": 3.950751858694729e-05, "loss": 1.4826, "step": 790 }, { "epoch": 1.8633686690223792, "grad_norm": 0.5920986533164978, "learning_rate": 3.947397534943096e-05, "loss": 1.6193, "step": 791 }, { "epoch": 1.8657243816254416, "grad_norm": 0.6174743175506592, "learning_rate": 3.944039287175774e-05, "loss": 1.4973, "step": 792 }, { "epoch": 1.868080094228504, "grad_norm": 0.5543470978736877, "learning_rate": 3.940677124497252e-05, "loss": 1.3766, "step": 793 }, { "epoch": 1.8704358068315665, "grad_norm": 0.5162727236747742, "learning_rate": 3.937311056022634e-05, "loss": 1.498, "step": 794 }, { "epoch": 1.872791519434629, "grad_norm": 0.6495651602745056, "learning_rate": 3.933941090877615e-05, "loss": 1.4183, "step": 795 }, { "epoch": 1.8751472320376914, "grad_norm": 0.6888061165809631, "learning_rate": 3.930567238198451e-05, "loss": 1.5282, "step": 796 }, { "epoch": 1.8775029446407538, "grad_norm": 0.5843103528022766, "learning_rate": 3.927189507131938e-05, "loss": 1.3775, "step": 797 }, { "epoch": 1.8798586572438163, "grad_norm": 0.5971599221229553, "learning_rate": 3.923807906835388e-05, "loss": 1.5271, "step": 798 }, { "epoch": 1.8822143698468787, "grad_norm": 0.6848633885383606, "learning_rate": 3.9204224464766015e-05, "loss": 1.8208, "step": 799 }, { "epoch": 1.8845700824499412, "grad_norm": 0.8001655340194702, "learning_rate": 3.917033135233845e-05, "loss": 1.2937, "step": 800 }, { "epoch": 1.8869257950530036, "grad_norm": 0.6040273308753967, "learning_rate": 3.9136399822958235e-05, "loss": 1.4478, "step": 801 }, { "epoch": 1.889281507656066, "grad_norm": 0.6502052545547485, "learning_rate": 3.910242996861659e-05, "loss": 1.5013, "step": 802 }, { "epoch": 1.8916372202591285, "grad_norm": 1.2293061017990112, "learning_rate": 3.9068421881408645e-05, "loss": 1.5567, "step": 803 }, { "epoch": 1.893992932862191, "grad_norm": 0.6549687385559082, "learning_rate": 3.903437565353314e-05, "loss": 1.5537, "step": 804 }, { "epoch": 1.8963486454652534, "grad_norm": 0.6101454496383667, "learning_rate": 3.9000291377292255e-05, "loss": 1.4906, "step": 805 }, { "epoch": 1.8987043580683156, "grad_norm": 0.5808925628662109, "learning_rate": 3.896616914509131e-05, "loss": 1.3673, "step": 806 }, { "epoch": 1.901060070671378, "grad_norm": 0.5687472224235535, "learning_rate": 3.893200904943853e-05, "loss": 1.3645, "step": 807 }, { "epoch": 1.9034157832744405, "grad_norm": 0.5268921256065369, "learning_rate": 3.88978111829448e-05, "loss": 1.4469, "step": 808 }, { "epoch": 1.905771495877503, "grad_norm": 0.5087159276008606, "learning_rate": 3.886357563832338e-05, "loss": 1.3529, "step": 809 }, { "epoch": 1.9081272084805654, "grad_norm": 0.7952041029930115, "learning_rate": 3.88293025083897e-05, "loss": 1.7272, "step": 810 }, { "epoch": 1.9104829210836278, "grad_norm": 0.5292350649833679, "learning_rate": 3.879499188606107e-05, "loss": 1.3972, "step": 811 }, { "epoch": 1.9128386336866903, "grad_norm": 0.5845118165016174, "learning_rate": 3.876064386435646e-05, "loss": 1.5094, "step": 812 }, { "epoch": 1.9151943462897525, "grad_norm": 0.5322152972221375, "learning_rate": 3.872625853639623e-05, "loss": 1.4796, "step": 813 }, { "epoch": 1.917550058892815, "grad_norm": 0.5214865803718567, "learning_rate": 3.8691835995401885e-05, "loss": 1.4921, "step": 814 }, { "epoch": 1.9199057714958774, "grad_norm": 0.579974353313446, "learning_rate": 3.8657376334695794e-05, "loss": 1.4, "step": 815 }, { "epoch": 1.9222614840989398, "grad_norm": 0.5283419489860535, "learning_rate": 3.862287964770099e-05, "loss": 1.4438, "step": 816 }, { "epoch": 1.9246171967020023, "grad_norm": 0.544545590877533, "learning_rate": 3.858834602794087e-05, "loss": 1.3723, "step": 817 }, { "epoch": 1.9269729093050647, "grad_norm": 0.5365920066833496, "learning_rate": 3.855377556903897e-05, "loss": 1.361, "step": 818 }, { "epoch": 1.9293286219081272, "grad_norm": 0.5245637893676758, "learning_rate": 3.851916836471868e-05, "loss": 1.4994, "step": 819 }, { "epoch": 1.9316843345111896, "grad_norm": 0.522786557674408, "learning_rate": 3.8484524508803035e-05, "loss": 1.3705, "step": 820 }, { "epoch": 1.934040047114252, "grad_norm": 0.6126843094825745, "learning_rate": 3.844984409521442e-05, "loss": 1.5588, "step": 821 }, { "epoch": 1.9363957597173145, "grad_norm": 2.6206839084625244, "learning_rate": 3.8415127217974325e-05, "loss": 1.4435, "step": 822 }, { "epoch": 1.938751472320377, "grad_norm": 0.531060516834259, "learning_rate": 3.8380373971203134e-05, "loss": 1.5093, "step": 823 }, { "epoch": 1.9411071849234394, "grad_norm": 0.5611820816993713, "learning_rate": 3.8345584449119776e-05, "loss": 1.4461, "step": 824 }, { "epoch": 1.9434628975265018, "grad_norm": 0.5812779068946838, "learning_rate": 3.831075874604155e-05, "loss": 1.5719, "step": 825 }, { "epoch": 1.9458186101295643, "grad_norm": 1.1113293170928955, "learning_rate": 3.827589695638388e-05, "loss": 1.4649, "step": 826 }, { "epoch": 1.9481743227326267, "grad_norm": 0.7303212285041809, "learning_rate": 3.824099917465996e-05, "loss": 1.4151, "step": 827 }, { "epoch": 1.9505300353356891, "grad_norm": 2.497227191925049, "learning_rate": 3.82060654954806e-05, "loss": 1.3088, "step": 828 }, { "epoch": 1.9528857479387516, "grad_norm": 0.5883631706237793, "learning_rate": 3.8171096013553944e-05, "loss": 1.4289, "step": 829 }, { "epoch": 1.955241460541814, "grad_norm": 2.7159810066223145, "learning_rate": 3.8136090823685157e-05, "loss": 1.3658, "step": 830 }, { "epoch": 1.9575971731448765, "grad_norm": 0.7951368689537048, "learning_rate": 3.8101050020776244e-05, "loss": 1.5926, "step": 831 }, { "epoch": 1.959952885747939, "grad_norm": 0.5866095423698425, "learning_rate": 3.806597369982574e-05, "loss": 1.471, "step": 832 }, { "epoch": 1.9623085983510011, "grad_norm": 0.6091902256011963, "learning_rate": 3.80308619559285e-05, "loss": 1.5527, "step": 833 }, { "epoch": 1.9646643109540636, "grad_norm": 0.6152707934379578, "learning_rate": 3.7995714884275384e-05, "loss": 1.465, "step": 834 }, { "epoch": 1.967020023557126, "grad_norm": 0.5903482437133789, "learning_rate": 3.796053258015304e-05, "loss": 1.3445, "step": 835 }, { "epoch": 1.9693757361601885, "grad_norm": 0.596836268901825, "learning_rate": 3.7925315138943655e-05, "loss": 1.4133, "step": 836 }, { "epoch": 1.971731448763251, "grad_norm": 0.5727148056030273, "learning_rate": 3.7890062656124624e-05, "loss": 1.4656, "step": 837 }, { "epoch": 1.9740871613663133, "grad_norm": 0.5360350608825684, "learning_rate": 3.785477522726839e-05, "loss": 1.5172, "step": 838 }, { "epoch": 1.9764428739693758, "grad_norm": 0.5439525246620178, "learning_rate": 3.7819452948042136e-05, "loss": 1.4332, "step": 839 }, { "epoch": 1.978798586572438, "grad_norm": 0.5403106808662415, "learning_rate": 3.778409591420749e-05, "loss": 1.4359, "step": 840 }, { "epoch": 1.9811542991755005, "grad_norm": 0.6118065714836121, "learning_rate": 3.774870422162034e-05, "loss": 1.4633, "step": 841 }, { "epoch": 1.983510011778563, "grad_norm": 0.5567687749862671, "learning_rate": 3.7713277966230514e-05, "loss": 1.4933, "step": 842 }, { "epoch": 1.9858657243816253, "grad_norm": 0.6087580919265747, "learning_rate": 3.7677817244081556e-05, "loss": 1.4785, "step": 843 }, { "epoch": 1.9882214369846878, "grad_norm": 1.0794850587844849, "learning_rate": 3.764232215131045e-05, "loss": 1.4477, "step": 844 }, { "epoch": 1.9905771495877502, "grad_norm": 0.7632965445518494, "learning_rate": 3.760679278414735e-05, "loss": 1.5376, "step": 845 }, { "epoch": 1.9929328621908127, "grad_norm": 0.6250706315040588, "learning_rate": 3.757122923891534e-05, "loss": 1.6285, "step": 846 }, { "epoch": 1.9952885747938751, "grad_norm": 0.5320702195167542, "learning_rate": 3.753563161203019e-05, "loss": 1.5627, "step": 847 }, { "epoch": 1.9976442873969376, "grad_norm": 0.5344881415367126, "learning_rate": 3.7500000000000003e-05, "loss": 1.4283, "step": 848 }, { "epoch": 1.9976442873969376, "eval_loss": 1.384067177772522, "eval_runtime": 5.8171, "eval_samples_per_second": 429.766, "eval_steps_per_second": 6.876, "step": 848 }, { "epoch": 2.0, "grad_norm": 0.6480754017829895, "learning_rate": 3.746433449942508e-05, "loss": 1.5762, "step": 849 }, { "epoch": 2.0023557126030624, "grad_norm": 0.5470189452171326, "learning_rate": 3.742863520699757e-05, "loss": 1.4849, "step": 850 }, { "epoch": 2.004711425206125, "grad_norm": 0.5942526459693909, "learning_rate": 3.7392902219501234e-05, "loss": 1.4304, "step": 851 }, { "epoch": 2.0070671378091873, "grad_norm": 9.930573463439941, "learning_rate": 3.73571356338112e-05, "loss": 1.3408, "step": 852 }, { "epoch": 2.0094228504122498, "grad_norm": 0.5976892113685608, "learning_rate": 3.732133554689365e-05, "loss": 1.614, "step": 853 }, { "epoch": 2.011778563015312, "grad_norm": 0.5336528420448303, "learning_rate": 3.728550205580564e-05, "loss": 1.4864, "step": 854 }, { "epoch": 2.0141342756183747, "grad_norm": 0.5210591554641724, "learning_rate": 3.724963525769472e-05, "loss": 1.404, "step": 855 }, { "epoch": 2.016489988221437, "grad_norm": 0.5330710411071777, "learning_rate": 3.721373524979883e-05, "loss": 1.5568, "step": 856 }, { "epoch": 2.0188457008244995, "grad_norm": 0.5555222630500793, "learning_rate": 3.7177802129445846e-05, "loss": 1.4809, "step": 857 }, { "epoch": 2.021201413427562, "grad_norm": 0.7463992834091187, "learning_rate": 3.714183599405347e-05, "loss": 1.3538, "step": 858 }, { "epoch": 2.0235571260306244, "grad_norm": 0.5476599335670471, "learning_rate": 3.710583694112893e-05, "loss": 1.3208, "step": 859 }, { "epoch": 2.025912838633687, "grad_norm": 0.588772177696228, "learning_rate": 3.706980506826863e-05, "loss": 1.4961, "step": 860 }, { "epoch": 2.0282685512367493, "grad_norm": 0.567319929599762, "learning_rate": 3.7033740473158e-05, "loss": 1.426, "step": 861 }, { "epoch": 2.0306242638398118, "grad_norm": 0.5447376370429993, "learning_rate": 3.699764325357119e-05, "loss": 1.6393, "step": 862 }, { "epoch": 2.0329799764428738, "grad_norm": 0.5246829390525818, "learning_rate": 3.6961513507370754e-05, "loss": 1.4681, "step": 863 }, { "epoch": 2.035335689045936, "grad_norm": 0.5779036283493042, "learning_rate": 3.6925351332507476e-05, "loss": 1.5013, "step": 864 }, { "epoch": 2.0376914016489986, "grad_norm": 1.1356041431427002, "learning_rate": 3.688915682702001e-05, "loss": 1.4538, "step": 865 }, { "epoch": 2.040047114252061, "grad_norm": 0.5246538519859314, "learning_rate": 3.685293008903471e-05, "loss": 1.4358, "step": 866 }, { "epoch": 2.0424028268551235, "grad_norm": 0.598699152469635, "learning_rate": 3.681667121676527e-05, "loss": 1.5024, "step": 867 }, { "epoch": 2.044758539458186, "grad_norm": 0.6000251770019531, "learning_rate": 3.6780380308512543e-05, "loss": 1.4448, "step": 868 }, { "epoch": 2.0471142520612484, "grad_norm": 1.324358582496643, "learning_rate": 3.67440574626642e-05, "loss": 1.504, "step": 869 }, { "epoch": 2.049469964664311, "grad_norm": 0.5584520697593689, "learning_rate": 3.670770277769451e-05, "loss": 1.5637, "step": 870 }, { "epoch": 2.0518256772673733, "grad_norm": 0.7929472327232361, "learning_rate": 3.667131635216408e-05, "loss": 1.2769, "step": 871 }, { "epoch": 2.0541813898704357, "grad_norm": 0.503578782081604, "learning_rate": 3.663489828471953e-05, "loss": 1.316, "step": 872 }, { "epoch": 2.056537102473498, "grad_norm": 0.5151907801628113, "learning_rate": 3.65984486740933e-05, "loss": 1.4271, "step": 873 }, { "epoch": 2.0588928150765606, "grad_norm": 1.494321584701538, "learning_rate": 3.656196761910332e-05, "loss": 1.4742, "step": 874 }, { "epoch": 2.061248527679623, "grad_norm": 0.6023456454277039, "learning_rate": 3.6525455218652785e-05, "loss": 1.4356, "step": 875 }, { "epoch": 2.0636042402826855, "grad_norm": 0.646373987197876, "learning_rate": 3.6488911571729864e-05, "loss": 1.4465, "step": 876 }, { "epoch": 2.065959952885748, "grad_norm": 0.5941593647003174, "learning_rate": 3.645233677740744e-05, "loss": 1.4302, "step": 877 }, { "epoch": 2.0683156654888104, "grad_norm": 0.540580689907074, "learning_rate": 3.6415730934842827e-05, "loss": 1.4142, "step": 878 }, { "epoch": 2.070671378091873, "grad_norm": 0.6551271080970764, "learning_rate": 3.6379094143277536e-05, "loss": 1.4174, "step": 879 }, { "epoch": 2.0730270906949353, "grad_norm": 0.5365005731582642, "learning_rate": 3.634242650203697e-05, "loss": 1.3456, "step": 880 }, { "epoch": 2.0753828032979977, "grad_norm": 3.452230215072632, "learning_rate": 3.630572811053016e-05, "loss": 1.531, "step": 881 }, { "epoch": 2.07773851590106, "grad_norm": 0.949171245098114, "learning_rate": 3.626899906824952e-05, "loss": 1.3734, "step": 882 }, { "epoch": 2.0800942285041226, "grad_norm": 0.6716760396957397, "learning_rate": 3.623223947477055e-05, "loss": 1.5684, "step": 883 }, { "epoch": 2.082449941107185, "grad_norm": 0.5458926558494568, "learning_rate": 3.619544942975158e-05, "loss": 1.3869, "step": 884 }, { "epoch": 2.0848056537102475, "grad_norm": 0.6367994546890259, "learning_rate": 3.61586290329335e-05, "loss": 1.4715, "step": 885 }, { "epoch": 2.08716136631331, "grad_norm": 1.8097198009490967, "learning_rate": 3.612177838413948e-05, "loss": 1.6526, "step": 886 }, { "epoch": 2.0895170789163724, "grad_norm": 0.6440650820732117, "learning_rate": 3.608489758327472e-05, "loss": 1.448, "step": 887 }, { "epoch": 2.091872791519435, "grad_norm": 2.37239670753479, "learning_rate": 3.604798673032613e-05, "loss": 1.5303, "step": 888 }, { "epoch": 2.094228504122497, "grad_norm": 0.6082547307014465, "learning_rate": 3.6011045925362144e-05, "loss": 1.5572, "step": 889 }, { "epoch": 2.0965842167255593, "grad_norm": 0.7018382549285889, "learning_rate": 3.597407526853235e-05, "loss": 1.4072, "step": 890 }, { "epoch": 2.0989399293286217, "grad_norm": 0.8221641182899475, "learning_rate": 3.59370748600673e-05, "loss": 1.4936, "step": 891 }, { "epoch": 2.101295641931684, "grad_norm": 0.6163040399551392, "learning_rate": 3.59000448002782e-05, "loss": 1.5663, "step": 892 }, { "epoch": 2.1036513545347466, "grad_norm": 0.5650806427001953, "learning_rate": 3.586298518955661e-05, "loss": 1.4581, "step": 893 }, { "epoch": 2.106007067137809, "grad_norm": 0.6848757863044739, "learning_rate": 3.582589612837427e-05, "loss": 1.7242, "step": 894 }, { "epoch": 2.1083627797408715, "grad_norm": 1.7443324327468872, "learning_rate": 3.5788777717282695e-05, "loss": 1.5485, "step": 895 }, { "epoch": 2.110718492343934, "grad_norm": 0.557542622089386, "learning_rate": 3.575163005691302e-05, "loss": 1.4973, "step": 896 }, { "epoch": 2.1130742049469964, "grad_norm": 0.6598139405250549, "learning_rate": 3.571445324797564e-05, "loss": 1.4195, "step": 897 }, { "epoch": 2.115429917550059, "grad_norm": 0.5981035232543945, "learning_rate": 3.5677247391259995e-05, "loss": 1.524, "step": 898 }, { "epoch": 2.1177856301531213, "grad_norm": 0.67792809009552, "learning_rate": 3.564001258763428e-05, "loss": 1.4156, "step": 899 }, { "epoch": 2.1201413427561837, "grad_norm": 0.641559362411499, "learning_rate": 3.560274893804515e-05, "loss": 1.6355, "step": 900 }, { "epoch": 2.122497055359246, "grad_norm": 1.4955319166183472, "learning_rate": 3.556545654351749e-05, "loss": 1.5618, "step": 901 }, { "epoch": 2.1248527679623086, "grad_norm": 0.6272910833358765, "learning_rate": 3.5528135505154084e-05, "loss": 1.4737, "step": 902 }, { "epoch": 2.127208480565371, "grad_norm": 2.4355177879333496, "learning_rate": 3.549078592413538e-05, "loss": 1.4709, "step": 903 }, { "epoch": 2.1295641931684335, "grad_norm": 0.5623099207878113, "learning_rate": 3.545340790171923e-05, "loss": 1.458, "step": 904 }, { "epoch": 2.131919905771496, "grad_norm": 0.538407027721405, "learning_rate": 3.541600153924058e-05, "loss": 1.4852, "step": 905 }, { "epoch": 2.1342756183745584, "grad_norm": 0.6085216403007507, "learning_rate": 3.537856693811118e-05, "loss": 1.475, "step": 906 }, { "epoch": 2.136631330977621, "grad_norm": 0.5858283638954163, "learning_rate": 3.5341104199819386e-05, "loss": 1.4276, "step": 907 }, { "epoch": 2.1389870435806833, "grad_norm": 0.6496621966362, "learning_rate": 3.530361342592981e-05, "loss": 1.614, "step": 908 }, { "epoch": 2.1413427561837457, "grad_norm": 0.665341317653656, "learning_rate": 3.526609471808305e-05, "loss": 1.5331, "step": 909 }, { "epoch": 2.143698468786808, "grad_norm": 0.4818594753742218, "learning_rate": 3.522854817799549e-05, "loss": 1.4221, "step": 910 }, { "epoch": 2.1460541813898706, "grad_norm": 0.5666949152946472, "learning_rate": 3.5190973907458924e-05, "loss": 1.6022, "step": 911 }, { "epoch": 2.148409893992933, "grad_norm": 0.5977969169616699, "learning_rate": 3.515337200834034e-05, "loss": 1.4998, "step": 912 }, { "epoch": 2.1507656065959955, "grad_norm": 0.5502481460571289, "learning_rate": 3.5115742582581626e-05, "loss": 1.4427, "step": 913 }, { "epoch": 2.153121319199058, "grad_norm": 0.5696197152137756, "learning_rate": 3.507808573219931e-05, "loss": 1.4779, "step": 914 }, { "epoch": 2.1554770318021204, "grad_norm": 0.5336370468139648, "learning_rate": 3.5040401559284254e-05, "loss": 1.453, "step": 915 }, { "epoch": 2.157832744405183, "grad_norm": 0.9479364156723022, "learning_rate": 3.50026901660014e-05, "loss": 1.3732, "step": 916 }, { "epoch": 2.160188457008245, "grad_norm": 0.5567924976348877, "learning_rate": 3.496495165458948e-05, "loss": 1.4988, "step": 917 }, { "epoch": 2.1625441696113072, "grad_norm": 0.5631065964698792, "learning_rate": 3.492718612736077e-05, "loss": 1.4313, "step": 918 }, { "epoch": 2.1648998822143697, "grad_norm": 0.5235875248908997, "learning_rate": 3.488939368670076e-05, "loss": 1.4499, "step": 919 }, { "epoch": 2.167255594817432, "grad_norm": 1.5622987747192383, "learning_rate": 3.485157443506792e-05, "loss": 1.3586, "step": 920 }, { "epoch": 2.1696113074204946, "grad_norm": 0.5903689861297607, "learning_rate": 3.48137284749934e-05, "loss": 1.4451, "step": 921 }, { "epoch": 2.171967020023557, "grad_norm": 1.4659440517425537, "learning_rate": 3.477585590908076e-05, "loss": 1.3503, "step": 922 }, { "epoch": 2.1743227326266195, "grad_norm": 0.5574550032615662, "learning_rate": 3.473795684000569e-05, "loss": 1.3807, "step": 923 }, { "epoch": 2.176678445229682, "grad_norm": 0.5407124161720276, "learning_rate": 3.4700031370515726e-05, "loss": 1.4349, "step": 924 }, { "epoch": 2.1790341578327443, "grad_norm": 0.5624421834945679, "learning_rate": 3.466207960343001e-05, "loss": 1.318, "step": 925 }, { "epoch": 2.181389870435807, "grad_norm": 0.9549903273582458, "learning_rate": 3.462410164163893e-05, "loss": 1.2852, "step": 926 }, { "epoch": 2.1837455830388692, "grad_norm": 1.735607624053955, "learning_rate": 3.458609758810393e-05, "loss": 1.4978, "step": 927 }, { "epoch": 2.1861012956419317, "grad_norm": 0.5475738644599915, "learning_rate": 3.454806754585716e-05, "loss": 1.4241, "step": 928 }, { "epoch": 2.188457008244994, "grad_norm": 0.5734015703201294, "learning_rate": 3.451001161800126e-05, "loss": 1.4934, "step": 929 }, { "epoch": 2.1908127208480566, "grad_norm": 0.8227376937866211, "learning_rate": 3.4471929907709025e-05, "loss": 1.4643, "step": 930 }, { "epoch": 2.193168433451119, "grad_norm": 0.6358530521392822, "learning_rate": 3.443382251822315e-05, "loss": 1.4171, "step": 931 }, { "epoch": 2.1955241460541814, "grad_norm": 0.5190877318382263, "learning_rate": 3.4395689552855955e-05, "loss": 1.331, "step": 932 }, { "epoch": 2.197879858657244, "grad_norm": 0.6085793375968933, "learning_rate": 3.43575311149891e-05, "loss": 1.3836, "step": 933 }, { "epoch": 2.2002355712603063, "grad_norm": 0.59223473072052, "learning_rate": 3.431934730807329e-05, "loss": 1.5398, "step": 934 }, { "epoch": 2.2025912838633688, "grad_norm": 0.5723126530647278, "learning_rate": 3.4281138235628035e-05, "loss": 1.5452, "step": 935 }, { "epoch": 2.204946996466431, "grad_norm": 0.8894895911216736, "learning_rate": 3.424290400124131e-05, "loss": 1.4448, "step": 936 }, { "epoch": 2.2073027090694937, "grad_norm": 0.5517821311950684, "learning_rate": 3.420464470856932e-05, "loss": 1.5099, "step": 937 }, { "epoch": 2.209658421672556, "grad_norm": 0.5943918228149414, "learning_rate": 3.4166360461336206e-05, "loss": 1.4942, "step": 938 }, { "epoch": 2.2120141342756185, "grad_norm": 0.587861180305481, "learning_rate": 3.412805136333377e-05, "loss": 1.5079, "step": 939 }, { "epoch": 2.214369846878681, "grad_norm": 0.5284847617149353, "learning_rate": 3.408971751842117e-05, "loss": 1.4958, "step": 940 }, { "epoch": 2.2167255594817434, "grad_norm": 0.9103102087974548, "learning_rate": 3.4051359030524654e-05, "loss": 1.4825, "step": 941 }, { "epoch": 2.2190812720848054, "grad_norm": 0.5814008712768555, "learning_rate": 3.401297600363731e-05, "loss": 1.643, "step": 942 }, { "epoch": 2.221436984687868, "grad_norm": 0.5727627277374268, "learning_rate": 3.3974568541818723e-05, "loss": 1.5383, "step": 943 }, { "epoch": 2.2237926972909303, "grad_norm": 0.5561125874519348, "learning_rate": 3.393613674919473e-05, "loss": 1.3396, "step": 944 }, { "epoch": 2.2261484098939928, "grad_norm": 0.5280982851982117, "learning_rate": 3.3897680729957135e-05, "loss": 1.3288, "step": 945 }, { "epoch": 2.228504122497055, "grad_norm": 0.6005162000656128, "learning_rate": 3.385920058836342e-05, "loss": 1.366, "step": 946 }, { "epoch": 2.2308598351001176, "grad_norm": 0.5712383389472961, "learning_rate": 3.382069642873646e-05, "loss": 1.3557, "step": 947 }, { "epoch": 2.23321554770318, "grad_norm": 0.6024034023284912, "learning_rate": 3.3782168355464263e-05, "loss": 1.4459, "step": 948 }, { "epoch": 2.2355712603062425, "grad_norm": 0.5565690398216248, "learning_rate": 3.374361647299964e-05, "loss": 1.3999, "step": 949 }, { "epoch": 2.237926972909305, "grad_norm": 0.8791359663009644, "learning_rate": 3.3705040885859975e-05, "loss": 1.3816, "step": 950 }, { "epoch": 2.2402826855123674, "grad_norm": 0.5484998822212219, "learning_rate": 3.3666441698626906e-05, "loss": 1.4249, "step": 951 }, { "epoch": 2.24263839811543, "grad_norm": 0.5457003712654114, "learning_rate": 3.362781901594606e-05, "loss": 1.4287, "step": 952 }, { "epoch": 2.2449941107184923, "grad_norm": 0.5610632300376892, "learning_rate": 3.358917294252675e-05, "loss": 1.3932, "step": 953 }, { "epoch": 2.2473498233215548, "grad_norm": 0.6172133088111877, "learning_rate": 3.355050358314172e-05, "loss": 1.4743, "step": 954 }, { "epoch": 2.249705535924617, "grad_norm": 0.5592623353004456, "learning_rate": 3.3511811042626835e-05, "loss": 1.5303, "step": 955 }, { "epoch": 2.2520612485276796, "grad_norm": 0.5257055759429932, "learning_rate": 3.3473095425880796e-05, "loss": 1.4494, "step": 956 }, { "epoch": 2.254416961130742, "grad_norm": 0.49868646264076233, "learning_rate": 3.3434356837864885e-05, "loss": 1.2587, "step": 957 }, { "epoch": 2.2567726737338045, "grad_norm": 0.5208065509796143, "learning_rate": 3.3395595383602644e-05, "loss": 1.4128, "step": 958 }, { "epoch": 2.259128386336867, "grad_norm": 0.5435162782669067, "learning_rate": 3.335681116817963e-05, "loss": 1.4078, "step": 959 }, { "epoch": 2.2614840989399294, "grad_norm": 0.7052735686302185, "learning_rate": 3.331800429674308e-05, "loss": 1.6014, "step": 960 }, { "epoch": 2.263839811542992, "grad_norm": 0.7194801568984985, "learning_rate": 3.3279174874501664e-05, "loss": 1.4253, "step": 961 }, { "epoch": 2.2661955241460543, "grad_norm": 0.562911868095398, "learning_rate": 3.32403230067252e-05, "loss": 1.5964, "step": 962 }, { "epoch": 2.2685512367491167, "grad_norm": 1.4617066383361816, "learning_rate": 3.320144879874434e-05, "loss": 1.4545, "step": 963 }, { "epoch": 2.270906949352179, "grad_norm": 0.5037046670913696, "learning_rate": 3.3162552355950324e-05, "loss": 1.3705, "step": 964 }, { "epoch": 2.2732626619552416, "grad_norm": 10.428269386291504, "learning_rate": 3.312363378379464e-05, "loss": 1.4742, "step": 965 }, { "epoch": 2.275618374558304, "grad_norm": 0.5605586767196655, "learning_rate": 3.308469318778881e-05, "loss": 1.3567, "step": 966 }, { "epoch": 2.2779740871613665, "grad_norm": 0.567072868347168, "learning_rate": 3.3045730673504035e-05, "loss": 1.5387, "step": 967 }, { "epoch": 2.280329799764429, "grad_norm": 0.5783290863037109, "learning_rate": 3.300674634657094e-05, "loss": 1.5422, "step": 968 }, { "epoch": 2.2826855123674914, "grad_norm": 0.533700704574585, "learning_rate": 3.296774031267931e-05, "loss": 1.3829, "step": 969 }, { "epoch": 2.285041224970554, "grad_norm": 0.555717408657074, "learning_rate": 3.292871267757775e-05, "loss": 1.4801, "step": 970 }, { "epoch": 2.287396937573616, "grad_norm": 0.586785614490509, "learning_rate": 3.2889663547073444e-05, "loss": 1.3356, "step": 971 }, { "epoch": 2.2897526501766783, "grad_norm": 0.5809354782104492, "learning_rate": 3.285059302703185e-05, "loss": 1.4894, "step": 972 }, { "epoch": 2.2921083627797407, "grad_norm": 0.6657307744026184, "learning_rate": 3.281150122337642e-05, "loss": 1.7592, "step": 973 }, { "epoch": 2.294464075382803, "grad_norm": 0.5232982635498047, "learning_rate": 3.277238824208828e-05, "loss": 1.3738, "step": 974 }, { "epoch": 2.2968197879858656, "grad_norm": 0.5680440068244934, "learning_rate": 3.273325418920602e-05, "loss": 1.5481, "step": 975 }, { "epoch": 2.299175500588928, "grad_norm": 0.5669149160385132, "learning_rate": 3.269409917082531e-05, "loss": 1.5101, "step": 976 }, { "epoch": 2.3015312131919905, "grad_norm": 0.5329069495201111, "learning_rate": 3.265492329309867e-05, "loss": 1.4421, "step": 977 }, { "epoch": 2.303886925795053, "grad_norm": 0.6059450507164001, "learning_rate": 3.2615726662235205e-05, "loss": 1.3948, "step": 978 }, { "epoch": 2.3062426383981154, "grad_norm": 3.6542723178863525, "learning_rate": 3.257650938450023e-05, "loss": 1.5665, "step": 979 }, { "epoch": 2.308598351001178, "grad_norm": 0.5612216591835022, "learning_rate": 3.2537271566215076e-05, "loss": 1.3896, "step": 980 }, { "epoch": 2.3109540636042403, "grad_norm": 0.5344660878181458, "learning_rate": 3.249801331375675e-05, "loss": 1.3426, "step": 981 }, { "epoch": 2.3133097762073027, "grad_norm": 0.6354736685752869, "learning_rate": 3.245873473355765e-05, "loss": 1.523, "step": 982 }, { "epoch": 2.315665488810365, "grad_norm": 1.179805874824524, "learning_rate": 3.241943593210529e-05, "loss": 1.34, "step": 983 }, { "epoch": 2.3180212014134276, "grad_norm": 0.5996628403663635, "learning_rate": 3.238011701594201e-05, "loss": 1.4791, "step": 984 }, { "epoch": 2.32037691401649, "grad_norm": 0.5832757353782654, "learning_rate": 3.234077809166468e-05, "loss": 1.433, "step": 985 }, { "epoch": 2.3227326266195525, "grad_norm": 5.0352067947387695, "learning_rate": 3.2301419265924395e-05, "loss": 1.5021, "step": 986 }, { "epoch": 2.325088339222615, "grad_norm": 0.9326024651527405, "learning_rate": 3.2262040645426244e-05, "loss": 1.5497, "step": 987 }, { "epoch": 2.3274440518256774, "grad_norm": 0.5389519333839417, "learning_rate": 3.2222642336928944e-05, "loss": 1.3617, "step": 988 }, { "epoch": 2.32979976442874, "grad_norm": 0.6079146862030029, "learning_rate": 3.21832244472446e-05, "loss": 1.4434, "step": 989 }, { "epoch": 2.3321554770318023, "grad_norm": 0.6156734824180603, "learning_rate": 3.214378708323842e-05, "loss": 1.4649, "step": 990 }, { "epoch": 2.3345111896348647, "grad_norm": 0.5352438688278198, "learning_rate": 3.2104330351828374e-05, "loss": 1.3762, "step": 991 }, { "epoch": 2.336866902237927, "grad_norm": 0.6618947982788086, "learning_rate": 3.206485435998498e-05, "loss": 1.4377, "step": 992 }, { "epoch": 2.3392226148409896, "grad_norm": 0.5275004506111145, "learning_rate": 3.2025359214730924e-05, "loss": 1.3502, "step": 993 }, { "epoch": 2.3415783274440516, "grad_norm": 0.8737163543701172, "learning_rate": 3.1985845023140884e-05, "loss": 1.5228, "step": 994 }, { "epoch": 2.343934040047114, "grad_norm": 0.5623593926429749, "learning_rate": 3.19463118923411e-05, "loss": 1.3719, "step": 995 }, { "epoch": 2.3462897526501765, "grad_norm": 0.594558835029602, "learning_rate": 3.190675992950921e-05, "loss": 1.394, "step": 996 }, { "epoch": 2.348645465253239, "grad_norm": 0.602398157119751, "learning_rate": 3.18671892418739e-05, "loss": 1.5065, "step": 997 }, { "epoch": 2.3510011778563014, "grad_norm": 0.5682621002197266, "learning_rate": 3.1827599936714584e-05, "loss": 1.3932, "step": 998 }, { "epoch": 2.353356890459364, "grad_norm": 0.684179425239563, "learning_rate": 3.1787992121361206e-05, "loss": 1.5375, "step": 999 }, { "epoch": 2.3557126030624262, "grad_norm": 0.5429834127426147, "learning_rate": 3.174836590319385e-05, "loss": 1.476, "step": 1000 }, { "epoch": 2.3580683156654887, "grad_norm": 0.5406569242477417, "learning_rate": 3.1708721389642495e-05, "loss": 1.3926, "step": 1001 }, { "epoch": 2.360424028268551, "grad_norm": 0.5105139017105103, "learning_rate": 3.166905868818676e-05, "loss": 1.276, "step": 1002 }, { "epoch": 2.3627797408716136, "grad_norm": 1.0757942199707031, "learning_rate": 3.162937790635552e-05, "loss": 1.3747, "step": 1003 }, { "epoch": 2.365135453474676, "grad_norm": 3.0357017517089844, "learning_rate": 3.158967915172669e-05, "loss": 1.28, "step": 1004 }, { "epoch": 2.3674911660777385, "grad_norm": 0.6653481721878052, "learning_rate": 3.154996253192693e-05, "loss": 1.3921, "step": 1005 }, { "epoch": 2.369846878680801, "grad_norm": 0.5390580892562866, "learning_rate": 3.15102281546313e-05, "loss": 1.452, "step": 1006 }, { "epoch": 2.3722025912838633, "grad_norm": 0.8843737244606018, "learning_rate": 3.147047612756302e-05, "loss": 1.5194, "step": 1007 }, { "epoch": 2.374558303886926, "grad_norm": 0.553547203540802, "learning_rate": 3.1430706558493166e-05, "loss": 1.6837, "step": 1008 }, { "epoch": 2.3769140164899882, "grad_norm": 0.5135992169380188, "learning_rate": 3.139091955524035e-05, "loss": 1.4989, "step": 1009 }, { "epoch": 2.3792697290930507, "grad_norm": 0.6012310981750488, "learning_rate": 3.135111522567048e-05, "loss": 1.4257, "step": 1010 }, { "epoch": 2.381625441696113, "grad_norm": 1.5577235221862793, "learning_rate": 3.1311293677696404e-05, "loss": 1.3812, "step": 1011 }, { "epoch": 2.3839811542991756, "grad_norm": 0.5108080506324768, "learning_rate": 3.127145501927769e-05, "loss": 1.3715, "step": 1012 }, { "epoch": 2.386336866902238, "grad_norm": 0.5440352559089661, "learning_rate": 3.123159935842024e-05, "loss": 1.423, "step": 1013 }, { "epoch": 2.3886925795053005, "grad_norm": 0.5810328125953674, "learning_rate": 3.11917268031761e-05, "loss": 1.5208, "step": 1014 }, { "epoch": 2.391048292108363, "grad_norm": 2.0191216468811035, "learning_rate": 3.1151837461643104e-05, "loss": 1.3876, "step": 1015 }, { "epoch": 2.3934040047114253, "grad_norm": 0.5925819873809814, "learning_rate": 3.111193144196457e-05, "loss": 1.4202, "step": 1016 }, { "epoch": 2.395759717314488, "grad_norm": 0.6074321866035461, "learning_rate": 3.107200885232908e-05, "loss": 1.3932, "step": 1017 }, { "epoch": 2.3981154299175502, "grad_norm": 0.5561642646789551, "learning_rate": 3.103206980097009e-05, "loss": 1.4891, "step": 1018 }, { "epoch": 2.4004711425206127, "grad_norm": 0.6189919710159302, "learning_rate": 3.099211439616571e-05, "loss": 1.3347, "step": 1019 }, { "epoch": 2.402826855123675, "grad_norm": 0.5392036437988281, "learning_rate": 3.095214274623839e-05, "loss": 1.3094, "step": 1020 }, { "epoch": 2.4051825677267376, "grad_norm": 0.5391421318054199, "learning_rate": 3.0912154959554606e-05, "loss": 1.4619, "step": 1021 }, { "epoch": 2.4075382803298, "grad_norm": 0.5534204840660095, "learning_rate": 3.0872151144524595e-05, "loss": 1.2631, "step": 1022 }, { "epoch": 2.4098939929328624, "grad_norm": 0.8570829629898071, "learning_rate": 3.083213140960204e-05, "loss": 1.5273, "step": 1023 }, { "epoch": 2.4122497055359244, "grad_norm": 0.599202036857605, "learning_rate": 3.0792095863283795e-05, "loss": 1.3123, "step": 1024 }, { "epoch": 2.414605418138987, "grad_norm": 2.0147159099578857, "learning_rate": 3.075204461410955e-05, "loss": 1.6301, "step": 1025 }, { "epoch": 2.4169611307420493, "grad_norm": 0.5680127143859863, "learning_rate": 3.071197777066162e-05, "loss": 1.4851, "step": 1026 }, { "epoch": 2.4193168433451118, "grad_norm": 0.6192094087600708, "learning_rate": 3.067189544156454e-05, "loss": 1.4141, "step": 1027 }, { "epoch": 2.421672555948174, "grad_norm": 0.5234526991844177, "learning_rate": 3.063179773548487e-05, "loss": 1.4691, "step": 1028 }, { "epoch": 2.4240282685512367, "grad_norm": 0.7341822385787964, "learning_rate": 3.059168476113085e-05, "loss": 1.3382, "step": 1029 }, { "epoch": 2.426383981154299, "grad_norm": 5.815999507904053, "learning_rate": 3.055155662725208e-05, "loss": 1.5216, "step": 1030 }, { "epoch": 2.4287396937573615, "grad_norm": 0.628262996673584, "learning_rate": 3.0511413442639296e-05, "loss": 1.613, "step": 1031 }, { "epoch": 2.431095406360424, "grad_norm": 0.5363373160362244, "learning_rate": 3.0471255316124037e-05, "loss": 1.3218, "step": 1032 }, { "epoch": 2.4334511189634864, "grad_norm": 0.5217775702476501, "learning_rate": 3.0431082356578334e-05, "loss": 1.3826, "step": 1033 }, { "epoch": 2.435806831566549, "grad_norm": 0.5610544681549072, "learning_rate": 3.0390894672914426e-05, "loss": 1.5726, "step": 1034 }, { "epoch": 2.4381625441696113, "grad_norm": 0.5543189644813538, "learning_rate": 3.03506923740845e-05, "loss": 1.3818, "step": 1035 }, { "epoch": 2.4405182567726738, "grad_norm": 0.5204430222511292, "learning_rate": 3.0310475569080345e-05, "loss": 1.142, "step": 1036 }, { "epoch": 2.442873969375736, "grad_norm": 0.5442864894866943, "learning_rate": 3.0270244366933064e-05, "loss": 1.4967, "step": 1037 }, { "epoch": 2.4452296819787986, "grad_norm": 0.5024524927139282, "learning_rate": 3.0229998876712827e-05, "loss": 1.3206, "step": 1038 }, { "epoch": 2.447585394581861, "grad_norm": 0.5672470927238464, "learning_rate": 3.0189739207528524e-05, "loss": 1.419, "step": 1039 }, { "epoch": 2.4499411071849235, "grad_norm": 0.598885715007782, "learning_rate": 3.014946546852746e-05, "loss": 1.4134, "step": 1040 }, { "epoch": 2.452296819787986, "grad_norm": 0.5258106589317322, "learning_rate": 3.010917776889513e-05, "loss": 1.2424, "step": 1041 }, { "epoch": 2.4546525323910484, "grad_norm": 0.685940146446228, "learning_rate": 3.0068876217854852e-05, "loss": 1.4888, "step": 1042 }, { "epoch": 2.457008244994111, "grad_norm": 0.5623028874397278, "learning_rate": 3.0028560924667487e-05, "loss": 1.4284, "step": 1043 }, { "epoch": 2.4593639575971733, "grad_norm": 1.709544062614441, "learning_rate": 2.9988231998631177e-05, "loss": 1.2788, "step": 1044 }, { "epoch": 2.4617196702002357, "grad_norm": 0.606414258480072, "learning_rate": 2.9947889549081005e-05, "loss": 1.4345, "step": 1045 }, { "epoch": 2.464075382803298, "grad_norm": 0.6186027526855469, "learning_rate": 2.990753368538872e-05, "loss": 1.4353, "step": 1046 }, { "epoch": 2.46643109540636, "grad_norm": 0.5278372764587402, "learning_rate": 2.986716451696245e-05, "loss": 1.331, "step": 1047 }, { "epoch": 2.4687868080094226, "grad_norm": 0.6476253867149353, "learning_rate": 2.982678215324638e-05, "loss": 1.4642, "step": 1048 }, { "epoch": 2.471142520612485, "grad_norm": 0.6853030323982239, "learning_rate": 2.978638670372047e-05, "loss": 1.501, "step": 1049 }, { "epoch": 2.4734982332155475, "grad_norm": 0.5592173933982849, "learning_rate": 2.9745978277900166e-05, "loss": 1.4892, "step": 1050 }, { "epoch": 2.47585394581861, "grad_norm": 0.5399948954582214, "learning_rate": 2.9705556985336086e-05, "loss": 1.4009, "step": 1051 }, { "epoch": 2.4782096584216724, "grad_norm": 2.6447010040283203, "learning_rate": 2.9665122935613727e-05, "loss": 1.3624, "step": 1052 }, { "epoch": 2.480565371024735, "grad_norm": 0.7537943124771118, "learning_rate": 2.962467623835319e-05, "loss": 1.438, "step": 1053 }, { "epoch": 2.4829210836277973, "grad_norm": 0.6421496272087097, "learning_rate": 2.9584217003208836e-05, "loss": 1.5782, "step": 1054 }, { "epoch": 2.4852767962308597, "grad_norm": 0.6954240798950195, "learning_rate": 2.9543745339869046e-05, "loss": 1.5193, "step": 1055 }, { "epoch": 2.487632508833922, "grad_norm": 0.5711853504180908, "learning_rate": 2.9503261358055873e-05, "loss": 1.3684, "step": 1056 }, { "epoch": 2.4899882214369846, "grad_norm": 1.7256232500076294, "learning_rate": 2.946276516752478e-05, "loss": 1.5114, "step": 1057 }, { "epoch": 2.492343934040047, "grad_norm": 0.5790532231330872, "learning_rate": 2.9422256878064325e-05, "loss": 1.4663, "step": 1058 }, { "epoch": 2.4946996466431095, "grad_norm": 0.6514726877212524, "learning_rate": 2.938173659949586e-05, "loss": 1.5583, "step": 1059 }, { "epoch": 2.497055359246172, "grad_norm": 0.8555186986923218, "learning_rate": 2.9341204441673266e-05, "loss": 1.5244, "step": 1060 }, { "epoch": 2.497055359246172, "eval_loss": 1.3518424034118652, "eval_runtime": 5.7896, "eval_samples_per_second": 431.807, "eval_steps_per_second": 6.909, "step": 1060 }, { "epoch": 2.4994110718492344, "grad_norm": 0.5877079963684082, "learning_rate": 2.930066051448258e-05, "loss": 1.4494, "step": 1061 }, { "epoch": 2.501766784452297, "grad_norm": 0.5171237587928772, "learning_rate": 2.9260104927841797e-05, "loss": 1.3316, "step": 1062 }, { "epoch": 2.5041224970553593, "grad_norm": 0.598540186882019, "learning_rate": 2.92195377917005e-05, "loss": 1.3462, "step": 1063 }, { "epoch": 2.5064782096584217, "grad_norm": 0.6499308943748474, "learning_rate": 2.917895921603958e-05, "loss": 1.3824, "step": 1064 }, { "epoch": 2.508833922261484, "grad_norm": 0.6240696310997009, "learning_rate": 2.9138369310870962e-05, "loss": 1.3661, "step": 1065 }, { "epoch": 2.5111896348645466, "grad_norm": 0.9136962890625, "learning_rate": 2.909776818623725e-05, "loss": 1.413, "step": 1066 }, { "epoch": 2.513545347467609, "grad_norm": 0.5324629545211792, "learning_rate": 2.9057155952211502e-05, "loss": 1.2529, "step": 1067 }, { "epoch": 2.5159010600706715, "grad_norm": 1.0870943069458008, "learning_rate": 2.9016532718896873e-05, "loss": 1.3913, "step": 1068 }, { "epoch": 2.518256772673734, "grad_norm": 0.564160168170929, "learning_rate": 2.8975898596426343e-05, "loss": 1.4081, "step": 1069 }, { "epoch": 2.5206124852767964, "grad_norm": 0.5838961601257324, "learning_rate": 2.893525369496241e-05, "loss": 1.3925, "step": 1070 }, { "epoch": 2.522968197879859, "grad_norm": 0.6044654250144958, "learning_rate": 2.889459812469681e-05, "loss": 1.4733, "step": 1071 }, { "epoch": 2.5253239104829213, "grad_norm": 0.5877034068107605, "learning_rate": 2.8853931995850185e-05, "loss": 1.5732, "step": 1072 }, { "epoch": 2.5276796230859837, "grad_norm": 1.540387511253357, "learning_rate": 2.8813255418671815e-05, "loss": 1.3735, "step": 1073 }, { "epoch": 2.530035335689046, "grad_norm": 0.5829999446868896, "learning_rate": 2.877256850343929e-05, "loss": 1.4713, "step": 1074 }, { "epoch": 2.5323910482921086, "grad_norm": 0.5942326784133911, "learning_rate": 2.8731871360458244e-05, "loss": 1.6029, "step": 1075 }, { "epoch": 2.534746760895171, "grad_norm": 0.5589162707328796, "learning_rate": 2.8691164100062034e-05, "loss": 1.3306, "step": 1076 }, { "epoch": 2.5371024734982335, "grad_norm": 0.6439933180809021, "learning_rate": 2.8650446832611444e-05, "loss": 1.5164, "step": 1077 }, { "epoch": 2.5394581861012955, "grad_norm": 0.7095947265625, "learning_rate": 2.8609719668494394e-05, "loss": 1.3788, "step": 1078 }, { "epoch": 2.541813898704358, "grad_norm": 0.5471096634864807, "learning_rate": 2.8568982718125613e-05, "loss": 1.4425, "step": 1079 }, { "epoch": 2.5441696113074204, "grad_norm": 0.61305832862854, "learning_rate": 2.8528236091946403e-05, "loss": 1.5326, "step": 1080 }, { "epoch": 2.546525323910483, "grad_norm": 0.5582051277160645, "learning_rate": 2.8487479900424253e-05, "loss": 1.4281, "step": 1081 }, { "epoch": 2.5488810365135453, "grad_norm": 0.6375832557678223, "learning_rate": 2.8446714254052613e-05, "loss": 1.2987, "step": 1082 }, { "epoch": 2.5512367491166077, "grad_norm": 0.633604109287262, "learning_rate": 2.8405939263350555e-05, "loss": 1.2102, "step": 1083 }, { "epoch": 2.55359246171967, "grad_norm": 0.6218820810317993, "learning_rate": 2.836515503886249e-05, "loss": 1.5743, "step": 1084 }, { "epoch": 2.5559481743227326, "grad_norm": 0.4978408217430115, "learning_rate": 2.8324361691157858e-05, "loss": 1.3387, "step": 1085 }, { "epoch": 2.558303886925795, "grad_norm": 0.7007297873497009, "learning_rate": 2.8283559330830834e-05, "loss": 1.3511, "step": 1086 }, { "epoch": 2.5606595995288575, "grad_norm": 1.2089507579803467, "learning_rate": 2.8242748068500023e-05, "loss": 1.6043, "step": 1087 }, { "epoch": 2.56301531213192, "grad_norm": 0.5819362998008728, "learning_rate": 2.820192801480817e-05, "loss": 1.2654, "step": 1088 }, { "epoch": 2.5653710247349824, "grad_norm": 0.5378157496452332, "learning_rate": 2.8161099280421866e-05, "loss": 1.4836, "step": 1089 }, { "epoch": 2.567726737338045, "grad_norm": 0.6355511546134949, "learning_rate": 2.81202619760312e-05, "loss": 1.2275, "step": 1090 }, { "epoch": 2.5700824499411072, "grad_norm": 0.5766435265541077, "learning_rate": 2.8079416212349528e-05, "loss": 1.3587, "step": 1091 }, { "epoch": 2.5724381625441697, "grad_norm": 0.6991839408874512, "learning_rate": 2.8038562100113137e-05, "loss": 1.4488, "step": 1092 }, { "epoch": 2.574793875147232, "grad_norm": 0.6051741242408752, "learning_rate": 2.7997699750080918e-05, "loss": 1.5575, "step": 1093 }, { "epoch": 2.5771495877502946, "grad_norm": 0.5664882659912109, "learning_rate": 2.7956829273034148e-05, "loss": 1.3902, "step": 1094 }, { "epoch": 2.579505300353357, "grad_norm": 1.066573143005371, "learning_rate": 2.791595077977608e-05, "loss": 1.3618, "step": 1095 }, { "epoch": 2.5818610129564195, "grad_norm": 0.5405884385108948, "learning_rate": 2.7875064381131733e-05, "loss": 1.4761, "step": 1096 }, { "epoch": 2.584216725559482, "grad_norm": 0.6015415191650391, "learning_rate": 2.7834170187947554e-05, "loss": 1.6498, "step": 1097 }, { "epoch": 2.586572438162544, "grad_norm": 0.6578870415687561, "learning_rate": 2.7793268311091125e-05, "loss": 1.5703, "step": 1098 }, { "epoch": 2.5889281507656063, "grad_norm": 1.2897908687591553, "learning_rate": 2.7752358861450834e-05, "loss": 1.3373, "step": 1099 }, { "epoch": 2.591283863368669, "grad_norm": 0.5923646688461304, "learning_rate": 2.7711441949935642e-05, "loss": 1.2714, "step": 1100 }, { "epoch": 2.5936395759717312, "grad_norm": 0.5072523951530457, "learning_rate": 2.7670517687474697e-05, "loss": 1.3823, "step": 1101 }, { "epoch": 2.5959952885747937, "grad_norm": 0.5902401208877563, "learning_rate": 2.7629586185017097e-05, "loss": 1.6154, "step": 1102 }, { "epoch": 2.598351001177856, "grad_norm": 0.6004310250282288, "learning_rate": 2.7588647553531576e-05, "loss": 1.4616, "step": 1103 }, { "epoch": 2.6007067137809186, "grad_norm": 0.6233294010162354, "learning_rate": 2.754770190400618e-05, "loss": 1.3804, "step": 1104 }, { "epoch": 2.603062426383981, "grad_norm": 0.5980051159858704, "learning_rate": 2.750674934744798e-05, "loss": 1.5092, "step": 1105 }, { "epoch": 2.6054181389870434, "grad_norm": 0.5091908574104309, "learning_rate": 2.7465789994882796e-05, "loss": 1.3794, "step": 1106 }, { "epoch": 2.607773851590106, "grad_norm": 0.5852126479148865, "learning_rate": 2.7424823957354843e-05, "loss": 1.4863, "step": 1107 }, { "epoch": 2.6101295641931683, "grad_norm": 0.635771632194519, "learning_rate": 2.7383851345926477e-05, "loss": 1.3856, "step": 1108 }, { "epoch": 2.6124852767962308, "grad_norm": 0.7647398710250854, "learning_rate": 2.734287227167787e-05, "loss": 1.3272, "step": 1109 }, { "epoch": 2.614840989399293, "grad_norm": 0.5250750184059143, "learning_rate": 2.7301886845706726e-05, "loss": 1.3357, "step": 1110 }, { "epoch": 2.6171967020023557, "grad_norm": 0.8658527135848999, "learning_rate": 2.7260895179127944e-05, "loss": 1.6286, "step": 1111 }, { "epoch": 2.619552414605418, "grad_norm": 0.6015082597732544, "learning_rate": 2.7219897383073373e-05, "loss": 1.3297, "step": 1112 }, { "epoch": 2.6219081272084805, "grad_norm": 1.702039361000061, "learning_rate": 2.717889356869146e-05, "loss": 1.3959, "step": 1113 }, { "epoch": 2.624263839811543, "grad_norm": 0.6131090521812439, "learning_rate": 2.7137883847146968e-05, "loss": 1.5957, "step": 1114 }, { "epoch": 2.6266195524146054, "grad_norm": 0.6263604164123535, "learning_rate": 2.7096868329620683e-05, "loss": 1.6613, "step": 1115 }, { "epoch": 2.628975265017668, "grad_norm": 0.6203348636627197, "learning_rate": 2.7055847127309107e-05, "loss": 1.441, "step": 1116 }, { "epoch": 2.6313309776207303, "grad_norm": 0.5773054361343384, "learning_rate": 2.7014820351424136e-05, "loss": 1.4526, "step": 1117 }, { "epoch": 2.6336866902237928, "grad_norm": 1.79023015499115, "learning_rate": 2.69737881131928e-05, "loss": 1.4395, "step": 1118 }, { "epoch": 2.636042402826855, "grad_norm": 0.5615230798721313, "learning_rate": 2.693275052385692e-05, "loss": 1.4519, "step": 1119 }, { "epoch": 2.6383981154299176, "grad_norm": 0.5263004302978516, "learning_rate": 2.689170769467283e-05, "loss": 1.3311, "step": 1120 }, { "epoch": 2.64075382803298, "grad_norm": 0.6184524893760681, "learning_rate": 2.6850659736911073e-05, "loss": 1.4421, "step": 1121 }, { "epoch": 2.6431095406360425, "grad_norm": 1.3448091745376587, "learning_rate": 2.6809606761856095e-05, "loss": 1.4422, "step": 1122 }, { "epoch": 2.645465253239105, "grad_norm": 0.8934986591339111, "learning_rate": 2.6768548880805934e-05, "loss": 1.411, "step": 1123 }, { "epoch": 2.6478209658421674, "grad_norm": 0.6491956114768982, "learning_rate": 2.672748620507195e-05, "loss": 1.5365, "step": 1124 }, { "epoch": 2.65017667844523, "grad_norm": 0.558304488658905, "learning_rate": 2.6686418845978478e-05, "loss": 1.3775, "step": 1125 }, { "epoch": 2.6525323910482923, "grad_norm": 0.49869948625564575, "learning_rate": 2.664534691486257e-05, "loss": 1.3711, "step": 1126 }, { "epoch": 2.6548881036513547, "grad_norm": 0.644311785697937, "learning_rate": 2.6604270523073653e-05, "loss": 1.4386, "step": 1127 }, { "epoch": 2.657243816254417, "grad_norm": 0.6112444400787354, "learning_rate": 2.6563189781973268e-05, "loss": 1.373, "step": 1128 }, { "epoch": 2.6595995288574796, "grad_norm": 0.5796204805374146, "learning_rate": 2.6522104802934727e-05, "loss": 1.3879, "step": 1129 }, { "epoch": 2.661955241460542, "grad_norm": 0.781163215637207, "learning_rate": 2.648101569734286e-05, "loss": 1.3723, "step": 1130 }, { "epoch": 2.664310954063604, "grad_norm": 0.5815576910972595, "learning_rate": 2.643992257659365e-05, "loss": 1.4162, "step": 1131 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5650984644889832, "learning_rate": 2.639882555209398e-05, "loss": 1.2472, "step": 1132 }, { "epoch": 2.669022379269729, "grad_norm": 0.5878518223762512, "learning_rate": 2.6357724735261335e-05, "loss": 1.4725, "step": 1133 }, { "epoch": 2.6713780918727914, "grad_norm": 0.5541209578514099, "learning_rate": 2.631662023752346e-05, "loss": 1.4535, "step": 1134 }, { "epoch": 2.673733804475854, "grad_norm": 2.8227009773254395, "learning_rate": 2.627551217031806e-05, "loss": 1.3212, "step": 1135 }, { "epoch": 2.6760895170789163, "grad_norm": 0.5463438034057617, "learning_rate": 2.623440064509258e-05, "loss": 1.42, "step": 1136 }, { "epoch": 2.6784452296819787, "grad_norm": 0.512425422668457, "learning_rate": 2.6193285773303772e-05, "loss": 1.2512, "step": 1137 }, { "epoch": 2.680800942285041, "grad_norm": 0.6076460480690002, "learning_rate": 2.6152167666417493e-05, "loss": 1.364, "step": 1138 }, { "epoch": 2.6831566548881036, "grad_norm": 0.5870344638824463, "learning_rate": 2.6111046435908383e-05, "loss": 1.4711, "step": 1139 }, { "epoch": 2.685512367491166, "grad_norm": 0.6945620775222778, "learning_rate": 2.606992219325952e-05, "loss": 1.6676, "step": 1140 }, { "epoch": 2.6878680800942285, "grad_norm": 0.6135069131851196, "learning_rate": 2.6028795049962167e-05, "loss": 1.5074, "step": 1141 }, { "epoch": 2.690223792697291, "grad_norm": 0.5198838710784912, "learning_rate": 2.5987665117515446e-05, "loss": 1.5432, "step": 1142 }, { "epoch": 2.6925795053003534, "grad_norm": 1.1001906394958496, "learning_rate": 2.594653250742605e-05, "loss": 1.3635, "step": 1143 }, { "epoch": 2.694935217903416, "grad_norm": 0.6138401627540588, "learning_rate": 2.590539733120791e-05, "loss": 1.4754, "step": 1144 }, { "epoch": 2.6972909305064783, "grad_norm": 0.5274995565414429, "learning_rate": 2.586425970038194e-05, "loss": 1.4835, "step": 1145 }, { "epoch": 2.6996466431095407, "grad_norm": 0.5955519080162048, "learning_rate": 2.5823119726475682e-05, "loss": 1.3335, "step": 1146 }, { "epoch": 2.702002355712603, "grad_norm": 0.5496350526809692, "learning_rate": 2.5781977521023043e-05, "loss": 1.4003, "step": 1147 }, { "epoch": 2.7043580683156656, "grad_norm": 0.5441713333129883, "learning_rate": 2.5740833195563996e-05, "loss": 1.4622, "step": 1148 }, { "epoch": 2.706713780918728, "grad_norm": 0.6476238965988159, "learning_rate": 2.5699686861644234e-05, "loss": 1.5439, "step": 1149 }, { "epoch": 2.7090694935217905, "grad_norm": 0.5159591436386108, "learning_rate": 2.5658538630814915e-05, "loss": 1.4267, "step": 1150 }, { "epoch": 2.7114252061248525, "grad_norm": 0.5663313269615173, "learning_rate": 2.561738861463232e-05, "loss": 1.4383, "step": 1151 }, { "epoch": 2.713780918727915, "grad_norm": 0.8264306783676147, "learning_rate": 2.5576236924657597e-05, "loss": 1.4084, "step": 1152 }, { "epoch": 2.7161366313309774, "grad_norm": 0.5401167869567871, "learning_rate": 2.5535083672456407e-05, "loss": 1.3513, "step": 1153 }, { "epoch": 2.71849234393404, "grad_norm": 0.5620750188827515, "learning_rate": 2.5493928969598662e-05, "loss": 1.5404, "step": 1154 }, { "epoch": 2.7208480565371023, "grad_norm": 0.6937330365180969, "learning_rate": 2.5452772927658196e-05, "loss": 1.3922, "step": 1155 }, { "epoch": 2.7232037691401647, "grad_norm": 0.9959506392478943, "learning_rate": 2.5411615658212478e-05, "loss": 1.4035, "step": 1156 }, { "epoch": 2.725559481743227, "grad_norm": 0.5836624503135681, "learning_rate": 2.537045727284232e-05, "loss": 1.4136, "step": 1157 }, { "epoch": 2.7279151943462896, "grad_norm": 0.5300898551940918, "learning_rate": 2.5329297883131526e-05, "loss": 1.4435, "step": 1158 }, { "epoch": 2.730270906949352, "grad_norm": 0.5761305689811707, "learning_rate": 2.528813760066664e-05, "loss": 1.5409, "step": 1159 }, { "epoch": 2.7326266195524145, "grad_norm": 0.5720075964927673, "learning_rate": 2.5246976537036644e-05, "loss": 1.4908, "step": 1160 }, { "epoch": 2.734982332155477, "grad_norm": 0.5551438331604004, "learning_rate": 2.5205814803832617e-05, "loss": 1.4704, "step": 1161 }, { "epoch": 2.7373380447585394, "grad_norm": 0.6081617474555969, "learning_rate": 2.5164652512647445e-05, "loss": 1.4664, "step": 1162 }, { "epoch": 2.739693757361602, "grad_norm": 0.5442171692848206, "learning_rate": 2.5123489775075558e-05, "loss": 1.4103, "step": 1163 }, { "epoch": 2.7420494699646643, "grad_norm": 0.49790769815444946, "learning_rate": 2.5082326702712567e-05, "loss": 1.3624, "step": 1164 }, { "epoch": 2.7444051825677267, "grad_norm": 0.4954708516597748, "learning_rate": 2.5041163407155e-05, "loss": 1.2112, "step": 1165 }, { "epoch": 2.746760895170789, "grad_norm": 0.5391804575920105, "learning_rate": 2.5e-05, "loss": 1.5388, "step": 1166 }, { "epoch": 2.7491166077738516, "grad_norm": 0.5193619132041931, "learning_rate": 2.4958836592845005e-05, "loss": 1.3637, "step": 1167 }, { "epoch": 2.751472320376914, "grad_norm": 0.7949517369270325, "learning_rate": 2.4917673297287446e-05, "loss": 1.2491, "step": 1168 }, { "epoch": 2.7538280329799765, "grad_norm": 0.5275465250015259, "learning_rate": 2.4876510224924445e-05, "loss": 1.3202, "step": 1169 }, { "epoch": 2.756183745583039, "grad_norm": 1.3937264680862427, "learning_rate": 2.4835347487352557e-05, "loss": 1.5668, "step": 1170 }, { "epoch": 2.7585394581861014, "grad_norm": 0.6405896544456482, "learning_rate": 2.4794185196167392e-05, "loss": 1.4507, "step": 1171 }, { "epoch": 2.760895170789164, "grad_norm": 0.7239024639129639, "learning_rate": 2.475302346296336e-05, "loss": 1.5262, "step": 1172 }, { "epoch": 2.7632508833922262, "grad_norm": 0.6344488859176636, "learning_rate": 2.4711862399333367e-05, "loss": 1.2562, "step": 1173 }, { "epoch": 2.7656065959952887, "grad_norm": 0.5562710165977478, "learning_rate": 2.467070211686849e-05, "loss": 1.3081, "step": 1174 }, { "epoch": 2.767962308598351, "grad_norm": 0.6612924337387085, "learning_rate": 2.4629542727157684e-05, "loss": 1.4687, "step": 1175 }, { "epoch": 2.7703180212014136, "grad_norm": 0.612334132194519, "learning_rate": 2.4588384341787518e-05, "loss": 1.4571, "step": 1176 }, { "epoch": 2.772673733804476, "grad_norm": 0.6006629467010498, "learning_rate": 2.4547227072341806e-05, "loss": 1.4426, "step": 1177 }, { "epoch": 2.7750294464075385, "grad_norm": 0.5404090881347656, "learning_rate": 2.4506071030401344e-05, "loss": 1.1143, "step": 1178 }, { "epoch": 2.777385159010601, "grad_norm": 0.7058479189872742, "learning_rate": 2.4464916327543596e-05, "loss": 1.2888, "step": 1179 }, { "epoch": 2.7797408716136633, "grad_norm": 1.4653067588806152, "learning_rate": 2.4423763075342405e-05, "loss": 1.3814, "step": 1180 }, { "epoch": 2.782096584216726, "grad_norm": 0.8745965957641602, "learning_rate": 2.4382611385367678e-05, "loss": 1.353, "step": 1181 }, { "epoch": 2.7844522968197882, "grad_norm": 0.9978426694869995, "learning_rate": 2.434146136918509e-05, "loss": 1.3583, "step": 1182 }, { "epoch": 2.7868080094228507, "grad_norm": 0.6602436900138855, "learning_rate": 2.4300313138355772e-05, "loss": 1.6583, "step": 1183 }, { "epoch": 2.7891637220259127, "grad_norm": 0.572460949420929, "learning_rate": 2.4259166804436006e-05, "loss": 1.4881, "step": 1184 }, { "epoch": 2.791519434628975, "grad_norm": 0.5319108366966248, "learning_rate": 2.421802247897696e-05, "loss": 1.3371, "step": 1185 }, { "epoch": 2.7938751472320376, "grad_norm": 0.613919734954834, "learning_rate": 2.417688027352433e-05, "loss": 1.3159, "step": 1186 }, { "epoch": 2.7962308598351, "grad_norm": 0.9045611619949341, "learning_rate": 2.413574029961807e-05, "loss": 1.4436, "step": 1187 }, { "epoch": 2.7985865724381624, "grad_norm": 0.5695676207542419, "learning_rate": 2.409460266879209e-05, "loss": 1.3647, "step": 1188 }, { "epoch": 2.800942285041225, "grad_norm": 0.9738927483558655, "learning_rate": 2.4053467492573954e-05, "loss": 1.4537, "step": 1189 }, { "epoch": 2.8032979976442873, "grad_norm": 0.5320029258728027, "learning_rate": 2.4012334882484556e-05, "loss": 1.3334, "step": 1190 }, { "epoch": 2.8056537102473498, "grad_norm": 0.6225754618644714, "learning_rate": 2.397120495003784e-05, "loss": 1.3363, "step": 1191 }, { "epoch": 2.808009422850412, "grad_norm": 0.5864959359169006, "learning_rate": 2.3930077806740488e-05, "loss": 1.5432, "step": 1192 }, { "epoch": 2.8103651354534747, "grad_norm": 0.560234546661377, "learning_rate": 2.388895356409162e-05, "loss": 1.2429, "step": 1193 }, { "epoch": 2.812720848056537, "grad_norm": 0.7584865689277649, "learning_rate": 2.3847832333582506e-05, "loss": 1.463, "step": 1194 }, { "epoch": 2.8150765606595995, "grad_norm": 0.52528315782547, "learning_rate": 2.3806714226696234e-05, "loss": 1.4248, "step": 1195 }, { "epoch": 2.817432273262662, "grad_norm": 0.5571629405021667, "learning_rate": 2.3765599354907427e-05, "loss": 1.4476, "step": 1196 }, { "epoch": 2.8197879858657244, "grad_norm": 0.5430626273155212, "learning_rate": 2.3724487829681942e-05, "loss": 1.4657, "step": 1197 }, { "epoch": 2.822143698468787, "grad_norm": 1.4348475933074951, "learning_rate": 2.3683379762476555e-05, "loss": 1.4039, "step": 1198 }, { "epoch": 2.8244994110718493, "grad_norm": 0.5515361428260803, "learning_rate": 2.364227526473866e-05, "loss": 1.4146, "step": 1199 }, { "epoch": 2.8268551236749118, "grad_norm": 0.5965813398361206, "learning_rate": 2.3601174447906017e-05, "loss": 1.4503, "step": 1200 }, { "epoch": 2.829210836277974, "grad_norm": 0.5185278654098511, "learning_rate": 2.3560077423406355e-05, "loss": 1.347, "step": 1201 }, { "epoch": 2.8315665488810366, "grad_norm": 0.5760292410850525, "learning_rate": 2.3518984302657146e-05, "loss": 1.6304, "step": 1202 }, { "epoch": 2.833922261484099, "grad_norm": 0.5719389915466309, "learning_rate": 2.3477895197065275e-05, "loss": 1.4253, "step": 1203 }, { "epoch": 2.836277974087161, "grad_norm": 0.537956714630127, "learning_rate": 2.343681021802674e-05, "loss": 1.4111, "step": 1204 }, { "epoch": 2.8386336866902235, "grad_norm": 0.5078034400939941, "learning_rate": 2.339572947692635e-05, "loss": 1.3181, "step": 1205 }, { "epoch": 2.840989399293286, "grad_norm": 0.5178635716438293, "learning_rate": 2.3354653085137433e-05, "loss": 1.335, "step": 1206 }, { "epoch": 2.8433451118963484, "grad_norm": 3.0602333545684814, "learning_rate": 2.3313581154021525e-05, "loss": 1.4954, "step": 1207 }, { "epoch": 2.845700824499411, "grad_norm": 0.5223456025123596, "learning_rate": 2.3272513794928054e-05, "loss": 1.3841, "step": 1208 }, { "epoch": 2.8480565371024733, "grad_norm": 0.679471492767334, "learning_rate": 2.3231451119194072e-05, "loss": 1.2963, "step": 1209 }, { "epoch": 2.8504122497055357, "grad_norm": 0.799528181552887, "learning_rate": 2.319039323814391e-05, "loss": 1.5088, "step": 1210 }, { "epoch": 2.852767962308598, "grad_norm": 0.6547077894210815, "learning_rate": 2.314934026308893e-05, "loss": 1.4945, "step": 1211 }, { "epoch": 2.8551236749116606, "grad_norm": 0.6009865999221802, "learning_rate": 2.3108292305327176e-05, "loss": 1.3006, "step": 1212 }, { "epoch": 2.857479387514723, "grad_norm": 0.5352737307548523, "learning_rate": 2.3067249476143087e-05, "loss": 1.3311, "step": 1213 }, { "epoch": 2.8598351001177855, "grad_norm": 1.4562902450561523, "learning_rate": 2.3026211886807202e-05, "loss": 1.6263, "step": 1214 }, { "epoch": 2.862190812720848, "grad_norm": 0.6507624387741089, "learning_rate": 2.298517964857587e-05, "loss": 1.5971, "step": 1215 }, { "epoch": 2.8645465253239104, "grad_norm": 0.5146584510803223, "learning_rate": 2.29441528726909e-05, "loss": 1.2692, "step": 1216 }, { "epoch": 2.866902237926973, "grad_norm": 0.6005557775497437, "learning_rate": 2.2903131670379323e-05, "loss": 1.1569, "step": 1217 }, { "epoch": 2.8692579505300353, "grad_norm": 0.5266910791397095, "learning_rate": 2.2862116152853034e-05, "loss": 1.248, "step": 1218 }, { "epoch": 2.8716136631330977, "grad_norm": 0.6851381063461304, "learning_rate": 2.2821106431308544e-05, "loss": 1.3817, "step": 1219 }, { "epoch": 2.87396937573616, "grad_norm": 0.8671360611915588, "learning_rate": 2.2780102616926633e-05, "loss": 1.3426, "step": 1220 }, { "epoch": 2.8763250883392226, "grad_norm": 0.6154194474220276, "learning_rate": 2.2739104820872062e-05, "loss": 1.5015, "step": 1221 }, { "epoch": 2.878680800942285, "grad_norm": 0.6097437739372253, "learning_rate": 2.2698113154293286e-05, "loss": 1.3454, "step": 1222 }, { "epoch": 2.8810365135453475, "grad_norm": 0.6132227182388306, "learning_rate": 2.2657127728322124e-05, "loss": 1.4427, "step": 1223 }, { "epoch": 2.88339222614841, "grad_norm": 0.5828686952590942, "learning_rate": 2.2616148654073522e-05, "loss": 1.5408, "step": 1224 }, { "epoch": 2.8857479387514724, "grad_norm": 0.5046601891517639, "learning_rate": 2.257517604264516e-05, "loss": 1.2329, "step": 1225 }, { "epoch": 2.888103651354535, "grad_norm": 0.5515561103820801, "learning_rate": 2.253421000511721e-05, "loss": 1.2603, "step": 1226 }, { "epoch": 2.8904593639575973, "grad_norm": 0.6231259107589722, "learning_rate": 2.249325065255202e-05, "loss": 1.4912, "step": 1227 }, { "epoch": 2.8928150765606597, "grad_norm": 0.6217333078384399, "learning_rate": 2.245229809599383e-05, "loss": 1.5408, "step": 1228 }, { "epoch": 2.895170789163722, "grad_norm": 1.6421300172805786, "learning_rate": 2.2411352446468426e-05, "loss": 1.4892, "step": 1229 }, { "epoch": 2.8975265017667846, "grad_norm": 0.7665801048278809, "learning_rate": 2.2370413814982905e-05, "loss": 1.4712, "step": 1230 }, { "epoch": 2.899882214369847, "grad_norm": 0.8724063038825989, "learning_rate": 2.232948231252531e-05, "loss": 1.3972, "step": 1231 }, { "epoch": 2.9022379269729095, "grad_norm": 0.5302340388298035, "learning_rate": 2.2288558050064367e-05, "loss": 1.4426, "step": 1232 }, { "epoch": 2.904593639575972, "grad_norm": 0.5424681901931763, "learning_rate": 2.2247641138549168e-05, "loss": 1.4029, "step": 1233 }, { "epoch": 2.9069493521790344, "grad_norm": 0.5629146695137024, "learning_rate": 2.2206731688908884e-05, "loss": 1.4571, "step": 1234 }, { "epoch": 2.909305064782097, "grad_norm": 0.5085024237632751, "learning_rate": 2.216582981205244e-05, "loss": 1.286, "step": 1235 }, { "epoch": 2.9116607773851593, "grad_norm": 0.5798412561416626, "learning_rate": 2.2124935618868266e-05, "loss": 1.5775, "step": 1236 }, { "epoch": 2.9140164899882217, "grad_norm": 0.4974880516529083, "learning_rate": 2.2084049220223926e-05, "loss": 1.3738, "step": 1237 }, { "epoch": 2.9163722025912837, "grad_norm": 0.5843273401260376, "learning_rate": 2.2043170726965858e-05, "loss": 1.3893, "step": 1238 }, { "epoch": 2.918727915194346, "grad_norm": 0.5889483094215393, "learning_rate": 2.2002300249919085e-05, "loss": 1.2464, "step": 1239 }, { "epoch": 2.9210836277974086, "grad_norm": 0.6125523447990417, "learning_rate": 2.196143789988687e-05, "loss": 1.5329, "step": 1240 }, { "epoch": 2.923439340400471, "grad_norm": 0.5362994074821472, "learning_rate": 2.192058378765047e-05, "loss": 1.2789, "step": 1241 }, { "epoch": 2.9257950530035335, "grad_norm": 0.5131523013114929, "learning_rate": 2.18797380239688e-05, "loss": 1.4799, "step": 1242 }, { "epoch": 2.928150765606596, "grad_norm": 0.5265060067176819, "learning_rate": 2.1838900719578143e-05, "loss": 1.4074, "step": 1243 }, { "epoch": 2.9305064782096584, "grad_norm": 0.5171497464179993, "learning_rate": 2.1798071985191832e-05, "loss": 1.1754, "step": 1244 }, { "epoch": 2.932862190812721, "grad_norm": 1.3039119243621826, "learning_rate": 2.175725193149998e-05, "loss": 1.4333, "step": 1245 }, { "epoch": 2.9352179034157833, "grad_norm": 0.6204612255096436, "learning_rate": 2.1716440669169175e-05, "loss": 1.5648, "step": 1246 }, { "epoch": 2.9375736160188457, "grad_norm": 0.5697060823440552, "learning_rate": 2.1675638308842145e-05, "loss": 1.3114, "step": 1247 }, { "epoch": 2.939929328621908, "grad_norm": 0.5261526107788086, "learning_rate": 2.1634844961137517e-05, "loss": 1.3483, "step": 1248 }, { "epoch": 2.9422850412249706, "grad_norm": 0.553205668926239, "learning_rate": 2.1594060736649448e-05, "loss": 1.4475, "step": 1249 }, { "epoch": 2.944640753828033, "grad_norm": 0.5770396590232849, "learning_rate": 2.1553285745947393e-05, "loss": 1.3799, "step": 1250 }, { "epoch": 2.9469964664310955, "grad_norm": 0.535224199295044, "learning_rate": 2.1512520099575756e-05, "loss": 1.5188, "step": 1251 }, { "epoch": 2.949352179034158, "grad_norm": 0.6847383975982666, "learning_rate": 2.147176390805361e-05, "loss": 1.6264, "step": 1252 }, { "epoch": 2.9517078916372204, "grad_norm": 0.7912705540657043, "learning_rate": 2.1431017281874386e-05, "loss": 1.3852, "step": 1253 }, { "epoch": 2.954063604240283, "grad_norm": 0.5189955234527588, "learning_rate": 2.139028033150561e-05, "loss": 1.4456, "step": 1254 }, { "epoch": 2.9564193168433452, "grad_norm": 0.6734246611595154, "learning_rate": 2.134955316738856e-05, "loss": 1.5886, "step": 1255 }, { "epoch": 2.9587750294464077, "grad_norm": 0.5744507908821106, "learning_rate": 2.1308835899937972e-05, "loss": 1.3918, "step": 1256 }, { "epoch": 2.9611307420494697, "grad_norm": 0.5711458325386047, "learning_rate": 2.1268128639541765e-05, "loss": 1.4921, "step": 1257 }, { "epoch": 2.963486454652532, "grad_norm": 0.5772138237953186, "learning_rate": 2.122743149656072e-05, "loss": 1.3957, "step": 1258 }, { "epoch": 2.9658421672555946, "grad_norm": 0.5750999450683594, "learning_rate": 2.1186744581328187e-05, "loss": 1.3495, "step": 1259 }, { "epoch": 2.968197879858657, "grad_norm": 0.576246976852417, "learning_rate": 2.1146068004149818e-05, "loss": 1.4132, "step": 1260 }, { "epoch": 2.9705535924617195, "grad_norm": 0.6137480735778809, "learning_rate": 2.1105401875303193e-05, "loss": 1.5503, "step": 1261 }, { "epoch": 2.972909305064782, "grad_norm": 0.6405668258666992, "learning_rate": 2.1064746305037595e-05, "loss": 1.5534, "step": 1262 }, { "epoch": 2.9752650176678443, "grad_norm": 0.5568289756774902, "learning_rate": 2.102410140357367e-05, "loss": 1.4686, "step": 1263 }, { "epoch": 2.977620730270907, "grad_norm": 0.5381885170936584, "learning_rate": 2.0983467281103143e-05, "loss": 1.3768, "step": 1264 }, { "epoch": 2.9799764428739692, "grad_norm": 0.5691659450531006, "learning_rate": 2.09428440477885e-05, "loss": 1.4297, "step": 1265 }, { "epoch": 2.9823321554770317, "grad_norm": 0.7221799492835999, "learning_rate": 2.0902231813762753e-05, "loss": 1.4994, "step": 1266 }, { "epoch": 2.984687868080094, "grad_norm": 0.475161075592041, "learning_rate": 2.0861630689129044e-05, "loss": 1.3312, "step": 1267 }, { "epoch": 2.9870435806831566, "grad_norm": 0.5715131759643555, "learning_rate": 2.0821040783960423e-05, "loss": 1.5467, "step": 1268 }, { "epoch": 2.989399293286219, "grad_norm": 0.5962476134300232, "learning_rate": 2.0780462208299505e-05, "loss": 1.6128, "step": 1269 }, { "epoch": 2.9917550058892814, "grad_norm": 0.5651288628578186, "learning_rate": 2.0739895072158212e-05, "loss": 1.3377, "step": 1270 }, { "epoch": 2.994110718492344, "grad_norm": 0.53815758228302, "learning_rate": 2.0699339485517422e-05, "loss": 1.4875, "step": 1271 }, { "epoch": 2.9964664310954063, "grad_norm": 0.5455276370048523, "learning_rate": 2.0658795558326743e-05, "loss": 1.4948, "step": 1272 }, { "epoch": 2.9964664310954063, "eval_loss": 1.3289984464645386, "eval_runtime": 5.7613, "eval_samples_per_second": 433.929, "eval_steps_per_second": 6.943, "step": 1272 }, { "epoch": 2.998822143698469, "grad_norm": 0.8064414262771606, "learning_rate": 2.061826340050414e-05, "loss": 1.4821, "step": 1273 }, { "epoch": 3.001177856301531, "grad_norm": 1.5746431350708008, "learning_rate": 2.0577743121935684e-05, "loss": 1.4556, "step": 1274 }, { "epoch": 3.0035335689045937, "grad_norm": 0.52943354845047, "learning_rate": 2.0537234832475223e-05, "loss": 1.3544, "step": 1275 }, { "epoch": 3.005889281507656, "grad_norm": 0.5273077487945557, "learning_rate": 2.0496738641944133e-05, "loss": 1.4935, "step": 1276 }, { "epoch": 3.0082449941107186, "grad_norm": 0.4758285582065582, "learning_rate": 2.0456254660130957e-05, "loss": 1.1904, "step": 1277 }, { "epoch": 3.010600706713781, "grad_norm": 0.5620205998420715, "learning_rate": 2.041578299679117e-05, "loss": 1.3844, "step": 1278 }, { "epoch": 3.0129564193168434, "grad_norm": 1.4259063005447388, "learning_rate": 2.0375323761646816e-05, "loss": 1.5371, "step": 1279 }, { "epoch": 3.015312131919906, "grad_norm": 0.5497833490371704, "learning_rate": 2.0334877064386276e-05, "loss": 1.4912, "step": 1280 }, { "epoch": 3.0176678445229683, "grad_norm": 0.6348953247070312, "learning_rate": 2.0294443014663923e-05, "loss": 1.215, "step": 1281 }, { "epoch": 3.0200235571260308, "grad_norm": 0.7117160558700562, "learning_rate": 2.0254021722099843e-05, "loss": 1.3811, "step": 1282 }, { "epoch": 3.022379269729093, "grad_norm": 0.5006508827209473, "learning_rate": 2.0213613296279533e-05, "loss": 1.2833, "step": 1283 }, { "epoch": 3.0247349823321557, "grad_norm": 0.6176966428756714, "learning_rate": 2.0173217846753628e-05, "loss": 1.459, "step": 1284 }, { "epoch": 3.027090694935218, "grad_norm": 0.598811686038971, "learning_rate": 2.0132835483037557e-05, "loss": 1.4587, "step": 1285 }, { "epoch": 3.0294464075382805, "grad_norm": 0.5975748300552368, "learning_rate": 2.0092466314611287e-05, "loss": 1.4464, "step": 1286 }, { "epoch": 3.0318021201413425, "grad_norm": 0.5336679816246033, "learning_rate": 2.005211045091901e-05, "loss": 1.3708, "step": 1287 }, { "epoch": 3.034157832744405, "grad_norm": 0.7059223055839539, "learning_rate": 2.0011768001368835e-05, "loss": 1.4464, "step": 1288 }, { "epoch": 3.0365135453474674, "grad_norm": 0.5105994939804077, "learning_rate": 1.9971439075332515e-05, "loss": 1.2699, "step": 1289 }, { "epoch": 3.03886925795053, "grad_norm": 0.5213927626609802, "learning_rate": 1.9931123782145154e-05, "loss": 1.4053, "step": 1290 }, { "epoch": 3.0412249705535923, "grad_norm": 0.5305792093276978, "learning_rate": 1.9890822231104872e-05, "loss": 1.3489, "step": 1291 }, { "epoch": 3.0435806831566548, "grad_norm": 0.508683979511261, "learning_rate": 1.9850534531472546e-05, "loss": 1.3586, "step": 1292 }, { "epoch": 3.045936395759717, "grad_norm": 0.6176059246063232, "learning_rate": 1.9810260792471492e-05, "loss": 1.4205, "step": 1293 }, { "epoch": 3.0482921083627796, "grad_norm": 0.6968875527381897, "learning_rate": 1.9770001123287182e-05, "loss": 1.5333, "step": 1294 }, { "epoch": 3.050647820965842, "grad_norm": 0.621222198009491, "learning_rate": 1.972975563306694e-05, "loss": 1.4062, "step": 1295 }, { "epoch": 3.0530035335689045, "grad_norm": 1.0966851711273193, "learning_rate": 1.9689524430919664e-05, "loss": 1.4585, "step": 1296 }, { "epoch": 3.055359246171967, "grad_norm": 0.6453889012336731, "learning_rate": 1.9649307625915505e-05, "loss": 1.6143, "step": 1297 }, { "epoch": 3.0577149587750294, "grad_norm": 0.839283287525177, "learning_rate": 1.960910532708558e-05, "loss": 1.4204, "step": 1298 }, { "epoch": 3.060070671378092, "grad_norm": 0.5251245498657227, "learning_rate": 1.956891764342168e-05, "loss": 1.2758, "step": 1299 }, { "epoch": 3.0624263839811543, "grad_norm": 0.600619912147522, "learning_rate": 1.9528744683875976e-05, "loss": 1.5262, "step": 1300 }, { "epoch": 3.0647820965842167, "grad_norm": 0.7909916639328003, "learning_rate": 1.9488586557360703e-05, "loss": 1.3919, "step": 1301 }, { "epoch": 3.067137809187279, "grad_norm": 0.7494365572929382, "learning_rate": 1.9448443372747927e-05, "loss": 1.2665, "step": 1302 }, { "epoch": 3.0694935217903416, "grad_norm": 0.7061365842819214, "learning_rate": 1.940831523886916e-05, "loss": 1.3756, "step": 1303 }, { "epoch": 3.071849234393404, "grad_norm": 0.5802088975906372, "learning_rate": 1.936820226451513e-05, "loss": 1.4296, "step": 1304 }, { "epoch": 3.0742049469964665, "grad_norm": 0.549069344997406, "learning_rate": 1.9328104558435456e-05, "loss": 1.4359, "step": 1305 }, { "epoch": 3.076560659599529, "grad_norm": 0.6913076043128967, "learning_rate": 1.9288022229338384e-05, "loss": 1.5051, "step": 1306 }, { "epoch": 3.0789163722025914, "grad_norm": 0.9083677530288696, "learning_rate": 1.924795538589045e-05, "loss": 1.2506, "step": 1307 }, { "epoch": 3.081272084805654, "grad_norm": 0.5917038321495056, "learning_rate": 1.9207904136716214e-05, "loss": 1.3732, "step": 1308 }, { "epoch": 3.0836277974087163, "grad_norm": 0.7237012386322021, "learning_rate": 1.9167868590397965e-05, "loss": 1.4187, "step": 1309 }, { "epoch": 3.0859835100117787, "grad_norm": 0.6573251485824585, "learning_rate": 1.912784885547541e-05, "loss": 1.522, "step": 1310 }, { "epoch": 3.088339222614841, "grad_norm": 0.541488766670227, "learning_rate": 1.90878450404454e-05, "loss": 1.3936, "step": 1311 }, { "epoch": 3.0906949352179036, "grad_norm": 0.7221812605857849, "learning_rate": 1.904785725376162e-05, "loss": 1.2884, "step": 1312 }, { "epoch": 3.0930506478209656, "grad_norm": 0.5733212828636169, "learning_rate": 1.9007885603834298e-05, "loss": 1.3281, "step": 1313 }, { "epoch": 3.095406360424028, "grad_norm": 0.4476734697818756, "learning_rate": 1.8967930199029917e-05, "loss": 1.2715, "step": 1314 }, { "epoch": 3.0977620730270905, "grad_norm": 0.55702805519104, "learning_rate": 1.8927991147670932e-05, "loss": 1.3468, "step": 1315 }, { "epoch": 3.100117785630153, "grad_norm": 0.5909210443496704, "learning_rate": 1.8888068558035435e-05, "loss": 1.3609, "step": 1316 }, { "epoch": 3.1024734982332154, "grad_norm": 0.5068781971931458, "learning_rate": 1.8848162538356912e-05, "loss": 1.2983, "step": 1317 }, { "epoch": 3.104829210836278, "grad_norm": 0.5593430399894714, "learning_rate": 1.880827319682391e-05, "loss": 1.4266, "step": 1318 }, { "epoch": 3.1071849234393403, "grad_norm": 0.6221286654472351, "learning_rate": 1.876840064157976e-05, "loss": 1.515, "step": 1319 }, { "epoch": 3.1095406360424027, "grad_norm": 0.5307830572128296, "learning_rate": 1.8728544980722318e-05, "loss": 1.3719, "step": 1320 }, { "epoch": 3.111896348645465, "grad_norm": 0.587023913860321, "learning_rate": 1.8688706322303595e-05, "loss": 1.4426, "step": 1321 }, { "epoch": 3.1142520612485276, "grad_norm": 0.6117364168167114, "learning_rate": 1.8648884774329524e-05, "loss": 1.3837, "step": 1322 }, { "epoch": 3.11660777385159, "grad_norm": 0.5979878902435303, "learning_rate": 1.8609080444759653e-05, "loss": 1.5695, "step": 1323 }, { "epoch": 3.1189634864546525, "grad_norm": 0.6785897016525269, "learning_rate": 1.8569293441506846e-05, "loss": 1.4317, "step": 1324 }, { "epoch": 3.121319199057715, "grad_norm": 0.5352964997291565, "learning_rate": 1.852952387243698e-05, "loss": 1.4158, "step": 1325 }, { "epoch": 3.1236749116607774, "grad_norm": 0.5435373187065125, "learning_rate": 1.84897718453687e-05, "loss": 1.4518, "step": 1326 }, { "epoch": 3.12603062426384, "grad_norm": 0.5248343348503113, "learning_rate": 1.845003746807308e-05, "loss": 1.2991, "step": 1327 }, { "epoch": 3.1283863368669023, "grad_norm": 0.513097882270813, "learning_rate": 1.8410320848273315e-05, "loss": 1.3004, "step": 1328 }, { "epoch": 3.1307420494699647, "grad_norm": 0.50612473487854, "learning_rate": 1.837062209364449e-05, "loss": 1.4265, "step": 1329 }, { "epoch": 3.133097762073027, "grad_norm": 0.5337306261062622, "learning_rate": 1.8330941311813245e-05, "loss": 1.297, "step": 1330 }, { "epoch": 3.1354534746760896, "grad_norm": 0.8820793032646179, "learning_rate": 1.82912786103575e-05, "loss": 1.3281, "step": 1331 }, { "epoch": 3.137809187279152, "grad_norm": 0.5290091633796692, "learning_rate": 1.825163409680616e-05, "loss": 1.423, "step": 1332 }, { "epoch": 3.1401648998822145, "grad_norm": 0.557161808013916, "learning_rate": 1.82120078786388e-05, "loss": 1.3853, "step": 1333 }, { "epoch": 3.142520612485277, "grad_norm": 0.5861290097236633, "learning_rate": 1.8172400063285422e-05, "loss": 1.3586, "step": 1334 }, { "epoch": 3.1448763250883394, "grad_norm": 0.575081467628479, "learning_rate": 1.8132810758126117e-05, "loss": 1.5177, "step": 1335 }, { "epoch": 3.147232037691402, "grad_norm": 0.9091634154319763, "learning_rate": 1.809324007049079e-05, "loss": 1.3196, "step": 1336 }, { "epoch": 3.1495877502944643, "grad_norm": 0.5268704891204834, "learning_rate": 1.8053688107658908e-05, "loss": 1.313, "step": 1337 }, { "epoch": 3.1519434628975267, "grad_norm": 0.5759544372558594, "learning_rate": 1.8014154976859126e-05, "loss": 1.4961, "step": 1338 }, { "epoch": 3.154299175500589, "grad_norm": 0.5821539163589478, "learning_rate": 1.797464078526908e-05, "loss": 1.2991, "step": 1339 }, { "epoch": 3.1566548881036516, "grad_norm": 0.5893937945365906, "learning_rate": 1.793514564001503e-05, "loss": 1.4235, "step": 1340 }, { "epoch": 3.1590106007067136, "grad_norm": 0.5790331363677979, "learning_rate": 1.789566964817163e-05, "loss": 1.3952, "step": 1341 }, { "epoch": 3.161366313309776, "grad_norm": 0.5490583181381226, "learning_rate": 1.785621291676159e-05, "loss": 1.4421, "step": 1342 }, { "epoch": 3.1637220259128385, "grad_norm": 0.607954740524292, "learning_rate": 1.7816775552755406e-05, "loss": 1.331, "step": 1343 }, { "epoch": 3.166077738515901, "grad_norm": 0.5901190638542175, "learning_rate": 1.777735766307106e-05, "loss": 1.4943, "step": 1344 }, { "epoch": 3.1684334511189634, "grad_norm": 0.5580378174781799, "learning_rate": 1.7737959354573765e-05, "loss": 1.4446, "step": 1345 }, { "epoch": 3.170789163722026, "grad_norm": 0.5773802995681763, "learning_rate": 1.769858073407561e-05, "loss": 1.2687, "step": 1346 }, { "epoch": 3.1731448763250882, "grad_norm": 0.5486554503440857, "learning_rate": 1.765922190833533e-05, "loss": 1.35, "step": 1347 }, { "epoch": 3.1755005889281507, "grad_norm": 0.5860403776168823, "learning_rate": 1.7619882984057987e-05, "loss": 1.4668, "step": 1348 }, { "epoch": 3.177856301531213, "grad_norm": 0.5680065751075745, "learning_rate": 1.7580564067894706e-05, "loss": 1.3801, "step": 1349 }, { "epoch": 3.1802120141342756, "grad_norm": 0.6463327407836914, "learning_rate": 1.7541265266442354e-05, "loss": 1.5029, "step": 1350 }, { "epoch": 3.182567726737338, "grad_norm": 0.5454993844032288, "learning_rate": 1.7501986686243256e-05, "loss": 1.4265, "step": 1351 }, { "epoch": 3.1849234393404005, "grad_norm": 0.6180981993675232, "learning_rate": 1.746272843378493e-05, "loss": 1.4004, "step": 1352 }, { "epoch": 3.187279151943463, "grad_norm": 2.9292774200439453, "learning_rate": 1.742349061549978e-05, "loss": 1.42, "step": 1353 }, { "epoch": 3.1896348645465253, "grad_norm": 0.5759695768356323, "learning_rate": 1.73842733377648e-05, "loss": 1.4452, "step": 1354 }, { "epoch": 3.191990577149588, "grad_norm": 0.5503639578819275, "learning_rate": 1.734507670690133e-05, "loss": 1.388, "step": 1355 }, { "epoch": 3.1943462897526502, "grad_norm": 0.8000882863998413, "learning_rate": 1.7305900829174697e-05, "loss": 1.3449, "step": 1356 }, { "epoch": 3.1967020023557127, "grad_norm": 0.5106770992279053, "learning_rate": 1.7266745810793986e-05, "loss": 1.3401, "step": 1357 }, { "epoch": 3.199057714958775, "grad_norm": 0.5212957859039307, "learning_rate": 1.7227611757911723e-05, "loss": 1.3967, "step": 1358 }, { "epoch": 3.2014134275618376, "grad_norm": 0.74335116147995, "learning_rate": 1.7188498776623592e-05, "loss": 1.4514, "step": 1359 }, { "epoch": 3.2037691401649, "grad_norm": 0.5074575543403625, "learning_rate": 1.7149406972968148e-05, "loss": 1.4488, "step": 1360 }, { "epoch": 3.2061248527679624, "grad_norm": 0.49796608090400696, "learning_rate": 1.7110336452926555e-05, "loss": 1.2398, "step": 1361 }, { "epoch": 3.208480565371025, "grad_norm": 0.5332653522491455, "learning_rate": 1.7071287322422253e-05, "loss": 1.4557, "step": 1362 }, { "epoch": 3.2108362779740873, "grad_norm": 0.5015223026275635, "learning_rate": 1.7032259687320695e-05, "loss": 1.4161, "step": 1363 }, { "epoch": 3.2131919905771498, "grad_norm": 0.5666617155075073, "learning_rate": 1.6993253653429063e-05, "loss": 1.4657, "step": 1364 }, { "epoch": 3.215547703180212, "grad_norm": 1.162514328956604, "learning_rate": 1.6954269326495975e-05, "loss": 1.5161, "step": 1365 }, { "epoch": 3.217903415783274, "grad_norm": 1.0753355026245117, "learning_rate": 1.691530681221119e-05, "loss": 1.3423, "step": 1366 }, { "epoch": 3.2202591283863367, "grad_norm": 0.5354429483413696, "learning_rate": 1.6876366216205357e-05, "loss": 1.3942, "step": 1367 }, { "epoch": 3.222614840989399, "grad_norm": 2.6275806427001953, "learning_rate": 1.683744764404968e-05, "loss": 1.4363, "step": 1368 }, { "epoch": 3.2249705535924615, "grad_norm": 0.5558125972747803, "learning_rate": 1.6798551201255663e-05, "loss": 1.2925, "step": 1369 }, { "epoch": 3.227326266195524, "grad_norm": 0.539991021156311, "learning_rate": 1.6759676993274804e-05, "loss": 1.3396, "step": 1370 }, { "epoch": 3.2296819787985864, "grad_norm": 0.5023191571235657, "learning_rate": 1.6720825125498342e-05, "loss": 1.3035, "step": 1371 }, { "epoch": 3.232037691401649, "grad_norm": 0.5547652244567871, "learning_rate": 1.6681995703256924e-05, "loss": 1.4746, "step": 1372 }, { "epoch": 3.2343934040047113, "grad_norm": 0.5805384516716003, "learning_rate": 1.6643188831820375e-05, "loss": 1.5219, "step": 1373 }, { "epoch": 3.2367491166077738, "grad_norm": 0.5085503458976746, "learning_rate": 1.6604404616397355e-05, "loss": 1.2475, "step": 1374 }, { "epoch": 3.239104829210836, "grad_norm": 0.54617840051651, "learning_rate": 1.656564316213512e-05, "loss": 1.3049, "step": 1375 }, { "epoch": 3.2414605418138986, "grad_norm": 0.57000732421875, "learning_rate": 1.6526904574119213e-05, "loss": 1.4789, "step": 1376 }, { "epoch": 3.243816254416961, "grad_norm": 0.5358412265777588, "learning_rate": 1.6488188957373174e-05, "loss": 1.4098, "step": 1377 }, { "epoch": 3.2461719670200235, "grad_norm": 0.502582848072052, "learning_rate": 1.6449496416858284e-05, "loss": 1.3373, "step": 1378 }, { "epoch": 3.248527679623086, "grad_norm": 0.6608543992042542, "learning_rate": 1.6410827057473248e-05, "loss": 1.3467, "step": 1379 }, { "epoch": 3.2508833922261484, "grad_norm": 0.5029400587081909, "learning_rate": 1.6372180984053944e-05, "loss": 1.3198, "step": 1380 }, { "epoch": 3.253239104829211, "grad_norm": 0.6682283878326416, "learning_rate": 1.63335583013731e-05, "loss": 1.4095, "step": 1381 }, { "epoch": 3.2555948174322733, "grad_norm": 0.5291099548339844, "learning_rate": 1.6294959114140034e-05, "loss": 1.3653, "step": 1382 }, { "epoch": 3.2579505300353357, "grad_norm": 0.47150886058807373, "learning_rate": 1.6256383527000372e-05, "loss": 1.1882, "step": 1383 }, { "epoch": 3.260306242638398, "grad_norm": 0.517058789730072, "learning_rate": 1.6217831644535742e-05, "loss": 1.3979, "step": 1384 }, { "epoch": 3.2626619552414606, "grad_norm": 0.6233232617378235, "learning_rate": 1.617930357126354e-05, "loss": 1.5281, "step": 1385 }, { "epoch": 3.265017667844523, "grad_norm": 0.705585241317749, "learning_rate": 1.6140799411636586e-05, "loss": 1.4857, "step": 1386 }, { "epoch": 3.2673733804475855, "grad_norm": 0.6202475428581238, "learning_rate": 1.610231927004287e-05, "loss": 1.4311, "step": 1387 }, { "epoch": 3.269729093050648, "grad_norm": 0.9263437390327454, "learning_rate": 1.606386325080528e-05, "loss": 1.2289, "step": 1388 }, { "epoch": 3.2720848056537104, "grad_norm": 0.5581469535827637, "learning_rate": 1.6025431458181282e-05, "loss": 1.3587, "step": 1389 }, { "epoch": 3.274440518256773, "grad_norm": 0.5049009323120117, "learning_rate": 1.5987023996362688e-05, "loss": 1.3287, "step": 1390 }, { "epoch": 3.2767962308598353, "grad_norm": 0.6443287134170532, "learning_rate": 1.5948640969475346e-05, "loss": 1.4733, "step": 1391 }, { "epoch": 3.2791519434628977, "grad_norm": 0.5874758362770081, "learning_rate": 1.591028248157884e-05, "loss": 1.3254, "step": 1392 }, { "epoch": 3.28150765606596, "grad_norm": 3.150019645690918, "learning_rate": 1.587194863666624e-05, "loss": 1.3997, "step": 1393 }, { "epoch": 3.2838633686690226, "grad_norm": 0.6312074661254883, "learning_rate": 1.58336395386638e-05, "loss": 1.3127, "step": 1394 }, { "epoch": 3.2862190812720846, "grad_norm": 0.5711130499839783, "learning_rate": 1.5795355291430693e-05, "loss": 1.3469, "step": 1395 }, { "epoch": 3.288574793875147, "grad_norm": 0.8136545419692993, "learning_rate": 1.575709599875869e-05, "loss": 1.4985, "step": 1396 }, { "epoch": 3.2909305064782095, "grad_norm": 0.5527499318122864, "learning_rate": 1.5718861764371967e-05, "loss": 1.3599, "step": 1397 }, { "epoch": 3.293286219081272, "grad_norm": 0.6098716855049133, "learning_rate": 1.5680652691926706e-05, "loss": 1.3421, "step": 1398 }, { "epoch": 3.2956419316843344, "grad_norm": 0.5192484855651855, "learning_rate": 1.5642468885010907e-05, "loss": 1.2901, "step": 1399 }, { "epoch": 3.297997644287397, "grad_norm": 1.1790953874588013, "learning_rate": 1.560431044714405e-05, "loss": 1.3214, "step": 1400 }, { "epoch": 3.3003533568904593, "grad_norm": 0.5919063687324524, "learning_rate": 1.5566177481776857e-05, "loss": 1.442, "step": 1401 }, { "epoch": 3.3027090694935217, "grad_norm": 0.6503802537918091, "learning_rate": 1.552807009229098e-05, "loss": 1.6523, "step": 1402 }, { "epoch": 3.305064782096584, "grad_norm": 0.6544208526611328, "learning_rate": 1.5489988381998743e-05, "loss": 1.2712, "step": 1403 }, { "epoch": 3.3074204946996466, "grad_norm": 0.5870060920715332, "learning_rate": 1.5451932454142845e-05, "loss": 1.46, "step": 1404 }, { "epoch": 3.309776207302709, "grad_norm": 1.398591160774231, "learning_rate": 1.541390241189608e-05, "loss": 1.4083, "step": 1405 }, { "epoch": 3.3121319199057715, "grad_norm": 0.705450177192688, "learning_rate": 1.537589835836108e-05, "loss": 1.3816, "step": 1406 }, { "epoch": 3.314487632508834, "grad_norm": 0.5327766537666321, "learning_rate": 1.5337920396570003e-05, "loss": 1.3933, "step": 1407 }, { "epoch": 3.3168433451118964, "grad_norm": 0.5579720139503479, "learning_rate": 1.5299968629484276e-05, "loss": 1.4239, "step": 1408 }, { "epoch": 3.319199057714959, "grad_norm": 0.5527592301368713, "learning_rate": 1.5262043159994317e-05, "loss": 1.4498, "step": 1409 }, { "epoch": 3.3215547703180213, "grad_norm": 0.605015218257904, "learning_rate": 1.5224144090919245e-05, "loss": 1.3533, "step": 1410 }, { "epoch": 3.3239104829210837, "grad_norm": 0.6016724705696106, "learning_rate": 1.5186271525006607e-05, "loss": 1.3669, "step": 1411 }, { "epoch": 3.326266195524146, "grad_norm": 0.5761764049530029, "learning_rate": 1.5148425564932084e-05, "loss": 1.4764, "step": 1412 }, { "epoch": 3.3286219081272086, "grad_norm": 2.6630613803863525, "learning_rate": 1.5110606313299248e-05, "loss": 1.4857, "step": 1413 }, { "epoch": 3.330977620730271, "grad_norm": 0.6027582287788391, "learning_rate": 1.5072813872639227e-05, "loss": 1.2908, "step": 1414 }, { "epoch": 3.3333333333333335, "grad_norm": 0.6258671283721924, "learning_rate": 1.5035048345410519e-05, "loss": 1.4996, "step": 1415 }, { "epoch": 3.335689045936396, "grad_norm": 0.6116472482681274, "learning_rate": 1.4997309833998607e-05, "loss": 1.6155, "step": 1416 }, { "epoch": 3.3380447585394584, "grad_norm": 0.5575313568115234, "learning_rate": 1.4959598440715755e-05, "loss": 1.6238, "step": 1417 }, { "epoch": 3.3404004711425204, "grad_norm": 0.5386831164360046, "learning_rate": 1.49219142678007e-05, "loss": 1.2874, "step": 1418 }, { "epoch": 3.342756183745583, "grad_norm": 0.7191012501716614, "learning_rate": 1.4884257417418382e-05, "loss": 1.3056, "step": 1419 }, { "epoch": 3.3451118963486453, "grad_norm": 0.57406085729599, "learning_rate": 1.4846627991659662e-05, "loss": 1.2388, "step": 1420 }, { "epoch": 3.3474676089517077, "grad_norm": 0.6344390511512756, "learning_rate": 1.4809026092541078e-05, "loss": 1.1887, "step": 1421 }, { "epoch": 3.34982332155477, "grad_norm": 0.5386343002319336, "learning_rate": 1.4771451822004511e-05, "loss": 1.3436, "step": 1422 }, { "epoch": 3.3521790341578326, "grad_norm": 0.5470718741416931, "learning_rate": 1.4733905281916954e-05, "loss": 1.4987, "step": 1423 }, { "epoch": 3.354534746760895, "grad_norm": 0.5848893523216248, "learning_rate": 1.4696386574070204e-05, "loss": 1.3642, "step": 1424 }, { "epoch": 3.3568904593639575, "grad_norm": 0.5322478413581848, "learning_rate": 1.4658895800180622e-05, "loss": 1.4008, "step": 1425 }, { "epoch": 3.35924617196702, "grad_norm": 0.5439946055412292, "learning_rate": 1.462143306188882e-05, "loss": 1.2942, "step": 1426 }, { "epoch": 3.3616018845700824, "grad_norm": 0.7155500650405884, "learning_rate": 1.4583998460759424e-05, "loss": 1.5271, "step": 1427 }, { "epoch": 3.363957597173145, "grad_norm": 0.627501368522644, "learning_rate": 1.454659209828077e-05, "loss": 1.3519, "step": 1428 }, { "epoch": 3.3663133097762072, "grad_norm": 0.5159199237823486, "learning_rate": 1.450921407586462e-05, "loss": 1.1298, "step": 1429 }, { "epoch": 3.3686690223792697, "grad_norm": 0.7754887938499451, "learning_rate": 1.447186449484593e-05, "loss": 1.4185, "step": 1430 }, { "epoch": 3.371024734982332, "grad_norm": 0.5428597331047058, "learning_rate": 1.443454345648252e-05, "loss": 1.3942, "step": 1431 }, { "epoch": 3.3733804475853946, "grad_norm": 0.5677019953727722, "learning_rate": 1.4397251061954847e-05, "loss": 1.336, "step": 1432 }, { "epoch": 3.375736160188457, "grad_norm": 0.5576009154319763, "learning_rate": 1.4359987412365721e-05, "loss": 1.3929, "step": 1433 }, { "epoch": 3.3780918727915195, "grad_norm": 0.5403046011924744, "learning_rate": 1.4322752608740014e-05, "loss": 1.3173, "step": 1434 }, { "epoch": 3.380447585394582, "grad_norm": 0.5300245881080627, "learning_rate": 1.428554675202437e-05, "loss": 1.328, "step": 1435 }, { "epoch": 3.3828032979976443, "grad_norm": 0.5451343059539795, "learning_rate": 1.4248369943086998e-05, "loss": 1.3615, "step": 1436 }, { "epoch": 3.385159010600707, "grad_norm": 0.5598456263542175, "learning_rate": 1.4211222282717313e-05, "loss": 1.4134, "step": 1437 }, { "epoch": 3.3875147232037692, "grad_norm": 0.5133709907531738, "learning_rate": 1.4174103871625738e-05, "loss": 1.2203, "step": 1438 }, { "epoch": 3.3898704358068317, "grad_norm": 0.5987529754638672, "learning_rate": 1.4137014810443386e-05, "loss": 1.5542, "step": 1439 }, { "epoch": 3.392226148409894, "grad_norm": 0.6169204115867615, "learning_rate": 1.4099955199721815e-05, "loss": 1.6228, "step": 1440 }, { "epoch": 3.3945818610129566, "grad_norm": 0.610907256603241, "learning_rate": 1.4062925139932703e-05, "loss": 1.4642, "step": 1441 }, { "epoch": 3.396937573616019, "grad_norm": 0.6104577779769897, "learning_rate": 1.402592473146766e-05, "loss": 1.3593, "step": 1442 }, { "epoch": 3.3992932862190814, "grad_norm": 0.5737777948379517, "learning_rate": 1.3988954074637867e-05, "loss": 1.3611, "step": 1443 }, { "epoch": 3.401648998822144, "grad_norm": 0.5312981605529785, "learning_rate": 1.3952013269673872e-05, "loss": 1.3455, "step": 1444 }, { "epoch": 3.4040047114252063, "grad_norm": 0.5598698854446411, "learning_rate": 1.3915102416725287e-05, "loss": 1.4254, "step": 1445 }, { "epoch": 3.4063604240282688, "grad_norm": 0.567198634147644, "learning_rate": 1.3878221615860527e-05, "loss": 1.535, "step": 1446 }, { "epoch": 3.408716136631331, "grad_norm": 0.5648866295814514, "learning_rate": 1.3841370967066502e-05, "loss": 1.4002, "step": 1447 }, { "epoch": 3.411071849234393, "grad_norm": 0.5179278254508972, "learning_rate": 1.3804550570248431e-05, "loss": 1.2782, "step": 1448 }, { "epoch": 3.4134275618374557, "grad_norm": 0.565303385257721, "learning_rate": 1.3767760525229462e-05, "loss": 1.3628, "step": 1449 }, { "epoch": 3.415783274440518, "grad_norm": 0.583527684211731, "learning_rate": 1.3731000931750487e-05, "loss": 1.5259, "step": 1450 }, { "epoch": 3.4181389870435805, "grad_norm": 1.9099422693252563, "learning_rate": 1.3694271889469844e-05, "loss": 1.4754, "step": 1451 }, { "epoch": 3.420494699646643, "grad_norm": 0.7335624098777771, "learning_rate": 1.3657573497963042e-05, "loss": 1.3536, "step": 1452 }, { "epoch": 3.4228504122497054, "grad_norm": 0.5374651551246643, "learning_rate": 1.3620905856722468e-05, "loss": 1.3518, "step": 1453 }, { "epoch": 3.425206124852768, "grad_norm": 0.5788171887397766, "learning_rate": 1.3584269065157174e-05, "loss": 1.4736, "step": 1454 }, { "epoch": 3.4275618374558303, "grad_norm": 0.49140051007270813, "learning_rate": 1.3547663222592572e-05, "loss": 1.3466, "step": 1455 }, { "epoch": 3.4299175500588928, "grad_norm": 1.974959373474121, "learning_rate": 1.3511088428270142e-05, "loss": 1.3923, "step": 1456 }, { "epoch": 3.432273262661955, "grad_norm": 0.5198172926902771, "learning_rate": 1.3474544781347217e-05, "loss": 1.4944, "step": 1457 }, { "epoch": 3.4346289752650176, "grad_norm": 0.575279176235199, "learning_rate": 1.3438032380896681e-05, "loss": 1.4396, "step": 1458 }, { "epoch": 3.43698468786808, "grad_norm": 0.47884616255760193, "learning_rate": 1.3401551325906709e-05, "loss": 1.2722, "step": 1459 }, { "epoch": 3.4393404004711425, "grad_norm": 0.6098414063453674, "learning_rate": 1.3365101715280473e-05, "loss": 1.3479, "step": 1460 }, { "epoch": 3.441696113074205, "grad_norm": 0.5703315734863281, "learning_rate": 1.3328683647835933e-05, "loss": 1.3597, "step": 1461 }, { "epoch": 3.4440518256772674, "grad_norm": 0.5171709656715393, "learning_rate": 1.3292297222305483e-05, "loss": 1.3257, "step": 1462 }, { "epoch": 3.44640753828033, "grad_norm": 0.8379916548728943, "learning_rate": 1.3255942537335805e-05, "loss": 1.303, "step": 1463 }, { "epoch": 3.4487632508833923, "grad_norm": 0.5670388340950012, "learning_rate": 1.3219619691487459e-05, "loss": 1.3824, "step": 1464 }, { "epoch": 3.4511189634864547, "grad_norm": 0.5228069424629211, "learning_rate": 1.3183328783234734e-05, "loss": 1.4437, "step": 1465 }, { "epoch": 3.453474676089517, "grad_norm": 0.5543954372406006, "learning_rate": 1.3147069910965298e-05, "loss": 1.4452, "step": 1466 }, { "epoch": 3.4558303886925796, "grad_norm": 0.5134137272834778, "learning_rate": 1.311084317298e-05, "loss": 1.4283, "step": 1467 }, { "epoch": 3.458186101295642, "grad_norm": 0.6205701231956482, "learning_rate": 1.3074648667492528e-05, "loss": 1.5731, "step": 1468 }, { "epoch": 3.4605418138987045, "grad_norm": 0.6763115525245667, "learning_rate": 1.3038486492629252e-05, "loss": 1.1827, "step": 1469 }, { "epoch": 3.462897526501767, "grad_norm": 0.5907649993896484, "learning_rate": 1.3002356746428817e-05, "loss": 1.5255, "step": 1470 }, { "epoch": 3.465253239104829, "grad_norm": 0.5283946394920349, "learning_rate": 1.2966259526842006e-05, "loss": 1.3149, "step": 1471 }, { "epoch": 3.4676089517078914, "grad_norm": 0.49043506383895874, "learning_rate": 1.2930194931731382e-05, "loss": 1.327, "step": 1472 }, { "epoch": 3.469964664310954, "grad_norm": 0.5440202355384827, "learning_rate": 1.2894163058871092e-05, "loss": 1.292, "step": 1473 }, { "epoch": 3.4723203769140163, "grad_norm": 0.5152914524078369, "learning_rate": 1.2858164005946527e-05, "loss": 1.306, "step": 1474 }, { "epoch": 3.4746760895170787, "grad_norm": 1.150414228439331, "learning_rate": 1.2822197870554165e-05, "loss": 1.2175, "step": 1475 }, { "epoch": 3.477031802120141, "grad_norm": 0.5440114736557007, "learning_rate": 1.2786264750201182e-05, "loss": 1.3291, "step": 1476 }, { "epoch": 3.4793875147232036, "grad_norm": 0.5625450611114502, "learning_rate": 1.2750364742305283e-05, "loss": 1.2301, "step": 1477 }, { "epoch": 3.481743227326266, "grad_norm": 0.6205968260765076, "learning_rate": 1.2714497944194376e-05, "loss": 1.4163, "step": 1478 }, { "epoch": 3.4840989399293285, "grad_norm": 0.6145076751708984, "learning_rate": 1.2678664453106351e-05, "loss": 1.3531, "step": 1479 }, { "epoch": 3.486454652532391, "grad_norm": 0.58487468957901, "learning_rate": 1.2642864366188806e-05, "loss": 1.4781, "step": 1480 }, { "epoch": 3.4888103651354534, "grad_norm": 0.5669450163841248, "learning_rate": 1.2607097780498772e-05, "loss": 1.3216, "step": 1481 }, { "epoch": 3.491166077738516, "grad_norm": 0.5319912433624268, "learning_rate": 1.2571364793002434e-05, "loss": 1.5266, "step": 1482 }, { "epoch": 3.4935217903415783, "grad_norm": 0.5175240635871887, "learning_rate": 1.2535665500574922e-05, "loss": 1.4284, "step": 1483 }, { "epoch": 3.4958775029446407, "grad_norm": 0.6224169135093689, "learning_rate": 1.2500000000000006e-05, "loss": 1.2899, "step": 1484 }, { "epoch": 3.4958775029446407, "eval_loss": 1.3198578357696533, "eval_runtime": 5.7593, "eval_samples_per_second": 434.079, "eval_steps_per_second": 6.945, "step": 1484 }, { "epoch": 3.498233215547703, "grad_norm": 0.6093937754631042, "learning_rate": 1.246436838796982e-05, "loss": 1.3291, "step": 1485 }, { "epoch": 3.5005889281507656, "grad_norm": 0.5102196335792542, "learning_rate": 1.2428770761084655e-05, "loss": 1.2465, "step": 1486 }, { "epoch": 3.502944640753828, "grad_norm": 0.5676569938659668, "learning_rate": 1.2393207215852651e-05, "loss": 1.3374, "step": 1487 }, { "epoch": 3.5053003533568905, "grad_norm": 0.5132775902748108, "learning_rate": 1.2357677848689558e-05, "loss": 1.3267, "step": 1488 }, { "epoch": 3.507656065959953, "grad_norm": 0.5167083144187927, "learning_rate": 1.2322182755918446e-05, "loss": 1.1602, "step": 1489 }, { "epoch": 3.5100117785630154, "grad_norm": 0.5659974217414856, "learning_rate": 1.2286722033769493e-05, "loss": 1.3662, "step": 1490 }, { "epoch": 3.512367491166078, "grad_norm": 0.5449172854423523, "learning_rate": 1.2251295778379657e-05, "loss": 1.3875, "step": 1491 }, { "epoch": 3.5147232037691403, "grad_norm": 0.45780497789382935, "learning_rate": 1.221590408579251e-05, "loss": 1.2549, "step": 1492 }, { "epoch": 3.5170789163722027, "grad_norm": 0.5225306749343872, "learning_rate": 1.2180547051957863e-05, "loss": 1.3494, "step": 1493 }, { "epoch": 3.519434628975265, "grad_norm": 0.49667471647262573, "learning_rate": 1.214522477273161e-05, "loss": 1.3116, "step": 1494 }, { "epoch": 3.5217903415783276, "grad_norm": 0.5128898620605469, "learning_rate": 1.2109937343875377e-05, "loss": 1.3758, "step": 1495 }, { "epoch": 3.52414605418139, "grad_norm": 0.5075840353965759, "learning_rate": 1.207468486105636e-05, "loss": 1.3766, "step": 1496 }, { "epoch": 3.5265017667844525, "grad_norm": 1.3918946981430054, "learning_rate": 1.2039467419846951e-05, "loss": 1.3215, "step": 1497 }, { "epoch": 3.528857479387515, "grad_norm": 0.5519576072692871, "learning_rate": 1.200428511572462e-05, "loss": 1.3493, "step": 1498 }, { "epoch": 3.5312131919905774, "grad_norm": 0.5368751883506775, "learning_rate": 1.1969138044071501e-05, "loss": 1.4376, "step": 1499 }, { "epoch": 3.53356890459364, "grad_norm": 0.5913774371147156, "learning_rate": 1.1934026300174264e-05, "loss": 1.5059, "step": 1500 }, { "epoch": 3.5359246171967023, "grad_norm": 1.0488969087600708, "learning_rate": 1.1898949979223765e-05, "loss": 1.3994, "step": 1501 }, { "epoch": 3.5382803297997647, "grad_norm": 0.5071121454238892, "learning_rate": 1.1863909176314856e-05, "loss": 1.3312, "step": 1502 }, { "epoch": 3.5406360424028267, "grad_norm": 0.53867107629776, "learning_rate": 1.1828903986446055e-05, "loss": 1.5312, "step": 1503 }, { "epoch": 3.542991755005889, "grad_norm": 0.873878002166748, "learning_rate": 1.1793934504519399e-05, "loss": 1.2307, "step": 1504 }, { "epoch": 3.5453474676089516, "grad_norm": 0.55849689245224, "learning_rate": 1.1759000825340041e-05, "loss": 1.3971, "step": 1505 }, { "epoch": 3.547703180212014, "grad_norm": 0.6240206956863403, "learning_rate": 1.1724103043616134e-05, "loss": 1.3392, "step": 1506 }, { "epoch": 3.5500588928150765, "grad_norm": 0.5950769782066345, "learning_rate": 1.168924125395845e-05, "loss": 1.3391, "step": 1507 }, { "epoch": 3.552414605418139, "grad_norm": 0.5146418809890747, "learning_rate": 1.1654415550880243e-05, "loss": 1.4735, "step": 1508 }, { "epoch": 3.5547703180212014, "grad_norm": 0.6084526181221008, "learning_rate": 1.1619626028796871e-05, "loss": 1.5066, "step": 1509 }, { "epoch": 3.557126030624264, "grad_norm": 0.5984584093093872, "learning_rate": 1.1584872782025677e-05, "loss": 1.4896, "step": 1510 }, { "epoch": 3.5594817432273262, "grad_norm": 0.8357718586921692, "learning_rate": 1.1550155904785587e-05, "loss": 1.313, "step": 1511 }, { "epoch": 3.5618374558303887, "grad_norm": 0.6249478459358215, "learning_rate": 1.1515475491196976e-05, "loss": 1.6323, "step": 1512 }, { "epoch": 3.564193168433451, "grad_norm": 0.6016179919242859, "learning_rate": 1.1480831635281328e-05, "loss": 1.3298, "step": 1513 }, { "epoch": 3.5665488810365136, "grad_norm": 0.6053820848464966, "learning_rate": 1.1446224430961036e-05, "loss": 1.4586, "step": 1514 }, { "epoch": 3.568904593639576, "grad_norm": 0.5422496795654297, "learning_rate": 1.1411653972059128e-05, "loss": 1.514, "step": 1515 }, { "epoch": 3.5712603062426385, "grad_norm": 0.517810583114624, "learning_rate": 1.1377120352299014e-05, "loss": 1.328, "step": 1516 }, { "epoch": 3.573616018845701, "grad_norm": 0.5480679869651794, "learning_rate": 1.1342623665304209e-05, "loss": 1.4115, "step": 1517 }, { "epoch": 3.5759717314487633, "grad_norm": 0.5266749858856201, "learning_rate": 1.1308164004598118e-05, "loss": 1.2451, "step": 1518 }, { "epoch": 3.578327444051826, "grad_norm": 0.5854389071464539, "learning_rate": 1.1273741463603774e-05, "loss": 1.386, "step": 1519 }, { "epoch": 3.5806831566548882, "grad_norm": 0.6050906181335449, "learning_rate": 1.1239356135643545e-05, "loss": 1.4089, "step": 1520 }, { "epoch": 3.5830388692579507, "grad_norm": 0.5736445188522339, "learning_rate": 1.1205008113938934e-05, "loss": 1.3517, "step": 1521 }, { "epoch": 3.585394581861013, "grad_norm": 0.6180756688117981, "learning_rate": 1.1170697491610304e-05, "loss": 1.4594, "step": 1522 }, { "epoch": 3.587750294464075, "grad_norm": 2.390347480773926, "learning_rate": 1.1136424361676626e-05, "loss": 1.3577, "step": 1523 }, { "epoch": 3.5901060070671376, "grad_norm": 1.0292620658874512, "learning_rate": 1.11021888170552e-05, "loss": 1.5227, "step": 1524 }, { "epoch": 3.5924617196702, "grad_norm": 0.5263658165931702, "learning_rate": 1.1067990950561472e-05, "loss": 1.3565, "step": 1525 }, { "epoch": 3.5948174322732624, "grad_norm": 3.016988754272461, "learning_rate": 1.1033830854908691e-05, "loss": 1.3947, "step": 1526 }, { "epoch": 3.597173144876325, "grad_norm": 0.5369905233383179, "learning_rate": 1.0999708622707746e-05, "loss": 1.2735, "step": 1527 }, { "epoch": 3.5995288574793873, "grad_norm": 0.54857337474823, "learning_rate": 1.096562434646686e-05, "loss": 1.3866, "step": 1528 }, { "epoch": 3.6018845700824498, "grad_norm": 0.5661123991012573, "learning_rate": 1.0931578118591362e-05, "loss": 1.3195, "step": 1529 }, { "epoch": 3.604240282685512, "grad_norm": 0.603049635887146, "learning_rate": 1.0897570031383406e-05, "loss": 1.2883, "step": 1530 }, { "epoch": 3.6065959952885747, "grad_norm": 0.5742252469062805, "learning_rate": 1.0863600177041772e-05, "loss": 1.3854, "step": 1531 }, { "epoch": 3.608951707891637, "grad_norm": 0.8406847715377808, "learning_rate": 1.0829668647661559e-05, "loss": 1.3353, "step": 1532 }, { "epoch": 3.6113074204946995, "grad_norm": 0.5295700430870056, "learning_rate": 1.0795775535233988e-05, "loss": 1.3537, "step": 1533 }, { "epoch": 3.613663133097762, "grad_norm": 0.5314053893089294, "learning_rate": 1.0761920931646122e-05, "loss": 1.3499, "step": 1534 }, { "epoch": 3.6160188457008244, "grad_norm": 0.5523396134376526, "learning_rate": 1.0728104928680624e-05, "loss": 1.4026, "step": 1535 }, { "epoch": 3.618374558303887, "grad_norm": 0.6266639828681946, "learning_rate": 1.0694327618015493e-05, "loss": 1.2495, "step": 1536 }, { "epoch": 3.6207302709069493, "grad_norm": 0.6351761817932129, "learning_rate": 1.0660589091223855e-05, "loss": 1.5139, "step": 1537 }, { "epoch": 3.6230859835100118, "grad_norm": 0.5728114247322083, "learning_rate": 1.062688943977366e-05, "loss": 1.4779, "step": 1538 }, { "epoch": 3.625441696113074, "grad_norm": 0.5696753263473511, "learning_rate": 1.0593228755027487e-05, "loss": 1.4547, "step": 1539 }, { "epoch": 3.6277974087161367, "grad_norm": 0.6083020567893982, "learning_rate": 1.0559607128242266e-05, "loss": 1.5139, "step": 1540 }, { "epoch": 3.630153121319199, "grad_norm": 0.5795078277587891, "learning_rate": 1.0526024650569047e-05, "loss": 1.3624, "step": 1541 }, { "epoch": 3.6325088339222615, "grad_norm": 0.6599566340446472, "learning_rate": 1.0492481413052716e-05, "loss": 1.4593, "step": 1542 }, { "epoch": 3.634864546525324, "grad_norm": 0.5426474809646606, "learning_rate": 1.0458977506631808e-05, "loss": 1.3444, "step": 1543 }, { "epoch": 3.6372202591283864, "grad_norm": 0.625540018081665, "learning_rate": 1.0425513022138203e-05, "loss": 1.4544, "step": 1544 }, { "epoch": 3.639575971731449, "grad_norm": 0.5271183252334595, "learning_rate": 1.0392088050296919e-05, "loss": 1.2443, "step": 1545 }, { "epoch": 3.6419316843345113, "grad_norm": 0.5795171856880188, "learning_rate": 1.0358702681725848e-05, "loss": 1.3794, "step": 1546 }, { "epoch": 3.6442873969375738, "grad_norm": 0.5803889632225037, "learning_rate": 1.0325357006935525e-05, "loss": 1.3453, "step": 1547 }, { "epoch": 3.646643109540636, "grad_norm": 0.5772535800933838, "learning_rate": 1.0292051116328844e-05, "loss": 1.5251, "step": 1548 }, { "epoch": 3.6489988221436986, "grad_norm": 0.6485162377357483, "learning_rate": 1.0258785100200865e-05, "loss": 1.381, "step": 1549 }, { "epoch": 3.651354534746761, "grad_norm": 3.400617837905884, "learning_rate": 1.0225559048738547e-05, "loss": 1.4251, "step": 1550 }, { "epoch": 3.6537102473498235, "grad_norm": 1.1564207077026367, "learning_rate": 1.019237305202048e-05, "loss": 1.4155, "step": 1551 }, { "epoch": 3.656065959952886, "grad_norm": 0.5722847580909729, "learning_rate": 1.0159227200016678e-05, "loss": 1.4272, "step": 1552 }, { "epoch": 3.6584216725559484, "grad_norm": 0.767595648765564, "learning_rate": 1.0126121582588316e-05, "loss": 1.4453, "step": 1553 }, { "epoch": 3.660777385159011, "grad_norm": 0.5896347165107727, "learning_rate": 1.00930562894875e-05, "loss": 1.455, "step": 1554 }, { "epoch": 3.6631330977620733, "grad_norm": 0.687151312828064, "learning_rate": 1.0060031410356988e-05, "loss": 1.2509, "step": 1555 }, { "epoch": 3.6654888103651353, "grad_norm": 0.5129477381706238, "learning_rate": 1.002704703473e-05, "loss": 1.3866, "step": 1556 }, { "epoch": 3.6678445229681977, "grad_norm": 1.0665055513381958, "learning_rate": 9.994103252029915e-06, "loss": 1.4229, "step": 1557 }, { "epoch": 3.67020023557126, "grad_norm": 0.4948142468929291, "learning_rate": 9.96120015157011e-06, "loss": 1.2257, "step": 1558 }, { "epoch": 3.6725559481743226, "grad_norm": 0.4843747019767761, "learning_rate": 9.928337822553621e-06, "loss": 1.2616, "step": 1559 }, { "epoch": 3.674911660777385, "grad_norm": 0.4766373932361603, "learning_rate": 9.895516354072981e-06, "loss": 1.2153, "step": 1560 }, { "epoch": 3.6772673733804475, "grad_norm": 0.5438255667686462, "learning_rate": 9.862735835109915e-06, "loss": 1.3507, "step": 1561 }, { "epoch": 3.67962308598351, "grad_norm": 0.5528162717819214, "learning_rate": 9.829996354535172e-06, "loss": 1.4282, "step": 1562 }, { "epoch": 3.6819787985865724, "grad_norm": 0.5559258460998535, "learning_rate": 9.797298001108193e-06, "loss": 1.175, "step": 1563 }, { "epoch": 3.684334511189635, "grad_norm": 0.556128740310669, "learning_rate": 9.764640863476981e-06, "loss": 1.4422, "step": 1564 }, { "epoch": 3.6866902237926973, "grad_norm": 1.7282902002334595, "learning_rate": 9.73202503017774e-06, "loss": 1.4483, "step": 1565 }, { "epoch": 3.6890459363957597, "grad_norm": 0.5606308579444885, "learning_rate": 9.699450589634736e-06, "loss": 1.5951, "step": 1566 }, { "epoch": 3.691401648998822, "grad_norm": 1.9709892272949219, "learning_rate": 9.666917630159985e-06, "loss": 1.4162, "step": 1567 }, { "epoch": 3.6937573616018846, "grad_norm": 0.6790780425071716, "learning_rate": 9.634426239953073e-06, "loss": 1.5256, "step": 1568 }, { "epoch": 3.696113074204947, "grad_norm": 3.1522786617279053, "learning_rate": 9.601976507100851e-06, "loss": 1.3028, "step": 1569 }, { "epoch": 3.6984687868080095, "grad_norm": 0.5627315640449524, "learning_rate": 9.569568519577294e-06, "loss": 1.2272, "step": 1570 }, { "epoch": 3.700824499411072, "grad_norm": 0.5687552094459534, "learning_rate": 9.53720236524313e-06, "loss": 1.4729, "step": 1571 }, { "epoch": 3.7031802120141344, "grad_norm": 0.5212887525558472, "learning_rate": 9.504878131845738e-06, "loss": 1.3897, "step": 1572 }, { "epoch": 3.705535924617197, "grad_norm": 0.6088081002235413, "learning_rate": 9.472595907018788e-06, "loss": 1.3792, "step": 1573 }, { "epoch": 3.7078916372202593, "grad_norm": 0.5294075608253479, "learning_rate": 9.440355778282106e-06, "loss": 1.2968, "step": 1574 }, { "epoch": 3.7102473498233217, "grad_norm": 0.5786322951316833, "learning_rate": 9.408157833041372e-06, "loss": 1.3609, "step": 1575 }, { "epoch": 3.7126030624263837, "grad_norm": 1.9893484115600586, "learning_rate": 9.376002158587915e-06, "loss": 1.2678, "step": 1576 }, { "epoch": 3.714958775029446, "grad_norm": 0.5370575189590454, "learning_rate": 9.343888842098441e-06, "loss": 1.4869, "step": 1577 }, { "epoch": 3.7173144876325086, "grad_norm": 0.5635150671005249, "learning_rate": 9.311817970634854e-06, "loss": 1.4788, "step": 1578 }, { "epoch": 3.719670200235571, "grad_norm": 0.6371703147888184, "learning_rate": 9.279789631143943e-06, "loss": 1.3279, "step": 1579 }, { "epoch": 3.7220259128386335, "grad_norm": 0.5337600708007812, "learning_rate": 9.247803910457226e-06, "loss": 1.3639, "step": 1580 }, { "epoch": 3.724381625441696, "grad_norm": 0.5267266631126404, "learning_rate": 9.215860895290662e-06, "loss": 1.2879, "step": 1581 }, { "epoch": 3.7267373380447584, "grad_norm": 0.6066449880599976, "learning_rate": 9.183960672244452e-06, "loss": 1.2964, "step": 1582 }, { "epoch": 3.729093050647821, "grad_norm": 0.5447261333465576, "learning_rate": 9.152103327802738e-06, "loss": 1.3524, "step": 1583 }, { "epoch": 3.7314487632508833, "grad_norm": 0.8154910802841187, "learning_rate": 9.120288948333463e-06, "loss": 1.3775, "step": 1584 }, { "epoch": 3.7338044758539457, "grad_norm": 0.5758695602416992, "learning_rate": 9.08851762008807e-06, "loss": 1.5762, "step": 1585 }, { "epoch": 3.736160188457008, "grad_norm": 0.618125855922699, "learning_rate": 9.05678942920127e-06, "loss": 1.4094, "step": 1586 }, { "epoch": 3.7385159010600706, "grad_norm": 0.5250685214996338, "learning_rate": 9.02510446169085e-06, "loss": 1.4613, "step": 1587 }, { "epoch": 3.740871613663133, "grad_norm": 0.5450623035430908, "learning_rate": 8.993462803457404e-06, "loss": 1.4149, "step": 1588 }, { "epoch": 3.7432273262661955, "grad_norm": 0.6390373110771179, "learning_rate": 8.96186454028412e-06, "loss": 1.309, "step": 1589 }, { "epoch": 3.745583038869258, "grad_norm": 0.5647222995758057, "learning_rate": 8.930309757836517e-06, "loss": 1.442, "step": 1590 }, { "epoch": 3.7479387514723204, "grad_norm": 0.5668995976448059, "learning_rate": 8.898798541662259e-06, "loss": 1.2888, "step": 1591 }, { "epoch": 3.750294464075383, "grad_norm": 0.5729898810386658, "learning_rate": 8.867330977190877e-06, "loss": 1.4071, "step": 1592 }, { "epoch": 3.7526501766784452, "grad_norm": 0.5865924954414368, "learning_rate": 8.835907149733569e-06, "loss": 1.3682, "step": 1593 }, { "epoch": 3.7550058892815077, "grad_norm": 0.7294045686721802, "learning_rate": 8.80452714448296e-06, "loss": 1.4365, "step": 1594 }, { "epoch": 3.75736160188457, "grad_norm": 0.5744823813438416, "learning_rate": 8.77319104651288e-06, "loss": 1.4535, "step": 1595 }, { "epoch": 3.7597173144876326, "grad_norm": 0.5517726540565491, "learning_rate": 8.741898940778088e-06, "loss": 1.3832, "step": 1596 }, { "epoch": 3.762073027090695, "grad_norm": 0.5168753266334534, "learning_rate": 8.710650912114115e-06, "loss": 1.2112, "step": 1597 }, { "epoch": 3.7644287396937575, "grad_norm": 0.6060011386871338, "learning_rate": 8.679447045236962e-06, "loss": 1.3701, "step": 1598 }, { "epoch": 3.76678445229682, "grad_norm": 0.4838341176509857, "learning_rate": 8.648287424742933e-06, "loss": 1.1495, "step": 1599 }, { "epoch": 3.7691401648998824, "grad_norm": 6.105593681335449, "learning_rate": 8.617172135108356e-06, "loss": 1.4825, "step": 1600 }, { "epoch": 3.771495877502945, "grad_norm": 0.5856305956840515, "learning_rate": 8.586101260689397e-06, "loss": 1.4147, "step": 1601 }, { "epoch": 3.7738515901060072, "grad_norm": 0.5920904278755188, "learning_rate": 8.55507488572177e-06, "loss": 1.3733, "step": 1602 }, { "epoch": 3.7762073027090697, "grad_norm": 0.517911434173584, "learning_rate": 8.524093094320593e-06, "loss": 1.2877, "step": 1603 }, { "epoch": 3.778563015312132, "grad_norm": 0.6693894267082214, "learning_rate": 8.493155970480073e-06, "loss": 1.3542, "step": 1604 }, { "epoch": 3.7809187279151946, "grad_norm": 0.5315267443656921, "learning_rate": 8.462263598073348e-06, "loss": 1.3634, "step": 1605 }, { "epoch": 3.783274440518257, "grad_norm": 0.8495927453041077, "learning_rate": 8.431416060852218e-06, "loss": 1.3592, "step": 1606 }, { "epoch": 3.7856301531213195, "grad_norm": 0.7102304100990295, "learning_rate": 8.400613442446948e-06, "loss": 1.6697, "step": 1607 }, { "epoch": 3.787985865724382, "grad_norm": 0.546316921710968, "learning_rate": 8.369855826365988e-06, "loss": 1.3866, "step": 1608 }, { "epoch": 3.790341578327444, "grad_norm": 0.5456357598304749, "learning_rate": 8.339143295995821e-06, "loss": 1.3726, "step": 1609 }, { "epoch": 3.7926972909305063, "grad_norm": 0.5593120455741882, "learning_rate": 8.30847593460069e-06, "loss": 1.4174, "step": 1610 }, { "epoch": 3.795053003533569, "grad_norm": 0.5836553573608398, "learning_rate": 8.277853825322355e-06, "loss": 1.4501, "step": 1611 }, { "epoch": 3.7974087161366312, "grad_norm": 0.571354866027832, "learning_rate": 8.247277051179925e-06, "loss": 1.3025, "step": 1612 }, { "epoch": 3.7997644287396937, "grad_norm": 0.5629730820655823, "learning_rate": 8.216745695069589e-06, "loss": 1.3419, "step": 1613 }, { "epoch": 3.802120141342756, "grad_norm": 0.6141170859336853, "learning_rate": 8.186259839764415e-06, "loss": 1.372, "step": 1614 }, { "epoch": 3.8044758539458186, "grad_norm": 0.5113803148269653, "learning_rate": 8.15581956791409e-06, "loss": 1.4669, "step": 1615 }, { "epoch": 3.806831566548881, "grad_norm": 0.567540168762207, "learning_rate": 8.125424962044742e-06, "loss": 1.1749, "step": 1616 }, { "epoch": 3.8091872791519434, "grad_norm": 0.5002404451370239, "learning_rate": 8.095076104558674e-06, "loss": 1.3284, "step": 1617 }, { "epoch": 3.811542991755006, "grad_norm": 0.507181704044342, "learning_rate": 8.064773077734206e-06, "loss": 1.4255, "step": 1618 }, { "epoch": 3.8138987043580683, "grad_norm": 0.5527018904685974, "learning_rate": 8.034515963725348e-06, "loss": 1.2971, "step": 1619 }, { "epoch": 3.8162544169611308, "grad_norm": 0.6221270561218262, "learning_rate": 8.004304844561683e-06, "loss": 1.3414, "step": 1620 }, { "epoch": 3.818610129564193, "grad_norm": 0.5122028589248657, "learning_rate": 7.974139802148065e-06, "loss": 1.2977, "step": 1621 }, { "epoch": 3.8209658421672557, "grad_norm": 0.6230779886245728, "learning_rate": 7.944020918264458e-06, "loss": 1.6261, "step": 1622 }, { "epoch": 3.823321554770318, "grad_norm": 0.6021052598953247, "learning_rate": 7.913948274565652e-06, "loss": 1.2674, "step": 1623 }, { "epoch": 3.8256772673733805, "grad_norm": 0.5739334225654602, "learning_rate": 7.88392195258113e-06, "loss": 1.4297, "step": 1624 }, { "epoch": 3.828032979976443, "grad_norm": 0.5318486094474792, "learning_rate": 7.853942033714736e-06, "loss": 1.405, "step": 1625 }, { "epoch": 3.8303886925795054, "grad_norm": 0.5569553971290588, "learning_rate": 7.824008599244553e-06, "loss": 1.4557, "step": 1626 }, { "epoch": 3.832744405182568, "grad_norm": 1.0495333671569824, "learning_rate": 7.794121730322606e-06, "loss": 1.5212, "step": 1627 }, { "epoch": 3.8351001177856303, "grad_norm": 0.5602095127105713, "learning_rate": 7.76428150797471e-06, "loss": 1.4202, "step": 1628 }, { "epoch": 3.8374558303886923, "grad_norm": 0.5447658896446228, "learning_rate": 7.734488013100177e-06, "loss": 1.2872, "step": 1629 }, { "epoch": 3.8398115429917548, "grad_norm": 0.6150215268135071, "learning_rate": 7.704741326471699e-06, "loss": 1.2264, "step": 1630 }, { "epoch": 3.842167255594817, "grad_norm": 0.5768173336982727, "learning_rate": 7.675041528735e-06, "loss": 1.4427, "step": 1631 }, { "epoch": 3.8445229681978796, "grad_norm": 0.7380544543266296, "learning_rate": 7.645388700408732e-06, "loss": 1.2967, "step": 1632 }, { "epoch": 3.846878680800942, "grad_norm": 1.131364345550537, "learning_rate": 7.615782921884174e-06, "loss": 1.4847, "step": 1633 }, { "epoch": 3.8492343934040045, "grad_norm": 0.6518924832344055, "learning_rate": 7.586224273425082e-06, "loss": 1.4532, "step": 1634 }, { "epoch": 3.851590106007067, "grad_norm": 0.6469493508338928, "learning_rate": 7.556712835167401e-06, "loss": 1.4791, "step": 1635 }, { "epoch": 3.8539458186101294, "grad_norm": 0.5271226763725281, "learning_rate": 7.5272486871191375e-06, "loss": 1.3643, "step": 1636 }, { "epoch": 3.856301531213192, "grad_norm": 0.5962862968444824, "learning_rate": 7.497831909160036e-06, "loss": 1.6548, "step": 1637 }, { "epoch": 3.8586572438162543, "grad_norm": 0.8989675641059875, "learning_rate": 7.468462581041452e-06, "loss": 1.3742, "step": 1638 }, { "epoch": 3.8610129564193167, "grad_norm": 0.5813814401626587, "learning_rate": 7.439140782386078e-06, "loss": 1.3651, "step": 1639 }, { "epoch": 3.863368669022379, "grad_norm": 0.54359370470047, "learning_rate": 7.409866592687767e-06, "loss": 1.4294, "step": 1640 }, { "epoch": 3.8657243816254416, "grad_norm": 0.5756585597991943, "learning_rate": 7.380640091311291e-06, "loss": 1.5024, "step": 1641 }, { "epoch": 3.868080094228504, "grad_norm": 0.7016811370849609, "learning_rate": 7.351461357492151e-06, "loss": 1.2882, "step": 1642 }, { "epoch": 3.8704358068315665, "grad_norm": 0.49998876452445984, "learning_rate": 7.3223304703363135e-06, "loss": 1.3307, "step": 1643 }, { "epoch": 3.872791519434629, "grad_norm": 0.5909798741340637, "learning_rate": 7.293247508820058e-06, "loss": 1.496, "step": 1644 }, { "epoch": 3.8751472320376914, "grad_norm": 0.5867403745651245, "learning_rate": 7.264212551789731e-06, "loss": 1.4894, "step": 1645 }, { "epoch": 3.877502944640754, "grad_norm": 0.5128086805343628, "learning_rate": 7.235225677961513e-06, "loss": 1.3051, "step": 1646 }, { "epoch": 3.8798586572438163, "grad_norm": 1.9520083665847778, "learning_rate": 7.206286965921249e-06, "loss": 1.448, "step": 1647 }, { "epoch": 3.8822143698468787, "grad_norm": 0.5792246460914612, "learning_rate": 7.177396494124206e-06, "loss": 1.3893, "step": 1648 }, { "epoch": 3.884570082449941, "grad_norm": 0.5184505581855774, "learning_rate": 7.1485543408948755e-06, "loss": 1.2293, "step": 1649 }, { "epoch": 3.8869257950530036, "grad_norm": 0.5740927457809448, "learning_rate": 7.1197605844267294e-06, "loss": 1.2462, "step": 1650 }, { "epoch": 3.889281507656066, "grad_norm": 0.5681979060173035, "learning_rate": 7.091015302782064e-06, "loss": 1.5397, "step": 1651 }, { "epoch": 3.8916372202591285, "grad_norm": 0.5753186345100403, "learning_rate": 7.062318573891716e-06, "loss": 1.3718, "step": 1652 }, { "epoch": 3.893992932862191, "grad_norm": 0.7387382388114929, "learning_rate": 7.033670475554949e-06, "loss": 1.3307, "step": 1653 }, { "epoch": 3.8963486454652534, "grad_norm": 0.5926812887191772, "learning_rate": 7.00507108543913e-06, "loss": 1.5168, "step": 1654 }, { "epoch": 3.898704358068316, "grad_norm": 0.5354231595993042, "learning_rate": 6.976520481079604e-06, "loss": 1.3793, "step": 1655 }, { "epoch": 3.9010600706713783, "grad_norm": 0.5609486103057861, "learning_rate": 6.948018739879439e-06, "loss": 1.5824, "step": 1656 }, { "epoch": 3.9034157832744407, "grad_norm": 0.5547861456871033, "learning_rate": 6.919565939109249e-06, "loss": 1.5102, "step": 1657 }, { "epoch": 3.905771495877503, "grad_norm": 0.5052303671836853, "learning_rate": 6.89116215590693e-06, "loss": 1.4205, "step": 1658 }, { "epoch": 3.9081272084805656, "grad_norm": 0.5321267247200012, "learning_rate": 6.862807467277546e-06, "loss": 1.4613, "step": 1659 }, { "epoch": 3.910482921083628, "grad_norm": 0.5275298357009888, "learning_rate": 6.834501950093006e-06, "loss": 1.2472, "step": 1660 }, { "epoch": 3.9128386336866905, "grad_norm": 0.49170222878456116, "learning_rate": 6.806245681091944e-06, "loss": 1.1899, "step": 1661 }, { "epoch": 3.9151943462897525, "grad_norm": 0.5771868824958801, "learning_rate": 6.778038736879452e-06, "loss": 1.4692, "step": 1662 }, { "epoch": 3.917550058892815, "grad_norm": 0.555927574634552, "learning_rate": 6.749881193926932e-06, "loss": 1.4376, "step": 1663 }, { "epoch": 3.9199057714958774, "grad_norm": 0.8719086050987244, "learning_rate": 6.721773128571812e-06, "loss": 1.3957, "step": 1664 }, { "epoch": 3.92226148409894, "grad_norm": 0.5687438249588013, "learning_rate": 6.693714617017435e-06, "loss": 1.3616, "step": 1665 }, { "epoch": 3.9246171967020023, "grad_norm": 0.5264513492584229, "learning_rate": 6.665705735332753e-06, "loss": 1.3027, "step": 1666 }, { "epoch": 3.9269729093050647, "grad_norm": 2.588770866394043, "learning_rate": 6.637746559452193e-06, "loss": 1.4708, "step": 1667 }, { "epoch": 3.929328621908127, "grad_norm": 0.5120534896850586, "learning_rate": 6.6098371651754085e-06, "loss": 1.4095, "step": 1668 }, { "epoch": 3.9316843345111896, "grad_norm": 0.5838505625724792, "learning_rate": 6.581977628167113e-06, "loss": 1.4091, "step": 1669 }, { "epoch": 3.934040047114252, "grad_norm": 0.5371755361557007, "learning_rate": 6.5541680239568165e-06, "loss": 1.2102, "step": 1670 }, { "epoch": 3.9363957597173145, "grad_norm": 0.6284140348434448, "learning_rate": 6.52640842793871e-06, "loss": 1.5214, "step": 1671 }, { "epoch": 3.938751472320377, "grad_norm": 0.7914584875106812, "learning_rate": 6.498698915371359e-06, "loss": 1.1925, "step": 1672 }, { "epoch": 3.9411071849234394, "grad_norm": 0.5357339978218079, "learning_rate": 6.471039561377581e-06, "loss": 1.3896, "step": 1673 }, { "epoch": 3.943462897526502, "grad_norm": 0.5868813395500183, "learning_rate": 6.443430440944181e-06, "loss": 1.3908, "step": 1674 }, { "epoch": 3.9458186101295643, "grad_norm": 4.1010823249816895, "learning_rate": 6.415871628921799e-06, "loss": 1.4277, "step": 1675 }, { "epoch": 3.9481743227326267, "grad_norm": 0.499858021736145, "learning_rate": 6.38836320002468e-06, "loss": 1.307, "step": 1676 }, { "epoch": 3.950530035335689, "grad_norm": 0.530317485332489, "learning_rate": 6.360905228830483e-06, "loss": 1.4924, "step": 1677 }, { "epoch": 3.9528857479387516, "grad_norm": 0.5147749185562134, "learning_rate": 6.333497789780041e-06, "loss": 1.2786, "step": 1678 }, { "epoch": 3.955241460541814, "grad_norm": 0.5589393377304077, "learning_rate": 6.3061409571772254e-06, "loss": 1.3597, "step": 1679 }, { "epoch": 3.9575971731448765, "grad_norm": 0.515583336353302, "learning_rate": 6.278834805188699e-06, "loss": 1.5172, "step": 1680 }, { "epoch": 3.959952885747939, "grad_norm": 0.7081592679023743, "learning_rate": 6.251579407843713e-06, "loss": 1.3294, "step": 1681 }, { "epoch": 3.962308598351001, "grad_norm": 0.5626111626625061, "learning_rate": 6.224374839033928e-06, "loss": 1.2844, "step": 1682 }, { "epoch": 3.9646643109540634, "grad_norm": 0.5407631397247314, "learning_rate": 6.1972211725132095e-06, "loss": 1.5241, "step": 1683 }, { "epoch": 3.967020023557126, "grad_norm": 0.5663389563560486, "learning_rate": 6.170118481897421e-06, "loss": 1.387, "step": 1684 }, { "epoch": 3.9693757361601882, "grad_norm": 0.5710420608520508, "learning_rate": 6.143066840664211e-06, "loss": 1.423, "step": 1685 }, { "epoch": 3.9717314487632507, "grad_norm": 0.5247495770454407, "learning_rate": 6.11606632215285e-06, "loss": 1.27, "step": 1686 }, { "epoch": 3.974087161366313, "grad_norm": 0.6587856411933899, "learning_rate": 6.089116999563988e-06, "loss": 1.3811, "step": 1687 }, { "epoch": 3.9764428739693756, "grad_norm": 0.4864495098590851, "learning_rate": 6.062218945959497e-06, "loss": 1.3464, "step": 1688 }, { "epoch": 3.978798586572438, "grad_norm": 0.5312260985374451, "learning_rate": 6.035372234262251e-06, "loss": 1.2308, "step": 1689 }, { "epoch": 3.9811542991755005, "grad_norm": 0.6395768523216248, "learning_rate": 6.008576937255933e-06, "loss": 1.6999, "step": 1690 }, { "epoch": 3.983510011778563, "grad_norm": 1.3282980918884277, "learning_rate": 5.981833127584824e-06, "loss": 1.3792, "step": 1691 }, { "epoch": 3.9858657243816253, "grad_norm": 0.5603832602500916, "learning_rate": 5.955140877753635e-06, "loss": 1.3415, "step": 1692 }, { "epoch": 3.988221436984688, "grad_norm": 0.5439705848693848, "learning_rate": 5.928500260127273e-06, "loss": 1.3724, "step": 1693 }, { "epoch": 3.9905771495877502, "grad_norm": 0.5632964372634888, "learning_rate": 5.901911346930689e-06, "loss": 1.4484, "step": 1694 }, { "epoch": 3.9929328621908127, "grad_norm": 0.6542787551879883, "learning_rate": 5.875374210248649e-06, "loss": 1.3596, "step": 1695 }, { "epoch": 3.995288574793875, "grad_norm": 0.9696907997131348, "learning_rate": 5.848888922025553e-06, "loss": 1.3518, "step": 1696 }, { "epoch": 3.995288574793875, "eval_loss": 1.3107149600982666, "eval_runtime": 5.8017, "eval_samples_per_second": 430.907, "eval_steps_per_second": 6.895, "step": 1696 }, { "epoch": 3.9976442873969376, "grad_norm": 0.5222924947738647, "learning_rate": 5.822455554065217e-06, "loss": 1.3688, "step": 1697 }, { "epoch": 4.0, "grad_norm": 0.9021956324577332, "learning_rate": 5.796074178030727e-06, "loss": 1.47, "step": 1698 }, { "epoch": 4.002355712603062, "grad_norm": 0.7636176943778992, "learning_rate": 5.7697448654441845e-06, "loss": 1.5242, "step": 1699 }, { "epoch": 4.004711425206125, "grad_norm": 0.6209522485733032, "learning_rate": 5.743467687686563e-06, "loss": 1.4746, "step": 1700 }, { "epoch": 4.007067137809187, "grad_norm": 0.49633294343948364, "learning_rate": 5.7172427159974865e-06, "loss": 1.1659, "step": 1701 }, { "epoch": 4.00942285041225, "grad_norm": 0.5980811715126038, "learning_rate": 5.691070021475051e-06, "loss": 1.4282, "step": 1702 }, { "epoch": 4.011778563015312, "grad_norm": 0.5320684313774109, "learning_rate": 5.664949675075607e-06, "loss": 1.3982, "step": 1703 }, { "epoch": 4.014134275618375, "grad_norm": 0.5911762714385986, "learning_rate": 5.638881747613603e-06, "loss": 1.5333, "step": 1704 }, { "epoch": 4.016489988221437, "grad_norm": 0.922157347202301, "learning_rate": 5.612866309761377e-06, "loss": 1.5789, "step": 1705 }, { "epoch": 4.0188457008244995, "grad_norm": 0.5106910467147827, "learning_rate": 5.586903432048943e-06, "loss": 1.3807, "step": 1706 }, { "epoch": 4.021201413427562, "grad_norm": 0.5205463171005249, "learning_rate": 5.560993184863833e-06, "loss": 1.3346, "step": 1707 }, { "epoch": 4.023557126030624, "grad_norm": 0.5103801488876343, "learning_rate": 5.5351356384509e-06, "loss": 1.4291, "step": 1708 }, { "epoch": 4.025912838633687, "grad_norm": 0.6833444237709045, "learning_rate": 5.509330862912115e-06, "loss": 1.2681, "step": 1709 }, { "epoch": 4.028268551236749, "grad_norm": 1.1435314416885376, "learning_rate": 5.48357892820637e-06, "loss": 1.3777, "step": 1710 }, { "epoch": 4.030624263839812, "grad_norm": 0.5232929587364197, "learning_rate": 5.457879904149327e-06, "loss": 1.3662, "step": 1711 }, { "epoch": 4.032979976442874, "grad_norm": 0.596541166305542, "learning_rate": 5.4322338604131715e-06, "loss": 1.4367, "step": 1712 }, { "epoch": 4.035335689045937, "grad_norm": 0.5148890614509583, "learning_rate": 5.40664086652648e-06, "loss": 1.3985, "step": 1713 }, { "epoch": 4.037691401648999, "grad_norm": 4.6051411628723145, "learning_rate": 5.3811009918739965e-06, "loss": 1.3901, "step": 1714 }, { "epoch": 4.0400471142520615, "grad_norm": 0.5814567804336548, "learning_rate": 5.355614305696468e-06, "loss": 1.4011, "step": 1715 }, { "epoch": 4.042402826855124, "grad_norm": 0.6472153663635254, "learning_rate": 5.33018087709041e-06, "loss": 1.3932, "step": 1716 }, { "epoch": 4.044758539458186, "grad_norm": 0.5093610882759094, "learning_rate": 5.304800775007992e-06, "loss": 1.4208, "step": 1717 }, { "epoch": 4.047114252061249, "grad_norm": 0.5390185117721558, "learning_rate": 5.279474068256767e-06, "loss": 1.2809, "step": 1718 }, { "epoch": 4.049469964664311, "grad_norm": 0.7668046355247498, "learning_rate": 5.254200825499589e-06, "loss": 1.3739, "step": 1719 }, { "epoch": 4.051825677267374, "grad_norm": 0.5157501101493835, "learning_rate": 5.2289811152543e-06, "loss": 1.2889, "step": 1720 }, { "epoch": 4.054181389870436, "grad_norm": 0.6357651352882385, "learning_rate": 5.203815005893664e-06, "loss": 1.4472, "step": 1721 }, { "epoch": 4.056537102473499, "grad_norm": 0.4992774724960327, "learning_rate": 5.178702565645091e-06, "loss": 1.2521, "step": 1722 }, { "epoch": 4.058892815076561, "grad_norm": 0.5255862474441528, "learning_rate": 5.1536438625905185e-06, "loss": 1.3974, "step": 1723 }, { "epoch": 4.0612485276796235, "grad_norm": 0.5593264102935791, "learning_rate": 5.128638964666166e-06, "loss": 1.3415, "step": 1724 }, { "epoch": 4.063604240282685, "grad_norm": 0.4990921914577484, "learning_rate": 5.103687939662427e-06, "loss": 1.2787, "step": 1725 }, { "epoch": 4.0659599528857475, "grad_norm": 3.3462636470794678, "learning_rate": 5.078790855223595e-06, "loss": 1.3995, "step": 1726 }, { "epoch": 4.06831566548881, "grad_norm": 0.5059384703636169, "learning_rate": 5.053947778847762e-06, "loss": 1.3359, "step": 1727 }, { "epoch": 4.070671378091872, "grad_norm": 0.6696217656135559, "learning_rate": 5.02915877788657e-06, "loss": 1.2423, "step": 1728 }, { "epoch": 4.073027090694935, "grad_norm": 0.5878140926361084, "learning_rate": 5.004423919545087e-06, "loss": 1.306, "step": 1729 }, { "epoch": 4.075382803297997, "grad_norm": 0.5696496963500977, "learning_rate": 4.979743270881559e-06, "loss": 1.4666, "step": 1730 }, { "epoch": 4.07773851590106, "grad_norm": 0.5326704382896423, "learning_rate": 4.955116898807316e-06, "loss": 1.405, "step": 1731 }, { "epoch": 4.080094228504122, "grad_norm": 0.5756763219833374, "learning_rate": 4.93054487008649e-06, "loss": 1.4365, "step": 1732 }, { "epoch": 4.082449941107185, "grad_norm": 0.5938823223114014, "learning_rate": 4.906027251335918e-06, "loss": 1.4148, "step": 1733 }, { "epoch": 4.084805653710247, "grad_norm": 0.47493642568588257, "learning_rate": 4.881564109024903e-06, "loss": 1.3994, "step": 1734 }, { "epoch": 4.0871613663133095, "grad_norm": 0.5050984025001526, "learning_rate": 4.8571555094750696e-06, "loss": 1.2531, "step": 1735 }, { "epoch": 4.089517078916372, "grad_norm": 0.5977907776832581, "learning_rate": 4.832801518860175e-06, "loss": 1.4325, "step": 1736 }, { "epoch": 4.091872791519434, "grad_norm": 1.4090451002120972, "learning_rate": 4.8085022032059275e-06, "loss": 1.374, "step": 1737 }, { "epoch": 4.094228504122497, "grad_norm": 0.6680536866188049, "learning_rate": 4.784257628389794e-06, "loss": 1.4987, "step": 1738 }, { "epoch": 4.096584216725559, "grad_norm": 0.7078143954277039, "learning_rate": 4.760067860140846e-06, "loss": 1.3291, "step": 1739 }, { "epoch": 4.098939929328622, "grad_norm": 0.5328817367553711, "learning_rate": 4.735932964039575e-06, "loss": 1.3773, "step": 1740 }, { "epoch": 4.101295641931684, "grad_norm": 1.1540193557739258, "learning_rate": 4.711853005517686e-06, "loss": 1.3272, "step": 1741 }, { "epoch": 4.103651354534747, "grad_norm": 0.540476381778717, "learning_rate": 4.687828049857967e-06, "loss": 1.3272, "step": 1742 }, { "epoch": 4.106007067137809, "grad_norm": 0.5823096036911011, "learning_rate": 4.6638581621940815e-06, "loss": 1.4448, "step": 1743 }, { "epoch": 4.1083627797408715, "grad_norm": 0.5617446303367615, "learning_rate": 4.639943407510408e-06, "loss": 1.4256, "step": 1744 }, { "epoch": 4.110718492343934, "grad_norm": 0.5515071749687195, "learning_rate": 4.616083850641825e-06, "loss": 1.4141, "step": 1745 }, { "epoch": 4.113074204946996, "grad_norm": 0.6387625336647034, "learning_rate": 4.592279556273604e-06, "loss": 1.4301, "step": 1746 }, { "epoch": 4.115429917550059, "grad_norm": 1.512547492980957, "learning_rate": 4.568530588941161e-06, "loss": 1.5786, "step": 1747 }, { "epoch": 4.117785630153121, "grad_norm": 0.6038934588432312, "learning_rate": 4.54483701302994e-06, "loss": 1.3636, "step": 1748 }, { "epoch": 4.120141342756184, "grad_norm": 0.5443477034568787, "learning_rate": 4.521198892775203e-06, "loss": 1.3643, "step": 1749 }, { "epoch": 4.122497055359246, "grad_norm": 0.5879799127578735, "learning_rate": 4.497616292261877e-06, "loss": 1.3193, "step": 1750 }, { "epoch": 4.124852767962309, "grad_norm": 0.5421321392059326, "learning_rate": 4.474089275424351e-06, "loss": 1.3138, "step": 1751 }, { "epoch": 4.127208480565371, "grad_norm": 0.6581401228904724, "learning_rate": 4.450617906046348e-06, "loss": 1.315, "step": 1752 }, { "epoch": 4.1295641931684335, "grad_norm": 0.4811381995677948, "learning_rate": 4.427202247760695e-06, "loss": 1.3075, "step": 1753 }, { "epoch": 4.131919905771496, "grad_norm": 1.4084199666976929, "learning_rate": 4.40384236404921e-06, "loss": 1.2799, "step": 1754 }, { "epoch": 4.134275618374558, "grad_norm": 1.6066672801971436, "learning_rate": 4.380538318242486e-06, "loss": 1.3781, "step": 1755 }, { "epoch": 4.136631330977621, "grad_norm": 0.5960988998413086, "learning_rate": 4.357290173519746e-06, "loss": 1.3463, "step": 1756 }, { "epoch": 4.138987043580683, "grad_norm": 0.5128644108772278, "learning_rate": 4.334097992908645e-06, "loss": 1.3867, "step": 1757 }, { "epoch": 4.141342756183746, "grad_norm": 1.177811861038208, "learning_rate": 4.31096183928513e-06, "loss": 1.2108, "step": 1758 }, { "epoch": 4.143698468786808, "grad_norm": 0.5613256692886353, "learning_rate": 4.287881775373237e-06, "loss": 1.3302, "step": 1759 }, { "epoch": 4.146054181389871, "grad_norm": 0.5446493029594421, "learning_rate": 4.264857863744956e-06, "loss": 1.5078, "step": 1760 }, { "epoch": 4.148409893992933, "grad_norm": 0.5942068696022034, "learning_rate": 4.241890166820034e-06, "loss": 1.4962, "step": 1761 }, { "epoch": 4.1507656065959955, "grad_norm": 0.5595738887786865, "learning_rate": 4.218978746865823e-06, "loss": 1.4532, "step": 1762 }, { "epoch": 4.153121319199058, "grad_norm": 0.5635387301445007, "learning_rate": 4.196123665997087e-06, "loss": 1.292, "step": 1763 }, { "epoch": 4.15547703180212, "grad_norm": 0.556657612323761, "learning_rate": 4.17332498617587e-06, "loss": 1.4243, "step": 1764 }, { "epoch": 4.157832744405183, "grad_norm": 0.6109320521354675, "learning_rate": 4.150582769211289e-06, "loss": 1.3363, "step": 1765 }, { "epoch": 4.160188457008245, "grad_norm": 0.5171672701835632, "learning_rate": 4.127897076759399e-06, "loss": 1.2995, "step": 1766 }, { "epoch": 4.162544169611308, "grad_norm": 0.5037778615951538, "learning_rate": 4.105267970323007e-06, "loss": 1.2903, "step": 1767 }, { "epoch": 4.16489988221437, "grad_norm": 1.0770846605300903, "learning_rate": 4.082695511251522e-06, "loss": 1.3409, "step": 1768 }, { "epoch": 4.167255594817433, "grad_norm": 0.5107141733169556, "learning_rate": 4.060179760740751e-06, "loss": 1.3242, "step": 1769 }, { "epoch": 4.169611307420495, "grad_norm": 0.9664813280105591, "learning_rate": 4.037720779832785e-06, "loss": 1.5335, "step": 1770 }, { "epoch": 4.1719670200235575, "grad_norm": 0.5541787147521973, "learning_rate": 4.015318629415804e-06, "loss": 1.3117, "step": 1771 }, { "epoch": 4.17432273262662, "grad_norm": 0.6314407587051392, "learning_rate": 3.992973370223896e-06, "loss": 1.2012, "step": 1772 }, { "epoch": 4.176678445229682, "grad_norm": 0.5587241053581238, "learning_rate": 3.970685062836932e-06, "loss": 1.2485, "step": 1773 }, { "epoch": 4.179034157832745, "grad_norm": 0.8921142220497131, "learning_rate": 3.948453767680379e-06, "loss": 1.5864, "step": 1774 }, { "epoch": 4.181389870435807, "grad_norm": 0.6697642803192139, "learning_rate": 3.926279545025138e-06, "loss": 1.4275, "step": 1775 }, { "epoch": 4.18374558303887, "grad_norm": 0.508639931678772, "learning_rate": 3.904162454987373e-06, "loss": 1.3755, "step": 1776 }, { "epoch": 4.186101295641931, "grad_norm": 0.5936905145645142, "learning_rate": 3.882102557528372e-06, "loss": 1.2676, "step": 1777 }, { "epoch": 4.188457008244994, "grad_norm": 0.754548192024231, "learning_rate": 3.860099912454346e-06, "loss": 1.2088, "step": 1778 }, { "epoch": 4.190812720848056, "grad_norm": 0.663715660572052, "learning_rate": 3.838154579416326e-06, "loss": 1.3247, "step": 1779 }, { "epoch": 4.193168433451119, "grad_norm": 0.5958415865898132, "learning_rate": 3.816266617909925e-06, "loss": 1.4586, "step": 1780 }, { "epoch": 4.195524146054181, "grad_norm": 0.561379611492157, "learning_rate": 3.7944360872752495e-06, "loss": 1.6171, "step": 1781 }, { "epoch": 4.1978798586572434, "grad_norm": 0.6449840664863586, "learning_rate": 3.772663046696684e-06, "loss": 1.4057, "step": 1782 }, { "epoch": 4.200235571260306, "grad_norm": 0.5428667068481445, "learning_rate": 3.7509475552027663e-06, "loss": 1.3721, "step": 1783 }, { "epoch": 4.202591283863368, "grad_norm": 0.6467713713645935, "learning_rate": 3.7292896716659974e-06, "loss": 1.2938, "step": 1784 }, { "epoch": 4.204946996466431, "grad_norm": 0.5764541029930115, "learning_rate": 3.707689454802729e-06, "loss": 1.4092, "step": 1785 }, { "epoch": 4.207302709069493, "grad_norm": 3.232020139694214, "learning_rate": 3.68614696317294e-06, "loss": 1.2416, "step": 1786 }, { "epoch": 4.209658421672556, "grad_norm": 0.5982542037963867, "learning_rate": 3.6646622551801345e-06, "loss": 1.3025, "step": 1787 }, { "epoch": 4.212014134275618, "grad_norm": 0.4885265529155731, "learning_rate": 3.6432353890711424e-06, "loss": 1.1864, "step": 1788 }, { "epoch": 4.2143698468786805, "grad_norm": 0.795825719833374, "learning_rate": 3.621866422935999e-06, "loss": 1.5194, "step": 1789 }, { "epoch": 4.216725559481743, "grad_norm": 0.5304363965988159, "learning_rate": 3.6005554147077402e-06, "loss": 1.3359, "step": 1790 }, { "epoch": 4.219081272084805, "grad_norm": 5.552985668182373, "learning_rate": 3.5793024221623147e-06, "loss": 1.4386, "step": 1791 }, { "epoch": 4.221436984687868, "grad_norm": 0.5588003993034363, "learning_rate": 3.5581075029183423e-06, "loss": 1.3139, "step": 1792 }, { "epoch": 4.22379269729093, "grad_norm": 0.6261292695999146, "learning_rate": 3.536970714437032e-06, "loss": 1.3036, "step": 1793 }, { "epoch": 4.226148409893993, "grad_norm": 0.5477218627929688, "learning_rate": 3.515892114021974e-06, "loss": 1.4437, "step": 1794 }, { "epoch": 4.228504122497055, "grad_norm": 0.5441961884498596, "learning_rate": 3.4948717588190267e-06, "loss": 1.4331, "step": 1795 }, { "epoch": 4.230859835100118, "grad_norm": 0.5202053785324097, "learning_rate": 3.4739097058161114e-06, "loss": 1.1759, "step": 1796 }, { "epoch": 4.23321554770318, "grad_norm": 0.6516292095184326, "learning_rate": 3.4530060118431295e-06, "loss": 1.2281, "step": 1797 }, { "epoch": 4.2355712603062425, "grad_norm": 0.5807310342788696, "learning_rate": 3.432160733571729e-06, "loss": 1.4572, "step": 1798 }, { "epoch": 4.237926972909305, "grad_norm": 0.5542933940887451, "learning_rate": 3.4113739275152074e-06, "loss": 1.2927, "step": 1799 }, { "epoch": 4.240282685512367, "grad_norm": 0.9899268746376038, "learning_rate": 3.3906456500283235e-06, "loss": 1.2505, "step": 1800 }, { "epoch": 4.24263839811543, "grad_norm": 0.6394761204719543, "learning_rate": 3.369975957307178e-06, "loss": 1.5737, "step": 1801 }, { "epoch": 4.244994110718492, "grad_norm": 0.5710820555686951, "learning_rate": 3.3493649053890326e-06, "loss": 1.3443, "step": 1802 }, { "epoch": 4.247349823321555, "grad_norm": 0.6869067549705505, "learning_rate": 3.32881255015218e-06, "loss": 1.5385, "step": 1803 }, { "epoch": 4.249705535924617, "grad_norm": 0.5882296562194824, "learning_rate": 3.308318947315758e-06, "loss": 1.4709, "step": 1804 }, { "epoch": 4.25206124852768, "grad_norm": 0.554146945476532, "learning_rate": 3.2878841524396465e-06, "loss": 1.4046, "step": 1805 }, { "epoch": 4.254416961130742, "grad_norm": 0.5895798802375793, "learning_rate": 3.267508220924287e-06, "loss": 1.4102, "step": 1806 }, { "epoch": 4.2567726737338045, "grad_norm": 0.6444280743598938, "learning_rate": 3.247191208010519e-06, "loss": 1.4882, "step": 1807 }, { "epoch": 4.259128386336867, "grad_norm": 0.5712672472000122, "learning_rate": 3.22693316877947e-06, "loss": 1.571, "step": 1808 }, { "epoch": 4.261484098939929, "grad_norm": 0.5907216668128967, "learning_rate": 3.2067341581523776e-06, "loss": 1.2381, "step": 1809 }, { "epoch": 4.263839811542992, "grad_norm": 0.5297447443008423, "learning_rate": 3.1865942308904524e-06, "loss": 1.1893, "step": 1810 }, { "epoch": 4.266195524146054, "grad_norm": 0.5954095125198364, "learning_rate": 3.1665134415947125e-06, "loss": 1.5393, "step": 1811 }, { "epoch": 4.268551236749117, "grad_norm": 0.5536341667175293, "learning_rate": 3.146491844705862e-06, "loss": 1.3776, "step": 1812 }, { "epoch": 4.270906949352179, "grad_norm": 0.47793442010879517, "learning_rate": 3.126529494504113e-06, "loss": 1.1731, "step": 1813 }, { "epoch": 4.273262661955242, "grad_norm": 0.8257054686546326, "learning_rate": 3.1066264451090815e-06, "loss": 1.3505, "step": 1814 }, { "epoch": 4.275618374558304, "grad_norm": 0.5433495044708252, "learning_rate": 3.0867827504795834e-06, "loss": 1.479, "step": 1815 }, { "epoch": 4.2779740871613665, "grad_norm": 1.765730857849121, "learning_rate": 3.066998464413545e-06, "loss": 1.2451, "step": 1816 }, { "epoch": 4.280329799764429, "grad_norm": 0.6123552918434143, "learning_rate": 3.04727364054781e-06, "loss": 1.5141, "step": 1817 }, { "epoch": 4.282685512367491, "grad_norm": 1.7967052459716797, "learning_rate": 3.027608332358034e-06, "loss": 1.2683, "step": 1818 }, { "epoch": 4.285041224970554, "grad_norm": 1.7990590333938599, "learning_rate": 3.0080025931584932e-06, "loss": 1.3185, "step": 1819 }, { "epoch": 4.287396937573616, "grad_norm": 0.5314221382141113, "learning_rate": 2.9884564761020085e-06, "loss": 1.3889, "step": 1820 }, { "epoch": 4.289752650176679, "grad_norm": 0.835995614528656, "learning_rate": 2.968970034179719e-06, "loss": 1.432, "step": 1821 }, { "epoch": 4.292108362779741, "grad_norm": 0.557085394859314, "learning_rate": 2.94954332022101e-06, "loss": 1.2951, "step": 1822 }, { "epoch": 4.294464075382804, "grad_norm": 0.5341096520423889, "learning_rate": 2.9301763868933157e-06, "loss": 1.3005, "step": 1823 }, { "epoch": 4.296819787985866, "grad_norm": 0.47961845993995667, "learning_rate": 2.9108692867020227e-06, "loss": 1.314, "step": 1824 }, { "epoch": 4.2991755005889285, "grad_norm": 0.5045312643051147, "learning_rate": 2.891622071990277e-06, "loss": 1.3812, "step": 1825 }, { "epoch": 4.301531213191991, "grad_norm": 0.5373421311378479, "learning_rate": 2.872434794938905e-06, "loss": 1.4957, "step": 1826 }, { "epoch": 4.303886925795053, "grad_norm": 0.6434303522109985, "learning_rate": 2.853307507566205e-06, "loss": 1.5041, "step": 1827 }, { "epoch": 4.306242638398116, "grad_norm": 0.5580484867095947, "learning_rate": 2.8342402617278606e-06, "loss": 1.294, "step": 1828 }, { "epoch": 4.308598351001178, "grad_norm": 0.5425178408622742, "learning_rate": 2.8152331091167604e-06, "loss": 1.4159, "step": 1829 }, { "epoch": 4.310954063604241, "grad_norm": 0.6118820905685425, "learning_rate": 2.796286101262888e-06, "loss": 1.2488, "step": 1830 }, { "epoch": 4.313309776207303, "grad_norm": 0.5728391408920288, "learning_rate": 2.777399289533164e-06, "loss": 1.5045, "step": 1831 }, { "epoch": 4.315665488810366, "grad_norm": 0.5918765068054199, "learning_rate": 2.75857272513132e-06, "loss": 1.4538, "step": 1832 }, { "epoch": 4.318021201413427, "grad_norm": 0.5906542539596558, "learning_rate": 2.739806459097735e-06, "loss": 1.3849, "step": 1833 }, { "epoch": 4.32037691401649, "grad_norm": 0.6528672575950623, "learning_rate": 2.7211005423093323e-06, "loss": 1.3945, "step": 1834 }, { "epoch": 4.322732626619552, "grad_norm": 0.596585750579834, "learning_rate": 2.7024550254794166e-06, "loss": 1.4151, "step": 1835 }, { "epoch": 4.3250883392226145, "grad_norm": 0.5299412608146667, "learning_rate": 2.683869959157534e-06, "loss": 1.2888, "step": 1836 }, { "epoch": 4.327444051825677, "grad_norm": 0.591060221195221, "learning_rate": 2.6653453937293597e-06, "loss": 1.4346, "step": 1837 }, { "epoch": 4.329799764428739, "grad_norm": 0.6671940088272095, "learning_rate": 2.6468813794165356e-06, "loss": 1.351, "step": 1838 }, { "epoch": 4.332155477031802, "grad_norm": 0.5304927229881287, "learning_rate": 2.628477966276555e-06, "loss": 1.5061, "step": 1839 }, { "epoch": 4.334511189634864, "grad_norm": 0.6004314422607422, "learning_rate": 2.610135204202599e-06, "loss": 1.4883, "step": 1840 }, { "epoch": 4.336866902237927, "grad_norm": 0.5913625955581665, "learning_rate": 2.5918531429234368e-06, "loss": 1.404, "step": 1841 }, { "epoch": 4.339222614840989, "grad_norm": 0.6106451153755188, "learning_rate": 2.573631832003254e-06, "loss": 1.4624, "step": 1842 }, { "epoch": 4.341578327444052, "grad_norm": 0.8274986743927002, "learning_rate": 2.555471320841557e-06, "loss": 1.4146, "step": 1843 }, { "epoch": 4.343934040047114, "grad_norm": 0.5549894571304321, "learning_rate": 2.5373716586730045e-06, "loss": 1.4602, "step": 1844 }, { "epoch": 4.3462897526501765, "grad_norm": 2.361919403076172, "learning_rate": 2.5193328945673e-06, "loss": 1.3038, "step": 1845 }, { "epoch": 4.348645465253239, "grad_norm": 0.5302972197532654, "learning_rate": 2.5013550774290322e-06, "loss": 1.266, "step": 1846 }, { "epoch": 4.351001177856301, "grad_norm": 0.6954131722450256, "learning_rate": 2.483438255997575e-06, "loss": 1.2477, "step": 1847 }, { "epoch": 4.353356890459364, "grad_norm": 0.519315779209137, "learning_rate": 2.4655824788469172e-06, "loss": 1.3659, "step": 1848 }, { "epoch": 4.355712603062426, "grad_norm": 0.5853548645973206, "learning_rate": 2.44778779438557e-06, "loss": 1.4422, "step": 1849 }, { "epoch": 4.358068315665489, "grad_norm": 0.540489912033081, "learning_rate": 2.430054250856412e-06, "loss": 1.2065, "step": 1850 }, { "epoch": 4.360424028268551, "grad_norm": 0.6952165365219116, "learning_rate": 2.41238189633656e-06, "loss": 1.233, "step": 1851 }, { "epoch": 4.362779740871614, "grad_norm": 0.6007671356201172, "learning_rate": 2.39477077873724e-06, "loss": 1.4737, "step": 1852 }, { "epoch": 4.365135453474676, "grad_norm": 3.848316192626953, "learning_rate": 2.3772209458036737e-06, "loss": 1.2964, "step": 1853 }, { "epoch": 4.3674911660777385, "grad_norm": 0.5475996136665344, "learning_rate": 2.359732445114915e-06, "loss": 1.3858, "step": 1854 }, { "epoch": 4.369846878680801, "grad_norm": 0.5624005198478699, "learning_rate": 2.3423053240837515e-06, "loss": 1.4338, "step": 1855 }, { "epoch": 4.372202591283863, "grad_norm": 0.519466757774353, "learning_rate": 2.3249396299565683e-06, "loss": 1.2674, "step": 1856 }, { "epoch": 4.374558303886926, "grad_norm": 0.6249402761459351, "learning_rate": 2.307635409813219e-06, "loss": 1.3799, "step": 1857 }, { "epoch": 4.376914016489988, "grad_norm": 0.5488910675048828, "learning_rate": 2.29039271056688e-06, "loss": 1.4468, "step": 1858 }, { "epoch": 4.379269729093051, "grad_norm": 0.5731770396232605, "learning_rate": 2.2732115789639603e-06, "loss": 1.4935, "step": 1859 }, { "epoch": 4.381625441696113, "grad_norm": 0.7639985680580139, "learning_rate": 2.2560920615839337e-06, "loss": 1.3136, "step": 1860 }, { "epoch": 4.383981154299176, "grad_norm": 0.5105938911437988, "learning_rate": 2.2390342048392467e-06, "loss": 1.2289, "step": 1861 }, { "epoch": 4.386336866902238, "grad_norm": 0.5451468229293823, "learning_rate": 2.2220380549751728e-06, "loss": 1.3795, "step": 1862 }, { "epoch": 4.3886925795053005, "grad_norm": 0.5688575506210327, "learning_rate": 2.2051036580697042e-06, "loss": 1.4971, "step": 1863 }, { "epoch": 4.391048292108363, "grad_norm": 0.5379204154014587, "learning_rate": 2.1882310600333965e-06, "loss": 1.4656, "step": 1864 }, { "epoch": 4.393404004711425, "grad_norm": 0.5878316164016724, "learning_rate": 2.171420306609273e-06, "loss": 1.3373, "step": 1865 }, { "epoch": 4.395759717314488, "grad_norm": 0.5973433256149292, "learning_rate": 2.1546714433726993e-06, "loss": 1.4983, "step": 1866 }, { "epoch": 4.39811542991755, "grad_norm": 0.47319769859313965, "learning_rate": 2.1379845157312338e-06, "loss": 1.2273, "step": 1867 }, { "epoch": 4.400471142520613, "grad_norm": 0.6621816754341125, "learning_rate": 2.1213595689245386e-06, "loss": 1.3411, "step": 1868 }, { "epoch": 4.402826855123675, "grad_norm": 0.5480780005455017, "learning_rate": 2.1047966480242292e-06, "loss": 1.2976, "step": 1869 }, { "epoch": 4.4051825677267376, "grad_norm": 0.5590142011642456, "learning_rate": 2.0882957979337787e-06, "loss": 1.3363, "step": 1870 }, { "epoch": 4.4075382803298, "grad_norm": 0.6572965979576111, "learning_rate": 2.0718570633883576e-06, "loss": 1.4011, "step": 1871 }, { "epoch": 4.409893992932862, "grad_norm": 2.375129461288452, "learning_rate": 2.0554804889547586e-06, "loss": 1.4476, "step": 1872 }, { "epoch": 4.412249705535925, "grad_norm": 0.5405499339103699, "learning_rate": 2.039166119031233e-06, "loss": 1.4241, "step": 1873 }, { "epoch": 4.414605418138987, "grad_norm": 0.5105279088020325, "learning_rate": 2.022913997847417e-06, "loss": 1.3501, "step": 1874 }, { "epoch": 4.41696113074205, "grad_norm": 0.5690165758132935, "learning_rate": 2.0067241694641555e-06, "loss": 1.2709, "step": 1875 }, { "epoch": 4.419316843345112, "grad_norm": 0.5482543706893921, "learning_rate": 1.990596677773435e-06, "loss": 1.4911, "step": 1876 }, { "epoch": 4.421672555948175, "grad_norm": 0.7299177050590515, "learning_rate": 1.9745315664982276e-06, "loss": 1.3588, "step": 1877 }, { "epoch": 4.424028268551237, "grad_norm": 0.6257749795913696, "learning_rate": 1.9585288791924004e-06, "loss": 1.387, "step": 1878 }, { "epoch": 4.4263839811542995, "grad_norm": 0.5491316318511963, "learning_rate": 1.94258865924056e-06, "loss": 1.451, "step": 1879 }, { "epoch": 4.428739693757362, "grad_norm": 0.5066297054290771, "learning_rate": 1.926710949857996e-06, "loss": 1.3225, "step": 1880 }, { "epoch": 4.431095406360424, "grad_norm": 0.5073818564414978, "learning_rate": 1.910895794090492e-06, "loss": 1.3112, "step": 1881 }, { "epoch": 4.433451118963487, "grad_norm": 0.5346611738204956, "learning_rate": 1.8951432348142617e-06, "loss": 1.323, "step": 1882 }, { "epoch": 4.435806831566548, "grad_norm": 0.5573200583457947, "learning_rate": 1.8794533147358074e-06, "loss": 1.3722, "step": 1883 }, { "epoch": 4.438162544169611, "grad_norm": 0.5400240421295166, "learning_rate": 1.8638260763918196e-06, "loss": 1.273, "step": 1884 }, { "epoch": 4.440518256772673, "grad_norm": 0.6530469655990601, "learning_rate": 1.8482615621490313e-06, "loss": 1.4439, "step": 1885 }, { "epoch": 4.442873969375736, "grad_norm": 0.4974363148212433, "learning_rate": 1.8327598142041658e-06, "loss": 1.2706, "step": 1886 }, { "epoch": 4.445229681978798, "grad_norm": 0.5545400381088257, "learning_rate": 1.817320874583739e-06, "loss": 1.3352, "step": 1887 }, { "epoch": 4.447585394581861, "grad_norm": 4.170608043670654, "learning_rate": 1.8019447851440163e-06, "loss": 1.2192, "step": 1888 }, { "epoch": 4.449941107184923, "grad_norm": 0.5207646489143372, "learning_rate": 1.7866315875708529e-06, "loss": 1.4203, "step": 1889 }, { "epoch": 4.4522968197879855, "grad_norm": 0.8415517210960388, "learning_rate": 1.7713813233796173e-06, "loss": 1.3144, "step": 1890 }, { "epoch": 4.454652532391048, "grad_norm": 0.5737892389297485, "learning_rate": 1.7561940339150373e-06, "loss": 1.2982, "step": 1891 }, { "epoch": 4.45700824499411, "grad_norm": 0.49673882126808167, "learning_rate": 1.7410697603511383e-06, "loss": 1.3372, "step": 1892 }, { "epoch": 4.459363957597173, "grad_norm": 2.2103116512298584, "learning_rate": 1.7260085436910794e-06, "loss": 1.473, "step": 1893 }, { "epoch": 4.461719670200235, "grad_norm": 0.5808117985725403, "learning_rate": 1.7110104247670871e-06, "loss": 1.3395, "step": 1894 }, { "epoch": 4.464075382803298, "grad_norm": 0.5760692954063416, "learning_rate": 1.6960754442403054e-06, "loss": 1.3302, "step": 1895 }, { "epoch": 4.46643109540636, "grad_norm": 0.5789401531219482, "learning_rate": 1.6812036426007176e-06, "loss": 1.4811, "step": 1896 }, { "epoch": 4.468786808009423, "grad_norm": 1.6321359872817993, "learning_rate": 1.6663950601670253e-06, "loss": 1.5543, "step": 1897 }, { "epoch": 4.471142520612485, "grad_norm": 0.6832950711250305, "learning_rate": 1.651649737086533e-06, "loss": 1.2313, "step": 1898 }, { "epoch": 4.4734982332155475, "grad_norm": 13.347956657409668, "learning_rate": 1.6369677133350386e-06, "loss": 1.4747, "step": 1899 }, { "epoch": 4.47585394581861, "grad_norm": 0.5401739478111267, "learning_rate": 1.6223490287167352e-06, "loss": 1.4143, "step": 1900 }, { "epoch": 4.478209658421672, "grad_norm": 0.5260170102119446, "learning_rate": 1.6077937228641093e-06, "loss": 1.3797, "step": 1901 }, { "epoch": 4.480565371024735, "grad_norm": 0.7917875051498413, "learning_rate": 1.5933018352377982e-06, "loss": 1.4063, "step": 1902 }, { "epoch": 4.482921083627797, "grad_norm": 0.46747756004333496, "learning_rate": 1.5788734051265268e-06, "loss": 1.413, "step": 1903 }, { "epoch": 4.48527679623086, "grad_norm": 0.8559178709983826, "learning_rate": 1.5645084716469777e-06, "loss": 1.235, "step": 1904 }, { "epoch": 4.487632508833922, "grad_norm": 0.6575620174407959, "learning_rate": 1.550207073743684e-06, "loss": 1.4525, "step": 1905 }, { "epoch": 4.489988221436985, "grad_norm": 0.5338000059127808, "learning_rate": 1.535969250188926e-06, "loss": 1.4644, "step": 1906 }, { "epoch": 4.492343934040047, "grad_norm": 0.5804066061973572, "learning_rate": 1.5217950395826408e-06, "loss": 1.3231, "step": 1907 }, { "epoch": 4.4946996466431095, "grad_norm": 0.5180765390396118, "learning_rate": 1.5076844803522922e-06, "loss": 1.416, "step": 1908 }, { "epoch": 4.4946996466431095, "eval_loss": 1.308779239654541, "eval_runtime": 5.7793, "eval_samples_per_second": 432.576, "eval_steps_per_second": 6.921, "step": 1908 }, { "epoch": 4.497055359246172, "grad_norm": 0.7770318984985352, "learning_rate": 1.4936376107527877e-06, "loss": 1.4627, "step": 1909 }, { "epoch": 4.499411071849234, "grad_norm": 0.728419840335846, "learning_rate": 1.4796544688663622e-06, "loss": 1.3957, "step": 1910 }, { "epoch": 4.501766784452297, "grad_norm": 0.514739453792572, "learning_rate": 1.465735092602491e-06, "loss": 1.3198, "step": 1911 }, { "epoch": 4.504122497055359, "grad_norm": 0.5496022701263428, "learning_rate": 1.4518795196977575e-06, "loss": 1.3178, "step": 1912 }, { "epoch": 4.506478209658422, "grad_norm": 0.5651735067367554, "learning_rate": 1.4380877877157834e-06, "loss": 1.479, "step": 1913 }, { "epoch": 4.508833922261484, "grad_norm": 0.6077483892440796, "learning_rate": 1.424359934047101e-06, "loss": 1.1751, "step": 1914 }, { "epoch": 4.511189634864547, "grad_norm": 0.5971115827560425, "learning_rate": 1.41069599590907e-06, "loss": 1.3967, "step": 1915 }, { "epoch": 4.513545347467609, "grad_norm": 0.5783839225769043, "learning_rate": 1.397096010345772e-06, "loss": 1.4523, "step": 1916 }, { "epoch": 4.5159010600706715, "grad_norm": 0.5992259979248047, "learning_rate": 1.3835600142279082e-06, "loss": 1.6238, "step": 1917 }, { "epoch": 4.518256772673734, "grad_norm": 0.5849583148956299, "learning_rate": 1.3700880442526876e-06, "loss": 1.2789, "step": 1918 }, { "epoch": 4.520612485276796, "grad_norm": 0.6012079119682312, "learning_rate": 1.35668013694375e-06, "loss": 1.3356, "step": 1919 }, { "epoch": 4.522968197879859, "grad_norm": 3.9645302295684814, "learning_rate": 1.3433363286510513e-06, "loss": 1.4715, "step": 1920 }, { "epoch": 4.525323910482921, "grad_norm": 1.2300719022750854, "learning_rate": 1.3300566555507709e-06, "loss": 1.3829, "step": 1921 }, { "epoch": 4.527679623085984, "grad_norm": 0.6704773306846619, "learning_rate": 1.3168411536452152e-06, "loss": 1.3861, "step": 1922 }, { "epoch": 4.530035335689046, "grad_norm": 0.522917628288269, "learning_rate": 1.303689858762716e-06, "loss": 1.2837, "step": 1923 }, { "epoch": 4.532391048292109, "grad_norm": 0.5015364289283752, "learning_rate": 1.2906028065575253e-06, "loss": 1.3298, "step": 1924 }, { "epoch": 4.534746760895171, "grad_norm": 0.5315055251121521, "learning_rate": 1.2775800325097453e-06, "loss": 1.1602, "step": 1925 }, { "epoch": 4.5371024734982335, "grad_norm": 0.5970683097839355, "learning_rate": 1.2646215719251952e-06, "loss": 1.3588, "step": 1926 }, { "epoch": 4.539458186101296, "grad_norm": 0.5525078177452087, "learning_rate": 1.2517274599353507e-06, "loss": 1.2794, "step": 1927 }, { "epoch": 4.541813898704358, "grad_norm": 0.6058875918388367, "learning_rate": 1.2388977314972238e-06, "loss": 1.5115, "step": 1928 }, { "epoch": 4.544169611307421, "grad_norm": 0.5909423232078552, "learning_rate": 1.2261324213932863e-06, "loss": 1.3665, "step": 1929 }, { "epoch": 4.546525323910483, "grad_norm": 0.5407531261444092, "learning_rate": 1.2134315642313514e-06, "loss": 1.3544, "step": 1930 }, { "epoch": 4.548881036513546, "grad_norm": 0.5575234293937683, "learning_rate": 1.2007951944445122e-06, "loss": 1.3037, "step": 1931 }, { "epoch": 4.551236749116608, "grad_norm": 0.554286003112793, "learning_rate": 1.1882233462910258e-06, "loss": 1.3379, "step": 1932 }, { "epoch": 4.553592461719671, "grad_norm": 0.5546988844871521, "learning_rate": 1.1757160538542117e-06, "loss": 1.4517, "step": 1933 }, { "epoch": 4.555948174322733, "grad_norm": 2.648643732070923, "learning_rate": 1.1632733510423933e-06, "loss": 1.2953, "step": 1934 }, { "epoch": 4.5583038869257955, "grad_norm": 1.2795376777648926, "learning_rate": 1.1508952715887755e-06, "loss": 1.4551, "step": 1935 }, { "epoch": 4.560659599528858, "grad_norm": 0.784894585609436, "learning_rate": 1.1385818490513733e-06, "loss": 1.4946, "step": 1936 }, { "epoch": 4.56301531213192, "grad_norm": 0.5555356740951538, "learning_rate": 1.1263331168128916e-06, "loss": 1.2889, "step": 1937 }, { "epoch": 4.565371024734983, "grad_norm": 1.154402256011963, "learning_rate": 1.114149108080678e-06, "loss": 1.3351, "step": 1938 }, { "epoch": 4.567726737338045, "grad_norm": 0.5165104269981384, "learning_rate": 1.1020298558865883e-06, "loss": 1.2371, "step": 1939 }, { "epoch": 4.570082449941108, "grad_norm": 0.5628257989883423, "learning_rate": 1.0899753930869394e-06, "loss": 1.3615, "step": 1940 }, { "epoch": 4.572438162544169, "grad_norm": 0.5067851543426514, "learning_rate": 1.0779857523623815e-06, "loss": 1.3503, "step": 1941 }, { "epoch": 4.574793875147232, "grad_norm": 0.5496648550033569, "learning_rate": 1.0660609662178329e-06, "loss": 1.5686, "step": 1942 }, { "epoch": 4.577149587750294, "grad_norm": 0.5158296823501587, "learning_rate": 1.0542010669823855e-06, "loss": 1.3414, "step": 1943 }, { "epoch": 4.579505300353357, "grad_norm": 0.5562631487846375, "learning_rate": 1.0424060868092195e-06, "loss": 1.4584, "step": 1944 }, { "epoch": 4.581861012956419, "grad_norm": 0.5408490300178528, "learning_rate": 1.0306760576755058e-06, "loss": 1.3393, "step": 1945 }, { "epoch": 4.5842167255594815, "grad_norm": 0.5896109938621521, "learning_rate": 1.0190110113823426e-06, "loss": 1.4175, "step": 1946 }, { "epoch": 4.586572438162544, "grad_norm": 5.684402942657471, "learning_rate": 1.0074109795546406e-06, "loss": 1.3122, "step": 1947 }, { "epoch": 4.588928150765606, "grad_norm": 0.5548268556594849, "learning_rate": 9.958759936410573e-07, "loss": 1.3945, "step": 1948 }, { "epoch": 4.591283863368669, "grad_norm": 0.5605316162109375, "learning_rate": 9.844060849138997e-07, "loss": 1.3913, "step": 1949 }, { "epoch": 4.593639575971731, "grad_norm": 0.5162404179573059, "learning_rate": 9.730012844690522e-07, "loss": 1.4062, "step": 1950 }, { "epoch": 4.595995288574794, "grad_norm": 0.5414873957633972, "learning_rate": 9.61661623225879e-07, "loss": 1.433, "step": 1951 }, { "epoch": 4.598351001177856, "grad_norm": 0.5329586267471313, "learning_rate": 9.503871319271551e-07, "loss": 1.3322, "step": 1952 }, { "epoch": 4.6007067137809186, "grad_norm": 0.6973456144332886, "learning_rate": 9.391778411389634e-07, "loss": 1.4413, "step": 1953 }, { "epoch": 4.603062426383981, "grad_norm": 0.5985949635505676, "learning_rate": 9.28033781250634e-07, "loss": 1.4006, "step": 1954 }, { "epoch": 4.605418138987043, "grad_norm": 0.5078518390655518, "learning_rate": 9.169549824746382e-07, "loss": 1.4029, "step": 1955 }, { "epoch": 4.607773851590106, "grad_norm": 0.569061815738678, "learning_rate": 9.059414748465278e-07, "loss": 1.2725, "step": 1956 }, { "epoch": 4.610129564193168, "grad_norm": 0.56297767162323, "learning_rate": 8.949932882248435e-07, "loss": 1.3302, "step": 1957 }, { "epoch": 4.612485276796231, "grad_norm": 0.5031496286392212, "learning_rate": 8.841104522910343e-07, "loss": 1.2728, "step": 1958 }, { "epoch": 4.614840989399293, "grad_norm": 0.6081288456916809, "learning_rate": 8.732929965493769e-07, "loss": 1.3791, "step": 1959 }, { "epoch": 4.617196702002356, "grad_norm": 0.6075167655944824, "learning_rate": 8.625409503268955e-07, "loss": 1.4588, "step": 1960 }, { "epoch": 4.619552414605418, "grad_norm": 0.5594125986099243, "learning_rate": 8.51854342773295e-07, "loss": 1.3184, "step": 1961 }, { "epoch": 4.6219081272084805, "grad_norm": 0.5300228595733643, "learning_rate": 8.412332028608499e-07, "loss": 1.2987, "step": 1962 }, { "epoch": 4.624263839811543, "grad_norm": 0.5662802457809448, "learning_rate": 8.306775593843657e-07, "loss": 1.4146, "step": 1963 }, { "epoch": 4.626619552414605, "grad_norm": 0.5653339624404907, "learning_rate": 8.201874409610733e-07, "loss": 1.5268, "step": 1964 }, { "epoch": 4.628975265017668, "grad_norm": 0.5869760513305664, "learning_rate": 8.097628760305592e-07, "loss": 1.5129, "step": 1965 }, { "epoch": 4.63133097762073, "grad_norm": 0.5987149477005005, "learning_rate": 7.994038928546887e-07, "loss": 1.2959, "step": 1966 }, { "epoch": 4.633686690223793, "grad_norm": 0.518696129322052, "learning_rate": 7.891105195175358e-07, "loss": 1.4284, "step": 1967 }, { "epoch": 4.636042402826855, "grad_norm": 0.6120498776435852, "learning_rate": 7.788827839252888e-07, "loss": 1.3733, "step": 1968 }, { "epoch": 4.638398115429918, "grad_norm": 0.6187866926193237, "learning_rate": 7.687207138062008e-07, "loss": 1.1905, "step": 1969 }, { "epoch": 4.64075382803298, "grad_norm": 2.480478286743164, "learning_rate": 7.586243367104894e-07, "loss": 1.3442, "step": 1970 }, { "epoch": 4.6431095406360425, "grad_norm": 0.5601136088371277, "learning_rate": 7.485936800102788e-07, "loss": 1.3871, "step": 1971 }, { "epoch": 4.645465253239105, "grad_norm": 0.5742787718772888, "learning_rate": 7.38628770899516e-07, "loss": 1.4146, "step": 1972 }, { "epoch": 4.647820965842167, "grad_norm": 0.611815869808197, "learning_rate": 7.287296363939045e-07, "loss": 1.2895, "step": 1973 }, { "epoch": 4.65017667844523, "grad_norm": 0.625275194644928, "learning_rate": 7.188963033308183e-07, "loss": 1.5098, "step": 1974 }, { "epoch": 4.652532391048292, "grad_norm": 0.7957082986831665, "learning_rate": 7.091287983692546e-07, "loss": 1.5056, "step": 1975 }, { "epoch": 4.654888103651355, "grad_norm": 3.651258707046509, "learning_rate": 6.994271479897314e-07, "loss": 1.446, "step": 1976 }, { "epoch": 4.657243816254417, "grad_norm": 0.5869871973991394, "learning_rate": 6.897913784942339e-07, "loss": 1.3738, "step": 1977 }, { "epoch": 4.65959952885748, "grad_norm": 0.5508609414100647, "learning_rate": 6.802215160061381e-07, "loss": 1.3562, "step": 1978 }, { "epoch": 4.661955241460542, "grad_norm": 0.5590100884437561, "learning_rate": 6.707175864701431e-07, "loss": 1.409, "step": 1979 }, { "epoch": 4.6643109540636045, "grad_norm": 0.591913104057312, "learning_rate": 6.612796156521911e-07, "loss": 1.3951, "step": 1980 }, { "epoch": 4.666666666666667, "grad_norm": 0.6022026538848877, "learning_rate": 6.519076291394172e-07, "loss": 1.3233, "step": 1981 }, { "epoch": 4.669022379269729, "grad_norm": 1.7853941917419434, "learning_rate": 6.426016523400552e-07, "loss": 1.2927, "step": 1982 }, { "epoch": 4.671378091872792, "grad_norm": 0.5891311168670654, "learning_rate": 6.333617104833878e-07, "loss": 1.2923, "step": 1983 }, { "epoch": 4.673733804475854, "grad_norm": 0.6010372638702393, "learning_rate": 6.241878286196684e-07, "loss": 1.3789, "step": 1984 }, { "epoch": 4.676089517078917, "grad_norm": 0.5356823205947876, "learning_rate": 6.150800316200605e-07, "loss": 1.3677, "step": 1985 }, { "epoch": 4.678445229681979, "grad_norm": 0.6064110994338989, "learning_rate": 6.060383441765544e-07, "loss": 1.3906, "step": 1986 }, { "epoch": 4.680800942285041, "grad_norm": 0.6271969079971313, "learning_rate": 5.970627908019333e-07, "loss": 1.5169, "step": 1987 }, { "epoch": 4.683156654888103, "grad_norm": 0.8259994983673096, "learning_rate": 5.881533958296631e-07, "loss": 1.368, "step": 1988 }, { "epoch": 4.685512367491166, "grad_norm": 0.5376898050308228, "learning_rate": 5.793101834138615e-07, "loss": 1.4107, "step": 1989 }, { "epoch": 4.687868080094228, "grad_norm": 0.5075318813323975, "learning_rate": 5.705331775292144e-07, "loss": 1.2154, "step": 1990 }, { "epoch": 4.6902237926972905, "grad_norm": 0.5572957396507263, "learning_rate": 5.618224019709212e-07, "loss": 1.35, "step": 1991 }, { "epoch": 4.692579505300353, "grad_norm": 2.8583261966705322, "learning_rate": 5.531778803546217e-07, "loss": 1.35, "step": 1992 }, { "epoch": 4.694935217903415, "grad_norm": 0.5289627313613892, "learning_rate": 5.445996361163358e-07, "loss": 1.2836, "step": 1993 }, { "epoch": 4.697290930506478, "grad_norm": 0.5760498642921448, "learning_rate": 5.360876925123992e-07, "loss": 1.4039, "step": 1994 }, { "epoch": 4.69964664310954, "grad_norm": 0.6049121618270874, "learning_rate": 5.276420726194053e-07, "loss": 1.318, "step": 1995 }, { "epoch": 4.702002355712603, "grad_norm": 4.852107524871826, "learning_rate": 5.192627993341359e-07, "loss": 1.3605, "step": 1996 }, { "epoch": 4.704358068315665, "grad_norm": 0.5550298690795898, "learning_rate": 5.109498953734971e-07, "loss": 1.3902, "step": 1997 }, { "epoch": 4.706713780918728, "grad_norm": 0.7088170647621155, "learning_rate": 5.027033832744693e-07, "loss": 1.3623, "step": 1998 }, { "epoch": 4.70906949352179, "grad_norm": 0.532736599445343, "learning_rate": 4.945232853940357e-07, "loss": 1.3525, "step": 1999 }, { "epoch": 4.7114252061248525, "grad_norm": 0.5044472813606262, "learning_rate": 4.864096239091287e-07, "loss": 1.4451, "step": 2000 }, { "epoch": 4.713780918727915, "grad_norm": 0.4897443652153015, "learning_rate": 4.783624208165554e-07, "loss": 1.2115, "step": 2001 }, { "epoch": 4.716136631330977, "grad_norm": 0.5601485967636108, "learning_rate": 4.703816979329617e-07, "loss": 1.4014, "step": 2002 }, { "epoch": 4.71849234393404, "grad_norm": 1.9382950067520142, "learning_rate": 4.6246747689474847e-07, "loss": 1.4386, "step": 2003 }, { "epoch": 4.720848056537102, "grad_norm": 0.7967495918273926, "learning_rate": 4.5461977915803864e-07, "loss": 1.2647, "step": 2004 }, { "epoch": 4.723203769140165, "grad_norm": 0.6108955144882202, "learning_rate": 4.468386259985885e-07, "loss": 1.5954, "step": 2005 }, { "epoch": 4.725559481743227, "grad_norm": 0.5450281500816345, "learning_rate": 4.3912403851176234e-07, "loss": 1.4268, "step": 2006 }, { "epoch": 4.72791519434629, "grad_norm": 0.5530911684036255, "learning_rate": 4.314760376124466e-07, "loss": 1.4405, "step": 2007 }, { "epoch": 4.730270906949352, "grad_norm": 0.47590357065200806, "learning_rate": 4.2389464403501113e-07, "loss": 1.1666, "step": 2008 }, { "epoch": 4.7326266195524145, "grad_norm": 0.5035638809204102, "learning_rate": 4.1637987833325067e-07, "loss": 1.2432, "step": 2009 }, { "epoch": 4.734982332155477, "grad_norm": 0.6395200490951538, "learning_rate": 4.0893176088031835e-07, "loss": 1.4506, "step": 2010 }, { "epoch": 4.737338044758539, "grad_norm": 0.5731927752494812, "learning_rate": 4.01550311868687e-07, "loss": 1.3493, "step": 2011 }, { "epoch": 4.739693757361602, "grad_norm": 0.6976472735404968, "learning_rate": 3.9423555131007925e-07, "loss": 1.4717, "step": 2012 }, { "epoch": 4.742049469964664, "grad_norm": 0.7000354528427124, "learning_rate": 3.8698749903542666e-07, "loss": 1.4349, "step": 2013 }, { "epoch": 4.744405182567727, "grad_norm": 0.5687220096588135, "learning_rate": 3.7980617469479953e-07, "loss": 1.3242, "step": 2014 }, { "epoch": 4.746760895170789, "grad_norm": 0.5357676148414612, "learning_rate": 3.726915977573714e-07, "loss": 1.2561, "step": 2015 }, { "epoch": 4.749116607773852, "grad_norm": 0.6018729209899902, "learning_rate": 3.656437875113522e-07, "loss": 1.3238, "step": 2016 }, { "epoch": 4.751472320376914, "grad_norm": 0.5533334016799927, "learning_rate": 3.586627630639466e-07, "loss": 1.3682, "step": 2017 }, { "epoch": 4.7538280329799765, "grad_norm": 0.5604222416877747, "learning_rate": 3.517485433412987e-07, "loss": 1.2287, "step": 2018 }, { "epoch": 4.756183745583039, "grad_norm": 0.6008772253990173, "learning_rate": 3.4490114708843056e-07, "loss": 1.3033, "step": 2019 }, { "epoch": 4.758539458186101, "grad_norm": 0.5649879574775696, "learning_rate": 3.3812059286920937e-07, "loss": 1.4575, "step": 2020 }, { "epoch": 4.760895170789164, "grad_norm": 0.5633196234703064, "learning_rate": 3.3140689906628054e-07, "loss": 1.2859, "step": 2021 }, { "epoch": 4.763250883392226, "grad_norm": 0.49205639958381653, "learning_rate": 3.2476008388102887e-07, "loss": 1.3567, "step": 2022 }, { "epoch": 4.765606595995289, "grad_norm": 0.5292999148368835, "learning_rate": 3.181801653335315e-07, "loss": 1.4489, "step": 2023 }, { "epoch": 4.767962308598351, "grad_norm": 0.5442575216293335, "learning_rate": 3.1166716126249663e-07, "loss": 1.314, "step": 2024 }, { "epoch": 4.770318021201414, "grad_norm": 0.5510631799697876, "learning_rate": 3.0522108932521667e-07, "loss": 1.3884, "step": 2025 }, { "epoch": 4.772673733804476, "grad_norm": 0.6074433922767639, "learning_rate": 2.9884196699753453e-07, "loss": 1.4481, "step": 2026 }, { "epoch": 4.7750294464075385, "grad_norm": 1.5106278657913208, "learning_rate": 2.9252981157378847e-07, "loss": 1.3359, "step": 2027 }, { "epoch": 4.777385159010601, "grad_norm": 0.5945783853530884, "learning_rate": 2.862846401667507e-07, "loss": 1.2999, "step": 2028 }, { "epoch": 4.779740871613663, "grad_norm": 0.5221632719039917, "learning_rate": 2.801064697076111e-07, "loss": 1.3526, "step": 2029 }, { "epoch": 4.782096584216726, "grad_norm": 0.610538899898529, "learning_rate": 2.739953169458992e-07, "loss": 1.3902, "step": 2030 }, { "epoch": 4.784452296819788, "grad_norm": 0.6049677133560181, "learning_rate": 2.6795119844946757e-07, "loss": 1.4577, "step": 2031 }, { "epoch": 4.786808009422851, "grad_norm": 0.5373300909996033, "learning_rate": 2.6197413060442266e-07, "loss": 1.3164, "step": 2032 }, { "epoch": 4.789163722025913, "grad_norm": 0.6083728671073914, "learning_rate": 2.560641296150967e-07, "loss": 1.3056, "step": 2033 }, { "epoch": 4.791519434628976, "grad_norm": 0.6079051494598389, "learning_rate": 2.5022121150399257e-07, "loss": 1.526, "step": 2034 }, { "epoch": 4.793875147232038, "grad_norm": 0.5965662002563477, "learning_rate": 2.4444539211175563e-07, "loss": 1.4582, "step": 2035 }, { "epoch": 4.7962308598351004, "grad_norm": 0.5757274627685547, "learning_rate": 2.387366870971103e-07, "loss": 1.3951, "step": 2036 }, { "epoch": 4.798586572438163, "grad_norm": 0.9061786532402039, "learning_rate": 2.3309511193683466e-07, "loss": 1.3974, "step": 2037 }, { "epoch": 4.800942285041225, "grad_norm": 3.8841683864593506, "learning_rate": 2.2752068192571084e-07, "loss": 1.4181, "step": 2038 }, { "epoch": 4.803297997644288, "grad_norm": 0.53260338306427, "learning_rate": 2.2201341217648331e-07, "loss": 1.3613, "step": 2039 }, { "epoch": 4.80565371024735, "grad_norm": 0.6298583745956421, "learning_rate": 2.165733176198198e-07, "loss": 1.5285, "step": 2040 }, { "epoch": 4.808009422850413, "grad_norm": 0.5340294241905212, "learning_rate": 2.112004130042755e-07, "loss": 1.3955, "step": 2041 }, { "epoch": 4.810365135453475, "grad_norm": 0.5168047547340393, "learning_rate": 2.0589471289624018e-07, "loss": 1.3531, "step": 2042 }, { "epoch": 4.8127208480565375, "grad_norm": 0.5599393248558044, "learning_rate": 2.006562316799132e-07, "loss": 1.4151, "step": 2043 }, { "epoch": 4.8150765606596, "grad_norm": 0.5982971787452698, "learning_rate": 1.9548498355725088e-07, "loss": 1.336, "step": 2044 }, { "epoch": 4.817432273262662, "grad_norm": 0.8993967771530151, "learning_rate": 1.9038098254794136e-07, "loss": 1.3897, "step": 2045 }, { "epoch": 4.819787985865725, "grad_norm": 0.5495524406433105, "learning_rate": 1.8534424248935756e-07, "loss": 1.3488, "step": 2046 }, { "epoch": 4.822143698468786, "grad_norm": 0.5085054039955139, "learning_rate": 1.8037477703652383e-07, "loss": 1.1436, "step": 2047 }, { "epoch": 4.824499411071849, "grad_norm": 0.8685007095336914, "learning_rate": 1.7547259966207708e-07, "loss": 1.2931, "step": 2048 }, { "epoch": 4.826855123674911, "grad_norm": 0.45470675826072693, "learning_rate": 1.7063772365622787e-07, "loss": 1.308, "step": 2049 }, { "epoch": 4.829210836277974, "grad_norm": 0.5802318453788757, "learning_rate": 1.6587016212672724e-07, "loss": 1.4797, "step": 2050 }, { "epoch": 4.831566548881036, "grad_norm": 0.5562822818756104, "learning_rate": 1.61169927998836e-07, "loss": 1.4854, "step": 2051 }, { "epoch": 4.833922261484099, "grad_norm": 0.6536052823066711, "learning_rate": 1.565370340152833e-07, "loss": 1.2102, "step": 2052 }, { "epoch": 4.836277974087161, "grad_norm": 0.5096936225891113, "learning_rate": 1.5197149273623036e-07, "loss": 1.1989, "step": 2053 }, { "epoch": 4.8386336866902235, "grad_norm": 0.6253844499588013, "learning_rate": 1.4747331653923724e-07, "loss": 1.4715, "step": 2054 }, { "epoch": 4.840989399293286, "grad_norm": 0.6470111608505249, "learning_rate": 1.430425176192407e-07, "loss": 1.4099, "step": 2055 }, { "epoch": 4.843345111896348, "grad_norm": 1.183488368988037, "learning_rate": 1.3867910798850692e-07, "loss": 1.3483, "step": 2056 }, { "epoch": 4.845700824499411, "grad_norm": 0.6262606978416443, "learning_rate": 1.343830994765982e-07, "loss": 1.3766, "step": 2057 }, { "epoch": 4.848056537102473, "grad_norm": 0.5454447865486145, "learning_rate": 1.301545037303592e-07, "loss": 1.4124, "step": 2058 }, { "epoch": 4.850412249705536, "grad_norm": 2.6874871253967285, "learning_rate": 1.25993332213864e-07, "loss": 1.3339, "step": 2059 }, { "epoch": 4.852767962308598, "grad_norm": 0.5370191931724548, "learning_rate": 1.2189959620839686e-07, "loss": 1.3391, "step": 2060 }, { "epoch": 4.855123674911661, "grad_norm": 0.5688257813453674, "learning_rate": 1.1787330681241881e-07, "loss": 1.2655, "step": 2061 }, { "epoch": 4.857479387514723, "grad_norm": 0.5321575999259949, "learning_rate": 1.139144749415344e-07, "loss": 1.319, "step": 2062 }, { "epoch": 4.8598351001177855, "grad_norm": 0.6976641416549683, "learning_rate": 1.1002311132846944e-07, "loss": 1.2845, "step": 2063 }, { "epoch": 4.862190812720848, "grad_norm": 0.6108237504959106, "learning_rate": 1.0619922652303494e-07, "loss": 1.2977, "step": 2064 }, { "epoch": 4.86454652532391, "grad_norm": 0.5808330774307251, "learning_rate": 1.0244283089210216e-07, "loss": 1.4585, "step": 2065 }, { "epoch": 4.866902237926973, "grad_norm": 0.5663031935691833, "learning_rate": 9.87539346195776e-08, "loss": 1.3218, "step": 2066 }, { "epoch": 4.869257950530035, "grad_norm": 0.6737905740737915, "learning_rate": 9.513254770636137e-08, "loss": 1.5467, "step": 2067 }, { "epoch": 4.871613663133098, "grad_norm": 0.8440515398979187, "learning_rate": 9.157867997034164e-08, "loss": 1.4143, "step": 2068 }, { "epoch": 4.87396937573616, "grad_norm": 0.5414733290672302, "learning_rate": 8.809234104634468e-08, "loss": 1.291, "step": 2069 }, { "epoch": 4.876325088339223, "grad_norm": 0.548550009727478, "learning_rate": 8.467354038613207e-08, "loss": 1.2752, "step": 2070 }, { "epoch": 4.878680800942285, "grad_norm": 0.508431613445282, "learning_rate": 8.132228725835634e-08, "loss": 1.3456, "step": 2071 }, { "epoch": 4.8810365135453475, "grad_norm": 0.5035206079483032, "learning_rate": 7.803859074854425e-08, "loss": 1.2165, "step": 2072 }, { "epoch": 4.88339222614841, "grad_norm": 0.5174320340156555, "learning_rate": 7.482245975907188e-08, "loss": 1.2895, "step": 2073 }, { "epoch": 4.885747938751472, "grad_norm": 0.5007889270782471, "learning_rate": 7.167390300913957e-08, "loss": 1.3353, "step": 2074 }, { "epoch": 4.888103651354535, "grad_norm": 0.4999377131462097, "learning_rate": 6.859292903474702e-08, "loss": 1.2678, "step": 2075 }, { "epoch": 4.890459363957597, "grad_norm": 0.5601775646209717, "learning_rate": 6.557954618867102e-08, "loss": 1.263, "step": 2076 }, { "epoch": 4.89281507656066, "grad_norm": 0.5824906229972839, "learning_rate": 6.263376264044885e-08, "loss": 1.4603, "step": 2077 }, { "epoch": 4.895170789163722, "grad_norm": 0.562465488910675, "learning_rate": 5.975558637634215e-08, "loss": 1.3562, "step": 2078 }, { "epoch": 4.897526501766785, "grad_norm": 0.5246828198432922, "learning_rate": 5.6945025199325874e-08, "loss": 1.2849, "step": 2079 }, { "epoch": 4.899882214369847, "grad_norm": 0.7847094535827637, "learning_rate": 5.4202086729071564e-08, "loss": 1.2639, "step": 2080 }, { "epoch": 4.9022379269729095, "grad_norm": 0.5356783270835876, "learning_rate": 5.1526778401911334e-08, "loss": 1.2795, "step": 2081 }, { "epoch": 4.904593639575972, "grad_norm": 0.5280889868736267, "learning_rate": 4.891910747082673e-08, "loss": 1.3188, "step": 2082 }, { "epoch": 4.906949352179034, "grad_norm": 0.5300598740577698, "learning_rate": 4.6379081005437644e-08, "loss": 1.344, "step": 2083 }, { "epoch": 4.909305064782097, "grad_norm": 0.5125104784965515, "learning_rate": 4.390670589196622e-08, "loss": 1.3763, "step": 2084 }, { "epoch": 4.911660777385159, "grad_norm": 0.6045590043067932, "learning_rate": 4.150198883322298e-08, "loss": 1.55, "step": 2085 }, { "epoch": 4.914016489988222, "grad_norm": 1.1757795810699463, "learning_rate": 3.916493634860407e-08, "loss": 1.3817, "step": 2086 }, { "epoch": 4.916372202591284, "grad_norm": 0.9464888572692871, "learning_rate": 3.6895554774052375e-08, "loss": 1.2897, "step": 2087 }, { "epoch": 4.918727915194347, "grad_norm": 0.5540003776550293, "learning_rate": 3.4693850262046415e-08, "loss": 1.5529, "step": 2088 }, { "epoch": 4.921083627797409, "grad_norm": 0.4782203137874603, "learning_rate": 3.25598287815948e-08, "loss": 1.2636, "step": 2089 }, { "epoch": 4.9234393404004715, "grad_norm": 0.5390238165855408, "learning_rate": 3.049349611820851e-08, "loss": 1.4047, "step": 2090 }, { "epoch": 4.925795053003534, "grad_norm": 0.5529649257659912, "learning_rate": 2.8494857873889724e-08, "loss": 1.3747, "step": 2091 }, { "epoch": 4.928150765606596, "grad_norm": 0.6802988648414612, "learning_rate": 2.6563919467106903e-08, "loss": 1.3489, "step": 2092 }, { "epoch": 4.930506478209658, "grad_norm": 0.5493934154510498, "learning_rate": 2.4700686132803076e-08, "loss": 1.3456, "step": 2093 }, { "epoch": 4.93286219081272, "grad_norm": 0.5638847351074219, "learning_rate": 2.2905162922354228e-08, "loss": 1.4031, "step": 2094 }, { "epoch": 4.935217903415783, "grad_norm": 0.5345922112464905, "learning_rate": 2.1177354703577623e-08, "loss": 1.2965, "step": 2095 }, { "epoch": 4.937573616018845, "grad_norm": 0.7797779440879822, "learning_rate": 1.9517266160704038e-08, "loss": 1.3101, "step": 2096 }, { "epoch": 4.939929328621908, "grad_norm": 0.5202140808105469, "learning_rate": 1.792490179437223e-08, "loss": 1.2684, "step": 2097 }, { "epoch": 4.94228504122497, "grad_norm": 0.5585792064666748, "learning_rate": 1.6400265921615032e-08, "loss": 1.4345, "step": 2098 }, { "epoch": 4.944640753828033, "grad_norm": 0.595483660697937, "learning_rate": 1.4943362675848283e-08, "loss": 1.4531, "step": 2099 }, { "epoch": 4.946996466431095, "grad_norm": 0.5386458039283752, "learning_rate": 1.3554196006854147e-08, "loss": 1.403, "step": 2100 }, { "epoch": 4.9493521790341575, "grad_norm": 0.682745099067688, "learning_rate": 1.2232769680789457e-08, "loss": 1.3931, "step": 2101 }, { "epoch": 4.95170789163722, "grad_norm": 0.6084470152854919, "learning_rate": 1.0979087280141298e-08, "loss": 1.502, "step": 2102 }, { "epoch": 4.954063604240282, "grad_norm": 0.7520517110824585, "learning_rate": 9.793152203751988e-09, "loss": 1.3457, "step": 2103 }, { "epoch": 4.956419316843345, "grad_norm": 0.5575360059738159, "learning_rate": 8.674967666791323e-09, "loss": 1.3188, "step": 2104 }, { "epoch": 4.958775029446407, "grad_norm": 0.5242128968238831, "learning_rate": 7.624536700751029e-09, "loss": 1.2901, "step": 2105 }, { "epoch": 4.96113074204947, "grad_norm": 0.5550107955932617, "learning_rate": 6.641862153433653e-09, "loss": 1.3033, "step": 2106 }, { "epoch": 4.963486454652532, "grad_norm": 0.5081865787506104, "learning_rate": 5.726946688955348e-09, "loss": 1.2357, "step": 2107 }, { "epoch": 4.965842167255595, "grad_norm": 0.6288586258888245, "learning_rate": 4.87979278772921e-09, "loss": 1.454, "step": 2108 }, { "epoch": 4.968197879858657, "grad_norm": 0.5291398167610168, "learning_rate": 4.100402746456955e-09, "loss": 1.4075, "step": 2109 }, { "epoch": 4.9705535924617195, "grad_norm": 0.5045610666275024, "learning_rate": 3.3887786781316987e-09, "loss": 1.4367, "step": 2110 }, { "epoch": 4.972909305064782, "grad_norm": 0.5701635479927063, "learning_rate": 2.7449225120268484e-09, "loss": 1.4982, "step": 2111 }, { "epoch": 4.975265017667844, "grad_norm": 0.6047322154045105, "learning_rate": 2.1688359936905547e-09, "loss": 1.3846, "step": 2112 }, { "epoch": 4.977620730270907, "grad_norm": 0.5273966789245605, "learning_rate": 1.6605206849373833e-09, "loss": 1.2688, "step": 2113 }, { "epoch": 4.979976442873969, "grad_norm": 0.5633477568626404, "learning_rate": 1.2199779638566444e-09, "loss": 1.4033, "step": 2114 }, { "epoch": 4.982332155477032, "grad_norm": 2.209848642349243, "learning_rate": 8.472090247957364e-10, "loss": 1.4045, "step": 2115 }, { "epoch": 4.984687868080094, "grad_norm": 0.5708813071250916, "learning_rate": 5.422148783629233e-10, "loss": 1.4687, "step": 2116 }, { "epoch": 4.987043580683157, "grad_norm": 0.514054000377655, "learning_rate": 3.0499635141900684e-10, "loss": 1.3755, "step": 2117 }, { "epoch": 4.989399293286219, "grad_norm": 0.6345243453979492, "learning_rate": 1.3555408708842977e-10, "loss": 1.456, "step": 2118 }, { "epoch": 4.9917550058892814, "grad_norm": 0.49476099014282227, "learning_rate": 3.388854473984626e-11, "loss": 1.2984, "step": 2119 }, { "epoch": 4.994110718492344, "grad_norm": 0.6101141571998596, "learning_rate": 0.0, "loss": 1.3852, "step": 2120 }, { "epoch": 4.994110718492344, "eval_loss": 1.3083828687667847, "eval_runtime": 5.7908, "eval_samples_per_second": 431.721, "eval_steps_per_second": 6.908, "step": 2120 } ], "logging_steps": 1, "max_steps": 2120, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 212, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.069759780786176e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }