diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22442 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 1600, + "global_step": 3199, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00031259768677711783, + "grad_norm": 0.21875, + "learning_rate": 2e-05, + "loss": 2.2083, + "step": 1 + }, + { + "epoch": 0.00031259768677711783, + "eval_loss": 2.0116653442382812, + "eval_runtime": 1898.4475, + "eval_samples_per_second": 4.813, + "eval_steps_per_second": 2.407, + "step": 1 + }, + { + "epoch": 0.0006251953735542357, + "grad_norm": 0.216796875, + "learning_rate": 4e-05, + "loss": 2.0017, + "step": 2 + }, + { + "epoch": 0.0009377930603313535, + "grad_norm": 0.216796875, + "learning_rate": 6e-05, + "loss": 2.2668, + "step": 3 + }, + { + "epoch": 0.0012503907471084713, + "grad_norm": 0.220703125, + "learning_rate": 8e-05, + "loss": 1.9291, + "step": 4 + }, + { + "epoch": 0.0015629884338855893, + "grad_norm": 0.22265625, + "learning_rate": 0.0001, + "loss": 1.8984, + "step": 5 + }, + { + "epoch": 0.001875586120662707, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012, + "loss": 2.1924, + "step": 6 + }, + { + "epoch": 0.002188183807439825, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014, + "loss": 2.135, + "step": 7 + }, + { + "epoch": 0.0025007814942169426, + "grad_norm": 0.1865234375, + "learning_rate": 0.00016, + "loss": 2.0434, + "step": 8 + }, + { + "epoch": 0.002813379180994061, + "grad_norm": 0.1826171875, + "learning_rate": 0.00018, + "loss": 1.892, + "step": 9 + }, + { + "epoch": 0.0031259768677711786, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 2.0976, + "step": 10 + }, + { + "epoch": 0.0034385745545482964, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001999999969814363, + "loss": 2.257, + "step": 11 + }, + { + "epoch": 0.003751172241325414, + "grad_norm": 0.185546875, + "learning_rate": 0.00019999998792574533, + "loss": 2.0452, + "step": 12 + }, + { + "epoch": 0.004063769928102532, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019999997283292765, + "loss": 1.9728, + "step": 13 + }, + { + "epoch": 0.00437636761487965, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001999999517029842, + "loss": 2.0065, + "step": 14 + }, + { + "epoch": 0.004688965301656768, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019999992453591622, + "loss": 2.0628, + "step": 15 + }, + { + "epoch": 0.005001562988433885, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019999989133172538, + "loss": 2.0292, + "step": 16 + }, + { + "epoch": 0.0053141606752110035, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999985209041366, + "loss": 1.9125, + "step": 17 + }, + { + "epoch": 0.005626758361988122, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999980681198345, + "loss": 1.8677, + "step": 18 + }, + { + "epoch": 0.005939356048765239, + "grad_norm": 0.16796875, + "learning_rate": 0.00019999975549643746, + "loss": 1.984, + "step": 19 + }, + { + "epoch": 0.006251953735542357, + "grad_norm": 0.177734375, + "learning_rate": 0.00019999969814377878, + "loss": 1.9886, + "step": 20 + }, + { + "epoch": 0.006564551422319475, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001999996347540109, + "loss": 2.0277, + "step": 21 + }, + { + "epoch": 0.006877149109096593, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001999995653271377, + "loss": 2.0216, + "step": 22 + }, + { + "epoch": 0.00718974679587371, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019999948986316324, + "loss": 1.8909, + "step": 23 + }, + { + "epoch": 0.007502344482650828, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999940836209215, + "loss": 1.9551, + "step": 24 + }, + { + "epoch": 0.007814942169427946, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019999932082392937, + "loss": 1.9186, + "step": 25 + }, + { + "epoch": 0.008127539856205065, + "grad_norm": 0.18359375, + "learning_rate": 0.00019999922724868015, + "loss": 2.076, + "step": 26 + }, + { + "epoch": 0.008440137542982182, + "grad_norm": 0.181640625, + "learning_rate": 0.00019999912763635016, + "loss": 1.9682, + "step": 27 + }, + { + "epoch": 0.0087527352297593, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019999902198694543, + "loss": 1.8056, + "step": 28 + }, + { + "epoch": 0.009065332916536417, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019999891030047227, + "loss": 1.8404, + "step": 29 + }, + { + "epoch": 0.009377930603313536, + "grad_norm": 0.16015625, + "learning_rate": 0.0001999987925769375, + "loss": 2.113, + "step": 30 + }, + { + "epoch": 0.009690528290090653, + "grad_norm": 0.166015625, + "learning_rate": 0.00019999866881634815, + "loss": 1.8715, + "step": 31 + }, + { + "epoch": 0.01000312597686777, + "grad_norm": 0.8359375, + "learning_rate": 0.00019999853901871175, + "loss": 3.09, + "step": 32 + }, + { + "epoch": 0.01031572366364489, + "grad_norm": 0.1640625, + "learning_rate": 0.00019999840318403613, + "loss": 2.1366, + "step": 33 + }, + { + "epoch": 0.010628321350422007, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019999826131232947, + "loss": 1.9007, + "step": 34 + }, + { + "epoch": 0.010940919037199124, + "grad_norm": 0.173828125, + "learning_rate": 0.00019999811340360034, + "loss": 1.9831, + "step": 35 + }, + { + "epoch": 0.011253516723976243, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001999979594578577, + "loss": 1.9656, + "step": 36 + }, + { + "epoch": 0.01156611441075336, + "grad_norm": 0.48828125, + "learning_rate": 0.0001999977994751108, + "loss": 2.8559, + "step": 37 + }, + { + "epoch": 0.011878712097530478, + "grad_norm": 0.177734375, + "learning_rate": 0.00019999763345536934, + "loss": 2.0553, + "step": 38 + }, + { + "epoch": 0.012191309784307595, + "grad_norm": 0.18359375, + "learning_rate": 0.0001999974613986433, + "loss": 1.8912, + "step": 39 + }, + { + "epoch": 0.012503907471084715, + "grad_norm": 0.169921875, + "learning_rate": 0.0001999972833049431, + "loss": 1.8342, + "step": 40 + }, + { + "epoch": 0.012816505157861832, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999709917427946, + "loss": 1.9032, + "step": 41 + }, + { + "epoch": 0.01312910284463895, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019999690900666353, + "loss": 1.9228, + "step": 42 + }, + { + "epoch": 0.013441700531416068, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019999671280210676, + "loss": 1.9666, + "step": 43 + }, + { + "epoch": 0.013754298218193186, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019999651056062102, + "loss": 1.901, + "step": 44 + }, + { + "epoch": 0.014066895904970303, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019999630228221852, + "loss": 2.1406, + "step": 45 + }, + { + "epoch": 0.01437949359174742, + "grad_norm": 0.16796875, + "learning_rate": 0.0001999960879669118, + "loss": 1.9389, + "step": 46 + }, + { + "epoch": 0.01469209127852454, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019999586761471384, + "loss": 1.9625, + "step": 47 + }, + { + "epoch": 0.015004688965301657, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019999564122563795, + "loss": 2.0456, + "step": 48 + }, + { + "epoch": 0.015317286652078774, + "grad_norm": 0.171875, + "learning_rate": 0.00019999540879969775, + "loss": 1.9955, + "step": 49 + }, + { + "epoch": 0.01562988433885589, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999517033690727, + "loss": 1.8969, + "step": 50 + }, + { + "epoch": 0.01594248202563301, + "grad_norm": 0.166015625, + "learning_rate": 0.00019999492583728097, + "loss": 1.8544, + "step": 51 + }, + { + "epoch": 0.01625507971241013, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999467530083356, + "loss": 1.779, + "step": 52 + }, + { + "epoch": 0.016567677399187245, + "grad_norm": 0.177734375, + "learning_rate": 0.00019999441872758017, + "loss": 1.8127, + "step": 53 + }, + { + "epoch": 0.016880275085964364, + "grad_norm": 0.1796875, + "learning_rate": 0.0001999941561175363, + "loss": 2.0516, + "step": 54 + }, + { + "epoch": 0.017192872772741483, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999938874707178, + "loss": 2.1641, + "step": 55 + }, + { + "epoch": 0.0175054704595186, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019999361278714092, + "loss": 2.0483, + "step": 56 + }, + { + "epoch": 0.017818068146295718, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999333206682218, + "loss": 1.9931, + "step": 57 + }, + { + "epoch": 0.018130665833072834, + "grad_norm": 0.181640625, + "learning_rate": 0.00019999304530977856, + "loss": 1.9795, + "step": 58 + }, + { + "epoch": 0.018443263519849953, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999275251602738, + "loss": 2.13, + "step": 59 + }, + { + "epoch": 0.018755861206627072, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001999924536855863, + "loss": 2.0652, + "step": 60 + }, + { + "epoch": 0.019068458893404187, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019999214881847338, + "loss": 1.9731, + "step": 61 + }, + { + "epoch": 0.019381056580181306, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019999183791470702, + "loss": 1.9303, + "step": 62 + }, + { + "epoch": 0.019693654266958426, + "grad_norm": 0.169921875, + "learning_rate": 0.000199991520974306, + "loss": 1.9115, + "step": 63 + }, + { + "epoch": 0.02000625195373554, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001999911979972894, + "loss": 2.1622, + "step": 64 + }, + { + "epoch": 0.02031884964051266, + "grad_norm": 0.169921875, + "learning_rate": 0.00019999086898367678, + "loss": 1.9662, + "step": 65 + }, + { + "epoch": 0.02063144732728978, + "grad_norm": 0.16796875, + "learning_rate": 0.00019999053393348796, + "loss": 1.8382, + "step": 66 + }, + { + "epoch": 0.020944045014066895, + "grad_norm": 0.17578125, + "learning_rate": 0.00019999019284674317, + "loss": 1.9147, + "step": 67 + }, + { + "epoch": 0.021256642700844014, + "grad_norm": 0.171875, + "learning_rate": 0.00019998984572346308, + "loss": 2.0712, + "step": 68 + }, + { + "epoch": 0.021569240387621133, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019998949256366854, + "loss": 2.0207, + "step": 69 + }, + { + "epoch": 0.02188183807439825, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019998913336738094, + "loss": 2.1334, + "step": 70 + }, + { + "epoch": 0.022194435761175368, + "grad_norm": 0.7109375, + "learning_rate": 0.00019998876813462192, + "loss": 2.7085, + "step": 71 + }, + { + "epoch": 0.022507033447952487, + "grad_norm": 0.17578125, + "learning_rate": 0.00019998839686541356, + "loss": 1.7364, + "step": 72 + }, + { + "epoch": 0.022819631134729602, + "grad_norm": 0.18359375, + "learning_rate": 0.0001999880195597783, + "loss": 1.9281, + "step": 73 + }, + { + "epoch": 0.02313222882150672, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019998763621773883, + "loss": 1.9648, + "step": 74 + }, + { + "epoch": 0.023444826508283837, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019998724683931838, + "loss": 1.9874, + "step": 75 + }, + { + "epoch": 0.023757424195060956, + "grad_norm": 0.171875, + "learning_rate": 0.0001999868514245404, + "loss": 1.785, + "step": 76 + }, + { + "epoch": 0.024070021881838075, + "grad_norm": 0.181640625, + "learning_rate": 0.0001999864499734288, + "loss": 1.9094, + "step": 77 + }, + { + "epoch": 0.02438261956861519, + "grad_norm": 0.162109375, + "learning_rate": 0.00019998604248600777, + "loss": 1.9723, + "step": 78 + }, + { + "epoch": 0.02469521725539231, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019998562896230196, + "loss": 1.8739, + "step": 79 + }, + { + "epoch": 0.02500781494216943, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019998520940233636, + "loss": 1.936, + "step": 80 + }, + { + "epoch": 0.025320412628946545, + "grad_norm": 0.16015625, + "learning_rate": 0.0001999847838061362, + "loss": 1.8807, + "step": 81 + }, + { + "epoch": 0.025633010315723664, + "grad_norm": 0.173828125, + "learning_rate": 0.00019998435217372728, + "loss": 1.7412, + "step": 82 + }, + { + "epoch": 0.025945608002500783, + "grad_norm": 0.17578125, + "learning_rate": 0.00019998391450513556, + "loss": 1.8404, + "step": 83 + }, + { + "epoch": 0.0262582056892779, + "grad_norm": 0.169921875, + "learning_rate": 0.00019998347080038754, + "loss": 1.8108, + "step": 84 + }, + { + "epoch": 0.026570803376055017, + "grad_norm": 0.181640625, + "learning_rate": 0.00019998302105950994, + "loss": 2.0934, + "step": 85 + }, + { + "epoch": 0.026883401062832137, + "grad_norm": 0.19140625, + "learning_rate": 0.00019998256528252998, + "loss": 2.0021, + "step": 86 + }, + { + "epoch": 0.027195998749609252, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019998210346947515, + "loss": 1.9675, + "step": 87 + }, + { + "epoch": 0.02750859643638637, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019998163562037332, + "loss": 1.8488, + "step": 88 + }, + { + "epoch": 0.02782119412316349, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019998116173525272, + "loss": 1.9255, + "step": 89 + }, + { + "epoch": 0.028133791809940606, + "grad_norm": 0.16796875, + "learning_rate": 0.000199980681814142, + "loss": 2.1055, + "step": 90 + }, + { + "epoch": 0.028446389496717725, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999801958570701, + "loss": 2.1303, + "step": 91 + }, + { + "epoch": 0.02875898718349484, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019997970386406637, + "loss": 2.0517, + "step": 92 + }, + { + "epoch": 0.02907158487027196, + "grad_norm": 0.181640625, + "learning_rate": 0.00019997920583516053, + "loss": 1.8314, + "step": 93 + }, + { + "epoch": 0.02938418255704908, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001999787017703826, + "loss": 1.7953, + "step": 94 + }, + { + "epoch": 0.029696780243826194, + "grad_norm": 0.169921875, + "learning_rate": 0.00019997819166976308, + "loss": 1.8238, + "step": 95 + }, + { + "epoch": 0.030009377930603313, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999776755333327, + "loss": 1.8905, + "step": 96 + }, + { + "epoch": 0.030321975617380433, + "grad_norm": 0.169921875, + "learning_rate": 0.00019997715336112263, + "loss": 1.7594, + "step": 97 + }, + { + "epoch": 0.030634573304157548, + "grad_norm": 0.171875, + "learning_rate": 0.0001999766251531644, + "loss": 1.9648, + "step": 98 + }, + { + "epoch": 0.030947170990934667, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019997609090948996, + "loss": 2.1577, + "step": 99 + }, + { + "epoch": 0.03125976867771178, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001999755506301315, + "loss": 2.0563, + "step": 100 + }, + { + "epoch": 0.0315723663644889, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001999750043151216, + "loss": 1.8857, + "step": 101 + }, + { + "epoch": 0.03188496405126602, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019997445196449337, + "loss": 1.8832, + "step": 102 + }, + { + "epoch": 0.03219756173804314, + "grad_norm": 0.189453125, + "learning_rate": 0.00019997389357828, + "loss": 2.0352, + "step": 103 + }, + { + "epoch": 0.03251015942482026, + "grad_norm": 0.18359375, + "learning_rate": 0.00019997332915651532, + "loss": 2.0126, + "step": 104 + }, + { + "epoch": 0.03282275711159737, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019997275869923335, + "loss": 2.0201, + "step": 105 + }, + { + "epoch": 0.03313535479837449, + "grad_norm": 0.177734375, + "learning_rate": 0.00019997218220646853, + "loss": 2.3295, + "step": 106 + }, + { + "epoch": 0.03344795248515161, + "grad_norm": 0.16796875, + "learning_rate": 0.0001999715996782557, + "loss": 1.8919, + "step": 107 + }, + { + "epoch": 0.03376055017192873, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019997101111462998, + "loss": 1.7797, + "step": 108 + }, + { + "epoch": 0.03407314785870585, + "grad_norm": 0.193359375, + "learning_rate": 0.00019997041651562695, + "loss": 1.8956, + "step": 109 + }, + { + "epoch": 0.03438574554548297, + "grad_norm": 0.1640625, + "learning_rate": 0.00019996981588128244, + "loss": 1.9683, + "step": 110 + }, + { + "epoch": 0.03469834323226008, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019996920921163278, + "loss": 1.727, + "step": 111 + }, + { + "epoch": 0.0350109409190372, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019996859650671457, + "loss": 1.8966, + "step": 112 + }, + { + "epoch": 0.03532353860581432, + "grad_norm": 0.16796875, + "learning_rate": 0.0001999679777665648, + "loss": 1.8195, + "step": 113 + }, + { + "epoch": 0.035636136292591436, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999673529912208, + "loss": 1.8785, + "step": 114 + }, + { + "epoch": 0.035948733979368555, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999667221807203, + "loss": 1.9363, + "step": 115 + }, + { + "epoch": 0.03626133166614567, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019996608533510144, + "loss": 2.0314, + "step": 116 + }, + { + "epoch": 0.036573929352922786, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999654424544026, + "loss": 1.9304, + "step": 117 + }, + { + "epoch": 0.036886527039699905, + "grad_norm": 0.181640625, + "learning_rate": 0.0001999647935386626, + "loss": 2.1456, + "step": 118 + }, + { + "epoch": 0.037199124726477024, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001999641385879206, + "loss": 1.6728, + "step": 119 + }, + { + "epoch": 0.037511722413254144, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019996347760221624, + "loss": 1.8201, + "step": 120 + }, + { + "epoch": 0.03782432010003126, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001999628105815893, + "loss": 2.096, + "step": 121 + }, + { + "epoch": 0.038136917786808375, + "grad_norm": 0.171875, + "learning_rate": 0.0001999621375260801, + "loss": 1.9948, + "step": 122 + }, + { + "epoch": 0.038449515473585494, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999614584357293, + "loss": 1.9562, + "step": 123 + }, + { + "epoch": 0.03876211316036261, + "grad_norm": 0.173828125, + "learning_rate": 0.00019996077331057788, + "loss": 1.8452, + "step": 124 + }, + { + "epoch": 0.03907471084713973, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019996008215066716, + "loss": 1.7615, + "step": 125 + }, + { + "epoch": 0.03938730853391685, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019995938495603893, + "loss": 1.7628, + "step": 126 + }, + { + "epoch": 0.03969990622069397, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019995868172673523, + "loss": 2.0241, + "step": 127 + }, + { + "epoch": 0.04001250390747108, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019995797246279856, + "loss": 2.0807, + "step": 128 + }, + { + "epoch": 0.0403251015942482, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019995725716427169, + "loss": 1.8564, + "step": 129 + }, + { + "epoch": 0.04063769928102532, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019995653583119785, + "loss": 2.1278, + "step": 130 + }, + { + "epoch": 0.04095029696780244, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019995580846362055, + "loss": 2.095, + "step": 131 + }, + { + "epoch": 0.04126289465457956, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019995507506158372, + "loss": 1.6848, + "step": 132 + }, + { + "epoch": 0.04157549234135667, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019995433562513163, + "loss": 1.8979, + "step": 133 + }, + { + "epoch": 0.04188809002813379, + "grad_norm": 0.427734375, + "learning_rate": 0.00019995359015430894, + "loss": 2.9492, + "step": 134 + }, + { + "epoch": 0.04220068771491091, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999528386491606, + "loss": 1.8951, + "step": 135 + }, + { + "epoch": 0.04251328540168803, + "grad_norm": 0.17578125, + "learning_rate": 0.00019995208110973206, + "loss": 1.7656, + "step": 136 + }, + { + "epoch": 0.04282588308846515, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019995131753606902, + "loss": 2.0607, + "step": 137 + }, + { + "epoch": 0.043138480775242266, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019995054792821754, + "loss": 1.6803, + "step": 138 + }, + { + "epoch": 0.04345107846201938, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019994977228622414, + "loss": 2.0165, + "step": 139 + }, + { + "epoch": 0.0437636761487965, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001999489906101356, + "loss": 1.9388, + "step": 140 + }, + { + "epoch": 0.044076273835573616, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019994820289999913, + "loss": 1.6209, + "step": 141 + }, + { + "epoch": 0.044388871522350735, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999474091558623, + "loss": 1.8157, + "step": 142 + }, + { + "epoch": 0.044701469209127855, + "grad_norm": 0.171875, + "learning_rate": 0.00019994660937777301, + "loss": 1.7581, + "step": 143 + }, + { + "epoch": 0.045014066895904974, + "grad_norm": 0.279296875, + "learning_rate": 0.00019994580356577957, + "loss": 2.6888, + "step": 144 + }, + { + "epoch": 0.045326664582682086, + "grad_norm": 0.1796875, + "learning_rate": 0.00019994499171993056, + "loss": 2.0103, + "step": 145 + }, + { + "epoch": 0.045639262269459205, + "grad_norm": 0.171875, + "learning_rate": 0.00019994417384027507, + "loss": 1.7455, + "step": 146 + }, + { + "epoch": 0.045951859956236324, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019994334992686245, + "loss": 1.9287, + "step": 147 + }, + { + "epoch": 0.04626445764301344, + "grad_norm": 0.1875, + "learning_rate": 0.00019994251997974241, + "loss": 1.8521, + "step": 148 + }, + { + "epoch": 0.04657705532979056, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019994168399896508, + "loss": 2.0915, + "step": 149 + }, + { + "epoch": 0.046889653016567674, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019994084198458097, + "loss": 2.0972, + "step": 150 + }, + { + "epoch": 0.04720225070334479, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019993999393664083, + "loss": 2.2031, + "step": 151 + }, + { + "epoch": 0.04751484839012191, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019993913985519592, + "loss": 1.8532, + "step": 152 + }, + { + "epoch": 0.04782744607689903, + "grad_norm": 0.18359375, + "learning_rate": 0.0001999382797402978, + "loss": 1.9278, + "step": 153 + }, + { + "epoch": 0.04814004376367615, + "grad_norm": 0.1796875, + "learning_rate": 0.00019993741359199834, + "loss": 1.6459, + "step": 154 + }, + { + "epoch": 0.04845264145045327, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999365414103499, + "loss": 1.8459, + "step": 155 + }, + { + "epoch": 0.04876523913723038, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999356631954051, + "loss": 1.9833, + "step": 156 + }, + { + "epoch": 0.0490778368240075, + "grad_norm": 0.177734375, + "learning_rate": 0.00019993477894721698, + "loss": 1.8361, + "step": 157 + }, + { + "epoch": 0.04939043451078462, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999338886658389, + "loss": 2.0878, + "step": 158 + }, + { + "epoch": 0.04970303219756174, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001999329923513246, + "loss": 1.9454, + "step": 159 + }, + { + "epoch": 0.05001562988433886, + "grad_norm": 0.177734375, + "learning_rate": 0.00019993209000372818, + "loss": 1.982, + "step": 160 + }, + { + "epoch": 0.05032822757111598, + "grad_norm": 0.1796875, + "learning_rate": 0.00019993118162310415, + "loss": 1.9192, + "step": 161 + }, + { + "epoch": 0.05064082525789309, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999302672095074, + "loss": 1.8865, + "step": 162 + }, + { + "epoch": 0.05095342294467021, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019992934676299302, + "loss": 1.6733, + "step": 163 + }, + { + "epoch": 0.05126602063144733, + "grad_norm": 0.169921875, + "learning_rate": 0.00019992842028361665, + "loss": 1.9374, + "step": 164 + }, + { + "epoch": 0.051578618318224446, + "grad_norm": 0.1953125, + "learning_rate": 0.0001999274877714342, + "loss": 1.9537, + "step": 165 + }, + { + "epoch": 0.051891216005001566, + "grad_norm": 0.17578125, + "learning_rate": 0.000199926549226502, + "loss": 1.8767, + "step": 166 + }, + { + "epoch": 0.05220381369177868, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019992560464887667, + "loss": 1.8994, + "step": 167 + }, + { + "epoch": 0.0525164113785558, + "grad_norm": 0.185546875, + "learning_rate": 0.00019992465403861524, + "loss": 1.7415, + "step": 168 + }, + { + "epoch": 0.052829009065332916, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019992369739577512, + "loss": 1.7688, + "step": 169 + }, + { + "epoch": 0.053141606752110035, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019992273472041404, + "loss": 1.7507, + "step": 170 + }, + { + "epoch": 0.053454204438887154, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019992176601259015, + "loss": 1.995, + "step": 171 + }, + { + "epoch": 0.05376680212566427, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019992079127236192, + "loss": 1.9025, + "step": 172 + }, + { + "epoch": 0.054079399812441385, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001999198104997882, + "loss": 1.7634, + "step": 173 + }, + { + "epoch": 0.054391997499218504, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019991882369492815, + "loss": 1.8371, + "step": 174 + }, + { + "epoch": 0.05470459518599562, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999178308578414, + "loss": 1.7978, + "step": 175 + }, + { + "epoch": 0.05501719287277274, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999168319885879, + "loss": 2.0066, + "step": 176 + }, + { + "epoch": 0.05532979055954986, + "grad_norm": 0.17578125, + "learning_rate": 0.00019991582708722792, + "loss": 1.6957, + "step": 177 + }, + { + "epoch": 0.05564238824632698, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001999148161538221, + "loss": 1.8989, + "step": 178 + }, + { + "epoch": 0.05595498593310409, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019991379918843155, + "loss": 2.0687, + "step": 179 + }, + { + "epoch": 0.05626758361988121, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019991277619111763, + "loss": 1.9398, + "step": 180 + }, + { + "epoch": 0.05658018130665833, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019991174716194203, + "loss": 1.7309, + "step": 181 + }, + { + "epoch": 0.05689277899343545, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019991071210096698, + "loss": 1.8865, + "step": 182 + }, + { + "epoch": 0.05720537668021257, + "grad_norm": 0.173828125, + "learning_rate": 0.00019990967100825491, + "loss": 1.8802, + "step": 183 + }, + { + "epoch": 0.05751797436698968, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999086238838687, + "loss": 1.814, + "step": 184 + }, + { + "epoch": 0.0578305720537668, + "grad_norm": 0.16796875, + "learning_rate": 0.00019990757072787152, + "loss": 1.6507, + "step": 185 + }, + { + "epoch": 0.05814316974054392, + "grad_norm": 0.17578125, + "learning_rate": 0.000199906511540327, + "loss": 1.921, + "step": 186 + }, + { + "epoch": 0.05845576742732104, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001999054463212991, + "loss": 1.9151, + "step": 187 + }, + { + "epoch": 0.05876836511409816, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019990437507085202, + "loss": 2.0727, + "step": 188 + }, + { + "epoch": 0.05908096280087528, + "grad_norm": 0.17578125, + "learning_rate": 0.00019990329778905058, + "loss": 2.0359, + "step": 189 + }, + { + "epoch": 0.05939356048765239, + "grad_norm": 0.19921875, + "learning_rate": 0.00019990221447595968, + "loss": 1.9311, + "step": 190 + }, + { + "epoch": 0.05970615817442951, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019990112513164484, + "loss": 1.8018, + "step": 191 + }, + { + "epoch": 0.06001875586120663, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019990002975617174, + "loss": 1.9104, + "step": 192 + }, + { + "epoch": 0.060331353547983746, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019989892834960656, + "loss": 1.7227, + "step": 193 + }, + { + "epoch": 0.060643951234760865, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019989782091201573, + "loss": 1.7287, + "step": 194 + }, + { + "epoch": 0.06095654892153798, + "grad_norm": 0.181640625, + "learning_rate": 0.0001998967074434662, + "loss": 1.8525, + "step": 195 + }, + { + "epoch": 0.061269146608315096, + "grad_norm": 0.439453125, + "learning_rate": 0.00019989558794402515, + "loss": 2.4259, + "step": 196 + }, + { + "epoch": 0.061581744295092215, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001998944624137601, + "loss": 2.1134, + "step": 197 + }, + { + "epoch": 0.061894341981869334, + "grad_norm": 0.169921875, + "learning_rate": 0.0001998933308527391, + "loss": 1.9239, + "step": 198 + }, + { + "epoch": 0.06220693966864645, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001998921932610304, + "loss": 1.7292, + "step": 199 + }, + { + "epoch": 0.06251953735542357, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001998910496387027, + "loss": 1.7629, + "step": 200 + }, + { + "epoch": 0.06283213504220068, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019988989998582506, + "loss": 2.005, + "step": 201 + }, + { + "epoch": 0.0631447327289778, + "grad_norm": 0.173828125, + "learning_rate": 0.00019988874430246686, + "loss": 1.7605, + "step": 202 + }, + { + "epoch": 0.06345733041575492, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001998875825886979, + "loss": 1.748, + "step": 203 + }, + { + "epoch": 0.06376992810253204, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019988641484458826, + "loss": 2.1037, + "step": 204 + }, + { + "epoch": 0.06408252578930916, + "grad_norm": 0.181640625, + "learning_rate": 0.00019988524107020846, + "loss": 1.9274, + "step": 205 + }, + { + "epoch": 0.06439512347608628, + "grad_norm": 0.173828125, + "learning_rate": 0.00019988406126562937, + "loss": 1.7823, + "step": 206 + }, + { + "epoch": 0.0647077211628634, + "grad_norm": 0.1796875, + "learning_rate": 0.00019988287543092225, + "loss": 2.06, + "step": 207 + }, + { + "epoch": 0.06502031884964052, + "grad_norm": 0.193359375, + "learning_rate": 0.00019988168356615865, + "loss": 1.9327, + "step": 208 + }, + { + "epoch": 0.06533291653641764, + "grad_norm": 0.17578125, + "learning_rate": 0.00019988048567141052, + "loss": 1.9889, + "step": 209 + }, + { + "epoch": 0.06564551422319474, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019987928174675023, + "loss": 1.6262, + "step": 210 + }, + { + "epoch": 0.06595811190997186, + "grad_norm": 0.173828125, + "learning_rate": 0.00019987807179225035, + "loss": 1.8805, + "step": 211 + }, + { + "epoch": 0.06627070959674898, + "grad_norm": 0.181640625, + "learning_rate": 0.00019987685580798403, + "loss": 1.7265, + "step": 212 + }, + { + "epoch": 0.0665833072835261, + "grad_norm": 0.17578125, + "learning_rate": 0.0001998756337940247, + "loss": 1.7049, + "step": 213 + }, + { + "epoch": 0.06689590497030322, + "grad_norm": 0.173828125, + "learning_rate": 0.00019987440575044602, + "loss": 1.7256, + "step": 214 + }, + { + "epoch": 0.06720850265708034, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019987317167732222, + "loss": 1.9469, + "step": 215 + }, + { + "epoch": 0.06752110034385746, + "grad_norm": 0.177734375, + "learning_rate": 0.00019987193157472777, + "loss": 2.0254, + "step": 216 + }, + { + "epoch": 0.06783369803063458, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019987068544273756, + "loss": 2.1006, + "step": 217 + }, + { + "epoch": 0.0681462957174117, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019986943328142678, + "loss": 1.9486, + "step": 218 + }, + { + "epoch": 0.06845889340418881, + "grad_norm": 0.181640625, + "learning_rate": 0.00019986817509087107, + "loss": 1.9707, + "step": 219 + }, + { + "epoch": 0.06877149109096593, + "grad_norm": 0.169921875, + "learning_rate": 0.00019986691087114635, + "loss": 1.868, + "step": 220 + }, + { + "epoch": 0.06908408877774304, + "grad_norm": 0.181640625, + "learning_rate": 0.00019986564062232897, + "loss": 1.9028, + "step": 221 + }, + { + "epoch": 0.06939668646452016, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001998643643444956, + "loss": 1.9136, + "step": 222 + }, + { + "epoch": 0.06970928415129728, + "grad_norm": 0.181640625, + "learning_rate": 0.0001998630820377233, + "loss": 1.8039, + "step": 223 + }, + { + "epoch": 0.0700218818380744, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019986179370208947, + "loss": 1.7326, + "step": 224 + }, + { + "epoch": 0.07033447952485151, + "grad_norm": 0.169921875, + "learning_rate": 0.0001998604993376719, + "loss": 1.7712, + "step": 225 + }, + { + "epoch": 0.07064707721162863, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019985919894454875, + "loss": 1.9061, + "step": 226 + }, + { + "epoch": 0.07095967489840575, + "grad_norm": 0.181640625, + "learning_rate": 0.00019985789252279846, + "loss": 1.8444, + "step": 227 + }, + { + "epoch": 0.07127227258518287, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001998565800725, + "loss": 2.1696, + "step": 228 + }, + { + "epoch": 0.07158487027195999, + "grad_norm": 0.19140625, + "learning_rate": 0.00019985526159373255, + "loss": 1.9888, + "step": 229 + }, + { + "epoch": 0.07189746795873711, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019985393708657568, + "loss": 2.018, + "step": 230 + }, + { + "epoch": 0.07221006564551423, + "grad_norm": 0.18359375, + "learning_rate": 0.0001998526065511094, + "loss": 1.7847, + "step": 231 + }, + { + "epoch": 0.07252266333229133, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019985126998741404, + "loss": 1.879, + "step": 232 + }, + { + "epoch": 0.07283526101906845, + "grad_norm": 0.177734375, + "learning_rate": 0.00019984992739557024, + "loss": 1.7065, + "step": 233 + }, + { + "epoch": 0.07314785870584557, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019984857877565907, + "loss": 1.7451, + "step": 234 + }, + { + "epoch": 0.07346045639262269, + "grad_norm": 0.173828125, + "learning_rate": 0.000199847224127762, + "loss": 1.8228, + "step": 235 + }, + { + "epoch": 0.07377305407939981, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019984586345196074, + "loss": 1.9904, + "step": 236 + }, + { + "epoch": 0.07408565176617693, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001998444967483375, + "loss": 1.8958, + "step": 237 + }, + { + "epoch": 0.07439824945295405, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019984312401697473, + "loss": 1.8913, + "step": 238 + }, + { + "epoch": 0.07471084713973117, + "grad_norm": 0.193359375, + "learning_rate": 0.00019984174525795536, + "loss": 1.9273, + "step": 239 + }, + { + "epoch": 0.07502344482650829, + "grad_norm": 0.189453125, + "learning_rate": 0.00019984036047136257, + "loss": 1.8831, + "step": 240 + }, + { + "epoch": 0.0753360425132854, + "grad_norm": 0.19140625, + "learning_rate": 0.00019983896965728001, + "loss": 1.9506, + "step": 241 + }, + { + "epoch": 0.07564864020006253, + "grad_norm": 0.173828125, + "learning_rate": 0.00019983757281579162, + "loss": 1.971, + "step": 242 + }, + { + "epoch": 0.07596123788683964, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019983616994698173, + "loss": 1.8156, + "step": 243 + }, + { + "epoch": 0.07627383557361675, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019983476105093505, + "loss": 1.9397, + "step": 244 + }, + { + "epoch": 0.07658643326039387, + "grad_norm": 0.177734375, + "learning_rate": 0.00019983334612773662, + "loss": 1.7567, + "step": 245 + }, + { + "epoch": 0.07689903094717099, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019983192517747186, + "loss": 1.8685, + "step": 246 + }, + { + "epoch": 0.0772116286339481, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019983049820022656, + "loss": 2.2285, + "step": 247 + }, + { + "epoch": 0.07752422632072523, + "grad_norm": 0.193359375, + "learning_rate": 0.00019982906519608687, + "loss": 1.9532, + "step": 248 + }, + { + "epoch": 0.07783682400750234, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001998276261651393, + "loss": 1.8775, + "step": 249 + }, + { + "epoch": 0.07814942169427946, + "grad_norm": 0.18359375, + "learning_rate": 0.00019982618110747074, + "loss": 1.892, + "step": 250 + }, + { + "epoch": 0.07846201938105658, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019982473002316838, + "loss": 2.2827, + "step": 251 + }, + { + "epoch": 0.0787746170678337, + "grad_norm": 0.185546875, + "learning_rate": 0.0001998232729123199, + "loss": 2.1452, + "step": 252 + }, + { + "epoch": 0.07908721475461082, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019982180977501322, + "loss": 1.7888, + "step": 253 + }, + { + "epoch": 0.07939981244138794, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019982034061133666, + "loss": 1.7486, + "step": 254 + }, + { + "epoch": 0.07971241012816505, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019981886542137892, + "loss": 1.8143, + "step": 255 + }, + { + "epoch": 0.08002500781494216, + "grad_norm": 0.18359375, + "learning_rate": 0.00019981738420522913, + "loss": 1.839, + "step": 256 + }, + { + "epoch": 0.08033760550171928, + "grad_norm": 0.169921875, + "learning_rate": 0.00019981589696297663, + "loss": 1.918, + "step": 257 + }, + { + "epoch": 0.0806502031884964, + "grad_norm": 0.19140625, + "learning_rate": 0.00019981440369471124, + "loss": 1.9144, + "step": 258 + }, + { + "epoch": 0.08096280087527352, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019981290440052306, + "loss": 1.7846, + "step": 259 + }, + { + "epoch": 0.08127539856205064, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001998113990805027, + "loss": 1.9837, + "step": 260 + }, + { + "epoch": 0.08158799624882776, + "grad_norm": 0.1875, + "learning_rate": 0.00019980988773474098, + "loss": 1.9422, + "step": 261 + }, + { + "epoch": 0.08190059393560488, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019980837036332917, + "loss": 1.7637, + "step": 262 + }, + { + "epoch": 0.082213191622382, + "grad_norm": 0.67578125, + "learning_rate": 0.0001998068469663588, + "loss": 2.5924, + "step": 263 + }, + { + "epoch": 0.08252578930915912, + "grad_norm": 0.185546875, + "learning_rate": 0.0001998053175439219, + "loss": 1.8041, + "step": 264 + }, + { + "epoch": 0.08283838699593624, + "grad_norm": 0.19921875, + "learning_rate": 0.00019980378209611083, + "loss": 2.139, + "step": 265 + }, + { + "epoch": 0.08315098468271334, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001998022406230182, + "loss": 1.8233, + "step": 266 + }, + { + "epoch": 0.08346358236949046, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001998006931247372, + "loss": 1.9227, + "step": 267 + }, + { + "epoch": 0.08377618005626758, + "grad_norm": 0.19140625, + "learning_rate": 0.00019979913960136114, + "loss": 1.7389, + "step": 268 + }, + { + "epoch": 0.0840887777430447, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019979758005298385, + "loss": 1.6342, + "step": 269 + }, + { + "epoch": 0.08440137542982182, + "grad_norm": 0.181640625, + "learning_rate": 0.0001997960144796995, + "loss": 1.9472, + "step": 270 + }, + { + "epoch": 0.08471397311659894, + "grad_norm": 0.18359375, + "learning_rate": 0.00019979444288160253, + "loss": 1.7985, + "step": 271 + }, + { + "epoch": 0.08502657080337606, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019979286525878792, + "loss": 1.8546, + "step": 272 + }, + { + "epoch": 0.08533916849015317, + "grad_norm": 0.19140625, + "learning_rate": 0.00019979128161135083, + "loss": 1.9697, + "step": 273 + }, + { + "epoch": 0.0856517661769303, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019978969193938694, + "loss": 2.095, + "step": 274 + }, + { + "epoch": 0.08596436386370741, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019978809624299218, + "loss": 1.9491, + "step": 275 + }, + { + "epoch": 0.08627696155048453, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019978649452226285, + "loss": 1.9463, + "step": 276 + }, + { + "epoch": 0.08658955923726164, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019978488677729574, + "loss": 1.8981, + "step": 277 + }, + { + "epoch": 0.08690215692403876, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019978327300818784, + "loss": 1.9126, + "step": 278 + }, + { + "epoch": 0.08721475461081588, + "grad_norm": 0.18359375, + "learning_rate": 0.0001997816532150366, + "loss": 1.8987, + "step": 279 + }, + { + "epoch": 0.087527352297593, + "grad_norm": 0.201171875, + "learning_rate": 0.00019978002739793978, + "loss": 1.7486, + "step": 280 + }, + { + "epoch": 0.08783994998437011, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019977839555699553, + "loss": 1.9603, + "step": 281 + }, + { + "epoch": 0.08815254767114723, + "grad_norm": 0.19140625, + "learning_rate": 0.00019977675769230246, + "loss": 1.8714, + "step": 282 + }, + { + "epoch": 0.08846514535792435, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019977511380395933, + "loss": 2.0087, + "step": 283 + }, + { + "epoch": 0.08877774304470147, + "grad_norm": 0.177734375, + "learning_rate": 0.00019977346389206545, + "loss": 2.1653, + "step": 284 + }, + { + "epoch": 0.08909034073147859, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019977180795672044, + "loss": 2.0311, + "step": 285 + }, + { + "epoch": 0.08940293841825571, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019977014599802418, + "loss": 1.8212, + "step": 286 + }, + { + "epoch": 0.08971553610503283, + "grad_norm": 0.193359375, + "learning_rate": 0.00019976847801607712, + "loss": 2.0245, + "step": 287 + }, + { + "epoch": 0.09002813379180995, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001997668040109799, + "loss": 1.8573, + "step": 288 + }, + { + "epoch": 0.09034073147858705, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019976512398283357, + "loss": 1.7208, + "step": 289 + }, + { + "epoch": 0.09065332916536417, + "grad_norm": 0.181640625, + "learning_rate": 0.00019976343793173958, + "loss": 1.7056, + "step": 290 + }, + { + "epoch": 0.09096592685214129, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019976174585779972, + "loss": 1.8874, + "step": 291 + }, + { + "epoch": 0.09127852453891841, + "grad_norm": 0.181640625, + "learning_rate": 0.00019976004776111613, + "loss": 1.5886, + "step": 292 + }, + { + "epoch": 0.09159112222569553, + "grad_norm": 0.181640625, + "learning_rate": 0.00019975834364179134, + "loss": 1.7725, + "step": 293 + }, + { + "epoch": 0.09190371991247265, + "grad_norm": 0.189453125, + "learning_rate": 0.0001997566334999282, + "loss": 1.7855, + "step": 294 + }, + { + "epoch": 0.09221631759924977, + "grad_norm": 0.1875, + "learning_rate": 0.00019975491733563, + "loss": 1.7919, + "step": 295 + }, + { + "epoch": 0.09252891528602689, + "grad_norm": 0.185546875, + "learning_rate": 0.00019975319514900028, + "loss": 1.7353, + "step": 296 + }, + { + "epoch": 0.092841512972804, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019975146694014312, + "loss": 1.8983, + "step": 297 + }, + { + "epoch": 0.09315411065958112, + "grad_norm": 0.185546875, + "learning_rate": 0.00019974973270916273, + "loss": 2.115, + "step": 298 + }, + { + "epoch": 0.09346670834635824, + "grad_norm": 0.177734375, + "learning_rate": 0.00019974799245616387, + "loss": 1.9605, + "step": 299 + }, + { + "epoch": 0.09377930603313535, + "grad_norm": 0.1953125, + "learning_rate": 0.0001997462461812516, + "loss": 1.9963, + "step": 300 + }, + { + "epoch": 0.09409190371991247, + "grad_norm": 0.189453125, + "learning_rate": 0.00019974449388453135, + "loss": 1.8288, + "step": 301 + }, + { + "epoch": 0.09440450140668959, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001997427355661089, + "loss": 1.7948, + "step": 302 + }, + { + "epoch": 0.0947170990934667, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001997409712260904, + "loss": 1.868, + "step": 303 + }, + { + "epoch": 0.09502969678024382, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019973920086458237, + "loss": 1.8929, + "step": 304 + }, + { + "epoch": 0.09534229446702094, + "grad_norm": 0.1796875, + "learning_rate": 0.00019973742448169165, + "loss": 1.6884, + "step": 305 + }, + { + "epoch": 0.09565489215379806, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019973564207752554, + "loss": 1.6901, + "step": 306 + }, + { + "epoch": 0.09596748984057518, + "grad_norm": 0.1875, + "learning_rate": 0.00019973385365219164, + "loss": 1.7943, + "step": 307 + }, + { + "epoch": 0.0962800875273523, + "grad_norm": 0.1875, + "learning_rate": 0.0001997320592057979, + "loss": 1.9581, + "step": 308 + }, + { + "epoch": 0.09659268521412942, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019973025873845263, + "loss": 1.6522, + "step": 309 + }, + { + "epoch": 0.09690528290090654, + "grad_norm": 0.189453125, + "learning_rate": 0.00019972845225026456, + "loss": 1.9327, + "step": 310 + }, + { + "epoch": 0.09721788058768364, + "grad_norm": 0.18359375, + "learning_rate": 0.00019972663974134275, + "loss": 1.9542, + "step": 311 + }, + { + "epoch": 0.09753047827446076, + "grad_norm": 0.189453125, + "learning_rate": 0.00019972482121179664, + "loss": 2.0571, + "step": 312 + }, + { + "epoch": 0.09784307596123788, + "grad_norm": 0.181640625, + "learning_rate": 0.00019972299666173594, + "loss": 2.2707, + "step": 313 + }, + { + "epoch": 0.098155673648015, + "grad_norm": 0.185546875, + "learning_rate": 0.0001997211660912709, + "loss": 1.9587, + "step": 314 + }, + { + "epoch": 0.09846827133479212, + "grad_norm": 0.189453125, + "learning_rate": 0.00019971932950051198, + "loss": 2.0126, + "step": 315 + }, + { + "epoch": 0.09878086902156924, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019971748688957003, + "loss": 1.7935, + "step": 316 + }, + { + "epoch": 0.09909346670834636, + "grad_norm": 0.18359375, + "learning_rate": 0.00019971563825855638, + "loss": 1.8761, + "step": 317 + }, + { + "epoch": 0.09940606439512348, + "grad_norm": 0.19921875, + "learning_rate": 0.00019971378360758254, + "loss": 2.2404, + "step": 318 + }, + { + "epoch": 0.0997186620819006, + "grad_norm": 0.177734375, + "learning_rate": 0.0001997119229367605, + "loss": 1.8394, + "step": 319 + }, + { + "epoch": 0.10003125976867772, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019971005624620265, + "loss": 1.8923, + "step": 320 + }, + { + "epoch": 0.10034385745545484, + "grad_norm": 0.1953125, + "learning_rate": 0.00019970818353602163, + "loss": 1.6077, + "step": 321 + }, + { + "epoch": 0.10065645514223195, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019970630480633047, + "loss": 1.8617, + "step": 322 + }, + { + "epoch": 0.10096905282900906, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001997044200572427, + "loss": 1.892, + "step": 323 + }, + { + "epoch": 0.10128165051578618, + "grad_norm": 0.181640625, + "learning_rate": 0.000199702529288872, + "loss": 1.7457, + "step": 324 + }, + { + "epoch": 0.1015942482025633, + "grad_norm": 0.173828125, + "learning_rate": 0.00019970063250133256, + "loss": 1.9309, + "step": 325 + }, + { + "epoch": 0.10190684588934042, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019969872969473888, + "loss": 1.905, + "step": 326 + }, + { + "epoch": 0.10221944357611754, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019969682086920585, + "loss": 1.697, + "step": 327 + }, + { + "epoch": 0.10253204126289465, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001996949060248487, + "loss": 1.8728, + "step": 328 + }, + { + "epoch": 0.10284463894967177, + "grad_norm": 0.1796875, + "learning_rate": 0.00019969298516178303, + "loss": 1.7783, + "step": 329 + }, + { + "epoch": 0.10315723663644889, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001996910582801248, + "loss": 1.8591, + "step": 330 + }, + { + "epoch": 0.10346983432322601, + "grad_norm": 0.181640625, + "learning_rate": 0.00019968912537999034, + "loss": 1.8009, + "step": 331 + }, + { + "epoch": 0.10378243201000313, + "grad_norm": 0.177734375, + "learning_rate": 0.00019968718646149635, + "loss": 1.6679, + "step": 332 + }, + { + "epoch": 0.10409502969678025, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019968524152475986, + "loss": 1.9598, + "step": 333 + }, + { + "epoch": 0.10440762738355736, + "grad_norm": 0.185546875, + "learning_rate": 0.00019968329056989836, + "loss": 1.7525, + "step": 334 + }, + { + "epoch": 0.10472022507033447, + "grad_norm": 0.1875, + "learning_rate": 0.00019968133359702956, + "loss": 1.9891, + "step": 335 + }, + { + "epoch": 0.1050328227571116, + "grad_norm": 0.27734375, + "learning_rate": 0.00019967937060627163, + "loss": 2.6398, + "step": 336 + }, + { + "epoch": 0.10534542044388871, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019967740159774304, + "loss": 1.8126, + "step": 337 + }, + { + "epoch": 0.10565801813066583, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001996754265715627, + "loss": 1.5844, + "step": 338 + }, + { + "epoch": 0.10597061581744295, + "grad_norm": 0.443359375, + "learning_rate": 0.00019967344552784987, + "loss": 2.6948, + "step": 339 + }, + { + "epoch": 0.10628321350422007, + "grad_norm": 0.1796875, + "learning_rate": 0.00019967145846672412, + "loss": 1.8124, + "step": 340 + }, + { + "epoch": 0.10659581119099719, + "grad_norm": 0.17578125, + "learning_rate": 0.00019966946538830537, + "loss": 1.7512, + "step": 341 + }, + { + "epoch": 0.10690840887777431, + "grad_norm": 0.203125, + "learning_rate": 0.00019966746629271402, + "loss": 1.886, + "step": 342 + }, + { + "epoch": 0.10722100656455143, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001996654611800707, + "loss": 1.8067, + "step": 343 + }, + { + "epoch": 0.10753360425132855, + "grad_norm": 0.185546875, + "learning_rate": 0.0001996634500504965, + "loss": 1.8013, + "step": 344 + }, + { + "epoch": 0.10784620193810565, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019966143290411282, + "loss": 1.701, + "step": 345 + }, + { + "epoch": 0.10815879962488277, + "grad_norm": 0.1953125, + "learning_rate": 0.00019965940974104145, + "loss": 1.6386, + "step": 346 + }, + { + "epoch": 0.10847139731165989, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001996573805614045, + "loss": 1.9652, + "step": 347 + }, + { + "epoch": 0.10878399499843701, + "grad_norm": 0.189453125, + "learning_rate": 0.0001996553453653245, + "loss": 1.8178, + "step": 348 + }, + { + "epoch": 0.10909659268521413, + "grad_norm": 0.177734375, + "learning_rate": 0.00019965330415292428, + "loss": 1.8802, + "step": 349 + }, + { + "epoch": 0.10940919037199125, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001996512569243271, + "loss": 1.6879, + "step": 350 + }, + { + "epoch": 0.10972178805876837, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001996492036796566, + "loss": 1.8288, + "step": 351 + }, + { + "epoch": 0.11003438574554548, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019964714441903663, + "loss": 1.8453, + "step": 352 + }, + { + "epoch": 0.1103469834323226, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019964507914259157, + "loss": 1.8259, + "step": 353 + }, + { + "epoch": 0.11065958111909972, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019964300785044615, + "loss": 1.9748, + "step": 354 + }, + { + "epoch": 0.11097217880587684, + "grad_norm": 0.18359375, + "learning_rate": 0.00019964093054272535, + "loss": 2.0296, + "step": 355 + }, + { + "epoch": 0.11128477649265396, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001996388472195546, + "loss": 2.1065, + "step": 356 + }, + { + "epoch": 0.11159737417943107, + "grad_norm": 0.19140625, + "learning_rate": 0.00019963675788105967, + "loss": 1.712, + "step": 357 + }, + { + "epoch": 0.11190997186620819, + "grad_norm": 0.173828125, + "learning_rate": 0.0001996346625273667, + "loss": 2.178, + "step": 358 + }, + { + "epoch": 0.1122225695529853, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019963256115860219, + "loss": 1.6854, + "step": 359 + }, + { + "epoch": 0.11253516723976242, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019963045377489297, + "loss": 1.7912, + "step": 360 + }, + { + "epoch": 0.11284776492653954, + "grad_norm": 0.181640625, + "learning_rate": 0.00019962834037636634, + "loss": 1.7385, + "step": 361 + }, + { + "epoch": 0.11316036261331666, + "grad_norm": 0.19140625, + "learning_rate": 0.00019962622096314983, + "loss": 1.787, + "step": 362 + }, + { + "epoch": 0.11347296030009378, + "grad_norm": 0.185546875, + "learning_rate": 0.00019962409553537141, + "loss": 1.7083, + "step": 363 + }, + { + "epoch": 0.1137855579868709, + "grad_norm": 0.177734375, + "learning_rate": 0.00019962196409315937, + "loss": 1.7489, + "step": 364 + }, + { + "epoch": 0.11409815567364802, + "grad_norm": 0.1875, + "learning_rate": 0.00019961982663664244, + "loss": 1.8184, + "step": 365 + }, + { + "epoch": 0.11441075336042514, + "grad_norm": 0.181640625, + "learning_rate": 0.0001996176831659496, + "loss": 1.924, + "step": 366 + }, + { + "epoch": 0.11472335104720226, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001996155336812103, + "loss": 2.1837, + "step": 367 + }, + { + "epoch": 0.11503594873397936, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019961337818255424, + "loss": 1.9305, + "step": 368 + }, + { + "epoch": 0.11534854642075648, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019961121667011166, + "loss": 1.9867, + "step": 369 + }, + { + "epoch": 0.1156611441075336, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019960904914401298, + "loss": 1.968, + "step": 370 + }, + { + "epoch": 0.11597374179431072, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019960687560438908, + "loss": 1.6922, + "step": 371 + }, + { + "epoch": 0.11628633948108784, + "grad_norm": 0.169921875, + "learning_rate": 0.00019960469605137114, + "loss": 1.7978, + "step": 372 + }, + { + "epoch": 0.11659893716786496, + "grad_norm": 0.189453125, + "learning_rate": 0.0001996025104850908, + "loss": 1.8674, + "step": 373 + }, + { + "epoch": 0.11691153485464208, + "grad_norm": 0.1796875, + "learning_rate": 0.00019960031890567997, + "loss": 1.7445, + "step": 374 + }, + { + "epoch": 0.1172241325414192, + "grad_norm": 0.185546875, + "learning_rate": 0.00019959812131327095, + "loss": 1.7513, + "step": 375 + }, + { + "epoch": 0.11753673022819631, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019959591770799643, + "loss": 1.7463, + "step": 376 + }, + { + "epoch": 0.11784932791497343, + "grad_norm": 0.189453125, + "learning_rate": 0.00019959370808998945, + "loss": 1.6496, + "step": 377 + }, + { + "epoch": 0.11816192560175055, + "grad_norm": 0.18359375, + "learning_rate": 0.0001995914924593834, + "loss": 1.6407, + "step": 378 + }, + { + "epoch": 0.11847452328852766, + "grad_norm": 0.19140625, + "learning_rate": 0.00019958927081631205, + "loss": 1.9992, + "step": 379 + }, + { + "epoch": 0.11878712097530478, + "grad_norm": 0.1875, + "learning_rate": 0.0001995870431609095, + "loss": 1.7538, + "step": 380 + }, + { + "epoch": 0.1190997186620819, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019958480949331024, + "loss": 1.6851, + "step": 381 + }, + { + "epoch": 0.11941231634885902, + "grad_norm": 0.189453125, + "learning_rate": 0.00019958256981364916, + "loss": 1.7887, + "step": 382 + }, + { + "epoch": 0.11972491403563613, + "grad_norm": 0.181640625, + "learning_rate": 0.00019958032412206142, + "loss": 1.8162, + "step": 383 + }, + { + "epoch": 0.12003751172241325, + "grad_norm": 0.1875, + "learning_rate": 0.0001995780724186826, + "loss": 1.8541, + "step": 384 + }, + { + "epoch": 0.12035010940919037, + "grad_norm": 0.1875, + "learning_rate": 0.00019957581470364869, + "loss": 1.8194, + "step": 385 + }, + { + "epoch": 0.12066270709596749, + "grad_norm": 0.20703125, + "learning_rate": 0.0001995735509770959, + "loss": 1.7891, + "step": 386 + }, + { + "epoch": 0.12097530478274461, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019957128123916103, + "loss": 1.992, + "step": 387 + }, + { + "epoch": 0.12128790246952173, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019956900548998097, + "loss": 1.9259, + "step": 388 + }, + { + "epoch": 0.12160050015629885, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019956672372969315, + "loss": 2.0642, + "step": 389 + }, + { + "epoch": 0.12191309784307595, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001995644359584354, + "loss": 1.6211, + "step": 390 + }, + { + "epoch": 0.12222569552985307, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019956214217634575, + "loss": 1.7604, + "step": 391 + }, + { + "epoch": 0.12253829321663019, + "grad_norm": 0.177734375, + "learning_rate": 0.00019955984238356268, + "loss": 1.8761, + "step": 392 + }, + { + "epoch": 0.12285089090340731, + "grad_norm": 0.193359375, + "learning_rate": 0.0001995575365802251, + "loss": 2.0069, + "step": 393 + }, + { + "epoch": 0.12316348859018443, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001995552247664721, + "loss": 1.7372, + "step": 394 + }, + { + "epoch": 0.12347608627696155, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019955290694244338, + "loss": 1.8025, + "step": 395 + }, + { + "epoch": 0.12378868396373867, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019955058310827878, + "loss": 1.8633, + "step": 396 + }, + { + "epoch": 0.12410128165051579, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019954825326411863, + "loss": 1.9765, + "step": 397 + }, + { + "epoch": 0.1244138793372929, + "grad_norm": 0.197265625, + "learning_rate": 0.0001995459174101036, + "loss": 1.6959, + "step": 398 + }, + { + "epoch": 0.12472647702407003, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001995435755463746, + "loss": 1.6401, + "step": 399 + }, + { + "epoch": 0.12503907471084713, + "grad_norm": 0.185546875, + "learning_rate": 0.00019954122767307318, + "loss": 2.1424, + "step": 400 + }, + { + "epoch": 0.12535167239762426, + "grad_norm": 0.17578125, + "learning_rate": 0.00019953887379034094, + "loss": 1.9393, + "step": 401 + }, + { + "epoch": 0.12566427008440137, + "grad_norm": 0.193359375, + "learning_rate": 0.00019953651389832008, + "loss": 1.8414, + "step": 402 + }, + { + "epoch": 0.1259768677711785, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019953414799715304, + "loss": 1.9348, + "step": 403 + }, + { + "epoch": 0.1262894654579556, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019953177608698263, + "loss": 1.6774, + "step": 404 + }, + { + "epoch": 0.12660206314473274, + "grad_norm": 0.18359375, + "learning_rate": 0.00019952939816795205, + "loss": 1.9635, + "step": 405 + }, + { + "epoch": 0.12691466083150985, + "grad_norm": 0.189453125, + "learning_rate": 0.0001995270142402049, + "loss": 1.788, + "step": 406 + }, + { + "epoch": 0.12722725851828695, + "grad_norm": 0.177734375, + "learning_rate": 0.00019952462430388506, + "loss": 1.7256, + "step": 407 + }, + { + "epoch": 0.12753985620506408, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019952222835913682, + "loss": 1.8476, + "step": 408 + }, + { + "epoch": 0.1278524538918412, + "grad_norm": 0.19140625, + "learning_rate": 0.00019951982640610484, + "loss": 1.9212, + "step": 409 + }, + { + "epoch": 0.12816505157861832, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019951741844493413, + "loss": 1.807, + "step": 410 + }, + { + "epoch": 0.12847764926539543, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019951500447577003, + "loss": 1.6015, + "step": 411 + }, + { + "epoch": 0.12879024695217256, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019951258449875828, + "loss": 1.8802, + "step": 412 + }, + { + "epoch": 0.12910284463894967, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019951015851404504, + "loss": 1.9614, + "step": 413 + }, + { + "epoch": 0.1294154423257268, + "grad_norm": 0.197265625, + "learning_rate": 0.0001995077265217767, + "loss": 1.8907, + "step": 414 + }, + { + "epoch": 0.1297280400125039, + "grad_norm": 0.197265625, + "learning_rate": 0.00019950528852210014, + "loss": 1.8123, + "step": 415 + }, + { + "epoch": 0.13004063769928104, + "grad_norm": 0.18359375, + "learning_rate": 0.00019950284451516245, + "loss": 1.6966, + "step": 416 + }, + { + "epoch": 0.13035323538605814, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019950039450111127, + "loss": 2.0439, + "step": 417 + }, + { + "epoch": 0.13066583307283527, + "grad_norm": 0.185546875, + "learning_rate": 0.00019949793848009448, + "loss": 1.9781, + "step": 418 + }, + { + "epoch": 0.13097843075961238, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019949547645226035, + "loss": 1.9264, + "step": 419 + }, + { + "epoch": 0.13129102844638948, + "grad_norm": 0.197265625, + "learning_rate": 0.00019949300841775753, + "loss": 2.0297, + "step": 420 + }, + { + "epoch": 0.13160362613316662, + "grad_norm": 0.19140625, + "learning_rate": 0.000199490534376735, + "loss": 1.9136, + "step": 421 + }, + { + "epoch": 0.13191622381994372, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019948805432934213, + "loss": 1.8224, + "step": 422 + }, + { + "epoch": 0.13222882150672086, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019948556827572862, + "loss": 1.7871, + "step": 423 + }, + { + "epoch": 0.13254141919349796, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019948307621604457, + "loss": 1.7048, + "step": 424 + }, + { + "epoch": 0.1328540168802751, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019948057815044048, + "loss": 1.9041, + "step": 425 + }, + { + "epoch": 0.1331666145670522, + "grad_norm": 0.1796875, + "learning_rate": 0.0001994780740790671, + "loss": 1.7443, + "step": 426 + }, + { + "epoch": 0.13347921225382933, + "grad_norm": 0.189453125, + "learning_rate": 0.0001994755640020756, + "loss": 1.6474, + "step": 427 + }, + { + "epoch": 0.13379180994060644, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019947304791961758, + "loss": 1.8303, + "step": 428 + }, + { + "epoch": 0.13410440762738357, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019947052583184488, + "loss": 1.64, + "step": 429 + }, + { + "epoch": 0.13441700531416068, + "grad_norm": 0.189453125, + "learning_rate": 0.00019946799773890974, + "loss": 1.7586, + "step": 430 + }, + { + "epoch": 0.13472960300093778, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019946546364096488, + "loss": 1.8402, + "step": 431 + }, + { + "epoch": 0.13504220068771491, + "grad_norm": 0.64453125, + "learning_rate": 0.00019946292353816318, + "loss": 2.2409, + "step": 432 + }, + { + "epoch": 0.13535479837449202, + "grad_norm": 0.193359375, + "learning_rate": 0.0001994603774306581, + "loss": 1.8416, + "step": 433 + }, + { + "epoch": 0.13566739606126915, + "grad_norm": 0.181640625, + "learning_rate": 0.00019945782531860325, + "loss": 1.7372, + "step": 434 + }, + { + "epoch": 0.13597999374804626, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019945526720215273, + "loss": 1.9704, + "step": 435 + }, + { + "epoch": 0.1362925914348234, + "grad_norm": 0.185546875, + "learning_rate": 0.00019945270308146103, + "loss": 1.6651, + "step": 436 + }, + { + "epoch": 0.1366051891216005, + "grad_norm": 0.19921875, + "learning_rate": 0.00019945013295668288, + "loss": 1.7958, + "step": 437 + }, + { + "epoch": 0.13691778680837763, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001994475568279735, + "loss": 2.0826, + "step": 438 + }, + { + "epoch": 0.13723038449515473, + "grad_norm": 0.19140625, + "learning_rate": 0.00019944497469548837, + "loss": 1.8808, + "step": 439 + }, + { + "epoch": 0.13754298218193187, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019944238655938339, + "loss": 2.257, + "step": 440 + }, + { + "epoch": 0.13785557986870897, + "grad_norm": 0.1796875, + "learning_rate": 0.0001994397924198148, + "loss": 2.0791, + "step": 441 + }, + { + "epoch": 0.13816817755548608, + "grad_norm": 0.193359375, + "learning_rate": 0.00019943719227693928, + "loss": 1.8917, + "step": 442 + }, + { + "epoch": 0.1384807752422632, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001994345861309137, + "loss": 1.8261, + "step": 443 + }, + { + "epoch": 0.13879337292904032, + "grad_norm": 0.189453125, + "learning_rate": 0.00019943197398189546, + "loss": 1.626, + "step": 444 + }, + { + "epoch": 0.13910597061581745, + "grad_norm": 0.193359375, + "learning_rate": 0.00019942935583004223, + "loss": 1.7819, + "step": 445 + }, + { + "epoch": 0.13941856830259455, + "grad_norm": 0.19921875, + "learning_rate": 0.0001994267316755121, + "loss": 1.8149, + "step": 446 + }, + { + "epoch": 0.1397311659893717, + "grad_norm": 0.1796875, + "learning_rate": 0.00019942410151846347, + "loss": 1.9703, + "step": 447 + }, + { + "epoch": 0.1400437636761488, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019942146535905514, + "loss": 1.7519, + "step": 448 + }, + { + "epoch": 0.14035636136292592, + "grad_norm": 0.201171875, + "learning_rate": 0.00019941882319744625, + "loss": 1.8088, + "step": 449 + }, + { + "epoch": 0.14066895904970303, + "grad_norm": 0.1953125, + "learning_rate": 0.0001994161750337963, + "loss": 2.0352, + "step": 450 + }, + { + "epoch": 0.14098155673648016, + "grad_norm": 0.19921875, + "learning_rate": 0.0001994135208682652, + "loss": 1.7832, + "step": 451 + }, + { + "epoch": 0.14129415442325727, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019941086070101314, + "loss": 1.7351, + "step": 452 + }, + { + "epoch": 0.14160675211003437, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019940819453220074, + "loss": 1.9127, + "step": 453 + }, + { + "epoch": 0.1419193497968115, + "grad_norm": 0.478515625, + "learning_rate": 0.00019940552236198897, + "loss": 2.6953, + "step": 454 + }, + { + "epoch": 0.1422319474835886, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019940284419053914, + "loss": 2.0053, + "step": 455 + }, + { + "epoch": 0.14254454517036574, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019940016001801294, + "loss": 1.7283, + "step": 456 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001993974698445724, + "loss": 1.7655, + "step": 457 + }, + { + "epoch": 0.14316974054391998, + "grad_norm": 0.19921875, + "learning_rate": 0.00019939477367037994, + "loss": 1.8373, + "step": 458 + }, + { + "epoch": 0.1434823382306971, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019939207149559835, + "loss": 1.8626, + "step": 459 + }, + { + "epoch": 0.14379493591747422, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019938936332039077, + "loss": 1.6125, + "step": 460 + }, + { + "epoch": 0.14410753360425133, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019938664914492062, + "loss": 2.0307, + "step": 461 + }, + { + "epoch": 0.14442013129102846, + "grad_norm": 0.193359375, + "learning_rate": 0.00019938392896935183, + "loss": 1.84, + "step": 462 + }, + { + "epoch": 0.14473272897780556, + "grad_norm": 0.19921875, + "learning_rate": 0.0001993812027938486, + "loss": 1.9634, + "step": 463 + }, + { + "epoch": 0.14504532666458267, + "grad_norm": 0.1953125, + "learning_rate": 0.00019937847061857552, + "loss": 2.0152, + "step": 464 + }, + { + "epoch": 0.1453579243513598, + "grad_norm": 0.201171875, + "learning_rate": 0.00019937573244369753, + "loss": 1.8692, + "step": 465 + }, + { + "epoch": 0.1456705220381369, + "grad_norm": 0.19140625, + "learning_rate": 0.00019937298826937995, + "loss": 1.7805, + "step": 466 + }, + { + "epoch": 0.14598311972491404, + "grad_norm": 0.197265625, + "learning_rate": 0.00019937023809578843, + "loss": 1.9569, + "step": 467 + }, + { + "epoch": 0.14629571741169115, + "grad_norm": 0.1865234375, + "learning_rate": 0.000199367481923089, + "loss": 1.9791, + "step": 468 + }, + { + "epoch": 0.14660831509846828, + "grad_norm": 0.189453125, + "learning_rate": 0.00019936471975144805, + "loss": 1.7193, + "step": 469 + }, + { + "epoch": 0.14692091278524538, + "grad_norm": 0.19140625, + "learning_rate": 0.00019936195158103237, + "loss": 1.7506, + "step": 470 + }, + { + "epoch": 0.14723351047202252, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019935917741200902, + "loss": 1.9867, + "step": 471 + }, + { + "epoch": 0.14754610815879962, + "grad_norm": 0.1953125, + "learning_rate": 0.00019935639724454556, + "loss": 1.8894, + "step": 472 + }, + { + "epoch": 0.14785870584557675, + "grad_norm": 0.197265625, + "learning_rate": 0.00019935361107880977, + "loss": 1.7917, + "step": 473 + }, + { + "epoch": 0.14817130353235386, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019935081891496985, + "loss": 1.9643, + "step": 474 + }, + { + "epoch": 0.14848390121913096, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001993480207531944, + "loss": 1.6624, + "step": 475 + }, + { + "epoch": 0.1487964989059081, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019934521659365235, + "loss": 1.5768, + "step": 476 + }, + { + "epoch": 0.1491090965926852, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019934240643651298, + "loss": 1.8556, + "step": 477 + }, + { + "epoch": 0.14942169427946234, + "grad_norm": 0.189453125, + "learning_rate": 0.00019933959028194592, + "loss": 1.9329, + "step": 478 + }, + { + "epoch": 0.14973429196623944, + "grad_norm": 0.203125, + "learning_rate": 0.0001993367681301212, + "loss": 1.7054, + "step": 479 + }, + { + "epoch": 0.15004688965301657, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001993339399812092, + "loss": 1.8809, + "step": 480 + }, + { + "epoch": 0.15035948733979368, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001993311058353807, + "loss": 1.5983, + "step": 481 + }, + { + "epoch": 0.1506720850265708, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019932826569280673, + "loss": 1.7169, + "step": 482 + }, + { + "epoch": 0.15098468271334792, + "grad_norm": 0.1953125, + "learning_rate": 0.00019932541955365883, + "loss": 1.9345, + "step": 483 + }, + { + "epoch": 0.15129728040012505, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019932256741810874, + "loss": 2.1597, + "step": 484 + }, + { + "epoch": 0.15160987808690216, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001993197092863287, + "loss": 1.5661, + "step": 485 + }, + { + "epoch": 0.1519224757736793, + "grad_norm": 0.19140625, + "learning_rate": 0.0001993168451584912, + "loss": 1.8121, + "step": 486 + }, + { + "epoch": 0.1522350734604564, + "grad_norm": 0.18359375, + "learning_rate": 0.00019931397503476924, + "loss": 1.7365, + "step": 487 + }, + { + "epoch": 0.1525476711472335, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019931109891533605, + "loss": 1.6982, + "step": 488 + }, + { + "epoch": 0.15286026883401063, + "grad_norm": 0.189453125, + "learning_rate": 0.00019930821680036527, + "loss": 1.9638, + "step": 489 + }, + { + "epoch": 0.15317286652078774, + "grad_norm": 0.201171875, + "learning_rate": 0.00019930532869003086, + "loss": 2.1991, + "step": 490 + }, + { + "epoch": 0.15348546420756487, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019930243458450724, + "loss": 1.8095, + "step": 491 + }, + { + "epoch": 0.15379806189434198, + "grad_norm": 0.177734375, + "learning_rate": 0.0001992995344839691, + "loss": 1.9021, + "step": 492 + }, + { + "epoch": 0.1541106595811191, + "grad_norm": 0.19921875, + "learning_rate": 0.0001992966283885915, + "loss": 1.9448, + "step": 493 + }, + { + "epoch": 0.1544232572678962, + "grad_norm": 0.19921875, + "learning_rate": 0.00019929371629854992, + "loss": 1.9806, + "step": 494 + }, + { + "epoch": 0.15473585495467335, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001992907982140202, + "loss": 1.7495, + "step": 495 + }, + { + "epoch": 0.15504845264145045, + "grad_norm": 0.203125, + "learning_rate": 0.00019928787413517842, + "loss": 2.0022, + "step": 496 + }, + { + "epoch": 0.15536105032822758, + "grad_norm": 0.193359375, + "learning_rate": 0.00019928494406220115, + "loss": 1.7185, + "step": 497 + }, + { + "epoch": 0.1556736480150047, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019928200799526532, + "loss": 2.0288, + "step": 498 + }, + { + "epoch": 0.1559862457017818, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019927906593454812, + "loss": 1.7969, + "step": 499 + }, + { + "epoch": 0.15629884338855893, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001992761178802272, + "loss": 2.1816, + "step": 500 + }, + { + "epoch": 0.15661144107533603, + "grad_norm": 0.1953125, + "learning_rate": 0.00019927316383248054, + "loss": 1.8524, + "step": 501 + }, + { + "epoch": 0.15692403876211317, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019927020379148646, + "loss": 1.6543, + "step": 502 + }, + { + "epoch": 0.15723663644889027, + "grad_norm": 0.203125, + "learning_rate": 0.0001992672377574237, + "loss": 1.7662, + "step": 503 + }, + { + "epoch": 0.1575492341356674, + "grad_norm": 0.1953125, + "learning_rate": 0.0001992642657304713, + "loss": 1.8305, + "step": 504 + }, + { + "epoch": 0.1578618318224445, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019926128771080868, + "loss": 1.6887, + "step": 505 + }, + { + "epoch": 0.15817442950922164, + "grad_norm": 0.1953125, + "learning_rate": 0.00019925830369861564, + "loss": 1.9668, + "step": 506 + }, + { + "epoch": 0.15848702719599875, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019925531369407228, + "loss": 1.8739, + "step": 507 + }, + { + "epoch": 0.15879962488277588, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019925231769735917, + "loss": 1.8289, + "step": 508 + }, + { + "epoch": 0.15911222256955299, + "grad_norm": 0.185546875, + "learning_rate": 0.0001992493157086572, + "loss": 1.9057, + "step": 509 + }, + { + "epoch": 0.1594248202563301, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019924630772814753, + "loss": 1.8643, + "step": 510 + }, + { + "epoch": 0.15973741794310722, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019924329375601177, + "loss": 1.8911, + "step": 511 + }, + { + "epoch": 0.16005001562988433, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019924027379243192, + "loss": 1.6922, + "step": 512 + }, + { + "epoch": 0.16036261331666146, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001992372478375903, + "loss": 1.9621, + "step": 513 + }, + { + "epoch": 0.16067521100343857, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019923421589166954, + "loss": 1.8731, + "step": 514 + }, + { + "epoch": 0.1609878086902157, + "grad_norm": 0.201171875, + "learning_rate": 0.00019923117795485272, + "loss": 1.6659, + "step": 515 + }, + { + "epoch": 0.1613004063769928, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019922813402732325, + "loss": 1.9896, + "step": 516 + }, + { + "epoch": 0.16161300406376994, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019922508410926489, + "loss": 1.8087, + "step": 517 + }, + { + "epoch": 0.16192560175054704, + "grad_norm": 0.19921875, + "learning_rate": 0.00019922202820086171, + "loss": 2.0338, + "step": 518 + }, + { + "epoch": 0.16223819943732418, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019921896630229827, + "loss": 1.8984, + "step": 519 + }, + { + "epoch": 0.16255079712410128, + "grad_norm": 0.205078125, + "learning_rate": 0.0001992158984137594, + "loss": 1.7892, + "step": 520 + }, + { + "epoch": 0.1628633948108784, + "grad_norm": 0.19921875, + "learning_rate": 0.00019921282453543032, + "loss": 1.6763, + "step": 521 + }, + { + "epoch": 0.16317599249765552, + "grad_norm": 0.185546875, + "learning_rate": 0.0001992097446674966, + "loss": 1.8474, + "step": 522 + }, + { + "epoch": 0.16348859018443263, + "grad_norm": 0.193359375, + "learning_rate": 0.00019920665881014416, + "loss": 1.9876, + "step": 523 + }, + { + "epoch": 0.16380118787120976, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001992035669635593, + "loss": 1.7454, + "step": 524 + }, + { + "epoch": 0.16411378555798686, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001992004691279287, + "loss": 1.9164, + "step": 525 + }, + { + "epoch": 0.164426383244764, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019919736530343935, + "loss": 1.9096, + "step": 526 + }, + { + "epoch": 0.1647389809315411, + "grad_norm": 0.1953125, + "learning_rate": 0.00019919425549027865, + "loss": 1.9148, + "step": 527 + }, + { + "epoch": 0.16505157861831823, + "grad_norm": 0.1953125, + "learning_rate": 0.00019919113968863437, + "loss": 1.9967, + "step": 528 + }, + { + "epoch": 0.16536417630509534, + "grad_norm": 0.2109375, + "learning_rate": 0.00019918801789869453, + "loss": 1.9329, + "step": 529 + }, + { + "epoch": 0.16567677399187247, + "grad_norm": 0.19921875, + "learning_rate": 0.00019918489012064772, + "loss": 1.9399, + "step": 530 + }, + { + "epoch": 0.16598937167864958, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019918175635468265, + "loss": 1.9082, + "step": 531 + }, + { + "epoch": 0.16630196936542668, + "grad_norm": 0.193359375, + "learning_rate": 0.00019917861660098858, + "loss": 1.9138, + "step": 532 + }, + { + "epoch": 0.16661456705220382, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019917547085975505, + "loss": 1.7534, + "step": 533 + }, + { + "epoch": 0.16692716473898092, + "grad_norm": 0.181640625, + "learning_rate": 0.00019917231913117197, + "loss": 1.8574, + "step": 534 + }, + { + "epoch": 0.16723976242575805, + "grad_norm": 0.19921875, + "learning_rate": 0.0001991691614154296, + "loss": 1.7967, + "step": 535 + }, + { + "epoch": 0.16755236011253516, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019916599771271855, + "loss": 1.765, + "step": 536 + }, + { + "epoch": 0.1678649577993123, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019916282802322989, + "loss": 1.9999, + "step": 537 + }, + { + "epoch": 0.1681775554860894, + "grad_norm": 0.197265625, + "learning_rate": 0.00019915965234715491, + "loss": 1.9353, + "step": 538 + }, + { + "epoch": 0.16849015317286653, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019915647068468538, + "loss": 1.8003, + "step": 539 + }, + { + "epoch": 0.16880275085964364, + "grad_norm": 0.19921875, + "learning_rate": 0.00019915328303601334, + "loss": 2.1542, + "step": 540 + }, + { + "epoch": 0.16911534854642077, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019915008940133127, + "loss": 1.9446, + "step": 541 + }, + { + "epoch": 0.16942794623319787, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019914688978083192, + "loss": 2.0184, + "step": 542 + }, + { + "epoch": 0.16974054391997498, + "grad_norm": 0.1875, + "learning_rate": 0.00019914368417470852, + "loss": 1.8707, + "step": 543 + }, + { + "epoch": 0.1700531416067521, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019914047258315457, + "loss": 1.8503, + "step": 544 + }, + { + "epoch": 0.17036573929352922, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019913725500636393, + "loss": 1.9382, + "step": 545 + }, + { + "epoch": 0.17067833698030635, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019913403144453088, + "loss": 1.6436, + "step": 546 + }, + { + "epoch": 0.17099093466708346, + "grad_norm": 0.197265625, + "learning_rate": 0.00019913080189785002, + "loss": 2.0155, + "step": 547 + }, + { + "epoch": 0.1713035323538606, + "grad_norm": 0.1875, + "learning_rate": 0.00019912756636651638, + "loss": 1.9679, + "step": 548 + }, + { + "epoch": 0.1716161300406377, + "grad_norm": 0.197265625, + "learning_rate": 0.00019912432485072516, + "loss": 1.619, + "step": 549 + }, + { + "epoch": 0.17192872772741483, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001991210773506722, + "loss": 1.8251, + "step": 550 + }, + { + "epoch": 0.17224132541419193, + "grad_norm": 0.197265625, + "learning_rate": 0.00019911782386655341, + "loss": 1.9356, + "step": 551 + }, + { + "epoch": 0.17255392310096906, + "grad_norm": 0.193359375, + "learning_rate": 0.00019911456439856536, + "loss": 1.7967, + "step": 552 + }, + { + "epoch": 0.17286652078774617, + "grad_norm": 0.1953125, + "learning_rate": 0.00019911129894690475, + "loss": 1.7887, + "step": 553 + }, + { + "epoch": 0.17317911847452327, + "grad_norm": 0.201171875, + "learning_rate": 0.00019910802751176867, + "loss": 1.8225, + "step": 554 + }, + { + "epoch": 0.1734917161613004, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019910475009335472, + "loss": 1.7761, + "step": 555 + }, + { + "epoch": 0.1738043138480775, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001991014666918607, + "loss": 1.917, + "step": 556 + }, + { + "epoch": 0.17411691153485465, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019909817730748487, + "loss": 1.707, + "step": 557 + }, + { + "epoch": 0.17442950922163175, + "grad_norm": 0.1953125, + "learning_rate": 0.00019909488194042575, + "loss": 2.2473, + "step": 558 + }, + { + "epoch": 0.17474210690840888, + "grad_norm": 0.1953125, + "learning_rate": 0.00019909158059088235, + "loss": 1.5952, + "step": 559 + }, + { + "epoch": 0.175054704595186, + "grad_norm": 0.1923828125, + "learning_rate": 0.000199088273259054, + "loss": 1.6575, + "step": 560 + }, + { + "epoch": 0.17536730228196312, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019908495994514026, + "loss": 1.9749, + "step": 561 + }, + { + "epoch": 0.17567989996874023, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019908164064934126, + "loss": 1.681, + "step": 562 + }, + { + "epoch": 0.17599249765551736, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019907831537185734, + "loss": 1.7532, + "step": 563 + }, + { + "epoch": 0.17630509534229447, + "grad_norm": 0.19140625, + "learning_rate": 0.00019907498411288925, + "loss": 2.0639, + "step": 564 + }, + { + "epoch": 0.1766176930290716, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019907164687263813, + "loss": 2.1285, + "step": 565 + }, + { + "epoch": 0.1769302907158487, + "grad_norm": 0.189453125, + "learning_rate": 0.00019906830365130546, + "loss": 1.7988, + "step": 566 + }, + { + "epoch": 0.1772428884026258, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019906495444909302, + "loss": 1.6593, + "step": 567 + }, + { + "epoch": 0.17755548608940294, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019906159926620306, + "loss": 1.8094, + "step": 568 + }, + { + "epoch": 0.17786808377618005, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019905823810283812, + "loss": 1.6249, + "step": 569 + }, + { + "epoch": 0.17818068146295718, + "grad_norm": 0.185546875, + "learning_rate": 0.0001990548709592011, + "loss": 1.6268, + "step": 570 + }, + { + "epoch": 0.17849327914973429, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019905149783549532, + "loss": 1.5067, + "step": 571 + }, + { + "epoch": 0.17880587683651142, + "grad_norm": 0.19140625, + "learning_rate": 0.00019904811873192437, + "loss": 1.7792, + "step": 572 + }, + { + "epoch": 0.17911847452328852, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001990447336486923, + "loss": 1.7893, + "step": 573 + }, + { + "epoch": 0.17943107221006566, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001990413425860034, + "loss": 1.7304, + "step": 574 + }, + { + "epoch": 0.17974366989684276, + "grad_norm": 0.193359375, + "learning_rate": 0.00019903794554406248, + "loss": 1.9092, + "step": 575 + }, + { + "epoch": 0.1800562675836199, + "grad_norm": 0.193359375, + "learning_rate": 0.00019903454252307454, + "loss": 1.6916, + "step": 576 + }, + { + "epoch": 0.180368865270397, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001990311335232451, + "loss": 1.7437, + "step": 577 + }, + { + "epoch": 0.1806814629571741, + "grad_norm": 0.203125, + "learning_rate": 0.00019902771854477994, + "loss": 1.7296, + "step": 578 + }, + { + "epoch": 0.18099406064395124, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001990242975878852, + "loss": 1.6257, + "step": 579 + }, + { + "epoch": 0.18130665833072834, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001990208706527674, + "loss": 1.6635, + "step": 580 + }, + { + "epoch": 0.18161925601750548, + "grad_norm": 0.208984375, + "learning_rate": 0.00019901743773963353, + "loss": 1.8428, + "step": 581 + }, + { + "epoch": 0.18193185370428258, + "grad_norm": 0.1953125, + "learning_rate": 0.00019901399884869072, + "loss": 1.7945, + "step": 582 + }, + { + "epoch": 0.18224445139105971, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019901055398014662, + "loss": 1.7858, + "step": 583 + }, + { + "epoch": 0.18255704907783682, + "grad_norm": 0.19921875, + "learning_rate": 0.0001990071031342092, + "loss": 1.62, + "step": 584 + }, + { + "epoch": 0.18286964676461395, + "grad_norm": 0.201171875, + "learning_rate": 0.00019900364631108682, + "loss": 1.8136, + "step": 585 + }, + { + "epoch": 0.18318224445139106, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019900018351098813, + "loss": 1.9074, + "step": 586 + }, + { + "epoch": 0.1834948421381682, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001989967147341222, + "loss": 1.8761, + "step": 587 + }, + { + "epoch": 0.1838074398249453, + "grad_norm": 0.19921875, + "learning_rate": 0.00019899323998069846, + "loss": 1.8516, + "step": 588 + }, + { + "epoch": 0.1841200375117224, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001989897592509267, + "loss": 1.7505, + "step": 589 + }, + { + "epoch": 0.18443263519849953, + "grad_norm": 0.189453125, + "learning_rate": 0.00019898627254501697, + "loss": 1.9066, + "step": 590 + }, + { + "epoch": 0.18474523288527664, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001989827798631799, + "loss": 1.926, + "step": 591 + }, + { + "epoch": 0.18505783057205377, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019897928120562623, + "loss": 1.9225, + "step": 592 + }, + { + "epoch": 0.18537042825883088, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019897577657256724, + "loss": 2.0965, + "step": 593 + }, + { + "epoch": 0.185683025945608, + "grad_norm": 0.20703125, + "learning_rate": 0.00019897226596421447, + "loss": 1.7195, + "step": 594 + }, + { + "epoch": 0.18599562363238512, + "grad_norm": 0.197265625, + "learning_rate": 0.00019896874938077992, + "loss": 1.8197, + "step": 595 + }, + { + "epoch": 0.18630822131916225, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001989652268224758, + "loss": 2.2171, + "step": 596 + }, + { + "epoch": 0.18662081900593935, + "grad_norm": 0.1875, + "learning_rate": 0.00019896169828951488, + "loss": 1.8195, + "step": 597 + }, + { + "epoch": 0.1869334166927165, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019895816378211008, + "loss": 1.6969, + "step": 598 + }, + { + "epoch": 0.1872460143794936, + "grad_norm": 0.19921875, + "learning_rate": 0.00019895462330047484, + "loss": 1.8099, + "step": 599 + }, + { + "epoch": 0.1875586120662707, + "grad_norm": 0.189453125, + "learning_rate": 0.00019895107684482293, + "loss": 1.7597, + "step": 600 + }, + { + "epoch": 0.18787120975304783, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019894752441536838, + "loss": 1.7928, + "step": 601 + }, + { + "epoch": 0.18818380743982493, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019894396601232567, + "loss": 1.7385, + "step": 602 + }, + { + "epoch": 0.18849640512660207, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001989404016359097, + "loss": 1.7216, + "step": 603 + }, + { + "epoch": 0.18880900281337917, + "grad_norm": 0.19140625, + "learning_rate": 0.00019893683128633557, + "loss": 1.749, + "step": 604 + }, + { + "epoch": 0.1891216005001563, + "grad_norm": 0.189453125, + "learning_rate": 0.00019893325496381884, + "loss": 1.8708, + "step": 605 + }, + { + "epoch": 0.1894341981869334, + "grad_norm": 0.197265625, + "learning_rate": 0.00019892967266857547, + "loss": 1.9852, + "step": 606 + }, + { + "epoch": 0.18974679587371054, + "grad_norm": 0.203125, + "learning_rate": 0.0001989260844008217, + "loss": 1.7595, + "step": 607 + }, + { + "epoch": 0.19005939356048765, + "grad_norm": 0.197265625, + "learning_rate": 0.00019892249016077412, + "loss": 1.7231, + "step": 608 + }, + { + "epoch": 0.19037199124726478, + "grad_norm": 0.212890625, + "learning_rate": 0.0001989188899486498, + "loss": 1.7735, + "step": 609 + }, + { + "epoch": 0.1906845889340419, + "grad_norm": 0.1953125, + "learning_rate": 0.00019891528376466598, + "loss": 1.8502, + "step": 610 + }, + { + "epoch": 0.190997186620819, + "grad_norm": 0.19921875, + "learning_rate": 0.00019891167160904046, + "loss": 1.8522, + "step": 611 + }, + { + "epoch": 0.19130978430759613, + "grad_norm": 0.19921875, + "learning_rate": 0.0001989080534819913, + "loss": 2.0308, + "step": 612 + }, + { + "epoch": 0.19162238199437323, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019890442938373686, + "loss": 1.7471, + "step": 613 + }, + { + "epoch": 0.19193497968115036, + "grad_norm": 0.1962890625, + "learning_rate": 0.000198900799314496, + "loss": 1.5426, + "step": 614 + }, + { + "epoch": 0.19224757736792747, + "grad_norm": 0.197265625, + "learning_rate": 0.0001988971632744879, + "loss": 2.0733, + "step": 615 + }, + { + "epoch": 0.1925601750547046, + "grad_norm": 0.193359375, + "learning_rate": 0.00019889352126393198, + "loss": 1.8229, + "step": 616 + }, + { + "epoch": 0.1928727727414817, + "grad_norm": 0.19921875, + "learning_rate": 0.00019888987328304817, + "loss": 1.9119, + "step": 617 + }, + { + "epoch": 0.19318537042825884, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001988862193320567, + "loss": 1.6569, + "step": 618 + }, + { + "epoch": 0.19349796811503595, + "grad_norm": 0.189453125, + "learning_rate": 0.00019888255941117816, + "loss": 2.0652, + "step": 619 + }, + { + "epoch": 0.19381056580181308, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001988788935206335, + "loss": 1.6115, + "step": 620 + }, + { + "epoch": 0.19412316348859018, + "grad_norm": 0.197265625, + "learning_rate": 0.00019887522166064402, + "loss": 1.6017, + "step": 621 + }, + { + "epoch": 0.1944357611753673, + "grad_norm": 0.1875, + "learning_rate": 0.00019887154383143143, + "loss": 1.9108, + "step": 622 + }, + { + "epoch": 0.19474835886214442, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019886786003321772, + "loss": 1.6372, + "step": 623 + }, + { + "epoch": 0.19506095654892153, + "grad_norm": 0.296875, + "learning_rate": 0.0001988641702662253, + "loss": 2.4569, + "step": 624 + }, + { + "epoch": 0.19537355423569866, + "grad_norm": 0.1875, + "learning_rate": 0.000198860474530677, + "loss": 1.6954, + "step": 625 + }, + { + "epoch": 0.19568615192247577, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019885677282679585, + "loss": 1.8825, + "step": 626 + }, + { + "epoch": 0.1959987496092529, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019885306515480533, + "loss": 1.7887, + "step": 627 + }, + { + "epoch": 0.19631134729603, + "grad_norm": 0.2109375, + "learning_rate": 0.00019884935151492933, + "loss": 1.8936, + "step": 628 + }, + { + "epoch": 0.19662394498280714, + "grad_norm": 0.19921875, + "learning_rate": 0.00019884563190739196, + "loss": 1.7583, + "step": 629 + }, + { + "epoch": 0.19693654266958424, + "grad_norm": 0.201171875, + "learning_rate": 0.0001988419063324179, + "loss": 1.898, + "step": 630 + }, + { + "epoch": 0.19724914035636137, + "grad_norm": 0.193359375, + "learning_rate": 0.0001988381747902319, + "loss": 2.0045, + "step": 631 + }, + { + "epoch": 0.19756173804313848, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019883443728105943, + "loss": 1.9453, + "step": 632 + }, + { + "epoch": 0.1978743357299156, + "grad_norm": 0.2001953125, + "learning_rate": 0.000198830693805126, + "loss": 1.8481, + "step": 633 + }, + { + "epoch": 0.19818693341669272, + "grad_norm": 0.203125, + "learning_rate": 0.00019882694436265764, + "loss": 1.8409, + "step": 634 + }, + { + "epoch": 0.19849953110346982, + "grad_norm": 0.208984375, + "learning_rate": 0.00019882318895388072, + "loss": 1.8232, + "step": 635 + }, + { + "epoch": 0.19881212879024696, + "grad_norm": 0.193359375, + "learning_rate": 0.00019881942757902197, + "loss": 1.7768, + "step": 636 + }, + { + "epoch": 0.19912472647702406, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001988156602383084, + "loss": 1.7056, + "step": 637 + }, + { + "epoch": 0.1994373241638012, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019881188693196756, + "loss": 1.5243, + "step": 638 + }, + { + "epoch": 0.1997499218505783, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019880810766022714, + "loss": 2.0564, + "step": 639 + }, + { + "epoch": 0.20006251953735543, + "grad_norm": 0.203125, + "learning_rate": 0.00019880432242331536, + "loss": 1.8789, + "step": 640 + }, + { + "epoch": 0.20037511722413254, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019880053122146073, + "loss": 1.8037, + "step": 641 + }, + { + "epoch": 0.20068771491090967, + "grad_norm": 0.205078125, + "learning_rate": 0.00019879673405489215, + "loss": 1.7692, + "step": 642 + }, + { + "epoch": 0.20100031259768678, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019879293092383882, + "loss": 1.7066, + "step": 643 + }, + { + "epoch": 0.2013129102844639, + "grad_norm": 0.197265625, + "learning_rate": 0.00019878912182853036, + "loss": 1.8715, + "step": 644 + }, + { + "epoch": 0.201625507971241, + "grad_norm": 0.19140625, + "learning_rate": 0.0001987853067691967, + "loss": 1.6647, + "step": 645 + }, + { + "epoch": 0.20193810565801812, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019878148574606824, + "loss": 1.6027, + "step": 646 + }, + { + "epoch": 0.20225070334479525, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019877765875937558, + "loss": 1.6788, + "step": 647 + }, + { + "epoch": 0.20256330103157236, + "grad_norm": 0.205078125, + "learning_rate": 0.00019877382580934977, + "loss": 1.7934, + "step": 648 + }, + { + "epoch": 0.2028758987183495, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019876998689622225, + "loss": 1.6556, + "step": 649 + }, + { + "epoch": 0.2031884964051266, + "grad_norm": 0.1953125, + "learning_rate": 0.00019876614202022475, + "loss": 1.7103, + "step": 650 + }, + { + "epoch": 0.20350109409190373, + "grad_norm": 0.193359375, + "learning_rate": 0.0001987622911815894, + "loss": 1.7654, + "step": 651 + }, + { + "epoch": 0.20381369177868083, + "grad_norm": 0.1875, + "learning_rate": 0.00019875843438054864, + "loss": 1.7043, + "step": 652 + }, + { + "epoch": 0.20412628946545797, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001987545716173354, + "loss": 1.966, + "step": 653 + }, + { + "epoch": 0.20443888715223507, + "grad_norm": 0.19921875, + "learning_rate": 0.0001987507028921828, + "loss": 1.7629, + "step": 654 + }, + { + "epoch": 0.2047514848390122, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019874682820532444, + "loss": 1.766, + "step": 655 + }, + { + "epoch": 0.2050640825257893, + "grad_norm": 0.201171875, + "learning_rate": 0.00019874294755699423, + "loss": 1.6821, + "step": 656 + }, + { + "epoch": 0.20537668021256641, + "grad_norm": 0.21875, + "learning_rate": 0.00019873906094742644, + "loss": 1.806, + "step": 657 + }, + { + "epoch": 0.20568927789934355, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001987351683768557, + "loss": 1.8864, + "step": 658 + }, + { + "epoch": 0.20600187558612065, + "grad_norm": 0.19921875, + "learning_rate": 0.00019873126984551703, + "loss": 1.7406, + "step": 659 + }, + { + "epoch": 0.20631447327289779, + "grad_norm": 0.201171875, + "learning_rate": 0.0001987273653536458, + "loss": 1.7246, + "step": 660 + }, + { + "epoch": 0.2066270709596749, + "grad_norm": 0.1875, + "learning_rate": 0.00019872345490147772, + "loss": 1.9874, + "step": 661 + }, + { + "epoch": 0.20693966864645202, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019871953848924886, + "loss": 1.7792, + "step": 662 + }, + { + "epoch": 0.20725226633322913, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019871561611719564, + "loss": 1.8759, + "step": 663 + }, + { + "epoch": 0.20756486402000626, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019871168778555492, + "loss": 1.9906, + "step": 664 + }, + { + "epoch": 0.20787746170678337, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001987077534945638, + "loss": 1.8973, + "step": 665 + }, + { + "epoch": 0.2081900593935605, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019870381324445978, + "loss": 1.6312, + "step": 666 + }, + { + "epoch": 0.2085026570803376, + "grad_norm": 0.208984375, + "learning_rate": 0.0001986998670354808, + "loss": 1.8406, + "step": 667 + }, + { + "epoch": 0.2088152547671147, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001986959148678651, + "loss": 1.7828, + "step": 668 + }, + { + "epoch": 0.20912785245389184, + "grad_norm": 0.201171875, + "learning_rate": 0.00019869195674185122, + "loss": 1.9185, + "step": 669 + }, + { + "epoch": 0.20944045014066895, + "grad_norm": 0.201171875, + "learning_rate": 0.00019868799265767816, + "loss": 1.7588, + "step": 670 + }, + { + "epoch": 0.20975304782744608, + "grad_norm": 0.203125, + "learning_rate": 0.00019868402261558524, + "loss": 1.7387, + "step": 671 + }, + { + "epoch": 0.2100656455142232, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019868004661581208, + "loss": 1.6164, + "step": 672 + }, + { + "epoch": 0.21037824320100032, + "grad_norm": 0.19140625, + "learning_rate": 0.0001986760646585988, + "loss": 1.8667, + "step": 673 + }, + { + "epoch": 0.21069084088777743, + "grad_norm": 0.189453125, + "learning_rate": 0.00019867207674418568, + "loss": 1.9312, + "step": 674 + }, + { + "epoch": 0.21100343857455456, + "grad_norm": 0.19921875, + "learning_rate": 0.0001986680828728136, + "loss": 1.7665, + "step": 675 + }, + { + "epoch": 0.21131603626133166, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019866408304472364, + "loss": 1.6056, + "step": 676 + }, + { + "epoch": 0.2116286339481088, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019866007726015723, + "loss": 1.5752, + "step": 677 + }, + { + "epoch": 0.2119412316348859, + "grad_norm": 0.208984375, + "learning_rate": 0.00019865606551935626, + "loss": 1.8815, + "step": 678 + }, + { + "epoch": 0.212253829321663, + "grad_norm": 0.203125, + "learning_rate": 0.00019865204782256287, + "loss": 1.7828, + "step": 679 + }, + { + "epoch": 0.21256642700844014, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001986480241700196, + "loss": 1.9457, + "step": 680 + }, + { + "epoch": 0.21287902469521724, + "grad_norm": 0.1953125, + "learning_rate": 0.00019864399456196946, + "loss": 1.9523, + "step": 681 + }, + { + "epoch": 0.21319162238199438, + "grad_norm": 0.19140625, + "learning_rate": 0.00019863995899865565, + "loss": 1.5974, + "step": 682 + }, + { + "epoch": 0.21350422006877148, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019863591748032184, + "loss": 1.8886, + "step": 683 + }, + { + "epoch": 0.21381681775554862, + "grad_norm": 0.201171875, + "learning_rate": 0.00019863187000721197, + "loss": 1.8564, + "step": 684 + }, + { + "epoch": 0.21412941544232572, + "grad_norm": 0.203125, + "learning_rate": 0.00019862781657957045, + "loss": 1.8022, + "step": 685 + }, + { + "epoch": 0.21444201312910285, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019862375719764192, + "loss": 1.855, + "step": 686 + }, + { + "epoch": 0.21475461081587996, + "grad_norm": 0.208984375, + "learning_rate": 0.0001986196918616715, + "loss": 2.0019, + "step": 687 + }, + { + "epoch": 0.2150672085026571, + "grad_norm": 0.1953125, + "learning_rate": 0.00019861562057190462, + "loss": 1.8597, + "step": 688 + }, + { + "epoch": 0.2153798061894342, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019861154332858708, + "loss": 1.9685, + "step": 689 + }, + { + "epoch": 0.2156924038762113, + "grad_norm": 0.197265625, + "learning_rate": 0.00019860746013196495, + "loss": 1.8702, + "step": 690 + }, + { + "epoch": 0.21600500156298844, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019860337098228485, + "loss": 1.6556, + "step": 691 + }, + { + "epoch": 0.21631759924976554, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019859927587979358, + "loss": 2.0366, + "step": 692 + }, + { + "epoch": 0.21663019693654267, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019859517482473838, + "loss": 1.9303, + "step": 693 + }, + { + "epoch": 0.21694279462331978, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019859106781736682, + "loss": 1.6981, + "step": 694 + }, + { + "epoch": 0.2172553923100969, + "grad_norm": 0.205078125, + "learning_rate": 0.00019858695485792686, + "loss": 1.4825, + "step": 695 + }, + { + "epoch": 0.21756798999687402, + "grad_norm": 0.19921875, + "learning_rate": 0.0001985828359466668, + "loss": 1.779, + "step": 696 + }, + { + "epoch": 0.21788058768365115, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019857871108383532, + "loss": 1.7535, + "step": 697 + }, + { + "epoch": 0.21819318537042826, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019857458026968143, + "loss": 1.7039, + "step": 698 + }, + { + "epoch": 0.2185057830572054, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001985704435044545, + "loss": 1.7501, + "step": 699 + }, + { + "epoch": 0.2188183807439825, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001985663007884043, + "loss": 1.8791, + "step": 700 + }, + { + "epoch": 0.2191309784307596, + "grad_norm": 0.20703125, + "learning_rate": 0.00019856215212178094, + "loss": 1.904, + "step": 701 + }, + { + "epoch": 0.21944357611753673, + "grad_norm": 0.2109375, + "learning_rate": 0.00019855799750483484, + "loss": 1.4772, + "step": 702 + }, + { + "epoch": 0.21975617380431384, + "grad_norm": 0.3984375, + "learning_rate": 0.00019855383693781682, + "loss": 2.4316, + "step": 703 + }, + { + "epoch": 0.22006877149109097, + "grad_norm": 0.2109375, + "learning_rate": 0.0001985496704209781, + "loss": 1.6331, + "step": 704 + }, + { + "epoch": 0.22038136917786808, + "grad_norm": 0.22265625, + "learning_rate": 0.0001985454979545702, + "loss": 1.7665, + "step": 705 + }, + { + "epoch": 0.2206939668646452, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019854131953884495, + "loss": 1.9052, + "step": 706 + }, + { + "epoch": 0.2210065645514223, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019853713517405472, + "loss": 1.8316, + "step": 707 + }, + { + "epoch": 0.22131916223819945, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019853294486045208, + "loss": 1.6123, + "step": 708 + }, + { + "epoch": 0.22163175992497655, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019852874859828997, + "loss": 1.8111, + "step": 709 + }, + { + "epoch": 0.22194435761175368, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019852454638782176, + "loss": 1.8234, + "step": 710 + }, + { + "epoch": 0.2222569552985308, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019852033822930114, + "loss": 1.6664, + "step": 711 + }, + { + "epoch": 0.22256955298530792, + "grad_norm": 0.201171875, + "learning_rate": 0.00019851612412298214, + "loss": 1.9896, + "step": 712 + }, + { + "epoch": 0.22288215067208503, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001985119040691192, + "loss": 1.6152, + "step": 713 + }, + { + "epoch": 0.22319474835886213, + "grad_norm": 0.19140625, + "learning_rate": 0.00019850767806796707, + "loss": 2.165, + "step": 714 + }, + { + "epoch": 0.22350734604563927, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019850344611978087, + "loss": 2.1852, + "step": 715 + }, + { + "epoch": 0.22381994373241637, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019849920822481614, + "loss": 1.7914, + "step": 716 + }, + { + "epoch": 0.2241325414191935, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019849496438332866, + "loss": 2.0296, + "step": 717 + }, + { + "epoch": 0.2244451391059706, + "grad_norm": 0.201171875, + "learning_rate": 0.0001984907145955747, + "loss": 1.7981, + "step": 718 + }, + { + "epoch": 0.22475773679274774, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019848645886181074, + "loss": 1.7928, + "step": 719 + }, + { + "epoch": 0.22507033447952485, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019848219718229378, + "loss": 1.8671, + "step": 720 + }, + { + "epoch": 0.22538293216630198, + "grad_norm": 0.203125, + "learning_rate": 0.00019847792955728107, + "loss": 1.8564, + "step": 721 + }, + { + "epoch": 0.22569552985307909, + "grad_norm": 0.20703125, + "learning_rate": 0.0001984736559870303, + "loss": 1.6293, + "step": 722 + }, + { + "epoch": 0.22600812753985622, + "grad_norm": 0.201171875, + "learning_rate": 0.0001984693764717994, + "loss": 1.9545, + "step": 723 + }, + { + "epoch": 0.22632072522663332, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019846509101184679, + "loss": 1.8173, + "step": 724 + }, + { + "epoch": 0.22663332291341043, + "grad_norm": 0.197265625, + "learning_rate": 0.00019846079960743112, + "loss": 1.649, + "step": 725 + }, + { + "epoch": 0.22694592060018756, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019845650225881154, + "loss": 1.8916, + "step": 726 + }, + { + "epoch": 0.22725851828696467, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019845219896624743, + "loss": 1.7553, + "step": 727 + }, + { + "epoch": 0.2275711159737418, + "grad_norm": 0.19140625, + "learning_rate": 0.0001984478897299986, + "loss": 1.6849, + "step": 728 + }, + { + "epoch": 0.2278837136605189, + "grad_norm": 0.19921875, + "learning_rate": 0.00019844357455032526, + "loss": 1.8667, + "step": 729 + }, + { + "epoch": 0.22819631134729604, + "grad_norm": 0.203125, + "learning_rate": 0.00019843925342748783, + "loss": 1.651, + "step": 730 + }, + { + "epoch": 0.22850890903407314, + "grad_norm": 0.19921875, + "learning_rate": 0.00019843492636174728, + "loss": 1.6074, + "step": 731 + }, + { + "epoch": 0.22882150672085028, + "grad_norm": 0.189453125, + "learning_rate": 0.00019843059335336474, + "loss": 1.8431, + "step": 732 + }, + { + "epoch": 0.22913410440762738, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019842625440260188, + "loss": 1.7872, + "step": 733 + }, + { + "epoch": 0.22944670209440451, + "grad_norm": 0.197265625, + "learning_rate": 0.0001984219095097206, + "loss": 1.6808, + "step": 734 + }, + { + "epoch": 0.22975929978118162, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019841755867498322, + "loss": 1.7461, + "step": 735 + }, + { + "epoch": 0.23007189746795872, + "grad_norm": 0.2109375, + "learning_rate": 0.00019841320189865243, + "loss": 1.7838, + "step": 736 + }, + { + "epoch": 0.23038449515473586, + "grad_norm": 0.20703125, + "learning_rate": 0.0001984088391809912, + "loss": 1.9104, + "step": 737 + }, + { + "epoch": 0.23069709284151296, + "grad_norm": 0.197265625, + "learning_rate": 0.00019840447052226298, + "loss": 1.9961, + "step": 738 + }, + { + "epoch": 0.2310096905282901, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019840009592273143, + "loss": 1.987, + "step": 739 + }, + { + "epoch": 0.2313222882150672, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019839571538266072, + "loss": 1.6382, + "step": 740 + }, + { + "epoch": 0.23163488590184433, + "grad_norm": 0.19921875, + "learning_rate": 0.0001983913289023153, + "loss": 1.6738, + "step": 741 + }, + { + "epoch": 0.23194748358862144, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019838693648195995, + "loss": 1.8182, + "step": 742 + }, + { + "epoch": 0.23226008127539857, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019838253812185988, + "loss": 1.598, + "step": 743 + }, + { + "epoch": 0.23257267896217568, + "grad_norm": 0.19140625, + "learning_rate": 0.00019837813382228063, + "loss": 1.7465, + "step": 744 + }, + { + "epoch": 0.2328852766489528, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019837372358348806, + "loss": 1.8831, + "step": 745 + }, + { + "epoch": 0.23319787433572992, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019836930740574845, + "loss": 1.525, + "step": 746 + }, + { + "epoch": 0.23351047202250702, + "grad_norm": 0.84375, + "learning_rate": 0.00019836488528932836, + "loss": 3.4084, + "step": 747 + }, + { + "epoch": 0.23382306970928415, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019836045723449483, + "loss": 1.7993, + "step": 748 + }, + { + "epoch": 0.23413566739606126, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019835602324151514, + "loss": 1.8971, + "step": 749 + }, + { + "epoch": 0.2344482650828384, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019835158331065703, + "loss": 1.7286, + "step": 750 + }, + { + "epoch": 0.2347608627696155, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019834713744218844, + "loss": 1.6018, + "step": 751 + }, + { + "epoch": 0.23507346045639263, + "grad_norm": 0.2109375, + "learning_rate": 0.00019834268563637787, + "loss": 1.8705, + "step": 752 + }, + { + "epoch": 0.23538605814316974, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019833822789349409, + "loss": 1.8121, + "step": 753 + }, + { + "epoch": 0.23569865582994687, + "grad_norm": 0.1953125, + "learning_rate": 0.00019833376421380612, + "loss": 1.6886, + "step": 754 + }, + { + "epoch": 0.23601125351672397, + "grad_norm": 0.205078125, + "learning_rate": 0.00019832929459758352, + "loss": 1.6922, + "step": 755 + }, + { + "epoch": 0.2363238512035011, + "grad_norm": 0.208984375, + "learning_rate": 0.0001983248190450961, + "loss": 1.7953, + "step": 756 + }, + { + "epoch": 0.2366364488902782, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019832033755661405, + "loss": 1.7892, + "step": 757 + }, + { + "epoch": 0.23694904657705532, + "grad_norm": 0.208984375, + "learning_rate": 0.00019831585013240793, + "loss": 1.5738, + "step": 758 + }, + { + "epoch": 0.23726164426383245, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001983113567727487, + "loss": 1.6719, + "step": 759 + }, + { + "epoch": 0.23757424195060955, + "grad_norm": 0.203125, + "learning_rate": 0.00019830685747790748, + "loss": 1.9564, + "step": 760 + }, + { + "epoch": 0.2378868396373867, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001983023522481561, + "loss": 2.1432, + "step": 761 + }, + { + "epoch": 0.2381994373241638, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001982978410837664, + "loss": 1.8179, + "step": 762 + }, + { + "epoch": 0.23851203501094093, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001982933239850108, + "loss": 1.9499, + "step": 763 + }, + { + "epoch": 0.23882463269771803, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019828880095216193, + "loss": 1.5989, + "step": 764 + }, + { + "epoch": 0.23913723038449516, + "grad_norm": 0.2109375, + "learning_rate": 0.00019828427198549293, + "loss": 1.8161, + "step": 765 + }, + { + "epoch": 0.23944982807127227, + "grad_norm": 0.208984375, + "learning_rate": 0.0001982797370852772, + "loss": 1.9312, + "step": 766 + }, + { + "epoch": 0.2397624257580494, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019827519625178846, + "loss": 1.7275, + "step": 767 + }, + { + "epoch": 0.2400750234448265, + "grad_norm": 0.203125, + "learning_rate": 0.0001982706494853009, + "loss": 2.0002, + "step": 768 + }, + { + "epoch": 0.2403876211316036, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019826609678608902, + "loss": 1.8021, + "step": 769 + }, + { + "epoch": 0.24070021881838075, + "grad_norm": 0.205078125, + "learning_rate": 0.00019826153815442763, + "loss": 1.5546, + "step": 770 + }, + { + "epoch": 0.24101281650515785, + "grad_norm": 0.212890625, + "learning_rate": 0.000198256973590592, + "loss": 1.5848, + "step": 771 + }, + { + "epoch": 0.24132541419193498, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019825240309485765, + "loss": 1.7763, + "step": 772 + }, + { + "epoch": 0.2416380118787121, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001982478266675005, + "loss": 1.6906, + "step": 773 + }, + { + "epoch": 0.24195060956548922, + "grad_norm": 0.21484375, + "learning_rate": 0.00019824324430879687, + "loss": 1.9644, + "step": 774 + }, + { + "epoch": 0.24226320725226633, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019823865601902341, + "loss": 1.9122, + "step": 775 + }, + { + "epoch": 0.24257580493904346, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019823406179845707, + "loss": 1.9017, + "step": 776 + }, + { + "epoch": 0.24288840262582057, + "grad_norm": 0.21875, + "learning_rate": 0.00019822946164737526, + "loss": 1.8361, + "step": 777 + }, + { + "epoch": 0.2432010003125977, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019822485556605566, + "loss": 1.7349, + "step": 778 + }, + { + "epoch": 0.2435135979993748, + "grad_norm": 0.212890625, + "learning_rate": 0.00019822024355477637, + "loss": 1.6017, + "step": 779 + }, + { + "epoch": 0.2438261956861519, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001982156256138158, + "loss": 1.8296, + "step": 780 + }, + { + "epoch": 0.24413879337292904, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019821100174345277, + "loss": 1.6754, + "step": 781 + }, + { + "epoch": 0.24445139105970615, + "grad_norm": 0.216796875, + "learning_rate": 0.0001982063719439664, + "loss": 2.0037, + "step": 782 + }, + { + "epoch": 0.24476398874648328, + "grad_norm": 0.2109375, + "learning_rate": 0.00019820173621563623, + "loss": 1.887, + "step": 783 + }, + { + "epoch": 0.24507658643326038, + "grad_norm": 0.19140625, + "learning_rate": 0.0001981970945587421, + "loss": 1.5708, + "step": 784 + }, + { + "epoch": 0.24538918412003752, + "grad_norm": 0.68359375, + "learning_rate": 0.0001981924469735642, + "loss": 2.3282, + "step": 785 + }, + { + "epoch": 0.24570178180681462, + "grad_norm": 0.203125, + "learning_rate": 0.00019818779346038318, + "loss": 1.7515, + "step": 786 + }, + { + "epoch": 0.24601437949359176, + "grad_norm": 0.20703125, + "learning_rate": 0.00019818313401947997, + "loss": 1.7623, + "step": 787 + }, + { + "epoch": 0.24632697718036886, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019817846865113577, + "loss": 1.8036, + "step": 788 + }, + { + "epoch": 0.246639574867146, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001981737973556324, + "loss": 1.8455, + "step": 789 + }, + { + "epoch": 0.2469521725539231, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001981691201332517, + "loss": 1.7791, + "step": 790 + }, + { + "epoch": 0.24726477024070023, + "grad_norm": 0.205078125, + "learning_rate": 0.00019816443698427615, + "loss": 2.0416, + "step": 791 + }, + { + "epoch": 0.24757736792747734, + "grad_norm": 0.2265625, + "learning_rate": 0.00019815974790898846, + "loss": 2.2271, + "step": 792 + }, + { + "epoch": 0.24788996561425444, + "grad_norm": 0.203125, + "learning_rate": 0.00019815505290767172, + "loss": 1.5433, + "step": 793 + }, + { + "epoch": 0.24820256330103158, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001981503519806093, + "loss": 1.7228, + "step": 794 + }, + { + "epoch": 0.24851516098780868, + "grad_norm": 0.20703125, + "learning_rate": 0.00019814564512808512, + "loss": 1.8217, + "step": 795 + }, + { + "epoch": 0.2488277586745858, + "grad_norm": 0.203125, + "learning_rate": 0.00019814093235038323, + "loss": 1.8205, + "step": 796 + }, + { + "epoch": 0.24914035636136292, + "grad_norm": 0.2109375, + "learning_rate": 0.00019813621364778817, + "loss": 1.8541, + "step": 797 + }, + { + "epoch": 0.24945295404814005, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001981314890205849, + "loss": 1.8656, + "step": 798 + }, + { + "epoch": 0.24976555173491716, + "grad_norm": 0.2109375, + "learning_rate": 0.00019812675846905855, + "loss": 1.809, + "step": 799 + }, + { + "epoch": 0.25007814942169426, + "grad_norm": 0.20703125, + "learning_rate": 0.00019812202199349476, + "loss": 2.0585, + "step": 800 + }, + { + "epoch": 0.2503907471084714, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019811727959417945, + "loss": 1.9492, + "step": 801 + }, + { + "epoch": 0.25070334479524853, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019811253127139896, + "loss": 1.8192, + "step": 802 + }, + { + "epoch": 0.25101594248202563, + "grad_norm": 0.2109375, + "learning_rate": 0.0001981077770254399, + "loss": 1.4981, + "step": 803 + }, + { + "epoch": 0.25132854016880274, + "grad_norm": 0.205078125, + "learning_rate": 0.00019810301685658935, + "loss": 1.8598, + "step": 804 + }, + { + "epoch": 0.25164113785557984, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019809825076513464, + "loss": 1.7946, + "step": 805 + }, + { + "epoch": 0.251953735542357, + "grad_norm": 0.23046875, + "learning_rate": 0.00019809347875136352, + "loss": 1.784, + "step": 806 + }, + { + "epoch": 0.2522663332291341, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019808870081556413, + "loss": 1.9401, + "step": 807 + }, + { + "epoch": 0.2525789309159112, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019808391695802483, + "loss": 2.0217, + "step": 808 + }, + { + "epoch": 0.2528915286026883, + "grad_norm": 0.20703125, + "learning_rate": 0.0001980791271790345, + "loss": 1.7354, + "step": 809 + }, + { + "epoch": 0.2532041262894655, + "grad_norm": 0.20703125, + "learning_rate": 0.00019807433147888225, + "loss": 2.1094, + "step": 810 + }, + { + "epoch": 0.2535167239762426, + "grad_norm": 0.8125, + "learning_rate": 0.00019806952985785764, + "loss": 2.8019, + "step": 811 + }, + { + "epoch": 0.2538293216630197, + "grad_norm": 0.193359375, + "learning_rate": 0.00019806472231625056, + "loss": 1.554, + "step": 812 + }, + { + "epoch": 0.2541419193497968, + "grad_norm": 0.1953125, + "learning_rate": 0.0001980599088543512, + "loss": 1.7158, + "step": 813 + }, + { + "epoch": 0.2544545170365739, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019805508947245021, + "loss": 1.934, + "step": 814 + }, + { + "epoch": 0.25476711472335106, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001980502641708385, + "loss": 2.0267, + "step": 815 + }, + { + "epoch": 0.25507971241012817, + "grad_norm": 0.20703125, + "learning_rate": 0.0001980454329498074, + "loss": 1.6819, + "step": 816 + }, + { + "epoch": 0.2553923100969053, + "grad_norm": 0.203125, + "learning_rate": 0.00019804059580964855, + "loss": 1.7279, + "step": 817 + }, + { + "epoch": 0.2557049077836824, + "grad_norm": 0.2109375, + "learning_rate": 0.00019803575275065404, + "loss": 1.6234, + "step": 818 + }, + { + "epoch": 0.25601750547045954, + "grad_norm": 0.205078125, + "learning_rate": 0.0001980309037731162, + "loss": 1.4631, + "step": 819 + }, + { + "epoch": 0.25633010315723664, + "grad_norm": 0.2109375, + "learning_rate": 0.00019802604887732774, + "loss": 1.7769, + "step": 820 + }, + { + "epoch": 0.25664270084401375, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019802118806358182, + "loss": 1.7928, + "step": 821 + }, + { + "epoch": 0.25695529853079085, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019801632133217189, + "loss": 1.639, + "step": 822 + }, + { + "epoch": 0.257267896217568, + "grad_norm": 0.212890625, + "learning_rate": 0.0001980114486833917, + "loss": 1.6918, + "step": 823 + }, + { + "epoch": 0.2575804939043451, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019800657011753548, + "loss": 1.8273, + "step": 824 + }, + { + "epoch": 0.2578930915911222, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001980016856348977, + "loss": 1.9625, + "step": 825 + }, + { + "epoch": 0.25820568927789933, + "grad_norm": 0.201171875, + "learning_rate": 0.00019799679523577332, + "loss": 1.741, + "step": 826 + }, + { + "epoch": 0.25851828696467644, + "grad_norm": 0.19921875, + "learning_rate": 0.00019799189892045748, + "loss": 2.0397, + "step": 827 + }, + { + "epoch": 0.2588308846514536, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019798699668924585, + "loss": 1.7246, + "step": 828 + }, + { + "epoch": 0.2591434823382307, + "grad_norm": 0.203125, + "learning_rate": 0.00019798208854243437, + "loss": 1.5622, + "step": 829 + }, + { + "epoch": 0.2594560800250078, + "grad_norm": 0.1953125, + "learning_rate": 0.00019797717448031936, + "loss": 1.4121, + "step": 830 + }, + { + "epoch": 0.2597686777117849, + "grad_norm": 0.20703125, + "learning_rate": 0.00019797225450319744, + "loss": 1.6693, + "step": 831 + }, + { + "epoch": 0.2600812753985621, + "grad_norm": 0.208984375, + "learning_rate": 0.0001979673286113657, + "loss": 1.6021, + "step": 832 + }, + { + "epoch": 0.2603938730853392, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001979623968051215, + "loss": 1.9199, + "step": 833 + }, + { + "epoch": 0.2607064707721163, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019795745908476254, + "loss": 2.0403, + "step": 834 + }, + { + "epoch": 0.2610190684588934, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019795251545058694, + "loss": 1.8294, + "step": 835 + }, + { + "epoch": 0.26133166614567055, + "grad_norm": 0.20703125, + "learning_rate": 0.00019794756590289317, + "loss": 1.8625, + "step": 836 + }, + { + "epoch": 0.26164426383244765, + "grad_norm": 0.21484375, + "learning_rate": 0.00019794261044198003, + "loss": 1.8086, + "step": 837 + }, + { + "epoch": 0.26195686151922476, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001979376490681467, + "loss": 1.6601, + "step": 838 + }, + { + "epoch": 0.26226945920600186, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019793268178169268, + "loss": 1.5396, + "step": 839 + }, + { + "epoch": 0.26258205689277897, + "grad_norm": 0.20703125, + "learning_rate": 0.00019792770858291788, + "loss": 1.7095, + "step": 840 + }, + { + "epoch": 0.26289465457955613, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019792272947212253, + "loss": 1.8782, + "step": 841 + }, + { + "epoch": 0.26320725226633324, + "grad_norm": 0.19921875, + "learning_rate": 0.00019791774444960717, + "loss": 1.9358, + "step": 842 + }, + { + "epoch": 0.26351984995311034, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019791275351567286, + "loss": 1.7342, + "step": 843 + }, + { + "epoch": 0.26383244763988745, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019790775667062081, + "loss": 1.7846, + "step": 844 + }, + { + "epoch": 0.2641450453266646, + "grad_norm": 0.201171875, + "learning_rate": 0.0001979027539147527, + "loss": 1.845, + "step": 845 + }, + { + "epoch": 0.2644576430134417, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001978977452483706, + "loss": 1.9035, + "step": 846 + }, + { + "epoch": 0.2647702407002188, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001978927306717769, + "loss": 1.5762, + "step": 847 + }, + { + "epoch": 0.2650828383869959, + "grad_norm": 0.1953125, + "learning_rate": 0.0001978877101852743, + "loss": 2.1721, + "step": 848 + }, + { + "epoch": 0.26539543607377303, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019788268378916586, + "loss": 1.7108, + "step": 849 + }, + { + "epoch": 0.2657080337605502, + "grad_norm": 1.0234375, + "learning_rate": 0.00019787765148375508, + "loss": 2.5699, + "step": 850 + }, + { + "epoch": 0.2660206314473273, + "grad_norm": 0.203125, + "learning_rate": 0.00019787261326934577, + "loss": 1.6568, + "step": 851 + }, + { + "epoch": 0.2663332291341044, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019786756914624208, + "loss": 1.8594, + "step": 852 + }, + { + "epoch": 0.2666458268208815, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019786251911474849, + "loss": 1.8597, + "step": 853 + }, + { + "epoch": 0.26695842450765866, + "grad_norm": 0.216796875, + "learning_rate": 0.00019785746317516994, + "loss": 2.0457, + "step": 854 + }, + { + "epoch": 0.26727102219443577, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019785240132781163, + "loss": 1.832, + "step": 855 + }, + { + "epoch": 0.2675836198812129, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019784733357297915, + "loss": 1.68, + "step": 856 + }, + { + "epoch": 0.26789621756799, + "grad_norm": 0.2109375, + "learning_rate": 0.00019784225991097848, + "loss": 1.8997, + "step": 857 + }, + { + "epoch": 0.26820881525476714, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019783718034211586, + "loss": 1.7594, + "step": 858 + }, + { + "epoch": 0.26852141294154425, + "grad_norm": 0.2138671875, + "learning_rate": 0.000197832094866698, + "loss": 1.7918, + "step": 859 + }, + { + "epoch": 0.26883401062832135, + "grad_norm": 0.203125, + "learning_rate": 0.00019782700348503193, + "loss": 1.6616, + "step": 860 + }, + { + "epoch": 0.26914660831509846, + "grad_norm": 0.201171875, + "learning_rate": 0.00019782190619742495, + "loss": 1.8357, + "step": 861 + }, + { + "epoch": 0.26945920600187556, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001978168030041849, + "loss": 1.74, + "step": 862 + }, + { + "epoch": 0.2697718036886527, + "grad_norm": 0.203125, + "learning_rate": 0.00019781169390561975, + "loss": 1.4934, + "step": 863 + }, + { + "epoch": 0.27008440137542983, + "grad_norm": 0.216796875, + "learning_rate": 0.000197806578902038, + "loss": 1.6285, + "step": 864 + }, + { + "epoch": 0.27039699906220693, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019780145799374848, + "loss": 1.5881, + "step": 865 + }, + { + "epoch": 0.27070959674898404, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019779633118106028, + "loss": 1.714, + "step": 866 + }, + { + "epoch": 0.2710221944357612, + "grad_norm": 0.2119140625, + "learning_rate": 0.000197791198464283, + "loss": 1.9303, + "step": 867 + }, + { + "epoch": 0.2713347921225383, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001977860598437264, + "loss": 1.6095, + "step": 868 + }, + { + "epoch": 0.2716473898093154, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019778091531970072, + "loss": 1.7565, + "step": 869 + }, + { + "epoch": 0.2719599874960925, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019777576489251664, + "loss": 1.5668, + "step": 870 + }, + { + "epoch": 0.2722725851828696, + "grad_norm": 0.20703125, + "learning_rate": 0.00019777060856248504, + "loss": 1.6762, + "step": 871 + }, + { + "epoch": 0.2725851828696468, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019776544632991717, + "loss": 1.7808, + "step": 872 + }, + { + "epoch": 0.2728977805564239, + "grad_norm": 0.203125, + "learning_rate": 0.00019776027819512474, + "loss": 1.8983, + "step": 873 + }, + { + "epoch": 0.273210378243201, + "grad_norm": 0.20703125, + "learning_rate": 0.00019775510415841977, + "loss": 1.837, + "step": 874 + }, + { + "epoch": 0.2735229759299781, + "grad_norm": 0.20703125, + "learning_rate": 0.00019774992422011452, + "loss": 1.7363, + "step": 875 + }, + { + "epoch": 0.27383557361675526, + "grad_norm": 0.2109375, + "learning_rate": 0.00019774473838052184, + "loss": 1.8509, + "step": 876 + }, + { + "epoch": 0.27414817130353236, + "grad_norm": 0.23046875, + "learning_rate": 0.00019773954663995476, + "loss": 1.8239, + "step": 877 + }, + { + "epoch": 0.27446076899030947, + "grad_norm": 0.205078125, + "learning_rate": 0.00019773434899872665, + "loss": 2.0633, + "step": 878 + }, + { + "epoch": 0.2747733666770866, + "grad_norm": 0.21484375, + "learning_rate": 0.00019772914545715135, + "loss": 2.0269, + "step": 879 + }, + { + "epoch": 0.27508596436386373, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019772393601554303, + "loss": 1.7389, + "step": 880 + }, + { + "epoch": 0.27539856205064084, + "grad_norm": 0.212890625, + "learning_rate": 0.00019771872067421615, + "loss": 2.0936, + "step": 881 + }, + { + "epoch": 0.27571115973741794, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019771349943348558, + "loss": 1.7132, + "step": 882 + }, + { + "epoch": 0.27602375742419505, + "grad_norm": 0.21484375, + "learning_rate": 0.00019770827229366654, + "loss": 1.6179, + "step": 883 + }, + { + "epoch": 0.27633635511097215, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019770303925507456, + "loss": 1.9907, + "step": 884 + }, + { + "epoch": 0.2766489527977493, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001976978003180256, + "loss": 1.5918, + "step": 885 + }, + { + "epoch": 0.2769615504845264, + "grad_norm": 0.2060546875, + "learning_rate": 0.000197692555482836, + "loss": 1.6069, + "step": 886 + }, + { + "epoch": 0.2772741481713035, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019768730474982227, + "loss": 1.9966, + "step": 887 + }, + { + "epoch": 0.27758674585808063, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019768204811930148, + "loss": 1.7923, + "step": 888 + }, + { + "epoch": 0.2778993435448578, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019767678559159098, + "loss": 1.6497, + "step": 889 + }, + { + "epoch": 0.2782119412316349, + "grad_norm": 0.2109375, + "learning_rate": 0.00019767151716700845, + "loss": 1.9629, + "step": 890 + }, + { + "epoch": 0.278524538918412, + "grad_norm": 0.20703125, + "learning_rate": 0.00019766624284587195, + "loss": 1.8348, + "step": 891 + }, + { + "epoch": 0.2788371366051891, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019766096262849994, + "loss": 1.8409, + "step": 892 + }, + { + "epoch": 0.27914973429196627, + "grad_norm": 0.208984375, + "learning_rate": 0.00019765567651521115, + "loss": 1.7796, + "step": 893 + }, + { + "epoch": 0.2794623319787434, + "grad_norm": 0.201171875, + "learning_rate": 0.00019765038450632476, + "loss": 1.9009, + "step": 894 + }, + { + "epoch": 0.2797749296655205, + "grad_norm": 0.203125, + "learning_rate": 0.00019764508660216019, + "loss": 1.4491, + "step": 895 + }, + { + "epoch": 0.2800875273522976, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001976397828030373, + "loss": 1.5436, + "step": 896 + }, + { + "epoch": 0.2804001250390747, + "grad_norm": 0.2109375, + "learning_rate": 0.0001976344731092763, + "loss": 1.8577, + "step": 897 + }, + { + "epoch": 0.28071272272585185, + "grad_norm": 0.3828125, + "learning_rate": 0.0001976291575211978, + "loss": 2.6341, + "step": 898 + }, + { + "epoch": 0.28102532041262895, + "grad_norm": 0.203125, + "learning_rate": 0.00019762383603912258, + "loss": 1.6624, + "step": 899 + }, + { + "epoch": 0.28133791809940606, + "grad_norm": 0.205078125, + "learning_rate": 0.000197618508663372, + "loss": 1.6193, + "step": 900 + }, + { + "epoch": 0.28165051578618316, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019761317539426765, + "loss": 1.6416, + "step": 901 + }, + { + "epoch": 0.2819631134729603, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019760783623213153, + "loss": 1.5813, + "step": 902 + }, + { + "epoch": 0.28227571115973743, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019760249117728592, + "loss": 2.1245, + "step": 903 + }, + { + "epoch": 0.28258830884651454, + "grad_norm": 0.236328125, + "learning_rate": 0.00019759714023005357, + "loss": 2.0305, + "step": 904 + }, + { + "epoch": 0.28290090653329164, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001975917833907575, + "loss": 1.4689, + "step": 905 + }, + { + "epoch": 0.28321350422006875, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019758642065972112, + "loss": 1.9306, + "step": 906 + }, + { + "epoch": 0.2835261019068459, + "grad_norm": 0.212890625, + "learning_rate": 0.0001975810520372681, + "loss": 1.8309, + "step": 907 + }, + { + "epoch": 0.283838699593623, + "grad_norm": 0.216796875, + "learning_rate": 0.0001975756775237227, + "loss": 1.732, + "step": 908 + }, + { + "epoch": 0.2841512972804001, + "grad_norm": 0.208984375, + "learning_rate": 0.00019757029711940923, + "loss": 1.5233, + "step": 909 + }, + { + "epoch": 0.2844638949671772, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019756491082465263, + "loss": 1.6491, + "step": 910 + }, + { + "epoch": 0.2847764926539544, + "grad_norm": 0.201171875, + "learning_rate": 0.00019755951863977805, + "loss": 2.2236, + "step": 911 + }, + { + "epoch": 0.2850890903407315, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019755412056511097, + "loss": 1.8299, + "step": 912 + }, + { + "epoch": 0.2854016880275086, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019754871660097734, + "loss": 1.5403, + "step": 913 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019754330674770339, + "loss": 1.5712, + "step": 914 + }, + { + "epoch": 0.28602688340106286, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019753789100561569, + "loss": 1.8814, + "step": 915 + }, + { + "epoch": 0.28633948108783996, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001975324693750412, + "loss": 1.7153, + "step": 916 + }, + { + "epoch": 0.28665207877461707, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001975270418563073, + "loss": 2.0221, + "step": 917 + }, + { + "epoch": 0.2869646764613942, + "grad_norm": 0.2109375, + "learning_rate": 0.00019752160844974158, + "loss": 1.7176, + "step": 918 + }, + { + "epoch": 0.2872772741481713, + "grad_norm": 0.208984375, + "learning_rate": 0.0001975161691556721, + "loss": 1.8581, + "step": 919 + }, + { + "epoch": 0.28758987183494844, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019751072397442715, + "loss": 1.8127, + "step": 920 + }, + { + "epoch": 0.28790246952172555, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001975052729063356, + "loss": 1.5183, + "step": 921 + }, + { + "epoch": 0.28821506720850265, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019749981595172647, + "loss": 1.7887, + "step": 922 + }, + { + "epoch": 0.28852766489527976, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019749435311092919, + "loss": 1.7053, + "step": 923 + }, + { + "epoch": 0.2888402625820569, + "grad_norm": 0.212890625, + "learning_rate": 0.00019748888438427358, + "loss": 1.7008, + "step": 924 + }, + { + "epoch": 0.289152860268834, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019748340977208977, + "loss": 1.888, + "step": 925 + }, + { + "epoch": 0.2894654579556111, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001974779292747083, + "loss": 1.708, + "step": 926 + }, + { + "epoch": 0.28977805564238823, + "grad_norm": 0.21484375, + "learning_rate": 0.00019747244289246006, + "loss": 1.8244, + "step": 927 + }, + { + "epoch": 0.29009065332916534, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001974669506256762, + "loss": 1.5614, + "step": 928 + }, + { + "epoch": 0.2904032510159425, + "grad_norm": 0.38671875, + "learning_rate": 0.00019746145247468832, + "loss": 2.2925, + "step": 929 + }, + { + "epoch": 0.2907158487027196, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019745594843982836, + "loss": 1.7933, + "step": 930 + }, + { + "epoch": 0.2910284463894967, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001974504385214286, + "loss": 1.8521, + "step": 931 + }, + { + "epoch": 0.2913410440762738, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019744492271982168, + "loss": 1.6939, + "step": 932 + }, + { + "epoch": 0.291653641763051, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019743940103534062, + "loss": 1.6783, + "step": 933 + }, + { + "epoch": 0.2919662394498281, + "grad_norm": 0.203125, + "learning_rate": 0.00019743387346831876, + "loss": 2.0204, + "step": 934 + }, + { + "epoch": 0.2922788371366052, + "grad_norm": 0.197265625, + "learning_rate": 0.00019742834001908977, + "loss": 1.7812, + "step": 935 + }, + { + "epoch": 0.2925914348233823, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019742280068798775, + "loss": 1.7483, + "step": 936 + }, + { + "epoch": 0.29290403251015945, + "grad_norm": 0.21484375, + "learning_rate": 0.00019741725547534712, + "loss": 1.8223, + "step": 937 + }, + { + "epoch": 0.29321663019693656, + "grad_norm": 0.20703125, + "learning_rate": 0.0001974117043815026, + "loss": 1.8306, + "step": 938 + }, + { + "epoch": 0.29352922788371366, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019740614740678937, + "loss": 1.9111, + "step": 939 + }, + { + "epoch": 0.29384182557049077, + "grad_norm": 0.224609375, + "learning_rate": 0.0001974005845515429, + "loss": 1.7384, + "step": 940 + }, + { + "epoch": 0.29415442325726787, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019739501581609903, + "loss": 1.7809, + "step": 941 + }, + { + "epoch": 0.29446702094404503, + "grad_norm": 0.212890625, + "learning_rate": 0.00019738944120079393, + "loss": 1.8266, + "step": 942 + }, + { + "epoch": 0.29477961863082214, + "grad_norm": 0.203125, + "learning_rate": 0.0001973838607059642, + "loss": 2.0459, + "step": 943 + }, + { + "epoch": 0.29509221631759924, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019737827433194665, + "loss": 1.7519, + "step": 944 + }, + { + "epoch": 0.29540481400437635, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001973726820790786, + "loss": 1.6264, + "step": 945 + }, + { + "epoch": 0.2957174116911535, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019736708394769764, + "loss": 1.6892, + "step": 946 + }, + { + "epoch": 0.2960300093779306, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019736147993814176, + "loss": 1.9491, + "step": 947 + }, + { + "epoch": 0.2963426070647077, + "grad_norm": 0.1953125, + "learning_rate": 0.00019735587005074927, + "loss": 1.7754, + "step": 948 + }, + { + "epoch": 0.2966552047514848, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019735025428585886, + "loss": 1.9126, + "step": 949 + }, + { + "epoch": 0.29696780243826193, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019734463264380953, + "loss": 2.071, + "step": 950 + }, + { + "epoch": 0.2972804001250391, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001973390051249407, + "loss": 1.6336, + "step": 951 + }, + { + "epoch": 0.2975929978118162, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019733337172959204, + "loss": 1.4598, + "step": 952 + }, + { + "epoch": 0.2979055954985933, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001973277324581037, + "loss": 1.5984, + "step": 953 + }, + { + "epoch": 0.2982181931853704, + "grad_norm": 0.21875, + "learning_rate": 0.00019732208731081615, + "loss": 1.9082, + "step": 954 + }, + { + "epoch": 0.29853079087214757, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019731643628807018, + "loss": 1.6075, + "step": 955 + }, + { + "epoch": 0.29884338855892467, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019731077939020693, + "loss": 1.9933, + "step": 956 + }, + { + "epoch": 0.2991559862457018, + "grad_norm": 0.20703125, + "learning_rate": 0.00019730511661756792, + "loss": 1.5719, + "step": 957 + }, + { + "epoch": 0.2994685839324789, + "grad_norm": 0.20703125, + "learning_rate": 0.00019729944797049502, + "loss": 1.6318, + "step": 958 + }, + { + "epoch": 0.29978118161925604, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019729377344933043, + "loss": 1.8574, + "step": 959 + }, + { + "epoch": 0.30009377930603315, + "grad_norm": 0.208984375, + "learning_rate": 0.0001972880930544168, + "loss": 1.9144, + "step": 960 + }, + { + "epoch": 0.30040637699281025, + "grad_norm": 0.2001953125, + "learning_rate": 0.000197282406786097, + "loss": 1.7335, + "step": 961 + }, + { + "epoch": 0.30071897467958736, + "grad_norm": 0.203125, + "learning_rate": 0.00019727671464471436, + "loss": 1.7289, + "step": 962 + }, + { + "epoch": 0.30103157236636446, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019727101663061247, + "loss": 1.994, + "step": 963 + }, + { + "epoch": 0.3013441700531416, + "grad_norm": 0.205078125, + "learning_rate": 0.00019726531274413532, + "loss": 1.7233, + "step": 964 + }, + { + "epoch": 0.30165676773991873, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019725960298562733, + "loss": 1.8961, + "step": 965 + }, + { + "epoch": 0.30196936542669583, + "grad_norm": 0.21484375, + "learning_rate": 0.00019725388735543318, + "loss": 1.6978, + "step": 966 + }, + { + "epoch": 0.30228196311347294, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001972481658538979, + "loss": 1.752, + "step": 967 + }, + { + "epoch": 0.3025945608002501, + "grad_norm": 0.205078125, + "learning_rate": 0.00019724243848136692, + "loss": 2.0531, + "step": 968 + }, + { + "epoch": 0.3029071584870272, + "grad_norm": 0.208984375, + "learning_rate": 0.000197236705238186, + "loss": 1.7117, + "step": 969 + }, + { + "epoch": 0.3032197561738043, + "grad_norm": 0.20703125, + "learning_rate": 0.00019723096612470133, + "loss": 1.5911, + "step": 970 + }, + { + "epoch": 0.3035323538605814, + "grad_norm": 0.20703125, + "learning_rate": 0.00019722522114125929, + "loss": 1.8811, + "step": 971 + }, + { + "epoch": 0.3038449515473586, + "grad_norm": 0.22265625, + "learning_rate": 0.00019721947028820676, + "loss": 1.6444, + "step": 972 + }, + { + "epoch": 0.3041575492341357, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001972137135658909, + "loss": 1.5187, + "step": 973 + }, + { + "epoch": 0.3044701469209128, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001972079509746593, + "loss": 1.6957, + "step": 974 + }, + { + "epoch": 0.3047827446076899, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019720218251485983, + "loss": 1.5887, + "step": 975 + }, + { + "epoch": 0.305095342294467, + "grad_norm": 0.216796875, + "learning_rate": 0.0001971964081868407, + "loss": 1.7837, + "step": 976 + }, + { + "epoch": 0.30540793998124416, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001971906279909506, + "loss": 1.8848, + "step": 977 + }, + { + "epoch": 0.30572053766802126, + "grad_norm": 0.224609375, + "learning_rate": 0.0001971848419275384, + "loss": 1.8966, + "step": 978 + }, + { + "epoch": 0.30603313535479837, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019717904999695348, + "loss": 1.6581, + "step": 979 + }, + { + "epoch": 0.3063457330415755, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019717325219954543, + "loss": 1.6071, + "step": 980 + }, + { + "epoch": 0.30665833072835263, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019716744853566436, + "loss": 1.8169, + "step": 981 + }, + { + "epoch": 0.30697092841512974, + "grad_norm": 0.197265625, + "learning_rate": 0.0001971616390056606, + "loss": 1.6017, + "step": 982 + }, + { + "epoch": 0.30728352610190685, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019715582360988482, + "loss": 1.6999, + "step": 983 + }, + { + "epoch": 0.30759612378868395, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019715000234868821, + "loss": 1.7758, + "step": 984 + }, + { + "epoch": 0.30790872147546106, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019714417522242214, + "loss": 1.9776, + "step": 985 + }, + { + "epoch": 0.3082213191622382, + "grad_norm": 0.2265625, + "learning_rate": 0.00019713834223143844, + "loss": 1.7776, + "step": 986 + }, + { + "epoch": 0.3085339168490153, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019713250337608922, + "loss": 1.8847, + "step": 987 + }, + { + "epoch": 0.3088465145357924, + "grad_norm": 0.2109375, + "learning_rate": 0.000197126658656727, + "loss": 1.8091, + "step": 988 + }, + { + "epoch": 0.30915911222256953, + "grad_norm": 0.212890625, + "learning_rate": 0.00019712080807370464, + "loss": 1.804, + "step": 989 + }, + { + "epoch": 0.3094717099093467, + "grad_norm": 0.22265625, + "learning_rate": 0.00019711495162737529, + "loss": 1.782, + "step": 990 + }, + { + "epoch": 0.3097843075961238, + "grad_norm": 0.201171875, + "learning_rate": 0.0001971090893180926, + "loss": 1.5211, + "step": 991 + }, + { + "epoch": 0.3100969052829009, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001971032211462104, + "loss": 1.4168, + "step": 992 + }, + { + "epoch": 0.310409502969678, + "grad_norm": 0.212890625, + "learning_rate": 0.00019709734711208303, + "loss": 1.5656, + "step": 993 + }, + { + "epoch": 0.31072210065645517, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019709146721606509, + "loss": 1.818, + "step": 994 + }, + { + "epoch": 0.3110346983432323, + "grad_norm": 0.205078125, + "learning_rate": 0.00019708558145851152, + "loss": 1.7158, + "step": 995 + }, + { + "epoch": 0.3113472960300094, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001970796898397777, + "loss": 1.6944, + "step": 996 + }, + { + "epoch": 0.3116598937167865, + "grad_norm": 0.2109375, + "learning_rate": 0.0001970737923602193, + "loss": 1.7961, + "step": 997 + }, + { + "epoch": 0.3119724914035636, + "grad_norm": 0.201171875, + "learning_rate": 0.00019706788902019233, + "loss": 1.8871, + "step": 998 + }, + { + "epoch": 0.31228508909034075, + "grad_norm": 0.205078125, + "learning_rate": 0.00019706197982005322, + "loss": 1.8513, + "step": 999 + }, + { + "epoch": 0.31259768677711786, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001970560647601587, + "loss": 1.6529, + "step": 1000 + }, + { + "epoch": 0.31291028446389496, + "grad_norm": 0.21875, + "learning_rate": 0.0001970501438408659, + "loss": 1.7564, + "step": 1001 + }, + { + "epoch": 0.31322288215067207, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001970442170625322, + "loss": 1.5718, + "step": 1002 + }, + { + "epoch": 0.3135354798374492, + "grad_norm": 0.201171875, + "learning_rate": 0.00019703828442551547, + "loss": 1.9791, + "step": 1003 + }, + { + "epoch": 0.31384807752422633, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019703234593017386, + "loss": 1.5583, + "step": 1004 + }, + { + "epoch": 0.31416067521100344, + "grad_norm": 0.19921875, + "learning_rate": 0.00019702640157686586, + "loss": 1.8005, + "step": 1005 + }, + { + "epoch": 0.31447327289778054, + "grad_norm": 0.216796875, + "learning_rate": 0.00019702045136595032, + "loss": 2.0622, + "step": 1006 + }, + { + "epoch": 0.31478587058455765, + "grad_norm": 0.19921875, + "learning_rate": 0.00019701449529778656, + "loss": 1.6313, + "step": 1007 + }, + { + "epoch": 0.3150984682713348, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019700853337273406, + "loss": 1.7088, + "step": 1008 + }, + { + "epoch": 0.3154110659581119, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001970025655911528, + "loss": 1.7942, + "step": 1009 + }, + { + "epoch": 0.315723663644889, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019699659195340303, + "loss": 1.8139, + "step": 1010 + }, + { + "epoch": 0.3160362613316661, + "grad_norm": 0.212890625, + "learning_rate": 0.0001969906124598454, + "loss": 1.6704, + "step": 1011 + }, + { + "epoch": 0.3163488590184433, + "grad_norm": 0.2109375, + "learning_rate": 0.00019698462711084091, + "loss": 1.9731, + "step": 1012 + }, + { + "epoch": 0.3166614567052204, + "grad_norm": 0.2109375, + "learning_rate": 0.00019697863590675086, + "loss": 1.6923, + "step": 1013 + }, + { + "epoch": 0.3169740543919975, + "grad_norm": 0.21484375, + "learning_rate": 0.00019697263884793702, + "loss": 1.8974, + "step": 1014 + }, + { + "epoch": 0.3172866520787746, + "grad_norm": 0.212890625, + "learning_rate": 0.0001969666359347614, + "loss": 2.0298, + "step": 1015 + }, + { + "epoch": 0.31759924976555176, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019696062716758638, + "loss": 1.6155, + "step": 1016 + }, + { + "epoch": 0.31791184745232887, + "grad_norm": 0.212890625, + "learning_rate": 0.00019695461254677475, + "loss": 1.6622, + "step": 1017 + }, + { + "epoch": 0.31822444513910597, + "grad_norm": 0.201171875, + "learning_rate": 0.00019694859207268958, + "loss": 2.0245, + "step": 1018 + }, + { + "epoch": 0.3185370428258831, + "grad_norm": 0.205078125, + "learning_rate": 0.0001969425657456944, + "loss": 1.7654, + "step": 1019 + }, + { + "epoch": 0.3188496405126602, + "grad_norm": 0.203125, + "learning_rate": 0.00019693653356615297, + "loss": 1.6629, + "step": 1020 + }, + { + "epoch": 0.31916223819943734, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019693049553442952, + "loss": 1.7823, + "step": 1021 + }, + { + "epoch": 0.31947483588621445, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001969244516508885, + "loss": 1.5993, + "step": 1022 + }, + { + "epoch": 0.31978743357299155, + "grad_norm": 0.2109375, + "learning_rate": 0.0001969184019158948, + "loss": 1.7385, + "step": 1023 + }, + { + "epoch": 0.32010003125976866, + "grad_norm": 0.220703125, + "learning_rate": 0.00019691234632981372, + "loss": 2.0781, + "step": 1024 + }, + { + "epoch": 0.3204126289465458, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019690628489301077, + "loss": 1.6396, + "step": 1025 + }, + { + "epoch": 0.3207252266333229, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019690021760585192, + "loss": 1.7066, + "step": 1026 + }, + { + "epoch": 0.32103782432010003, + "grad_norm": 0.216796875, + "learning_rate": 0.00019689414446870344, + "loss": 1.6741, + "step": 1027 + }, + { + "epoch": 0.32135042200687713, + "grad_norm": 0.2109375, + "learning_rate": 0.000196888065481932, + "loss": 1.8628, + "step": 1028 + }, + { + "epoch": 0.32166301969365424, + "grad_norm": 0.234375, + "learning_rate": 0.00019688198064590458, + "loss": 1.8129, + "step": 1029 + }, + { + "epoch": 0.3219756173804314, + "grad_norm": 0.203125, + "learning_rate": 0.00019687588996098853, + "loss": 1.9068, + "step": 1030 + }, + { + "epoch": 0.3222882150672085, + "grad_norm": 0.2109375, + "learning_rate": 0.00019686979342755154, + "loss": 1.8664, + "step": 1031 + }, + { + "epoch": 0.3226008127539856, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001968636910459617, + "loss": 1.7239, + "step": 1032 + }, + { + "epoch": 0.3229134104407627, + "grad_norm": 0.201171875, + "learning_rate": 0.00019685758281658738, + "loss": 1.9294, + "step": 1033 + }, + { + "epoch": 0.3232260081275399, + "grad_norm": 0.20703125, + "learning_rate": 0.00019685146873979736, + "loss": 1.7469, + "step": 1034 + }, + { + "epoch": 0.323538605814317, + "grad_norm": 0.208984375, + "learning_rate": 0.00019684534881596078, + "loss": 1.8425, + "step": 1035 + }, + { + "epoch": 0.3238512035010941, + "grad_norm": 0.208984375, + "learning_rate": 0.00019683922304544705, + "loss": 1.5658, + "step": 1036 + }, + { + "epoch": 0.3241638011878712, + "grad_norm": 0.20703125, + "learning_rate": 0.000196833091428626, + "loss": 1.7025, + "step": 1037 + }, + { + "epoch": 0.32447639887464835, + "grad_norm": 0.20703125, + "learning_rate": 0.00019682695396586785, + "loss": 1.7166, + "step": 1038 + }, + { + "epoch": 0.32478899656142546, + "grad_norm": 0.220703125, + "learning_rate": 0.00019682081065754313, + "loss": 1.8159, + "step": 1039 + }, + { + "epoch": 0.32510159424820256, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019681466150402266, + "loss": 1.7957, + "step": 1040 + }, + { + "epoch": 0.32541419193497967, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001968085065056777, + "loss": 1.6375, + "step": 1041 + }, + { + "epoch": 0.3257267896217568, + "grad_norm": 0.21484375, + "learning_rate": 0.00019680234566287985, + "loss": 2.1855, + "step": 1042 + }, + { + "epoch": 0.32603938730853393, + "grad_norm": 0.20703125, + "learning_rate": 0.00019679617897600102, + "loss": 1.8348, + "step": 1043 + }, + { + "epoch": 0.32635198499531104, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019679000644541356, + "loss": 1.6444, + "step": 1044 + }, + { + "epoch": 0.32666458268208814, + "grad_norm": 0.205078125, + "learning_rate": 0.00019678382807149003, + "loss": 1.8918, + "step": 1045 + }, + { + "epoch": 0.32697718036886525, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019677764385460348, + "loss": 1.6544, + "step": 1046 + }, + { + "epoch": 0.3272897780556424, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019677145379512723, + "loss": 1.8734, + "step": 1047 + }, + { + "epoch": 0.3276023757424195, + "grad_norm": 0.208984375, + "learning_rate": 0.00019676525789343502, + "loss": 1.8792, + "step": 1048 + }, + { + "epoch": 0.3279149734291966, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019675905614990085, + "loss": 1.8914, + "step": 1049 + }, + { + "epoch": 0.3282275711159737, + "grad_norm": 0.20703125, + "learning_rate": 0.0001967528485648992, + "loss": 1.6186, + "step": 1050 + }, + { + "epoch": 0.3285401688027509, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019674663513880475, + "loss": 1.7937, + "step": 1051 + }, + { + "epoch": 0.328852766489528, + "grad_norm": 0.203125, + "learning_rate": 0.00019674041587199268, + "loss": 1.7155, + "step": 1052 + }, + { + "epoch": 0.3291653641763051, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001967341907648384, + "loss": 1.8787, + "step": 1053 + }, + { + "epoch": 0.3294779618630822, + "grad_norm": 0.220703125, + "learning_rate": 0.00019672795981771777, + "loss": 1.6195, + "step": 1054 + }, + { + "epoch": 0.3297905595498593, + "grad_norm": 0.203125, + "learning_rate": 0.00019672172303100696, + "loss": 1.9987, + "step": 1055 + }, + { + "epoch": 0.33010315723663647, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019671548040508244, + "loss": 1.6107, + "step": 1056 + }, + { + "epoch": 0.3304157549234136, + "grad_norm": 0.20703125, + "learning_rate": 0.00019670923194032116, + "loss": 1.6394, + "step": 1057 + }, + { + "epoch": 0.3307283526101907, + "grad_norm": 0.19921875, + "learning_rate": 0.00019670297763710028, + "loss": 1.7142, + "step": 1058 + }, + { + "epoch": 0.3310409502969678, + "grad_norm": 0.205078125, + "learning_rate": 0.00019669671749579742, + "loss": 1.8344, + "step": 1059 + }, + { + "epoch": 0.33135354798374494, + "grad_norm": 0.212890625, + "learning_rate": 0.0001966904515167905, + "loss": 1.933, + "step": 1060 + }, + { + "epoch": 0.33166614567052205, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001966841797004578, + "loss": 1.763, + "step": 1061 + }, + { + "epoch": 0.33197874335729916, + "grad_norm": 0.2041015625, + "learning_rate": 0.000196677902047178, + "loss": 1.8741, + "step": 1062 + }, + { + "epoch": 0.33229134104407626, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019667161855733002, + "loss": 1.8624, + "step": 1063 + }, + { + "epoch": 0.33260393873085337, + "grad_norm": 0.216796875, + "learning_rate": 0.00019666532923129327, + "loss": 1.899, + "step": 1064 + }, + { + "epoch": 0.3329165364176305, + "grad_norm": 0.30078125, + "learning_rate": 0.00019665903406944737, + "loss": 2.3084, + "step": 1065 + }, + { + "epoch": 0.33322913410440763, + "grad_norm": 0.197265625, + "learning_rate": 0.00019665273307217245, + "loss": 1.6737, + "step": 1066 + }, + { + "epoch": 0.33354173179118474, + "grad_norm": 0.216796875, + "learning_rate": 0.00019664642623984886, + "loss": 1.6899, + "step": 1067 + }, + { + "epoch": 0.33385432947796184, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019664011357285735, + "loss": 1.8702, + "step": 1068 + }, + { + "epoch": 0.334166927164739, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019663379507157903, + "loss": 2.0766, + "step": 1069 + }, + { + "epoch": 0.3344795248515161, + "grad_norm": 0.1953125, + "learning_rate": 0.00019662747073639537, + "loss": 1.9336, + "step": 1070 + }, + { + "epoch": 0.3347921225382932, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019662114056768815, + "loss": 1.8872, + "step": 1071 + }, + { + "epoch": 0.3351047202250703, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019661480456583958, + "loss": 1.7719, + "step": 1072 + }, + { + "epoch": 0.3354173179118475, + "grad_norm": 0.220703125, + "learning_rate": 0.00019660846273123213, + "loss": 1.695, + "step": 1073 + }, + { + "epoch": 0.3357299155986246, + "grad_norm": 0.208984375, + "learning_rate": 0.00019660211506424867, + "loss": 1.8269, + "step": 1074 + }, + { + "epoch": 0.3360425132854017, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001965957615652724, + "loss": 1.8746, + "step": 1075 + }, + { + "epoch": 0.3363551109721788, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019658940223468693, + "loss": 1.5041, + "step": 1076 + }, + { + "epoch": 0.3366677086589559, + "grad_norm": 0.22265625, + "learning_rate": 0.00019658303707287617, + "loss": 1.8079, + "step": 1077 + }, + { + "epoch": 0.33698030634573306, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019657666608022438, + "loss": 1.7644, + "step": 1078 + }, + { + "epoch": 0.33729290403251017, + "grad_norm": 0.212890625, + "learning_rate": 0.00019657028925711617, + "loss": 1.759, + "step": 1079 + }, + { + "epoch": 0.33760550171928727, + "grad_norm": 0.220703125, + "learning_rate": 0.00019656390660393659, + "loss": 1.9192, + "step": 1080 + }, + { + "epoch": 0.3379180994060644, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019655751812107085, + "loss": 1.9153, + "step": 1081 + }, + { + "epoch": 0.33823069709284154, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019655112380890475, + "loss": 1.688, + "step": 1082 + }, + { + "epoch": 0.33854329477961864, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019654472366782425, + "loss": 1.907, + "step": 1083 + }, + { + "epoch": 0.33885589246639575, + "grad_norm": 0.212890625, + "learning_rate": 0.00019653831769821575, + "loss": 1.9453, + "step": 1084 + }, + { + "epoch": 0.33916849015317285, + "grad_norm": 0.2099609375, + "learning_rate": 0.000196531905900466, + "loss": 1.6311, + "step": 1085 + }, + { + "epoch": 0.33948108783994996, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019652548827496207, + "loss": 1.9493, + "step": 1086 + }, + { + "epoch": 0.3397936855267271, + "grad_norm": 0.208984375, + "learning_rate": 0.0001965190648220914, + "loss": 1.8175, + "step": 1087 + }, + { + "epoch": 0.3401062832135042, + "grad_norm": 0.19921875, + "learning_rate": 0.0001965126355422418, + "loss": 1.8018, + "step": 1088 + }, + { + "epoch": 0.34041888090028133, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001965062004358014, + "loss": 1.6674, + "step": 1089 + }, + { + "epoch": 0.34073147858705843, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001964997595031587, + "loss": 1.9538, + "step": 1090 + }, + { + "epoch": 0.3410440762738356, + "grad_norm": 0.203125, + "learning_rate": 0.00019649331274470256, + "loss": 1.8417, + "step": 1091 + }, + { + "epoch": 0.3413566739606127, + "grad_norm": 0.21484375, + "learning_rate": 0.00019648686016082216, + "loss": 2.0019, + "step": 1092 + }, + { + "epoch": 0.3416692716473898, + "grad_norm": 0.21484375, + "learning_rate": 0.00019648040175190707, + "loss": 1.7955, + "step": 1093 + }, + { + "epoch": 0.3419818693341669, + "grad_norm": 0.22265625, + "learning_rate": 0.00019647393751834718, + "loss": 1.6747, + "step": 1094 + }, + { + "epoch": 0.34229446702094407, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019646746746053274, + "loss": 1.7818, + "step": 1095 + }, + { + "epoch": 0.3426070647077212, + "grad_norm": 0.20703125, + "learning_rate": 0.00019646099157885437, + "loss": 1.7983, + "step": 1096 + }, + { + "epoch": 0.3429196623944983, + "grad_norm": 0.2265625, + "learning_rate": 0.00019645450987370298, + "loss": 1.677, + "step": 1097 + }, + { + "epoch": 0.3432322600812754, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019644802234546993, + "loss": 1.9241, + "step": 1098 + }, + { + "epoch": 0.3435448577680525, + "grad_norm": 0.212890625, + "learning_rate": 0.0001964415289945469, + "loss": 1.9008, + "step": 1099 + }, + { + "epoch": 0.34385745545482965, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019643502982132581, + "loss": 1.6438, + "step": 1100 + }, + { + "epoch": 0.34417005314160676, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001964285248261991, + "loss": 1.7665, + "step": 1101 + }, + { + "epoch": 0.34448265082838386, + "grad_norm": 0.2109375, + "learning_rate": 0.0001964220140095595, + "loss": 1.7259, + "step": 1102 + }, + { + "epoch": 0.34479524851516097, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019641549737180002, + "loss": 1.7119, + "step": 1103 + }, + { + "epoch": 0.34510784620193813, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019640897491331408, + "loss": 1.6551, + "step": 1104 + }, + { + "epoch": 0.34542044388871523, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001964024466344955, + "loss": 1.9882, + "step": 1105 + }, + { + "epoch": 0.34573304157549234, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019639591253573836, + "loss": 1.7573, + "step": 1106 + }, + { + "epoch": 0.34604563926226944, + "grad_norm": 0.208984375, + "learning_rate": 0.00019638937261743714, + "loss": 1.6814, + "step": 1107 + }, + { + "epoch": 0.34635823694904655, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019638282687998667, + "loss": 1.943, + "step": 1108 + }, + { + "epoch": 0.3466708346358237, + "grad_norm": 0.220703125, + "learning_rate": 0.00019637627532378212, + "loss": 1.6896, + "step": 1109 + }, + { + "epoch": 0.3469834323226008, + "grad_norm": 0.2197265625, + "learning_rate": 0.000196369717949219, + "loss": 1.8984, + "step": 1110 + }, + { + "epoch": 0.3472960300093779, + "grad_norm": 0.201171875, + "learning_rate": 0.00019636315475669324, + "loss": 1.4845, + "step": 1111 + }, + { + "epoch": 0.347608627696155, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019635658574660098, + "loss": 1.7234, + "step": 1112 + }, + { + "epoch": 0.3479212253829322, + "grad_norm": 0.201171875, + "learning_rate": 0.0001963500109193389, + "loss": 1.5583, + "step": 1113 + }, + { + "epoch": 0.3482338230697093, + "grad_norm": 0.220703125, + "learning_rate": 0.00019634343027530383, + "loss": 1.8789, + "step": 1114 + }, + { + "epoch": 0.3485464207564864, + "grad_norm": 0.21484375, + "learning_rate": 0.00019633684381489315, + "loss": 2.0262, + "step": 1115 + }, + { + "epoch": 0.3488590184432635, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019633025153850442, + "loss": 1.7877, + "step": 1116 + }, + { + "epoch": 0.34917161613004066, + "grad_norm": 0.216796875, + "learning_rate": 0.00019632365344653563, + "loss": 1.7381, + "step": 1117 + }, + { + "epoch": 0.34948421381681777, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019631704953938518, + "loss": 1.7758, + "step": 1118 + }, + { + "epoch": 0.3497968115035949, + "grad_norm": 0.212890625, + "learning_rate": 0.0001963104398174517, + "loss": 1.8063, + "step": 1119 + }, + { + "epoch": 0.350109409190372, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019630382428113417, + "loss": 1.8691, + "step": 1120 + }, + { + "epoch": 0.3504220068771491, + "grad_norm": 0.203125, + "learning_rate": 0.00019629720293083214, + "loss": 1.7844, + "step": 1121 + }, + { + "epoch": 0.35073460456392624, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019629057576694522, + "loss": 1.6097, + "step": 1122 + }, + { + "epoch": 0.35104720225070335, + "grad_norm": 0.21875, + "learning_rate": 0.00019628394278987355, + "loss": 1.9393, + "step": 1123 + }, + { + "epoch": 0.35135979993748045, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001962773040000175, + "loss": 1.7556, + "step": 1124 + }, + { + "epoch": 0.35167239762425756, + "grad_norm": 0.220703125, + "learning_rate": 0.000196270659397778, + "loss": 1.7145, + "step": 1125 + }, + { + "epoch": 0.3519849953110347, + "grad_norm": 0.220703125, + "learning_rate": 0.0001962640089835561, + "loss": 1.6505, + "step": 1126 + }, + { + "epoch": 0.3522975929978118, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019625735275775327, + "loss": 1.6953, + "step": 1127 + }, + { + "epoch": 0.35261019068458893, + "grad_norm": 0.224609375, + "learning_rate": 0.00019625069072077138, + "loss": 1.7897, + "step": 1128 + }, + { + "epoch": 0.35292278837136604, + "grad_norm": 0.2109375, + "learning_rate": 0.0001962440228730127, + "loss": 1.8916, + "step": 1129 + }, + { + "epoch": 0.3532353860581432, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019623734921487965, + "loss": 1.5444, + "step": 1130 + }, + { + "epoch": 0.3535479837449203, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019623066974677525, + "loss": 1.6391, + "step": 1131 + }, + { + "epoch": 0.3538605814316974, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019622398446910263, + "loss": 1.6171, + "step": 1132 + }, + { + "epoch": 0.3541731791184745, + "grad_norm": 0.216796875, + "learning_rate": 0.0001962172933822655, + "loss": 1.6352, + "step": 1133 + }, + { + "epoch": 0.3544857768052516, + "grad_norm": 0.220703125, + "learning_rate": 0.00019621059648666772, + "loss": 1.8147, + "step": 1134 + }, + { + "epoch": 0.3547983744920288, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019620389378271366, + "loss": 1.7773, + "step": 1135 + }, + { + "epoch": 0.3551109721788059, + "grad_norm": 0.212890625, + "learning_rate": 0.0001961971852708079, + "loss": 1.7441, + "step": 1136 + }, + { + "epoch": 0.355423569865583, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019619047095135553, + "loss": 1.9931, + "step": 1137 + }, + { + "epoch": 0.3557361675523601, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019618375082476182, + "loss": 1.6723, + "step": 1138 + }, + { + "epoch": 0.35604876523913725, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001961770248914325, + "loss": 1.8312, + "step": 1139 + }, + { + "epoch": 0.35636136292591436, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019617029315177365, + "loss": 2.0553, + "step": 1140 + }, + { + "epoch": 0.35667396061269147, + "grad_norm": 0.20703125, + "learning_rate": 0.00019616355560619163, + "loss": 1.6513, + "step": 1141 + }, + { + "epoch": 0.35698655829946857, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019615681225509325, + "loss": 1.8244, + "step": 1142 + }, + { + "epoch": 0.3572991559862457, + "grad_norm": 0.20703125, + "learning_rate": 0.00019615006309888552, + "loss": 1.9322, + "step": 1143 + }, + { + "epoch": 0.35761175367302284, + "grad_norm": 0.2138671875, + "learning_rate": 0.000196143308137976, + "loss": 1.7572, + "step": 1144 + }, + { + "epoch": 0.35792435135979994, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019613654737277245, + "loss": 1.5536, + "step": 1145 + }, + { + "epoch": 0.35823694904657705, + "grad_norm": 0.2197265625, + "learning_rate": 0.000196129780803683, + "loss": 1.9036, + "step": 1146 + }, + { + "epoch": 0.35854954673335415, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019612300843111622, + "loss": 2.1856, + "step": 1147 + }, + { + "epoch": 0.3588621444201313, + "grad_norm": 0.212890625, + "learning_rate": 0.0001961162302554809, + "loss": 1.6396, + "step": 1148 + }, + { + "epoch": 0.3591747421069084, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019610944627718627, + "loss": 1.8837, + "step": 1149 + }, + { + "epoch": 0.3594873397936855, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019610265649664193, + "loss": 1.7418, + "step": 1150 + }, + { + "epoch": 0.35979993748046263, + "grad_norm": 0.2109375, + "learning_rate": 0.00019609586091425774, + "loss": 1.8848, + "step": 1151 + }, + { + "epoch": 0.3601125351672398, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019608905953044396, + "loss": 1.4857, + "step": 1152 + }, + { + "epoch": 0.3604251328540169, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019608225234561121, + "loss": 1.6741, + "step": 1153 + }, + { + "epoch": 0.360737730540794, + "grad_norm": 0.19921875, + "learning_rate": 0.00019607543936017046, + "loss": 1.6363, + "step": 1154 + }, + { + "epoch": 0.3610503282275711, + "grad_norm": 0.2109375, + "learning_rate": 0.00019606862057453298, + "loss": 1.8323, + "step": 1155 + }, + { + "epoch": 0.3613629259143482, + "grad_norm": 0.21484375, + "learning_rate": 0.00019606179598911049, + "loss": 1.6778, + "step": 1156 + }, + { + "epoch": 0.36167552360112537, + "grad_norm": 0.208984375, + "learning_rate": 0.00019605496560431496, + "loss": 1.8691, + "step": 1157 + }, + { + "epoch": 0.3619881212879025, + "grad_norm": 0.2109375, + "learning_rate": 0.00019604812942055873, + "loss": 1.6175, + "step": 1158 + }, + { + "epoch": 0.3623007189746796, + "grad_norm": 0.212890625, + "learning_rate": 0.00019604128743825453, + "loss": 1.717, + "step": 1159 + }, + { + "epoch": 0.3626133166614567, + "grad_norm": 0.201171875, + "learning_rate": 0.00019603443965781543, + "loss": 1.773, + "step": 1160 + }, + { + "epoch": 0.36292591434823385, + "grad_norm": 0.212890625, + "learning_rate": 0.00019602758607965484, + "loss": 1.8844, + "step": 1161 + }, + { + "epoch": 0.36323851203501095, + "grad_norm": 0.2109375, + "learning_rate": 0.00019602072670418647, + "loss": 1.9545, + "step": 1162 + }, + { + "epoch": 0.36355110972178806, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019601386153182451, + "loss": 1.523, + "step": 1163 + }, + { + "epoch": 0.36386370740856516, + "grad_norm": 0.224609375, + "learning_rate": 0.00019600699056298337, + "loss": 2.0468, + "step": 1164 + }, + { + "epoch": 0.36417630509534227, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019600011379807786, + "loss": 1.9032, + "step": 1165 + }, + { + "epoch": 0.36448890278211943, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019599323123752315, + "loss": 1.3631, + "step": 1166 + }, + { + "epoch": 0.36480150046889653, + "grad_norm": 0.21875, + "learning_rate": 0.00019598634288173474, + "loss": 1.6805, + "step": 1167 + }, + { + "epoch": 0.36511409815567364, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019597944873112852, + "loss": 1.4813, + "step": 1168 + }, + { + "epoch": 0.36542669584245074, + "grad_norm": 0.216796875, + "learning_rate": 0.00019597254878612065, + "loss": 1.7945, + "step": 1169 + }, + { + "epoch": 0.3657392935292279, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001959656430471277, + "loss": 1.5851, + "step": 1170 + }, + { + "epoch": 0.366051891216005, + "grad_norm": 0.21875, + "learning_rate": 0.0001959587315145666, + "loss": 1.8493, + "step": 1171 + }, + { + "epoch": 0.3663644889027821, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001959518141888546, + "loss": 1.7852, + "step": 1172 + }, + { + "epoch": 0.3666770865895592, + "grad_norm": 0.22265625, + "learning_rate": 0.00019594489107040928, + "loss": 1.9668, + "step": 1173 + }, + { + "epoch": 0.3669896842763364, + "grad_norm": 0.216796875, + "learning_rate": 0.00019593796215964867, + "loss": 1.656, + "step": 1174 + }, + { + "epoch": 0.3673022819631135, + "grad_norm": 0.20703125, + "learning_rate": 0.000195931027456991, + "loss": 1.5947, + "step": 1175 + }, + { + "epoch": 0.3676148796498906, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019592408696285496, + "loss": 1.7685, + "step": 1176 + }, + { + "epoch": 0.3679274773366677, + "grad_norm": 0.220703125, + "learning_rate": 0.00019591714067765953, + "loss": 1.6027, + "step": 1177 + }, + { + "epoch": 0.3682400750234448, + "grad_norm": 0.205078125, + "learning_rate": 0.0001959101886018241, + "loss": 2.2013, + "step": 1178 + }, + { + "epoch": 0.36855267271022196, + "grad_norm": 0.208984375, + "learning_rate": 0.0001959032307357684, + "loss": 1.6995, + "step": 1179 + }, + { + "epoch": 0.36886527039699907, + "grad_norm": 0.20703125, + "learning_rate": 0.00019589626707991242, + "loss": 1.7104, + "step": 1180 + }, + { + "epoch": 0.3691778680837762, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019588929763467657, + "loss": 1.6798, + "step": 1181 + }, + { + "epoch": 0.3694904657705533, + "grad_norm": 0.20703125, + "learning_rate": 0.00019588232240048167, + "loss": 1.5464, + "step": 1182 + }, + { + "epoch": 0.36980306345733044, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001958753413777488, + "loss": 1.7789, + "step": 1183 + }, + { + "epoch": 0.37011566114410754, + "grad_norm": 0.2265625, + "learning_rate": 0.00019586835456689934, + "loss": 1.7634, + "step": 1184 + }, + { + "epoch": 0.37042825883088465, + "grad_norm": 0.2109375, + "learning_rate": 0.0001958613619683552, + "loss": 1.9015, + "step": 1185 + }, + { + "epoch": 0.37074085651766175, + "grad_norm": 0.318359375, + "learning_rate": 0.00019585436358253845, + "loss": 2.3964, + "step": 1186 + }, + { + "epoch": 0.37105345420443886, + "grad_norm": 0.216796875, + "learning_rate": 0.00019584735940987163, + "loss": 1.7068, + "step": 1187 + }, + { + "epoch": 0.371366051891216, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019584034945077758, + "loss": 1.9431, + "step": 1188 + }, + { + "epoch": 0.3716786495779931, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001958333337056795, + "loss": 1.6602, + "step": 1189 + }, + { + "epoch": 0.37199124726477023, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019582631217500093, + "loss": 1.9655, + "step": 1190 + }, + { + "epoch": 0.37230384495154734, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001958192848591658, + "loss": 1.7755, + "step": 1191 + }, + { + "epoch": 0.3726164426383245, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019581225175859833, + "loss": 1.7425, + "step": 1192 + }, + { + "epoch": 0.3729290403251016, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019580521287372317, + "loss": 1.8308, + "step": 1193 + }, + { + "epoch": 0.3732416380118787, + "grad_norm": 0.2265625, + "learning_rate": 0.00019579816820496516, + "loss": 1.7996, + "step": 1194 + }, + { + "epoch": 0.3735542356986558, + "grad_norm": 0.224609375, + "learning_rate": 0.0001957911177527497, + "loss": 1.8265, + "step": 1195 + }, + { + "epoch": 0.373866833385433, + "grad_norm": 0.21484375, + "learning_rate": 0.00019578406151750236, + "loss": 1.5686, + "step": 1196 + }, + { + "epoch": 0.3741794310722101, + "grad_norm": 0.212890625, + "learning_rate": 0.0001957769994996492, + "loss": 1.7951, + "step": 1197 + }, + { + "epoch": 0.3744920287589872, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019576993169961653, + "loss": 1.7821, + "step": 1198 + }, + { + "epoch": 0.3748046264457643, + "grad_norm": 0.2158203125, + "learning_rate": 0.000195762858117831, + "loss": 1.7286, + "step": 1199 + }, + { + "epoch": 0.3751172241325414, + "grad_norm": 0.212890625, + "learning_rate": 0.00019575577875471974, + "loss": 1.707, + "step": 1200 + }, + { + "epoch": 0.37542982181931855, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019574869361071006, + "loss": 1.9656, + "step": 1201 + }, + { + "epoch": 0.37574241950609566, + "grad_norm": 0.2265625, + "learning_rate": 0.00019574160268622976, + "loss": 1.7242, + "step": 1202 + }, + { + "epoch": 0.37605501719287276, + "grad_norm": 0.21484375, + "learning_rate": 0.00019573450598170687, + "loss": 1.7001, + "step": 1203 + }, + { + "epoch": 0.37636761487964987, + "grad_norm": 0.21875, + "learning_rate": 0.00019572740349756992, + "loss": 1.8952, + "step": 1204 + }, + { + "epoch": 0.37668021256642703, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019572029523424756, + "loss": 1.8052, + "step": 1205 + }, + { + "epoch": 0.37699281025320414, + "grad_norm": 0.21875, + "learning_rate": 0.00019571318119216904, + "loss": 1.8727, + "step": 1206 + }, + { + "epoch": 0.37730540793998124, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001957060613717638, + "loss": 1.6054, + "step": 1207 + }, + { + "epoch": 0.37761800562675835, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019569893577346168, + "loss": 1.8537, + "step": 1208 + }, + { + "epoch": 0.3779306033135355, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019569180439769283, + "loss": 1.6096, + "step": 1209 + }, + { + "epoch": 0.3782432010003126, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019568466724488782, + "loss": 1.9668, + "step": 1210 + }, + { + "epoch": 0.3785557986870897, + "grad_norm": 0.20703125, + "learning_rate": 0.00019567752431547754, + "loss": 1.6992, + "step": 1211 + }, + { + "epoch": 0.3788683963738668, + "grad_norm": 0.2109375, + "learning_rate": 0.00019567037560989315, + "loss": 1.6169, + "step": 1212 + }, + { + "epoch": 0.37918099406064393, + "grad_norm": 0.21875, + "learning_rate": 0.00019566322112856633, + "loss": 1.7126, + "step": 1213 + }, + { + "epoch": 0.3794935917474211, + "grad_norm": 0.203125, + "learning_rate": 0.0001956560608719289, + "loss": 1.6279, + "step": 1214 + }, + { + "epoch": 0.3798061894341982, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001956488948404132, + "loss": 2.0578, + "step": 1215 + }, + { + "epoch": 0.3801187871209753, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019564172303445182, + "loss": 1.7761, + "step": 1216 + }, + { + "epoch": 0.3804313848077524, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019563454545447773, + "loss": 1.6644, + "step": 1217 + }, + { + "epoch": 0.38074398249452956, + "grad_norm": 0.2109375, + "learning_rate": 0.00019562736210092428, + "loss": 1.8542, + "step": 1218 + }, + { + "epoch": 0.38105658018130667, + "grad_norm": 0.208984375, + "learning_rate": 0.0001956201729742251, + "loss": 1.7917, + "step": 1219 + }, + { + "epoch": 0.3813691778680838, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019561297807481427, + "loss": 1.8474, + "step": 1220 + }, + { + "epoch": 0.3816817755548609, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001956057774031261, + "loss": 1.627, + "step": 1221 + }, + { + "epoch": 0.381994373241638, + "grad_norm": 0.2109375, + "learning_rate": 0.00019559857095959528, + "loss": 1.6842, + "step": 1222 + }, + { + "epoch": 0.38230697092841515, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019559135874465695, + "loss": 1.7735, + "step": 1223 + }, + { + "epoch": 0.38261956861519225, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019558414075874646, + "loss": 1.8281, + "step": 1224 + }, + { + "epoch": 0.38293216630196936, + "grad_norm": 0.21484375, + "learning_rate": 0.00019557691700229957, + "loss": 1.5633, + "step": 1225 + }, + { + "epoch": 0.38324476398874646, + "grad_norm": 0.212890625, + "learning_rate": 0.00019556968747575244, + "loss": 1.8649, + "step": 1226 + }, + { + "epoch": 0.3835573616755236, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019556245217954149, + "loss": 1.6938, + "step": 1227 + }, + { + "epoch": 0.38386995936230073, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001955552111141035, + "loss": 1.6866, + "step": 1228 + }, + { + "epoch": 0.38418255704907783, + "grad_norm": 0.232421875, + "learning_rate": 0.00019554796427987566, + "loss": 1.9343, + "step": 1229 + }, + { + "epoch": 0.38449515473585494, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019554071167729545, + "loss": 1.9785, + "step": 1230 + }, + { + "epoch": 0.3848077524226321, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019553345330680077, + "loss": 1.876, + "step": 1231 + }, + { + "epoch": 0.3851203501094092, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019552618916882973, + "loss": 1.671, + "step": 1232 + }, + { + "epoch": 0.3854329477961863, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019551891926382093, + "loss": 1.6575, + "step": 1233 + }, + { + "epoch": 0.3857455454829634, + "grad_norm": 0.216796875, + "learning_rate": 0.00019551164359221326, + "loss": 1.9775, + "step": 1234 + }, + { + "epoch": 0.3860581431697405, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019550436215444594, + "loss": 1.7329, + "step": 1235 + }, + { + "epoch": 0.3863707408565177, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001954970749509586, + "loss": 1.6745, + "step": 1236 + }, + { + "epoch": 0.3866833385432948, + "grad_norm": 0.21484375, + "learning_rate": 0.00019548978198219113, + "loss": 1.7502, + "step": 1237 + }, + { + "epoch": 0.3869959362300719, + "grad_norm": 0.228515625, + "learning_rate": 0.00019548248324858386, + "loss": 1.6299, + "step": 1238 + }, + { + "epoch": 0.387308533916849, + "grad_norm": 0.21875, + "learning_rate": 0.00019547517875057738, + "loss": 1.6477, + "step": 1239 + }, + { + "epoch": 0.38762113160362616, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019546786848861268, + "loss": 1.8717, + "step": 1240 + }, + { + "epoch": 0.38793372929040326, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019546055246313113, + "loss": 1.5382, + "step": 1241 + }, + { + "epoch": 0.38824632697718037, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019545323067457439, + "loss": 2.0394, + "step": 1242 + }, + { + "epoch": 0.3885589246639575, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019544590312338444, + "loss": 1.8064, + "step": 1243 + }, + { + "epoch": 0.3888715223507346, + "grad_norm": 0.2265625, + "learning_rate": 0.00019543856981000371, + "loss": 1.6846, + "step": 1244 + }, + { + "epoch": 0.38918412003751174, + "grad_norm": 0.203125, + "learning_rate": 0.0001954312307348749, + "loss": 1.7834, + "step": 1245 + }, + { + "epoch": 0.38949671772428884, + "grad_norm": 0.21484375, + "learning_rate": 0.0001954238858984411, + "loss": 1.8043, + "step": 1246 + }, + { + "epoch": 0.38980931541106595, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019541653530114568, + "loss": 1.7905, + "step": 1247 + }, + { + "epoch": 0.39012191309784305, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019540917894343246, + "loss": 1.6521, + "step": 1248 + }, + { + "epoch": 0.3904345107846202, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019540181682574552, + "loss": 1.6881, + "step": 1249 + }, + { + "epoch": 0.3907471084713973, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001953944489485293, + "loss": 2.0565, + "step": 1250 + }, + { + "epoch": 0.3910597061581744, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019538707531222867, + "loss": 1.7884, + "step": 1251 + }, + { + "epoch": 0.39137230384495153, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019537969591728872, + "loss": 1.5153, + "step": 1252 + }, + { + "epoch": 0.3916849015317287, + "grad_norm": 0.220703125, + "learning_rate": 0.000195372310764155, + "loss": 1.8401, + "step": 1253 + }, + { + "epoch": 0.3919974992185058, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019536491985327334, + "loss": 1.5898, + "step": 1254 + }, + { + "epoch": 0.3923100969052829, + "grad_norm": 0.208984375, + "learning_rate": 0.00019535752318508998, + "loss": 1.8118, + "step": 1255 + }, + { + "epoch": 0.39262269459206, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019535012076005138, + "loss": 1.4033, + "step": 1256 + }, + { + "epoch": 0.3929352922788371, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019534271257860448, + "loss": 1.672, + "step": 1257 + }, + { + "epoch": 0.3932478899656143, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019533529864119658, + "loss": 1.752, + "step": 1258 + }, + { + "epoch": 0.3935604876523914, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001953278789482752, + "loss": 1.3813, + "step": 1259 + }, + { + "epoch": 0.3938730853391685, + "grad_norm": 0.2265625, + "learning_rate": 0.00019532045350028826, + "loss": 1.8827, + "step": 1260 + }, + { + "epoch": 0.3941856830259456, + "grad_norm": 0.224609375, + "learning_rate": 0.00019531302229768404, + "loss": 1.9363, + "step": 1261 + }, + { + "epoch": 0.39449828071272275, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019530558534091127, + "loss": 1.8975, + "step": 1262 + }, + { + "epoch": 0.39481087839949985, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019529814263041884, + "loss": 1.7931, + "step": 1263 + }, + { + "epoch": 0.39512347608627696, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001952906941666561, + "loss": 1.7258, + "step": 1264 + }, + { + "epoch": 0.39543607377305406, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001952832399500727, + "loss": 1.8547, + "step": 1265 + }, + { + "epoch": 0.3957486714598312, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019527577998111874, + "loss": 1.7344, + "step": 1266 + }, + { + "epoch": 0.39606126914660833, + "grad_norm": 0.2109375, + "learning_rate": 0.0001952683142602445, + "loss": 1.7313, + "step": 1267 + }, + { + "epoch": 0.39637386683338544, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019526084278790074, + "loss": 1.8261, + "step": 1268 + }, + { + "epoch": 0.39668646452016254, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019525336556453852, + "loss": 1.7306, + "step": 1269 + }, + { + "epoch": 0.39699906220693965, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001952458825906092, + "loss": 1.9536, + "step": 1270 + }, + { + "epoch": 0.3973116598937168, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019523839386656458, + "loss": 1.7486, + "step": 1271 + }, + { + "epoch": 0.3976242575804939, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019523089939285675, + "loss": 1.9232, + "step": 1272 + }, + { + "epoch": 0.397936855267271, + "grad_norm": 0.220703125, + "learning_rate": 0.0001952233991699382, + "loss": 1.5959, + "step": 1273 + }, + { + "epoch": 0.3982494529540481, + "grad_norm": 0.224609375, + "learning_rate": 0.00019521589319826168, + "loss": 1.9811, + "step": 1274 + }, + { + "epoch": 0.3985620506408253, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019520838147828035, + "loss": 1.6908, + "step": 1275 + }, + { + "epoch": 0.3988746483276024, + "grad_norm": 0.208984375, + "learning_rate": 0.00019520086401044772, + "loss": 1.7011, + "step": 1276 + }, + { + "epoch": 0.3991872460143795, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001951933407952176, + "loss": 1.6478, + "step": 1277 + }, + { + "epoch": 0.3994998437011566, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001951858118330442, + "loss": 1.5169, + "step": 1278 + }, + { + "epoch": 0.3998124413879337, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019517827712438207, + "loss": 1.7061, + "step": 1279 + }, + { + "epoch": 0.40012503907471086, + "grad_norm": 0.212890625, + "learning_rate": 0.00019517073666968604, + "loss": 1.7499, + "step": 1280 + }, + { + "epoch": 0.40043763676148797, + "grad_norm": 0.212890625, + "learning_rate": 0.00019516319046941134, + "loss": 2.132, + "step": 1281 + }, + { + "epoch": 0.4007502344482651, + "grad_norm": 0.20703125, + "learning_rate": 0.00019515563852401358, + "loss": 1.56, + "step": 1282 + }, + { + "epoch": 0.4010628321350422, + "grad_norm": 0.216796875, + "learning_rate": 0.00019514808083394866, + "loss": 1.86, + "step": 1283 + }, + { + "epoch": 0.40137542982181934, + "grad_norm": 0.22265625, + "learning_rate": 0.00019514051739967286, + "loss": 1.6877, + "step": 1284 + }, + { + "epoch": 0.40168802750859645, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019513294822164277, + "loss": 1.5612, + "step": 1285 + }, + { + "epoch": 0.40200062519537355, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019512537330031537, + "loss": 1.7812, + "step": 1286 + }, + { + "epoch": 0.40231322288215066, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019511779263614798, + "loss": 1.5228, + "step": 1287 + }, + { + "epoch": 0.4026258205689278, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019511020622959823, + "loss": 1.4276, + "step": 1288 + }, + { + "epoch": 0.4029384182557049, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019510261408112414, + "loss": 1.8561, + "step": 1289 + }, + { + "epoch": 0.403251015942482, + "grad_norm": 0.21875, + "learning_rate": 0.00019509501619118403, + "loss": 1.8674, + "step": 1290 + }, + { + "epoch": 0.40356361362925913, + "grad_norm": 0.20703125, + "learning_rate": 0.0001950874125602366, + "loss": 1.8583, + "step": 1291 + }, + { + "epoch": 0.40387621131603624, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019507980318874096, + "loss": 1.686, + "step": 1292 + }, + { + "epoch": 0.4041888090028134, + "grad_norm": 0.21484375, + "learning_rate": 0.00019507218807715638, + "loss": 1.7897, + "step": 1293 + }, + { + "epoch": 0.4045014066895905, + "grad_norm": 0.228515625, + "learning_rate": 0.00019506456722594265, + "loss": 1.7626, + "step": 1294 + }, + { + "epoch": 0.4048140043763676, + "grad_norm": 0.212890625, + "learning_rate": 0.0001950569406355599, + "loss": 1.9098, + "step": 1295 + }, + { + "epoch": 0.4051266020631447, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001950493083064685, + "loss": 1.5848, + "step": 1296 + }, + { + "epoch": 0.4054391997499219, + "grad_norm": 0.220703125, + "learning_rate": 0.00019504167023912922, + "loss": 1.6362, + "step": 1297 + }, + { + "epoch": 0.405751797436699, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001950340264340032, + "loss": 1.9604, + "step": 1298 + }, + { + "epoch": 0.4060643951234761, + "grad_norm": 0.306640625, + "learning_rate": 0.0001950263768915519, + "loss": 2.5325, + "step": 1299 + }, + { + "epoch": 0.4063769928102532, + "grad_norm": 0.21484375, + "learning_rate": 0.00019501872161223712, + "loss": 1.9979, + "step": 1300 + }, + { + "epoch": 0.4066895904970303, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019501106059652108, + "loss": 1.714, + "step": 1301 + }, + { + "epoch": 0.40700218818380746, + "grad_norm": 0.220703125, + "learning_rate": 0.0001950033938448662, + "loss": 1.7827, + "step": 1302 + }, + { + "epoch": 0.40731478587058456, + "grad_norm": 0.21484375, + "learning_rate": 0.00019499572135773537, + "loss": 1.6062, + "step": 1303 + }, + { + "epoch": 0.40762738355736167, + "grad_norm": 0.21484375, + "learning_rate": 0.0001949880431355918, + "loss": 1.6599, + "step": 1304 + }, + { + "epoch": 0.40793998124413877, + "grad_norm": 0.2197265625, + "learning_rate": 0.000194980359178899, + "loss": 1.5345, + "step": 1305 + }, + { + "epoch": 0.40825257893091593, + "grad_norm": 0.220703125, + "learning_rate": 0.0001949726694881209, + "loss": 1.8149, + "step": 1306 + }, + { + "epoch": 0.40856517661769304, + "grad_norm": 0.220703125, + "learning_rate": 0.00019496497406372174, + "loss": 1.6207, + "step": 1307 + }, + { + "epoch": 0.40887777430447014, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019495727290616606, + "loss": 1.7058, + "step": 1308 + }, + { + "epoch": 0.40919037199124725, + "grad_norm": 0.216796875, + "learning_rate": 0.0001949495660159188, + "loss": 1.5045, + "step": 1309 + }, + { + "epoch": 0.4095029696780244, + "grad_norm": 0.21875, + "learning_rate": 0.00019494185339344523, + "loss": 1.8221, + "step": 1310 + }, + { + "epoch": 0.4098155673648015, + "grad_norm": 0.224609375, + "learning_rate": 0.000194934135039211, + "loss": 1.4478, + "step": 1311 + }, + { + "epoch": 0.4101281650515786, + "grad_norm": 0.228515625, + "learning_rate": 0.0001949264109536821, + "loss": 1.4922, + "step": 1312 + }, + { + "epoch": 0.4104407627383557, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019491868113732474, + "loss": 1.8462, + "step": 1313 + }, + { + "epoch": 0.41075336042513283, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001949109455906057, + "loss": 1.831, + "step": 1314 + }, + { + "epoch": 0.41106595811191, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001949032043139919, + "loss": 1.5742, + "step": 1315 + }, + { + "epoch": 0.4113785557986871, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001948954573079507, + "loss": 1.7099, + "step": 1316 + }, + { + "epoch": 0.4116911534854642, + "grad_norm": 0.21875, + "learning_rate": 0.00019488770457294985, + "loss": 1.8771, + "step": 1317 + }, + { + "epoch": 0.4120037511722413, + "grad_norm": 0.21875, + "learning_rate": 0.00019487994610945734, + "loss": 1.9056, + "step": 1318 + }, + { + "epoch": 0.41231634885901847, + "grad_norm": 0.20703125, + "learning_rate": 0.00019487218191794158, + "loss": 1.7384, + "step": 1319 + }, + { + "epoch": 0.41262894654579557, + "grad_norm": 0.212890625, + "learning_rate": 0.00019486441199887132, + "loss": 1.9079, + "step": 1320 + }, + { + "epoch": 0.4129415442325727, + "grad_norm": 0.224609375, + "learning_rate": 0.00019485663635271562, + "loss": 1.8313, + "step": 1321 + }, + { + "epoch": 0.4132541419193498, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019484885497994387, + "loss": 1.642, + "step": 1322 + }, + { + "epoch": 0.4135667396061269, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019484106788102593, + "loss": 1.7165, + "step": 1323 + }, + { + "epoch": 0.41387933729290405, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001948332750564318, + "loss": 1.6474, + "step": 1324 + }, + { + "epoch": 0.41419193497968115, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019482547650663206, + "loss": 1.5541, + "step": 1325 + }, + { + "epoch": 0.41450453266645826, + "grad_norm": 0.20703125, + "learning_rate": 0.00019481767223209745, + "loss": 2.0118, + "step": 1326 + }, + { + "epoch": 0.41481713035323536, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019480986223329913, + "loss": 1.8306, + "step": 1327 + }, + { + "epoch": 0.4151297280400125, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019480204651070864, + "loss": 1.6828, + "step": 1328 + }, + { + "epoch": 0.41544232572678963, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019479422506479775, + "loss": 1.6071, + "step": 1329 + }, + { + "epoch": 0.41575492341356673, + "grad_norm": 0.203125, + "learning_rate": 0.00019478639789603872, + "loss": 1.6847, + "step": 1330 + }, + { + "epoch": 0.41606752110034384, + "grad_norm": 0.21484375, + "learning_rate": 0.00019477856500490405, + "loss": 1.6309, + "step": 1331 + }, + { + "epoch": 0.416380118787121, + "grad_norm": 0.21484375, + "learning_rate": 0.00019477072639186664, + "loss": 1.9451, + "step": 1332 + }, + { + "epoch": 0.4166927164738981, + "grad_norm": 0.220703125, + "learning_rate": 0.0001947628820573997, + "loss": 1.8675, + "step": 1333 + }, + { + "epoch": 0.4170053141606752, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019475503200197685, + "loss": 1.5601, + "step": 1334 + }, + { + "epoch": 0.4173179118474523, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019474717622607195, + "loss": 1.5294, + "step": 1335 + }, + { + "epoch": 0.4176305095342294, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019473931473015926, + "loss": 1.7433, + "step": 1336 + }, + { + "epoch": 0.4179431072210066, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019473144751471345, + "loss": 1.6771, + "step": 1337 + }, + { + "epoch": 0.4182557049077837, + "grad_norm": 0.212890625, + "learning_rate": 0.0001947235745802094, + "loss": 1.9994, + "step": 1338 + }, + { + "epoch": 0.4185683025945608, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001947156959271225, + "loss": 1.726, + "step": 1339 + }, + { + "epoch": 0.4188809002813379, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019470781155592827, + "loss": 1.8079, + "step": 1340 + }, + { + "epoch": 0.41919349796811506, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019469992146710282, + "loss": 1.8046, + "step": 1341 + }, + { + "epoch": 0.41950609565489216, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001946920256611224, + "loss": 1.619, + "step": 1342 + }, + { + "epoch": 0.41981869334166927, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019468412413846373, + "loss": 1.6015, + "step": 1343 + }, + { + "epoch": 0.4201312910284464, + "grad_norm": 0.212890625, + "learning_rate": 0.00019467621689960385, + "loss": 1.7538, + "step": 1344 + }, + { + "epoch": 0.42044388871522353, + "grad_norm": 0.20703125, + "learning_rate": 0.00019466830394502009, + "loss": 1.8732, + "step": 1345 + }, + { + "epoch": 0.42075648640200064, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001946603852751902, + "loss": 1.7492, + "step": 1346 + }, + { + "epoch": 0.42106908408877775, + "grad_norm": 0.224609375, + "learning_rate": 0.0001946524608905922, + "loss": 1.6893, + "step": 1347 + }, + { + "epoch": 0.42138168177555485, + "grad_norm": 0.220703125, + "learning_rate": 0.00019464453079170454, + "loss": 1.5848, + "step": 1348 + }, + { + "epoch": 0.42169427946233196, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019463659497900593, + "loss": 1.5974, + "step": 1349 + }, + { + "epoch": 0.4220068771491091, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001946286534529755, + "loss": 1.9757, + "step": 1350 + }, + { + "epoch": 0.4223194748358862, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001946207062140927, + "loss": 1.9514, + "step": 1351 + }, + { + "epoch": 0.4226320725226633, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019461275326283724, + "loss": 1.894, + "step": 1352 + }, + { + "epoch": 0.42294467020944043, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019460479459968932, + "loss": 1.5872, + "step": 1353 + }, + { + "epoch": 0.4232572678962176, + "grad_norm": 0.21484375, + "learning_rate": 0.0001945968302251294, + "loss": 1.5275, + "step": 1354 + }, + { + "epoch": 0.4235698655829947, + "grad_norm": 0.220703125, + "learning_rate": 0.0001945888601396383, + "loss": 1.6427, + "step": 1355 + }, + { + "epoch": 0.4238824632697718, + "grad_norm": 0.21875, + "learning_rate": 0.00019458088434369715, + "loss": 1.6407, + "step": 1356 + }, + { + "epoch": 0.4241950609565489, + "grad_norm": 0.224609375, + "learning_rate": 0.00019457290283778747, + "loss": 1.9373, + "step": 1357 + }, + { + "epoch": 0.424507658643326, + "grad_norm": 0.21484375, + "learning_rate": 0.0001945649156223912, + "loss": 1.7385, + "step": 1358 + }, + { + "epoch": 0.4248202563301032, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001945569226979904, + "loss": 1.8262, + "step": 1359 + }, + { + "epoch": 0.4251328540168803, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019454892406506775, + "loss": 1.6286, + "step": 1360 + }, + { + "epoch": 0.4254454517036574, + "grad_norm": 0.22265625, + "learning_rate": 0.00019454091972410603, + "loss": 1.7992, + "step": 1361 + }, + { + "epoch": 0.4257580493904345, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001945329096755885, + "loss": 1.9609, + "step": 1362 + }, + { + "epoch": 0.42607064707721165, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019452489391999874, + "loss": 1.9051, + "step": 1363 + }, + { + "epoch": 0.42638324476398876, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019451687245782072, + "loss": 1.7331, + "step": 1364 + }, + { + "epoch": 0.42669584245076586, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019450884528953864, + "loss": 2.1455, + "step": 1365 + }, + { + "epoch": 0.42700844013754297, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019450081241563716, + "loss": 1.8298, + "step": 1366 + }, + { + "epoch": 0.4273210378243201, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019449277383660118, + "loss": 1.8084, + "step": 1367 + }, + { + "epoch": 0.42763363551109723, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019448472955291605, + "loss": 1.6876, + "step": 1368 + }, + { + "epoch": 0.42794623319787434, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001944766795650674, + "loss": 1.7431, + "step": 1369 + }, + { + "epoch": 0.42825883088465144, + "grad_norm": 0.228515625, + "learning_rate": 0.0001944686238735412, + "loss": 1.7904, + "step": 1370 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019446056247882378, + "loss": 1.8465, + "step": 1371 + }, + { + "epoch": 0.4288840262582057, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019445249538140185, + "loss": 1.6672, + "step": 1372 + }, + { + "epoch": 0.4291966239449828, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001944444225817624, + "loss": 1.9209, + "step": 1373 + }, + { + "epoch": 0.4295092216317599, + "grad_norm": 0.220703125, + "learning_rate": 0.00019443634408039282, + "loss": 1.8336, + "step": 1374 + }, + { + "epoch": 0.429821819318537, + "grad_norm": 0.22265625, + "learning_rate": 0.0001944282598777808, + "loss": 1.9261, + "step": 1375 + }, + { + "epoch": 0.4301344170053142, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001944201699744144, + "loss": 1.6371, + "step": 1376 + }, + { + "epoch": 0.4304470146920913, + "grad_norm": 0.220703125, + "learning_rate": 0.00019441207437078203, + "loss": 1.4774, + "step": 1377 + }, + { + "epoch": 0.4307596123788684, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001944039730673724, + "loss": 1.5849, + "step": 1378 + }, + { + "epoch": 0.4310722100656455, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001943958660646746, + "loss": 1.8103, + "step": 1379 + }, + { + "epoch": 0.4313848077524226, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019438775336317812, + "loss": 1.8946, + "step": 1380 + }, + { + "epoch": 0.43169740543919977, + "grad_norm": 0.20703125, + "learning_rate": 0.00019437963496337266, + "loss": 1.6056, + "step": 1381 + }, + { + "epoch": 0.43201000312597687, + "grad_norm": 0.220703125, + "learning_rate": 0.00019437151086574837, + "loss": 1.6991, + "step": 1382 + }, + { + "epoch": 0.432322600812754, + "grad_norm": 0.2265625, + "learning_rate": 0.00019436338107079574, + "loss": 1.6126, + "step": 1383 + }, + { + "epoch": 0.4326351984995311, + "grad_norm": 0.216796875, + "learning_rate": 0.00019435524557900551, + "loss": 1.4967, + "step": 1384 + }, + { + "epoch": 0.43294779618630824, + "grad_norm": 0.212890625, + "learning_rate": 0.00019434710439086888, + "loss": 1.5868, + "step": 1385 + }, + { + "epoch": 0.43326039387308535, + "grad_norm": 0.2265625, + "learning_rate": 0.00019433895750687734, + "loss": 1.7528, + "step": 1386 + }, + { + "epoch": 0.43357299155986245, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019433080492752268, + "loss": 1.899, + "step": 1387 + }, + { + "epoch": 0.43388558924663956, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019432264665329715, + "loss": 2.0873, + "step": 1388 + }, + { + "epoch": 0.4341981869334167, + "grad_norm": 0.216796875, + "learning_rate": 0.00019431448268469325, + "loss": 1.4453, + "step": 1389 + }, + { + "epoch": 0.4345107846201938, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019430631302220385, + "loss": 1.9314, + "step": 1390 + }, + { + "epoch": 0.43482338230697093, + "grad_norm": 0.21875, + "learning_rate": 0.0001942981376663221, + "loss": 1.5989, + "step": 1391 + }, + { + "epoch": 0.43513597999374803, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019428995661754171, + "loss": 1.8037, + "step": 1392 + }, + { + "epoch": 0.43544857768052514, + "grad_norm": 0.20703125, + "learning_rate": 0.0001942817698763564, + "loss": 1.7903, + "step": 1393 + }, + { + "epoch": 0.4357611753673023, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019427357744326057, + "loss": 1.7809, + "step": 1394 + }, + { + "epoch": 0.4360737730540794, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001942653793187487, + "loss": 1.552, + "step": 1395 + }, + { + "epoch": 0.4363863707408565, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019425717550331572, + "loss": 1.7079, + "step": 1396 + }, + { + "epoch": 0.4366989684276336, + "grad_norm": 0.2119140625, + "learning_rate": 0.000194248965997457, + "loss": 1.8321, + "step": 1397 + }, + { + "epoch": 0.4370115661144108, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019424075080166805, + "loss": 1.6185, + "step": 1398 + }, + { + "epoch": 0.4373241638011879, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019423252991644492, + "loss": 1.7149, + "step": 1399 + }, + { + "epoch": 0.437636761487965, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019422430334228386, + "loss": 1.7048, + "step": 1400 + }, + { + "epoch": 0.4379493591747421, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019421607107968154, + "loss": 1.8062, + "step": 1401 + }, + { + "epoch": 0.4382619568615192, + "grad_norm": 0.2265625, + "learning_rate": 0.00019420783312913494, + "loss": 1.8332, + "step": 1402 + }, + { + "epoch": 0.43857455454829636, + "grad_norm": 0.306640625, + "learning_rate": 0.0001941995894911414, + "loss": 2.397, + "step": 1403 + }, + { + "epoch": 0.43888715223507346, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019419134016619865, + "loss": 1.6672, + "step": 1404 + }, + { + "epoch": 0.43919974992185057, + "grad_norm": 0.2265625, + "learning_rate": 0.0001941830851548046, + "loss": 1.6112, + "step": 1405 + }, + { + "epoch": 0.4395123476086277, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001941748244574577, + "loss": 1.7182, + "step": 1406 + }, + { + "epoch": 0.43982494529540483, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019416655807465667, + "loss": 1.7438, + "step": 1407 + }, + { + "epoch": 0.44013754298218194, + "grad_norm": 0.216796875, + "learning_rate": 0.0001941582860069005, + "loss": 1.8327, + "step": 1408 + }, + { + "epoch": 0.44045014066895904, + "grad_norm": 0.224609375, + "learning_rate": 0.00019415000825468863, + "loss": 2.0563, + "step": 1409 + }, + { + "epoch": 0.44076273835573615, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001941417248185208, + "loss": 1.9451, + "step": 1410 + }, + { + "epoch": 0.4410753360425133, + "grad_norm": 0.224609375, + "learning_rate": 0.00019413343569889702, + "loss": 1.8786, + "step": 1411 + }, + { + "epoch": 0.4413879337292904, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019412514089631785, + "loss": 1.7905, + "step": 1412 + }, + { + "epoch": 0.4417005314160675, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019411684041128392, + "loss": 1.7573, + "step": 1413 + }, + { + "epoch": 0.4420131291028446, + "grad_norm": 0.220703125, + "learning_rate": 0.00019410853424429642, + "loss": 1.6898, + "step": 1414 + }, + { + "epoch": 0.44232572678962173, + "grad_norm": 0.220703125, + "learning_rate": 0.00019410022239585678, + "loss": 1.7676, + "step": 1415 + }, + { + "epoch": 0.4426383244763989, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001940919048664668, + "loss": 1.7774, + "step": 1416 + }, + { + "epoch": 0.442950922163176, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019408358165662866, + "loss": 1.6328, + "step": 1417 + }, + { + "epoch": 0.4432635198499531, + "grad_norm": 0.2265625, + "learning_rate": 0.00019407525276684474, + "loss": 1.7037, + "step": 1418 + }, + { + "epoch": 0.4435761175367302, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019406691819761796, + "loss": 1.81, + "step": 1419 + }, + { + "epoch": 0.44388871522350737, + "grad_norm": 0.2421875, + "learning_rate": 0.00019405857794945147, + "loss": 1.8474, + "step": 1420 + }, + { + "epoch": 0.4442013129102845, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019405023202284874, + "loss": 1.6398, + "step": 1421 + }, + { + "epoch": 0.4445139105970616, + "grad_norm": 0.22265625, + "learning_rate": 0.0001940418804183137, + "loss": 1.5592, + "step": 1422 + }, + { + "epoch": 0.4448265082838387, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019403352313635046, + "loss": 1.6566, + "step": 1423 + }, + { + "epoch": 0.44513910597061584, + "grad_norm": 0.21484375, + "learning_rate": 0.0001940251601774636, + "loss": 1.6928, + "step": 1424 + }, + { + "epoch": 0.44545170365739295, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019401679154215802, + "loss": 2.029, + "step": 1425 + }, + { + "epoch": 0.44576430134417006, + "grad_norm": 0.220703125, + "learning_rate": 0.0001940084172309389, + "loss": 1.9225, + "step": 1426 + }, + { + "epoch": 0.44607689903094716, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019400003724431185, + "loss": 1.9033, + "step": 1427 + }, + { + "epoch": 0.44638949671772427, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019399165158278279, + "loss": 1.9373, + "step": 1428 + }, + { + "epoch": 0.4467020944045014, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019398326024685792, + "loss": 1.8287, + "step": 1429 + }, + { + "epoch": 0.44701469209127853, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019397486323704388, + "loss": 1.4876, + "step": 1430 + }, + { + "epoch": 0.44732728977805564, + "grad_norm": 0.2109375, + "learning_rate": 0.0001939664605538476, + "loss": 1.7532, + "step": 1431 + }, + { + "epoch": 0.44763988746483274, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001939580521977763, + "loss": 1.8811, + "step": 1432 + }, + { + "epoch": 0.4479524851516099, + "grad_norm": 0.22265625, + "learning_rate": 0.00019394963816933772, + "loss": 1.8956, + "step": 1433 + }, + { + "epoch": 0.448265082838387, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019394121846903975, + "loss": 1.7634, + "step": 1434 + }, + { + "epoch": 0.4485776805251641, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001939327930973907, + "loss": 1.5284, + "step": 1435 + }, + { + "epoch": 0.4488902782119412, + "grad_norm": 0.234375, + "learning_rate": 0.00019392436205489924, + "loss": 1.8581, + "step": 1436 + }, + { + "epoch": 0.4492028758987183, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019391592534207436, + "loss": 1.4981, + "step": 1437 + }, + { + "epoch": 0.4495154735854955, + "grad_norm": 0.220703125, + "learning_rate": 0.00019390748295942535, + "loss": 1.6315, + "step": 1438 + }, + { + "epoch": 0.4498280712722726, + "grad_norm": 0.216796875, + "learning_rate": 0.00019389903490746194, + "loss": 1.755, + "step": 1439 + }, + { + "epoch": 0.4501406689590497, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019389058118669418, + "loss": 1.6564, + "step": 1440 + }, + { + "epoch": 0.4504532666458268, + "grad_norm": 0.232421875, + "learning_rate": 0.00019388212179763235, + "loss": 1.8079, + "step": 1441 + }, + { + "epoch": 0.45076586433260396, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001938736567407872, + "loss": 1.7621, + "step": 1442 + }, + { + "epoch": 0.45107846201938107, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019386518601666977, + "loss": 2.0246, + "step": 1443 + }, + { + "epoch": 0.45139105970615817, + "grad_norm": 0.228515625, + "learning_rate": 0.0001938567096257914, + "loss": 1.7006, + "step": 1444 + }, + { + "epoch": 0.4517036573929353, + "grad_norm": 0.23046875, + "learning_rate": 0.00019384822756866394, + "loss": 1.7433, + "step": 1445 + }, + { + "epoch": 0.45201625507971244, + "grad_norm": 0.220703125, + "learning_rate": 0.00019383973984579936, + "loss": 1.6673, + "step": 1446 + }, + { + "epoch": 0.45232885276648954, + "grad_norm": 0.20703125, + "learning_rate": 0.00019383124645771008, + "loss": 1.7402, + "step": 1447 + }, + { + "epoch": 0.45264145045326665, + "grad_norm": 0.220703125, + "learning_rate": 0.00019382274740490892, + "loss": 1.7445, + "step": 1448 + }, + { + "epoch": 0.45295404814004375, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001938142426879089, + "loss": 1.752, + "step": 1449 + }, + { + "epoch": 0.45326664582682086, + "grad_norm": 0.224609375, + "learning_rate": 0.00019380573230722353, + "loss": 1.7653, + "step": 1450 + }, + { + "epoch": 0.453579243513598, + "grad_norm": 0.224609375, + "learning_rate": 0.00019379721626336656, + "loss": 1.4672, + "step": 1451 + }, + { + "epoch": 0.4538918412003751, + "grad_norm": 0.224609375, + "learning_rate": 0.0001937886945568521, + "loss": 1.6907, + "step": 1452 + }, + { + "epoch": 0.45420443888715223, + "grad_norm": 0.2265625, + "learning_rate": 0.00019378016718819466, + "loss": 1.7775, + "step": 1453 + }, + { + "epoch": 0.45451703657392933, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019377163415790902, + "loss": 1.913, + "step": 1454 + }, + { + "epoch": 0.4548296342607065, + "grad_norm": 0.216796875, + "learning_rate": 0.00019376309546651033, + "loss": 1.8471, + "step": 1455 + }, + { + "epoch": 0.4551422319474836, + "grad_norm": 0.228515625, + "learning_rate": 0.00019375455111451405, + "loss": 1.5682, + "step": 1456 + }, + { + "epoch": 0.4554548296342607, + "grad_norm": 0.220703125, + "learning_rate": 0.00019374600110243608, + "loss": 1.7008, + "step": 1457 + }, + { + "epoch": 0.4557674273210378, + "grad_norm": 0.21875, + "learning_rate": 0.00019373744543079257, + "loss": 1.7075, + "step": 1458 + }, + { + "epoch": 0.4560800250078149, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001937288841001, + "loss": 1.6143, + "step": 1459 + }, + { + "epoch": 0.4563926226945921, + "grad_norm": 0.21484375, + "learning_rate": 0.00019372031711087527, + "loss": 1.6665, + "step": 1460 + }, + { + "epoch": 0.4567052203813692, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019371174446363557, + "loss": 1.6533, + "step": 1461 + }, + { + "epoch": 0.4570178180681463, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019370316615889842, + "loss": 1.5501, + "step": 1462 + }, + { + "epoch": 0.4573304157549234, + "grad_norm": 0.232421875, + "learning_rate": 0.00019369458219718175, + "loss": 1.8101, + "step": 1463 + }, + { + "epoch": 0.45764301344170055, + "grad_norm": 0.216796875, + "learning_rate": 0.00019368599257900372, + "loss": 1.6708, + "step": 1464 + }, + { + "epoch": 0.45795561112847766, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019367739730488296, + "loss": 1.6922, + "step": 1465 + }, + { + "epoch": 0.45826820881525476, + "grad_norm": 0.220703125, + "learning_rate": 0.00019366879637533834, + "loss": 1.6808, + "step": 1466 + }, + { + "epoch": 0.45858080650203187, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019366018979088913, + "loss": 1.654, + "step": 1467 + }, + { + "epoch": 0.45889340418880903, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001936515775520549, + "loss": 1.7892, + "step": 1468 + }, + { + "epoch": 0.45920600187558613, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019364295965935562, + "loss": 1.6039, + "step": 1469 + }, + { + "epoch": 0.45951859956236324, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001936343361133115, + "loss": 1.6348, + "step": 1470 + }, + { + "epoch": 0.45983119724914034, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001936257069144432, + "loss": 2.0579, + "step": 1471 + }, + { + "epoch": 0.46014379493591745, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019361707206327168, + "loss": 1.5824, + "step": 1472 + }, + { + "epoch": 0.4604563926226946, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001936084315603182, + "loss": 1.6563, + "step": 1473 + }, + { + "epoch": 0.4607689903094717, + "grad_norm": 0.21484375, + "learning_rate": 0.0001935997854061044, + "loss": 1.7782, + "step": 1474 + }, + { + "epoch": 0.4610815879962488, + "grad_norm": 0.22265625, + "learning_rate": 0.00019359113360115234, + "loss": 1.7625, + "step": 1475 + }, + { + "epoch": 0.4613941856830259, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019358247614598427, + "loss": 1.5607, + "step": 1476 + }, + { + "epoch": 0.4617067833698031, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019357381304112281, + "loss": 1.6091, + "step": 1477 + }, + { + "epoch": 0.4620193810565802, + "grad_norm": 0.21875, + "learning_rate": 0.00019356514428709104, + "loss": 1.5822, + "step": 1478 + }, + { + "epoch": 0.4623319787433573, + "grad_norm": 0.21875, + "learning_rate": 0.0001935564698844123, + "loss": 1.8785, + "step": 1479 + }, + { + "epoch": 0.4626445764301344, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001935477898336102, + "loss": 1.4933, + "step": 1480 + }, + { + "epoch": 0.4629571741169115, + "grad_norm": 0.341796875, + "learning_rate": 0.00019353910413520887, + "loss": 2.2543, + "step": 1481 + }, + { + "epoch": 0.46326977180368867, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001935304127897326, + "loss": 1.6022, + "step": 1482 + }, + { + "epoch": 0.4635823694904658, + "grad_norm": 0.224609375, + "learning_rate": 0.00019352171579770615, + "loss": 1.9542, + "step": 1483 + }, + { + "epoch": 0.4638949671772429, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019351301315965452, + "loss": 1.5863, + "step": 1484 + }, + { + "epoch": 0.46420756486402, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019350430487610312, + "loss": 1.9259, + "step": 1485 + }, + { + "epoch": 0.46452016255079714, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001934955909475777, + "loss": 1.9044, + "step": 1486 + }, + { + "epoch": 0.46483276023757425, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019348687137460432, + "loss": 1.829, + "step": 1487 + }, + { + "epoch": 0.46514535792435135, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019347814615770933, + "loss": 1.5524, + "step": 1488 + }, + { + "epoch": 0.46545795561112846, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019346941529741954, + "loss": 1.683, + "step": 1489 + }, + { + "epoch": 0.4657705532979056, + "grad_norm": 0.220703125, + "learning_rate": 0.0001934606787942621, + "loss": 1.8919, + "step": 1490 + }, + { + "epoch": 0.4660831509846827, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019345193664876433, + "loss": 1.7553, + "step": 1491 + }, + { + "epoch": 0.46639574867145983, + "grad_norm": 0.21875, + "learning_rate": 0.0001934431888614541, + "loss": 1.9543, + "step": 1492 + }, + { + "epoch": 0.46670834635823694, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019343443543285945, + "loss": 1.6919, + "step": 1493 + }, + { + "epoch": 0.46702094404501404, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019342567636350887, + "loss": 1.6121, + "step": 1494 + }, + { + "epoch": 0.4673335417317912, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019341691165393116, + "loss": 1.5772, + "step": 1495 + }, + { + "epoch": 0.4676461394185683, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019340814130465548, + "loss": 1.9449, + "step": 1496 + }, + { + "epoch": 0.4679587371053454, + "grad_norm": 0.224609375, + "learning_rate": 0.00019339936531621122, + "loss": 1.7063, + "step": 1497 + }, + { + "epoch": 0.4682713347921225, + "grad_norm": 0.212890625, + "learning_rate": 0.0001933905836891283, + "loss": 1.7768, + "step": 1498 + }, + { + "epoch": 0.4685839324788997, + "grad_norm": 0.21875, + "learning_rate": 0.00019338179642393685, + "loss": 1.7279, + "step": 1499 + }, + { + "epoch": 0.4688965301656768, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001933730035211673, + "loss": 1.7344, + "step": 1500 + }, + { + "epoch": 0.4692091278524539, + "grad_norm": 0.21484375, + "learning_rate": 0.00019336420498135057, + "loss": 1.6349, + "step": 1501 + }, + { + "epoch": 0.469521725539231, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001933554008050178, + "loss": 1.703, + "step": 1502 + }, + { + "epoch": 0.46983432322600815, + "grad_norm": 0.21875, + "learning_rate": 0.00019334659099270053, + "loss": 1.6039, + "step": 1503 + }, + { + "epoch": 0.47014692091278526, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001933377755449306, + "loss": 1.7018, + "step": 1504 + }, + { + "epoch": 0.47045951859956237, + "grad_norm": 0.22265625, + "learning_rate": 0.00019332895446224022, + "loss": 1.5957, + "step": 1505 + }, + { + "epoch": 0.47077211628633947, + "grad_norm": 0.2265625, + "learning_rate": 0.00019332012774516191, + "loss": 1.6054, + "step": 1506 + }, + { + "epoch": 0.4710847139731166, + "grad_norm": 0.216796875, + "learning_rate": 0.0001933112953942286, + "loss": 1.6822, + "step": 1507 + }, + { + "epoch": 0.47139731165989374, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019330245740997346, + "loss": 1.6045, + "step": 1508 + }, + { + "epoch": 0.47170990934667084, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019329361379293006, + "loss": 1.6817, + "step": 1509 + }, + { + "epoch": 0.47202250703344795, + "grad_norm": 0.26953125, + "learning_rate": 0.00019328476454363237, + "loss": 1.6334, + "step": 1510 + }, + { + "epoch": 0.47233510472022505, + "grad_norm": 0.236328125, + "learning_rate": 0.00019327590966261452, + "loss": 1.9416, + "step": 1511 + }, + { + "epoch": 0.4726477024070022, + "grad_norm": 0.21484375, + "learning_rate": 0.00019326704915041115, + "loss": 1.8148, + "step": 1512 + }, + { + "epoch": 0.4729603000937793, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001932581830075572, + "loss": 1.6804, + "step": 1513 + }, + { + "epoch": 0.4732728977805564, + "grad_norm": 0.224609375, + "learning_rate": 0.00019324931123458784, + "loss": 1.6578, + "step": 1514 + }, + { + "epoch": 0.47358549546733353, + "grad_norm": 0.232421875, + "learning_rate": 0.00019324043383203875, + "loss": 1.7513, + "step": 1515 + }, + { + "epoch": 0.47389809315411063, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019323155080044587, + "loss": 1.8009, + "step": 1516 + }, + { + "epoch": 0.4742106908408878, + "grad_norm": 0.23828125, + "learning_rate": 0.00019322266214034546, + "loss": 1.5399, + "step": 1517 + }, + { + "epoch": 0.4745232885276649, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019321376785227416, + "loss": 1.6751, + "step": 1518 + }, + { + "epoch": 0.474835886214442, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019320486793676889, + "loss": 1.5572, + "step": 1519 + }, + { + "epoch": 0.4751484839012191, + "grad_norm": 0.228515625, + "learning_rate": 0.00019319596239436698, + "loss": 1.6178, + "step": 1520 + }, + { + "epoch": 0.47546108158799627, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019318705122560602, + "loss": 1.5581, + "step": 1521 + }, + { + "epoch": 0.4757736792747734, + "grad_norm": 0.2265625, + "learning_rate": 0.00019317813443102408, + "loss": 1.6904, + "step": 1522 + }, + { + "epoch": 0.4760862769615505, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001931692120111594, + "loss": 1.9162, + "step": 1523 + }, + { + "epoch": 0.4763988746483276, + "grad_norm": 0.2265625, + "learning_rate": 0.0001931602839665507, + "loss": 1.6703, + "step": 1524 + }, + { + "epoch": 0.47671147233510475, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001931513502977369, + "loss": 1.6865, + "step": 1525 + }, + { + "epoch": 0.47702407002188185, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019314241100525738, + "loss": 1.7221, + "step": 1526 + }, + { + "epoch": 0.47733666770865896, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019313346608965183, + "loss": 1.6306, + "step": 1527 + }, + { + "epoch": 0.47764926539543606, + "grad_norm": 0.224609375, + "learning_rate": 0.00019312451555146022, + "loss": 2.0435, + "step": 1528 + }, + { + "epoch": 0.47796186308221317, + "grad_norm": 0.2265625, + "learning_rate": 0.00019311555939122298, + "loss": 1.4892, + "step": 1529 + }, + { + "epoch": 0.47827446076899033, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019310659760948075, + "loss": 1.7291, + "step": 1530 + }, + { + "epoch": 0.47858705845576743, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019309763020677458, + "loss": 1.7014, + "step": 1531 + }, + { + "epoch": 0.47889965614254454, + "grad_norm": 0.23828125, + "learning_rate": 0.00019308865718364583, + "loss": 2.0065, + "step": 1532 + }, + { + "epoch": 0.47921225382932164, + "grad_norm": 0.23046875, + "learning_rate": 0.00019307967854063622, + "loss": 1.5883, + "step": 1533 + }, + { + "epoch": 0.4795248515160988, + "grad_norm": 0.236328125, + "learning_rate": 0.0001930706942782878, + "loss": 1.7971, + "step": 1534 + }, + { + "epoch": 0.4798374492028759, + "grad_norm": 0.224609375, + "learning_rate": 0.00019306170439714298, + "loss": 1.6701, + "step": 1535 + }, + { + "epoch": 0.480150046889653, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019305270889774444, + "loss": 1.611, + "step": 1536 + }, + { + "epoch": 0.4804626445764301, + "grad_norm": 0.240234375, + "learning_rate": 0.00019304370778063534, + "loss": 1.8515, + "step": 1537 + }, + { + "epoch": 0.4807752422632072, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019303470104635898, + "loss": 1.64, + "step": 1538 + }, + { + "epoch": 0.4810878399499844, + "grad_norm": 0.228515625, + "learning_rate": 0.0001930256886954592, + "loss": 1.7283, + "step": 1539 + }, + { + "epoch": 0.4814004376367615, + "grad_norm": 0.244140625, + "learning_rate": 0.00019301667072848004, + "loss": 1.8076, + "step": 1540 + }, + { + "epoch": 0.4817130353235386, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019300764714596594, + "loss": 1.9384, + "step": 1541 + }, + { + "epoch": 0.4820256330103157, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019299861794846166, + "loss": 1.8492, + "step": 1542 + }, + { + "epoch": 0.48233823069709286, + "grad_norm": 0.21875, + "learning_rate": 0.00019298958313651227, + "loss": 1.744, + "step": 1543 + }, + { + "epoch": 0.48265082838386997, + "grad_norm": 0.220703125, + "learning_rate": 0.0001929805427106633, + "loss": 1.7691, + "step": 1544 + }, + { + "epoch": 0.4829634260706471, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019297149667146045, + "loss": 1.6095, + "step": 1545 + }, + { + "epoch": 0.4832760237574242, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001929624450194499, + "loss": 1.8153, + "step": 1546 + }, + { + "epoch": 0.48358862144420134, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019295338775517803, + "loss": 1.8315, + "step": 1547 + }, + { + "epoch": 0.48390121913097844, + "grad_norm": 0.208984375, + "learning_rate": 0.00019294432487919173, + "loss": 1.6651, + "step": 1548 + }, + { + "epoch": 0.48421381681775555, + "grad_norm": 0.20703125, + "learning_rate": 0.0001929352563920381, + "loss": 1.632, + "step": 1549 + }, + { + "epoch": 0.48452641450453265, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001929261822942646, + "loss": 1.5682, + "step": 1550 + }, + { + "epoch": 0.48483901219130976, + "grad_norm": 0.220703125, + "learning_rate": 0.00019291710258641907, + "loss": 1.7631, + "step": 1551 + }, + { + "epoch": 0.4851516098780869, + "grad_norm": 0.212890625, + "learning_rate": 0.00019290801726904962, + "loss": 1.6418, + "step": 1552 + }, + { + "epoch": 0.485464207564864, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001928989263427048, + "loss": 1.4744, + "step": 1553 + }, + { + "epoch": 0.48577680525164113, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001928898298079334, + "loss": 1.7507, + "step": 1554 + }, + { + "epoch": 0.48608940293841824, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019288072766528462, + "loss": 1.5483, + "step": 1555 + }, + { + "epoch": 0.4864020006251954, + "grad_norm": 0.228515625, + "learning_rate": 0.00019287161991530792, + "loss": 1.7318, + "step": 1556 + }, + { + "epoch": 0.4867145983119725, + "grad_norm": 0.228515625, + "learning_rate": 0.0001928625065585532, + "loss": 1.8483, + "step": 1557 + }, + { + "epoch": 0.4870271959987496, + "grad_norm": 0.21875, + "learning_rate": 0.00019285338759557065, + "loss": 1.6431, + "step": 1558 + }, + { + "epoch": 0.4873397936855267, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019284426302691073, + "loss": 1.6648, + "step": 1559 + }, + { + "epoch": 0.4876523913723038, + "grad_norm": 0.21484375, + "learning_rate": 0.00019283513285312437, + "loss": 1.5061, + "step": 1560 + }, + { + "epoch": 0.487964989059081, + "grad_norm": 0.310546875, + "learning_rate": 0.0001928259970747627, + "loss": 2.72, + "step": 1561 + }, + { + "epoch": 0.4882775867458581, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019281685569237734, + "loss": 1.6893, + "step": 1562 + }, + { + "epoch": 0.4885901844326352, + "grad_norm": 0.216796875, + "learning_rate": 0.0001928077087065201, + "loss": 1.6951, + "step": 1563 + }, + { + "epoch": 0.4889027821194123, + "grad_norm": 0.220703125, + "learning_rate": 0.0001927985561177432, + "loss": 1.7366, + "step": 1564 + }, + { + "epoch": 0.48921537980618945, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019278939792659924, + "loss": 1.7637, + "step": 1565 + }, + { + "epoch": 0.48952797749296656, + "grad_norm": 0.212890625, + "learning_rate": 0.00019278023413364106, + "loss": 1.5522, + "step": 1566 + }, + { + "epoch": 0.48984057517974366, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019277106473942194, + "loss": 1.8184, + "step": 1567 + }, + { + "epoch": 0.49015317286652077, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019276188974449543, + "loss": 1.5573, + "step": 1568 + }, + { + "epoch": 0.49046577055329793, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019275270914941538, + "loss": 1.5074, + "step": 1569 + }, + { + "epoch": 0.49077836824007504, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019274352295473612, + "loss": 1.9685, + "step": 1570 + }, + { + "epoch": 0.49109096592685214, + "grad_norm": 0.23046875, + "learning_rate": 0.00019273433116101217, + "loss": 1.8918, + "step": 1571 + }, + { + "epoch": 0.49140356361362925, + "grad_norm": 0.240234375, + "learning_rate": 0.00019272513376879854, + "loss": 1.8173, + "step": 1572 + }, + { + "epoch": 0.49171616130040635, + "grad_norm": 0.220703125, + "learning_rate": 0.00019271593077865035, + "loss": 1.7093, + "step": 1573 + }, + { + "epoch": 0.4920287589871835, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019270672219112332, + "loss": 1.7993, + "step": 1574 + }, + { + "epoch": 0.4923413566739606, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019269750800677331, + "loss": 1.7468, + "step": 1575 + }, + { + "epoch": 0.4926539543607377, + "grad_norm": 0.208984375, + "learning_rate": 0.00019268828822615661, + "loss": 1.4455, + "step": 1576 + }, + { + "epoch": 0.4929665520475148, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019267906284982985, + "loss": 1.9409, + "step": 1577 + }, + { + "epoch": 0.493279149734292, + "grad_norm": 0.220703125, + "learning_rate": 0.00019266983187834995, + "loss": 1.8848, + "step": 1578 + }, + { + "epoch": 0.4935917474210691, + "grad_norm": 0.220703125, + "learning_rate": 0.0001926605953122742, + "loss": 1.5927, + "step": 1579 + }, + { + "epoch": 0.4939043451078462, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019265135315216028, + "loss": 1.7506, + "step": 1580 + }, + { + "epoch": 0.4942169427946233, + "grad_norm": 0.22265625, + "learning_rate": 0.00019264210539856607, + "loss": 1.7024, + "step": 1581 + }, + { + "epoch": 0.49452954048140046, + "grad_norm": 0.228515625, + "learning_rate": 0.0001926328520520499, + "loss": 1.8899, + "step": 1582 + }, + { + "epoch": 0.49484213816817757, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001926235931131704, + "loss": 1.7209, + "step": 1583 + }, + { + "epoch": 0.4951547358549547, + "grad_norm": 0.22265625, + "learning_rate": 0.00019261432858248657, + "loss": 1.582, + "step": 1584 + }, + { + "epoch": 0.4954673335417318, + "grad_norm": 0.216796875, + "learning_rate": 0.0001926050584605577, + "loss": 1.7583, + "step": 1585 + }, + { + "epoch": 0.4957799312285089, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019259578274794344, + "loss": 1.7366, + "step": 1586 + }, + { + "epoch": 0.49609252891528605, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001925865014452038, + "loss": 1.7721, + "step": 1587 + }, + { + "epoch": 0.49640512660206315, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019257721455289906, + "loss": 1.9818, + "step": 1588 + }, + { + "epoch": 0.49671772428884026, + "grad_norm": 0.220703125, + "learning_rate": 0.00019256792207158991, + "loss": 1.719, + "step": 1589 + }, + { + "epoch": 0.49703032197561736, + "grad_norm": 0.220703125, + "learning_rate": 0.00019255862400183733, + "loss": 1.7085, + "step": 1590 + }, + { + "epoch": 0.4973429196623945, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019254932034420266, + "loss": 1.5593, + "step": 1591 + }, + { + "epoch": 0.4976555173491716, + "grad_norm": 0.220703125, + "learning_rate": 0.00019254001109924763, + "loss": 1.6743, + "step": 1592 + }, + { + "epoch": 0.49796811503594873, + "grad_norm": 0.220703125, + "learning_rate": 0.0001925306962675342, + "loss": 1.5977, + "step": 1593 + }, + { + "epoch": 0.49828071272272584, + "grad_norm": 0.216796875, + "learning_rate": 0.00019252137584962472, + "loss": 1.6007, + "step": 1594 + }, + { + "epoch": 0.49859331040950294, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019251204984608184, + "loss": 1.5078, + "step": 1595 + }, + { + "epoch": 0.4989059080962801, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019250271825746866, + "loss": 1.9624, + "step": 1596 + }, + { + "epoch": 0.4992185057830572, + "grad_norm": 0.21484375, + "learning_rate": 0.0001924933810843485, + "loss": 1.6749, + "step": 1597 + }, + { + "epoch": 0.4995311034698343, + "grad_norm": 0.23046875, + "learning_rate": 0.00019248403832728504, + "loss": 1.7965, + "step": 1598 + }, + { + "epoch": 0.4998437011566114, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019247468998684233, + "loss": 1.7333, + "step": 1599 + }, + { + "epoch": 0.5001562988433885, + "grad_norm": 0.21484375, + "learning_rate": 0.00019246533606358476, + "loss": 1.9014, + "step": 1600 + }, + { + "epoch": 0.5001562988433885, + "eval_loss": 1.6468836069107056, + "eval_runtime": 1904.4552, + "eval_samples_per_second": 4.798, + "eval_steps_per_second": 2.399, + "step": 1600 + }, + { + "epoch": 0.5004688965301657, + "grad_norm": 0.2060546875, + "learning_rate": 0.000192455976558077, + "loss": 1.8399, + "step": 1601 + }, + { + "epoch": 0.5007814942169428, + "grad_norm": 0.22265625, + "learning_rate": 0.00019244661147088413, + "loss": 1.7516, + "step": 1602 + }, + { + "epoch": 0.5010940919037199, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019243724080257154, + "loss": 1.6023, + "step": 1603 + }, + { + "epoch": 0.5014066895904971, + "grad_norm": 0.240234375, + "learning_rate": 0.0001924278645537049, + "loss": 1.8678, + "step": 1604 + }, + { + "epoch": 0.5017192872772741, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001924184827248503, + "loss": 1.8877, + "step": 1605 + }, + { + "epoch": 0.5020318849640513, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019240909531657415, + "loss": 1.7109, + "step": 1606 + }, + { + "epoch": 0.5023444826508284, + "grad_norm": 0.21484375, + "learning_rate": 0.00019239970232944314, + "loss": 1.9394, + "step": 1607 + }, + { + "epoch": 0.5026570803376055, + "grad_norm": 0.2265625, + "learning_rate": 0.00019239030376402437, + "loss": 1.6907, + "step": 1608 + }, + { + "epoch": 0.5029696780243826, + "grad_norm": 0.21875, + "learning_rate": 0.00019238089962088522, + "loss": 1.3726, + "step": 1609 + }, + { + "epoch": 0.5032822757111597, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019237148990059342, + "loss": 1.4186, + "step": 1610 + }, + { + "epoch": 0.5035948733979368, + "grad_norm": 0.232421875, + "learning_rate": 0.00019236207460371707, + "loss": 1.8961, + "step": 1611 + }, + { + "epoch": 0.503907471084714, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001923526537308246, + "loss": 1.5122, + "step": 1612 + }, + { + "epoch": 0.5042200687714911, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019234322728248473, + "loss": 1.6718, + "step": 1613 + }, + { + "epoch": 0.5045326664582682, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019233379525926652, + "loss": 1.5157, + "step": 1614 + }, + { + "epoch": 0.5048452641450454, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019232435766173946, + "loss": 1.8013, + "step": 1615 + }, + { + "epoch": 0.5051578618318224, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019231491449047327, + "loss": 1.6126, + "step": 1616 + }, + { + "epoch": 0.5054704595185996, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019230546574603805, + "loss": 1.9199, + "step": 1617 + }, + { + "epoch": 0.5057830572053766, + "grad_norm": 0.216796875, + "learning_rate": 0.00019229601142900426, + "loss": 1.8629, + "step": 1618 + }, + { + "epoch": 0.5060956548921538, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001922865515399426, + "loss": 1.9572, + "step": 1619 + }, + { + "epoch": 0.506408252578931, + "grad_norm": 0.212890625, + "learning_rate": 0.0001922770860794243, + "loss": 1.8666, + "step": 1620 + }, + { + "epoch": 0.506720850265708, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019226761504802066, + "loss": 1.6269, + "step": 1621 + }, + { + "epoch": 0.5070334479524852, + "grad_norm": 0.212890625, + "learning_rate": 0.00019225813844630355, + "loss": 1.4542, + "step": 1622 + }, + { + "epoch": 0.5073460456392622, + "grad_norm": 0.232421875, + "learning_rate": 0.00019224865627484502, + "loss": 1.726, + "step": 1623 + }, + { + "epoch": 0.5076586433260394, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019223916853421756, + "loss": 1.9227, + "step": 1624 + }, + { + "epoch": 0.5079712410128165, + "grad_norm": 0.232421875, + "learning_rate": 0.000192229675224994, + "loss": 1.7876, + "step": 1625 + }, + { + "epoch": 0.5082838386995936, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001922201763477474, + "loss": 1.9213, + "step": 1626 + }, + { + "epoch": 0.5085964363863708, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019221067190305121, + "loss": 1.8536, + "step": 1627 + }, + { + "epoch": 0.5089090340731478, + "grad_norm": 0.236328125, + "learning_rate": 0.00019220116189147928, + "loss": 1.7391, + "step": 1628 + }, + { + "epoch": 0.509221631759925, + "grad_norm": 0.22265625, + "learning_rate": 0.00019219164631360572, + "loss": 1.5871, + "step": 1629 + }, + { + "epoch": 0.5095342294467021, + "grad_norm": 0.236328125, + "learning_rate": 0.00019218212517000497, + "loss": 1.7358, + "step": 1630 + }, + { + "epoch": 0.5098468271334792, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019217259846125186, + "loss": 1.7538, + "step": 1631 + }, + { + "epoch": 0.5101594248202563, + "grad_norm": 0.228515625, + "learning_rate": 0.00019216306618792151, + "loss": 2.0148, + "step": 1632 + }, + { + "epoch": 0.5104720225070335, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019215352835058944, + "loss": 1.655, + "step": 1633 + }, + { + "epoch": 0.5107846201938105, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001921439849498314, + "loss": 1.8552, + "step": 1634 + }, + { + "epoch": 0.5110972178805877, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001921344359862236, + "loss": 2.0283, + "step": 1635 + }, + { + "epoch": 0.5114098155673648, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019212488146034247, + "loss": 1.8859, + "step": 1636 + }, + { + "epoch": 0.5117224132541419, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019211532137276485, + "loss": 1.7173, + "step": 1637 + }, + { + "epoch": 0.5120350109409191, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001921057557240679, + "loss": 1.6262, + "step": 1638 + }, + { + "epoch": 0.5123476086276961, + "grad_norm": 0.234375, + "learning_rate": 0.00019209618451482911, + "loss": 1.6141, + "step": 1639 + }, + { + "epoch": 0.5126602063144733, + "grad_norm": 0.224609375, + "learning_rate": 0.0001920866077456263, + "loss": 1.7475, + "step": 1640 + }, + { + "epoch": 0.5129728040012503, + "grad_norm": 0.23828125, + "learning_rate": 0.0001920770254170376, + "loss": 1.7333, + "step": 1641 + }, + { + "epoch": 0.5132854016880275, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001920674375296416, + "loss": 1.8058, + "step": 1642 + }, + { + "epoch": 0.5135979993748047, + "grad_norm": 0.216796875, + "learning_rate": 0.00019205784408401705, + "loss": 1.5659, + "step": 1643 + }, + { + "epoch": 0.5139105970615817, + "grad_norm": 0.2421875, + "learning_rate": 0.00019204824508074314, + "loss": 1.6922, + "step": 1644 + }, + { + "epoch": 0.5142231947483589, + "grad_norm": 0.20703125, + "learning_rate": 0.00019203864052039937, + "loss": 1.5329, + "step": 1645 + }, + { + "epoch": 0.514535792435136, + "grad_norm": 0.2265625, + "learning_rate": 0.00019202903040356557, + "loss": 1.5799, + "step": 1646 + }, + { + "epoch": 0.5148483901219131, + "grad_norm": 0.216796875, + "learning_rate": 0.00019201941473082196, + "loss": 1.7131, + "step": 1647 + }, + { + "epoch": 0.5151609878086902, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019200979350274898, + "loss": 1.668, + "step": 1648 + }, + { + "epoch": 0.5154735854954673, + "grad_norm": 0.22265625, + "learning_rate": 0.00019200016671992755, + "loss": 1.8212, + "step": 1649 + }, + { + "epoch": 0.5157861831822445, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019199053438293884, + "loss": 1.745, + "step": 1650 + }, + { + "epoch": 0.5160987808690216, + "grad_norm": 0.224609375, + "learning_rate": 0.0001919808964923643, + "loss": 1.9392, + "step": 1651 + }, + { + "epoch": 0.5164113785557987, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019197125304878587, + "loss": 1.8001, + "step": 1652 + }, + { + "epoch": 0.5167239762425758, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019196160405278567, + "loss": 1.6449, + "step": 1653 + }, + { + "epoch": 0.5170365739293529, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019195194950494623, + "loss": 1.7974, + "step": 1654 + }, + { + "epoch": 0.51734917161613, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019194228940585043, + "loss": 1.6213, + "step": 1655 + }, + { + "epoch": 0.5176617693029072, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001919326237560815, + "loss": 1.7459, + "step": 1656 + }, + { + "epoch": 0.5179743669896842, + "grad_norm": 0.23046875, + "learning_rate": 0.00019192295255622286, + "loss": 2.0187, + "step": 1657 + }, + { + "epoch": 0.5182869646764614, + "grad_norm": 0.22265625, + "learning_rate": 0.00019191327580685846, + "loss": 1.465, + "step": 1658 + }, + { + "epoch": 0.5185995623632386, + "grad_norm": 0.21875, + "learning_rate": 0.0001919035935085725, + "loss": 1.7626, + "step": 1659 + }, + { + "epoch": 0.5189121600500156, + "grad_norm": 0.228515625, + "learning_rate": 0.00019189390566194943, + "loss": 1.6333, + "step": 1660 + }, + { + "epoch": 0.5192247577367928, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019188421226757423, + "loss": 1.6854, + "step": 1661 + }, + { + "epoch": 0.5195373554235698, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019187451332603202, + "loss": 1.5598, + "step": 1662 + }, + { + "epoch": 0.519849953110347, + "grad_norm": 0.224609375, + "learning_rate": 0.00019186480883790836, + "loss": 1.7953, + "step": 1663 + }, + { + "epoch": 0.5201625507971241, + "grad_norm": 0.22265625, + "learning_rate": 0.00019185509880378912, + "loss": 1.7901, + "step": 1664 + }, + { + "epoch": 0.5204751484839012, + "grad_norm": 0.23046875, + "learning_rate": 0.00019184538322426054, + "loss": 1.6819, + "step": 1665 + }, + { + "epoch": 0.5207877461706784, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019183566209990911, + "loss": 1.8034, + "step": 1666 + }, + { + "epoch": 0.5211003438574554, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019182593543132174, + "loss": 2.0384, + "step": 1667 + }, + { + "epoch": 0.5214129415442326, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019181620321908564, + "loss": 1.9369, + "step": 1668 + }, + { + "epoch": 0.5217255392310097, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019180646546378832, + "loss": 1.8764, + "step": 1669 + }, + { + "epoch": 0.5220381369177868, + "grad_norm": 0.220703125, + "learning_rate": 0.00019179672216601773, + "loss": 1.6419, + "step": 1670 + }, + { + "epoch": 0.5223507346045639, + "grad_norm": 0.408203125, + "learning_rate": 0.00019178697332636202, + "loss": 2.427, + "step": 1671 + }, + { + "epoch": 0.5226633322913411, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019177721894540975, + "loss": 1.81, + "step": 1672 + }, + { + "epoch": 0.5229759299781181, + "grad_norm": 0.216796875, + "learning_rate": 0.0001917674590237499, + "loss": 1.67, + "step": 1673 + }, + { + "epoch": 0.5232885276648953, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019175769356197153, + "loss": 1.6198, + "step": 1674 + }, + { + "epoch": 0.5236011253516724, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001917479225606643, + "loss": 1.8033, + "step": 1675 + }, + { + "epoch": 0.5239137230384495, + "grad_norm": 0.220703125, + "learning_rate": 0.00019173814602041803, + "loss": 1.6005, + "step": 1676 + }, + { + "epoch": 0.5242263207252267, + "grad_norm": 0.22265625, + "learning_rate": 0.00019172836394182303, + "loss": 1.6983, + "step": 1677 + }, + { + "epoch": 0.5245389184120037, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019171857632546978, + "loss": 1.8186, + "step": 1678 + }, + { + "epoch": 0.5248515160987809, + "grad_norm": 0.220703125, + "learning_rate": 0.00019170878317194924, + "loss": 1.6052, + "step": 1679 + }, + { + "epoch": 0.5251641137855579, + "grad_norm": 0.23828125, + "learning_rate": 0.00019169898448185256, + "loss": 1.7156, + "step": 1680 + }, + { + "epoch": 0.5254767114723351, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019168918025577134, + "loss": 1.7039, + "step": 1681 + }, + { + "epoch": 0.5257893091591123, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019167937049429745, + "loss": 1.8326, + "step": 1682 + }, + { + "epoch": 0.5261019068458893, + "grad_norm": 0.228515625, + "learning_rate": 0.00019166955519802316, + "loss": 1.6872, + "step": 1683 + }, + { + "epoch": 0.5264145045326665, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019165973436754098, + "loss": 1.6172, + "step": 1684 + }, + { + "epoch": 0.5267271022194435, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019164990800344387, + "loss": 1.7482, + "step": 1685 + }, + { + "epoch": 0.5270396999062207, + "grad_norm": 0.224609375, + "learning_rate": 0.000191640076106325, + "loss": 1.6177, + "step": 1686 + }, + { + "epoch": 0.5273522975929978, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019163023867677797, + "loss": 1.6793, + "step": 1687 + }, + { + "epoch": 0.5276648952797749, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019162039571539666, + "loss": 1.6634, + "step": 1688 + }, + { + "epoch": 0.527977492966552, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001916105472227753, + "loss": 1.7808, + "step": 1689 + }, + { + "epoch": 0.5282900906533292, + "grad_norm": 0.228515625, + "learning_rate": 0.00019160069319950845, + "loss": 1.7203, + "step": 1690 + }, + { + "epoch": 0.5286026883401063, + "grad_norm": 0.23046875, + "learning_rate": 0.00019159083364619103, + "loss": 1.6893, + "step": 1691 + }, + { + "epoch": 0.5289152860268834, + "grad_norm": 0.349609375, + "learning_rate": 0.0001915809685634183, + "loss": 2.3232, + "step": 1692 + }, + { + "epoch": 0.5292278837136605, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001915710979517858, + "loss": 1.554, + "step": 1693 + }, + { + "epoch": 0.5295404814004376, + "grad_norm": 0.234375, + "learning_rate": 0.0001915612218118894, + "loss": 1.6621, + "step": 1694 + }, + { + "epoch": 0.5298530790872148, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019155134014432534, + "loss": 1.8881, + "step": 1695 + }, + { + "epoch": 0.5301656767739918, + "grad_norm": 0.22265625, + "learning_rate": 0.00019154145294969022, + "loss": 1.8313, + "step": 1696 + }, + { + "epoch": 0.530478274460769, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019153156022858094, + "loss": 1.7908, + "step": 1697 + }, + { + "epoch": 0.5307908721475461, + "grad_norm": 0.224609375, + "learning_rate": 0.00019152166198159476, + "loss": 1.6425, + "step": 1698 + }, + { + "epoch": 0.5311034698343232, + "grad_norm": 0.21484375, + "learning_rate": 0.00019151175820932917, + "loss": 1.7114, + "step": 1699 + }, + { + "epoch": 0.5314160675211004, + "grad_norm": 0.2109375, + "learning_rate": 0.00019150184891238216, + "loss": 1.5121, + "step": 1700 + }, + { + "epoch": 0.5317286652078774, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019149193409135192, + "loss": 1.7762, + "step": 1701 + }, + { + "epoch": 0.5320412628946546, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019148201374683704, + "loss": 1.8021, + "step": 1702 + }, + { + "epoch": 0.5323538605814317, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019147208787943638, + "loss": 1.8559, + "step": 1703 + }, + { + "epoch": 0.5326664582682088, + "grad_norm": 0.33984375, + "learning_rate": 0.00019146215648974924, + "loss": 2.3382, + "step": 1704 + }, + { + "epoch": 0.532979055954986, + "grad_norm": 0.23046875, + "learning_rate": 0.00019145221957837515, + "loss": 1.6269, + "step": 1705 + }, + { + "epoch": 0.533291653641763, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019144227714591402, + "loss": 1.8329, + "step": 1706 + }, + { + "epoch": 0.5336042513285402, + "grad_norm": 0.23046875, + "learning_rate": 0.0001914323291929661, + "loss": 1.7395, + "step": 1707 + }, + { + "epoch": 0.5339168490153173, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019142237572013197, + "loss": 1.4983, + "step": 1708 + }, + { + "epoch": 0.5342294467020944, + "grad_norm": 0.220703125, + "learning_rate": 0.00019141241672801247, + "loss": 1.7625, + "step": 1709 + }, + { + "epoch": 0.5345420443888715, + "grad_norm": 0.23046875, + "learning_rate": 0.0001914024522172089, + "loss": 1.8429, + "step": 1710 + }, + { + "epoch": 0.5348546420756486, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019139248218832285, + "loss": 1.9247, + "step": 1711 + }, + { + "epoch": 0.5351672397624258, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019138250664195615, + "loss": 1.6563, + "step": 1712 + }, + { + "epoch": 0.5354798374492029, + "grad_norm": 0.216796875, + "learning_rate": 0.0001913725255787111, + "loss": 1.5108, + "step": 1713 + }, + { + "epoch": 0.53579243513598, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019136253899919024, + "loss": 1.8109, + "step": 1714 + }, + { + "epoch": 0.5361050328227571, + "grad_norm": 0.21875, + "learning_rate": 0.00019135254690399648, + "loss": 1.7063, + "step": 1715 + }, + { + "epoch": 0.5364176305095343, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019134254929373303, + "loss": 1.7218, + "step": 1716 + }, + { + "epoch": 0.5367302281963113, + "grad_norm": 0.232421875, + "learning_rate": 0.00019133254616900347, + "loss": 1.6555, + "step": 1717 + }, + { + "epoch": 0.5370428258830885, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019132253753041174, + "loss": 1.9246, + "step": 1718 + }, + { + "epoch": 0.5373554235698655, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019131252337856205, + "loss": 1.818, + "step": 1719 + }, + { + "epoch": 0.5376680212566427, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019130250371405895, + "loss": 1.6691, + "step": 1720 + }, + { + "epoch": 0.5379806189434199, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019129247853750733, + "loss": 1.6272, + "step": 1721 + }, + { + "epoch": 0.5382932166301969, + "grad_norm": 0.2109375, + "learning_rate": 0.0001912824478495125, + "loss": 1.529, + "step": 1722 + }, + { + "epoch": 0.5386058143169741, + "grad_norm": 0.224609375, + "learning_rate": 0.00019127241165067994, + "loss": 1.8957, + "step": 1723 + }, + { + "epoch": 0.5389184120037511, + "grad_norm": 0.216796875, + "learning_rate": 0.00019126236994161558, + "loss": 1.6643, + "step": 1724 + }, + { + "epoch": 0.5392310096905283, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019125232272292563, + "loss": 1.8746, + "step": 1725 + }, + { + "epoch": 0.5395436073773054, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019124226999521672, + "loss": 1.5691, + "step": 1726 + }, + { + "epoch": 0.5398562050640825, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019123221175909567, + "loss": 1.7902, + "step": 1727 + }, + { + "epoch": 0.5401688027508597, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019122214801516973, + "loss": 1.6767, + "step": 1728 + }, + { + "epoch": 0.5404814004376368, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019121207876404648, + "loss": 1.727, + "step": 1729 + }, + { + "epoch": 0.5407939981244139, + "grad_norm": 0.228515625, + "learning_rate": 0.0001912020040063338, + "loss": 1.6355, + "step": 1730 + }, + { + "epoch": 0.541106595811191, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019119192374263992, + "loss": 1.9062, + "step": 1731 + }, + { + "epoch": 0.5414191934979681, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019118183797357338, + "loss": 1.5986, + "step": 1732 + }, + { + "epoch": 0.5417317911847452, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019117174669974312, + "loss": 1.5961, + "step": 1733 + }, + { + "epoch": 0.5420443888715224, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019116164992175828, + "loss": 1.8585, + "step": 1734 + }, + { + "epoch": 0.5423569865582994, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019115154764022852, + "loss": 1.731, + "step": 1735 + }, + { + "epoch": 0.5426695842450766, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019114143985576366, + "loss": 1.9891, + "step": 1736 + }, + { + "epoch": 0.5429821819318537, + "grad_norm": 0.23046875, + "learning_rate": 0.0001911313265689739, + "loss": 1.6551, + "step": 1737 + }, + { + "epoch": 0.5432947796186308, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019112120778046987, + "loss": 2.0219, + "step": 1738 + }, + { + "epoch": 0.543607377305408, + "grad_norm": 0.21875, + "learning_rate": 0.0001911110834908624, + "loss": 1.7808, + "step": 1739 + }, + { + "epoch": 0.543919974992185, + "grad_norm": 0.23046875, + "learning_rate": 0.0001911009537007627, + "loss": 1.7043, + "step": 1740 + }, + { + "epoch": 0.5442325726789622, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019109081841078233, + "loss": 1.7296, + "step": 1741 + }, + { + "epoch": 0.5445451703657392, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001910806776215332, + "loss": 1.6465, + "step": 1742 + }, + { + "epoch": 0.5448577680525164, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019107053133362749, + "loss": 1.8411, + "step": 1743 + }, + { + "epoch": 0.5451703657392936, + "grad_norm": 0.220703125, + "learning_rate": 0.00019106037954767774, + "loss": 1.4522, + "step": 1744 + }, + { + "epoch": 0.5454829634260706, + "grad_norm": 0.212890625, + "learning_rate": 0.00019105022226429682, + "loss": 1.7463, + "step": 1745 + }, + { + "epoch": 0.5457955611128478, + "grad_norm": 0.22265625, + "learning_rate": 0.00019104005948409797, + "loss": 1.622, + "step": 1746 + }, + { + "epoch": 0.5461081587996249, + "grad_norm": 0.234375, + "learning_rate": 0.00019102989120769475, + "loss": 1.8334, + "step": 1747 + }, + { + "epoch": 0.546420756486402, + "grad_norm": 0.236328125, + "learning_rate": 0.00019101971743570094, + "loss": 1.6375, + "step": 1748 + }, + { + "epoch": 0.5467333541731791, + "grad_norm": 0.224609375, + "learning_rate": 0.00019100953816873084, + "loss": 1.4945, + "step": 1749 + }, + { + "epoch": 0.5470459518599562, + "grad_norm": 0.21875, + "learning_rate": 0.00019099935340739893, + "loss": 1.687, + "step": 1750 + }, + { + "epoch": 0.5473585495467334, + "grad_norm": 0.251953125, + "learning_rate": 0.0001909891631523201, + "loss": 1.8769, + "step": 1751 + }, + { + "epoch": 0.5476711472335105, + "grad_norm": 0.228515625, + "learning_rate": 0.00019097896740410955, + "loss": 1.814, + "step": 1752 + }, + { + "epoch": 0.5479837449202876, + "grad_norm": 0.224609375, + "learning_rate": 0.00019096876616338278, + "loss": 1.8215, + "step": 1753 + }, + { + "epoch": 0.5482963426070647, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019095855943075568, + "loss": 1.6682, + "step": 1754 + }, + { + "epoch": 0.5486089402938418, + "grad_norm": 0.234375, + "learning_rate": 0.00019094834720684447, + "loss": 1.8052, + "step": 1755 + }, + { + "epoch": 0.5489215379806189, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001909381294922656, + "loss": 1.7685, + "step": 1756 + }, + { + "epoch": 0.5492341356673961, + "grad_norm": 0.2392578125, + "learning_rate": 0.000190927906287636, + "loss": 1.6704, + "step": 1757 + }, + { + "epoch": 0.5495467333541731, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001909176775935728, + "loss": 1.75, + "step": 1758 + }, + { + "epoch": 0.5498593310409503, + "grad_norm": 0.240234375, + "learning_rate": 0.00019090744341069356, + "loss": 1.5139, + "step": 1759 + }, + { + "epoch": 0.5501719287277275, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019089720373961612, + "loss": 1.5844, + "step": 1760 + }, + { + "epoch": 0.5504845264145045, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019088695858095864, + "loss": 1.7899, + "step": 1761 + }, + { + "epoch": 0.5507971241012817, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019087670793533967, + "loss": 1.7717, + "step": 1762 + }, + { + "epoch": 0.5511097217880587, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019086645180337803, + "loss": 1.7754, + "step": 1763 + }, + { + "epoch": 0.5514223194748359, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001908561901856929, + "loss": 1.8412, + "step": 1764 + }, + { + "epoch": 0.551734917161613, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001908459230829038, + "loss": 1.7254, + "step": 1765 + }, + { + "epoch": 0.5520475148483901, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019083565049563057, + "loss": 1.8097, + "step": 1766 + }, + { + "epoch": 0.5523601125351673, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019082537242449333, + "loss": 1.8441, + "step": 1767 + }, + { + "epoch": 0.5526727102219443, + "grad_norm": 0.328125, + "learning_rate": 0.00019081508887011263, + "loss": 2.4757, + "step": 1768 + }, + { + "epoch": 0.5529853079087215, + "grad_norm": 0.21875, + "learning_rate": 0.0001908047998331093, + "loss": 1.5833, + "step": 1769 + }, + { + "epoch": 0.5532979055954986, + "grad_norm": 0.359375, + "learning_rate": 0.0001907945053141045, + "loss": 2.4293, + "step": 1770 + }, + { + "epoch": 0.5536105032822757, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001907842053137197, + "loss": 1.9397, + "step": 1771 + }, + { + "epoch": 0.5539231009690528, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001907738998325767, + "loss": 2.0662, + "step": 1772 + }, + { + "epoch": 0.55423569865583, + "grad_norm": 0.228515625, + "learning_rate": 0.00019076358887129774, + "loss": 1.8447, + "step": 1773 + }, + { + "epoch": 0.554548296342607, + "grad_norm": 0.32421875, + "learning_rate": 0.00019075327243050526, + "loss": 2.3451, + "step": 1774 + }, + { + "epoch": 0.5548608940293842, + "grad_norm": 0.228515625, + "learning_rate": 0.00019074295051082205, + "loss": 1.623, + "step": 1775 + }, + { + "epoch": 0.5551734917161613, + "grad_norm": 0.240234375, + "learning_rate": 0.0001907326231128713, + "loss": 2.0579, + "step": 1776 + }, + { + "epoch": 0.5554860894029384, + "grad_norm": 0.216796875, + "learning_rate": 0.00019072229023727645, + "loss": 1.6111, + "step": 1777 + }, + { + "epoch": 0.5557986870897156, + "grad_norm": 0.224609375, + "learning_rate": 0.00019071195188466135, + "loss": 1.87, + "step": 1778 + }, + { + "epoch": 0.5561112847764926, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019070160805565012, + "loss": 1.6437, + "step": 1779 + }, + { + "epoch": 0.5564238824632698, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019069125875086722, + "loss": 1.6752, + "step": 1780 + }, + { + "epoch": 0.5567364801500468, + "grad_norm": 0.236328125, + "learning_rate": 0.00019068090397093745, + "loss": 1.7323, + "step": 1781 + }, + { + "epoch": 0.557049077836824, + "grad_norm": 0.228515625, + "learning_rate": 0.000190670543716486, + "loss": 1.7324, + "step": 1782 + }, + { + "epoch": 0.5573616755236012, + "grad_norm": 0.22265625, + "learning_rate": 0.00019066017798813825, + "loss": 1.5224, + "step": 1783 + }, + { + "epoch": 0.5576742732103782, + "grad_norm": 0.326171875, + "learning_rate": 0.00019064980678652, + "loss": 2.3167, + "step": 1784 + }, + { + "epoch": 0.5579868708971554, + "grad_norm": 0.212890625, + "learning_rate": 0.00019063943011225743, + "loss": 1.7731, + "step": 1785 + }, + { + "epoch": 0.5582994685839325, + "grad_norm": 0.23828125, + "learning_rate": 0.00019062904796597697, + "loss": 1.6789, + "step": 1786 + }, + { + "epoch": 0.5586120662707096, + "grad_norm": 0.224609375, + "learning_rate": 0.00019061866034830534, + "loss": 1.7119, + "step": 1787 + }, + { + "epoch": 0.5589246639574867, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019060826725986977, + "loss": 1.6962, + "step": 1788 + }, + { + "epoch": 0.5592372616442638, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019059786870129761, + "loss": 1.6318, + "step": 1789 + }, + { + "epoch": 0.559549859331041, + "grad_norm": 0.21875, + "learning_rate": 0.0001905874646732167, + "loss": 1.8541, + "step": 1790 + }, + { + "epoch": 0.5598624570178181, + "grad_norm": 0.2265625, + "learning_rate": 0.00019057705517625505, + "loss": 1.8081, + "step": 1791 + }, + { + "epoch": 0.5601750547045952, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001905666402110412, + "loss": 1.4779, + "step": 1792 + }, + { + "epoch": 0.5604876523913723, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019055621977820387, + "loss": 1.6657, + "step": 1793 + }, + { + "epoch": 0.5608002500781494, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019054579387837214, + "loss": 1.5665, + "step": 1794 + }, + { + "epoch": 0.5611128477649265, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019053536251217545, + "loss": 1.5586, + "step": 1795 + }, + { + "epoch": 0.5614254454517037, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019052492568024355, + "loss": 1.5323, + "step": 1796 + }, + { + "epoch": 0.5617380431384807, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019051448338320656, + "loss": 1.7868, + "step": 1797 + }, + { + "epoch": 0.5620506408252579, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019050403562169486, + "loss": 1.6351, + "step": 1798 + }, + { + "epoch": 0.562363238512035, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019049358239633916, + "loss": 2.0889, + "step": 1799 + }, + { + "epoch": 0.5626758361988121, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019048312370777062, + "loss": 1.5398, + "step": 1800 + }, + { + "epoch": 0.5629884338855893, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019047265955662054, + "loss": 1.6967, + "step": 1801 + }, + { + "epoch": 0.5633010315723663, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019046218994352076, + "loss": 1.6917, + "step": 1802 + }, + { + "epoch": 0.5636136292591435, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001904517148691033, + "loss": 1.4587, + "step": 1803 + }, + { + "epoch": 0.5639262269459207, + "grad_norm": 0.23046875, + "learning_rate": 0.00019044123433400052, + "loss": 1.8214, + "step": 1804 + }, + { + "epoch": 0.5642388246326977, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001904307483388452, + "loss": 1.6375, + "step": 1805 + }, + { + "epoch": 0.5645514223194749, + "grad_norm": 0.2265625, + "learning_rate": 0.00019042025688427035, + "loss": 1.5963, + "step": 1806 + }, + { + "epoch": 0.5648640200062519, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019040975997090936, + "loss": 1.8623, + "step": 1807 + }, + { + "epoch": 0.5651766176930291, + "grad_norm": 0.220703125, + "learning_rate": 0.00019039925759939597, + "loss": 1.6458, + "step": 1808 + }, + { + "epoch": 0.5654892153798062, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001903887497703642, + "loss": 1.6367, + "step": 1809 + }, + { + "epoch": 0.5658018130665833, + "grad_norm": 0.216796875, + "learning_rate": 0.00019037823648444842, + "loss": 1.6211, + "step": 1810 + }, + { + "epoch": 0.5661144107533604, + "grad_norm": 0.220703125, + "learning_rate": 0.0001903677177422833, + "loss": 1.5955, + "step": 1811 + }, + { + "epoch": 0.5664270084401375, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019035719354450393, + "loss": 1.6509, + "step": 1812 + }, + { + "epoch": 0.5667396061269147, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019034666389174568, + "loss": 1.5193, + "step": 1813 + }, + { + "epoch": 0.5670522038136918, + "grad_norm": 0.23046875, + "learning_rate": 0.00019033612878464412, + "loss": 1.8779, + "step": 1814 + }, + { + "epoch": 0.5673648015004689, + "grad_norm": 0.232421875, + "learning_rate": 0.00019032558822383542, + "loss": 1.746, + "step": 1815 + }, + { + "epoch": 0.567677399187246, + "grad_norm": 0.21875, + "learning_rate": 0.0001903150422099558, + "loss": 1.6802, + "step": 1816 + }, + { + "epoch": 0.5679899968740232, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019030449074364204, + "loss": 1.8168, + "step": 1817 + }, + { + "epoch": 0.5683025945608002, + "grad_norm": 0.23046875, + "learning_rate": 0.00019029393382553108, + "loss": 1.6261, + "step": 1818 + }, + { + "epoch": 0.5686151922475774, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019028337145626028, + "loss": 1.6126, + "step": 1819 + }, + { + "epoch": 0.5689277899343544, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019027280363646728, + "loss": 1.7607, + "step": 1820 + }, + { + "epoch": 0.5692403876211316, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001902622303667901, + "loss": 1.6267, + "step": 1821 + }, + { + "epoch": 0.5695529853079088, + "grad_norm": 0.22265625, + "learning_rate": 0.00019025165164786705, + "loss": 1.7209, + "step": 1822 + }, + { + "epoch": 0.5698655829946858, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019024106748033679, + "loss": 1.4932, + "step": 1823 + }, + { + "epoch": 0.570178180681463, + "grad_norm": 0.236328125, + "learning_rate": 0.00019023047786483828, + "loss": 1.4764, + "step": 1824 + }, + { + "epoch": 0.57049077836824, + "grad_norm": 0.228515625, + "learning_rate": 0.00019021988280201084, + "loss": 1.6664, + "step": 1825 + }, + { + "epoch": 0.5708033760550172, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001902092822924941, + "loss": 1.5628, + "step": 1826 + }, + { + "epoch": 0.5711159737417943, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019019867633692802, + "loss": 1.8942, + "step": 1827 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019018806493595293, + "loss": 1.5664, + "step": 1828 + }, + { + "epoch": 0.5717411691153486, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019017744809020942, + "loss": 1.4663, + "step": 1829 + }, + { + "epoch": 0.5720537668021257, + "grad_norm": 0.22265625, + "learning_rate": 0.00019016682580033848, + "loss": 1.8574, + "step": 1830 + }, + { + "epoch": 0.5723663644889028, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019015619806698135, + "loss": 1.7824, + "step": 1831 + }, + { + "epoch": 0.5726789621756799, + "grad_norm": 0.224609375, + "learning_rate": 0.00019014556489077965, + "loss": 1.5226, + "step": 1832 + }, + { + "epoch": 0.572991559862457, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019013492627237532, + "loss": 1.8333, + "step": 1833 + }, + { + "epoch": 0.5733041575492341, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019012428221241065, + "loss": 1.5824, + "step": 1834 + }, + { + "epoch": 0.5736167552360113, + "grad_norm": 0.232421875, + "learning_rate": 0.00019011363271152822, + "loss": 1.7483, + "step": 1835 + }, + { + "epoch": 0.5739293529227883, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019010297777037093, + "loss": 1.6215, + "step": 1836 + }, + { + "epoch": 0.5742419506095655, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019009231738958206, + "loss": 1.6124, + "step": 1837 + }, + { + "epoch": 0.5745545482963426, + "grad_norm": 0.234375, + "learning_rate": 0.00019008165156980517, + "loss": 1.8104, + "step": 1838 + }, + { + "epoch": 0.5748671459831197, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001900709803116842, + "loss": 1.7839, + "step": 1839 + }, + { + "epoch": 0.5751797436698969, + "grad_norm": 0.216796875, + "learning_rate": 0.0001900603036158634, + "loss": 1.6926, + "step": 1840 + }, + { + "epoch": 0.5754923413566739, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019004962148298725, + "loss": 1.8372, + "step": 1841 + }, + { + "epoch": 0.5758049390434511, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001900389339137007, + "loss": 1.5496, + "step": 1842 + }, + { + "epoch": 0.5761175367302281, + "grad_norm": 0.2275390625, + "learning_rate": 0.000190028240908649, + "loss": 1.7024, + "step": 1843 + }, + { + "epoch": 0.5764301344170053, + "grad_norm": 0.236328125, + "learning_rate": 0.00019001754246847767, + "loss": 1.6237, + "step": 1844 + }, + { + "epoch": 0.5767427321037825, + "grad_norm": 0.23046875, + "learning_rate": 0.00019000683859383258, + "loss": 1.6012, + "step": 1845 + }, + { + "epoch": 0.5770553297905595, + "grad_norm": 0.2119140625, + "learning_rate": 0.00018999612928535995, + "loss": 1.7586, + "step": 1846 + }, + { + "epoch": 0.5773679274773367, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018998541454370632, + "loss": 1.4823, + "step": 1847 + }, + { + "epoch": 0.5776805251641138, + "grad_norm": 0.2265625, + "learning_rate": 0.00018997469436951854, + "loss": 1.5688, + "step": 1848 + }, + { + "epoch": 0.5779931228508909, + "grad_norm": 0.318359375, + "learning_rate": 0.0001899639687634438, + "loss": 2.5108, + "step": 1849 + }, + { + "epoch": 0.578305720537668, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018995323772612964, + "loss": 1.6868, + "step": 1850 + }, + { + "epoch": 0.5786183182244451, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018994250125822386, + "loss": 1.6238, + "step": 1851 + }, + { + "epoch": 0.5789309159112223, + "grad_norm": 0.220703125, + "learning_rate": 0.0001899317593603747, + "loss": 1.5826, + "step": 1852 + }, + { + "epoch": 0.5792435135979994, + "grad_norm": 0.2265625, + "learning_rate": 0.0001899210120332306, + "loss": 1.6792, + "step": 1853 + }, + { + "epoch": 0.5795561112847765, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018991025927744042, + "loss": 1.8574, + "step": 1854 + }, + { + "epoch": 0.5798687089715536, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001898995010936533, + "loss": 1.7686, + "step": 1855 + }, + { + "epoch": 0.5801813066583307, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018988873748251877, + "loss": 1.7198, + "step": 1856 + }, + { + "epoch": 0.5804939043451078, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018987796844468658, + "loss": 1.7134, + "step": 1857 + }, + { + "epoch": 0.580806502031885, + "grad_norm": 0.212890625, + "learning_rate": 0.00018986719398080695, + "loss": 1.5788, + "step": 1858 + }, + { + "epoch": 0.581119099718662, + "grad_norm": 0.2265625, + "learning_rate": 0.00018985641409153026, + "loss": 1.6557, + "step": 1859 + }, + { + "epoch": 0.5814316974054392, + "grad_norm": 0.23046875, + "learning_rate": 0.00018984562877750737, + "loss": 1.719, + "step": 1860 + }, + { + "epoch": 0.5817442950922164, + "grad_norm": 0.2265625, + "learning_rate": 0.00018983483803938932, + "loss": 1.7116, + "step": 1861 + }, + { + "epoch": 0.5820568927789934, + "grad_norm": 0.236328125, + "learning_rate": 0.0001898240418778277, + "loss": 1.9006, + "step": 1862 + }, + { + "epoch": 0.5823694904657706, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018981324029347416, + "loss": 1.3191, + "step": 1863 + }, + { + "epoch": 0.5826820881525476, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018980243328698088, + "loss": 1.7602, + "step": 1864 + }, + { + "epoch": 0.5829946858393248, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018979162085900025, + "loss": 2.0473, + "step": 1865 + }, + { + "epoch": 0.583307283526102, + "grad_norm": 0.234375, + "learning_rate": 0.00018978080301018503, + "loss": 1.7591, + "step": 1866 + }, + { + "epoch": 0.583619881212879, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018976997974118836, + "loss": 1.9532, + "step": 1867 + }, + { + "epoch": 0.5839324788996562, + "grad_norm": 0.234375, + "learning_rate": 0.0001897591510526636, + "loss": 1.8456, + "step": 1868 + }, + { + "epoch": 0.5842450765864332, + "grad_norm": 0.23046875, + "learning_rate": 0.00018974831694526452, + "loss": 1.7148, + "step": 1869 + }, + { + "epoch": 0.5845576742732104, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018973747741964515, + "loss": 1.6221, + "step": 1870 + }, + { + "epoch": 0.5848702719599875, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018972663247645994, + "loss": 2.0677, + "step": 1871 + }, + { + "epoch": 0.5851828696467646, + "grad_norm": 0.2421875, + "learning_rate": 0.00018971578211636359, + "loss": 1.4428, + "step": 1872 + }, + { + "epoch": 0.5854954673335417, + "grad_norm": 0.224609375, + "learning_rate": 0.00018970492634001114, + "loss": 1.6225, + "step": 1873 + }, + { + "epoch": 0.5858080650203189, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018969406514805797, + "loss": 1.5286, + "step": 1874 + }, + { + "epoch": 0.586120662707096, + "grad_norm": 0.2421875, + "learning_rate": 0.00018968319854115978, + "loss": 1.7499, + "step": 1875 + }, + { + "epoch": 0.5864332603938731, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018967232651997265, + "loss": 1.9038, + "step": 1876 + }, + { + "epoch": 0.5867458580806502, + "grad_norm": 0.21484375, + "learning_rate": 0.00018966144908515284, + "loss": 1.5464, + "step": 1877 + }, + { + "epoch": 0.5870584557674273, + "grad_norm": 0.23046875, + "learning_rate": 0.00018965056623735713, + "loss": 1.6405, + "step": 1878 + }, + { + "epoch": 0.5873710534542045, + "grad_norm": 0.2099609375, + "learning_rate": 0.00018963967797724248, + "loss": 1.727, + "step": 1879 + }, + { + "epoch": 0.5876836511409815, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018962878430546626, + "loss": 1.7438, + "step": 1880 + }, + { + "epoch": 0.5879962488277587, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001896178852226861, + "loss": 1.6973, + "step": 1881 + }, + { + "epoch": 0.5883088465145357, + "grad_norm": 0.228515625, + "learning_rate": 0.00018960698072956, + "loss": 1.7813, + "step": 1882 + }, + { + "epoch": 0.5886214442013129, + "grad_norm": 0.224609375, + "learning_rate": 0.00018959607082674632, + "loss": 1.8691, + "step": 1883 + }, + { + "epoch": 0.5889340418880901, + "grad_norm": 0.2265625, + "learning_rate": 0.00018958515551490364, + "loss": 1.8186, + "step": 1884 + }, + { + "epoch": 0.5892466395748671, + "grad_norm": 0.224609375, + "learning_rate": 0.00018957423479469096, + "loss": 1.6628, + "step": 1885 + }, + { + "epoch": 0.5895592372616443, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001895633086667676, + "loss": 1.8004, + "step": 1886 + }, + { + "epoch": 0.5898718349484214, + "grad_norm": 0.24609375, + "learning_rate": 0.00018955237713179314, + "loss": 1.781, + "step": 1887 + }, + { + "epoch": 0.5901844326351985, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018954144019042759, + "loss": 1.7539, + "step": 1888 + }, + { + "epoch": 0.5904970303219756, + "grad_norm": 0.23046875, + "learning_rate": 0.00018953049784333116, + "loss": 1.6668, + "step": 1889 + }, + { + "epoch": 0.5908096280087527, + "grad_norm": 0.228515625, + "learning_rate": 0.00018951955009116449, + "loss": 1.954, + "step": 1890 + }, + { + "epoch": 0.5911222256955299, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001895085969345885, + "loss": 1.8232, + "step": 1891 + }, + { + "epoch": 0.591434823382307, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018949763837426445, + "loss": 1.5966, + "step": 1892 + }, + { + "epoch": 0.5917474210690841, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018948667441085398, + "loss": 1.5623, + "step": 1893 + }, + { + "epoch": 0.5920600187558612, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018947570504501888, + "loss": 1.689, + "step": 1894 + }, + { + "epoch": 0.5923726164426383, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018946473027742146, + "loss": 1.6939, + "step": 1895 + }, + { + "epoch": 0.5926852141294154, + "grad_norm": 0.228515625, + "learning_rate": 0.00018945375010872426, + "loss": 1.7252, + "step": 1896 + }, + { + "epoch": 0.5929978118161926, + "grad_norm": 0.220703125, + "learning_rate": 0.0001894427645395902, + "loss": 1.7894, + "step": 1897 + }, + { + "epoch": 0.5933104095029696, + "grad_norm": 0.234375, + "learning_rate": 0.00018943177357068244, + "loss": 1.8643, + "step": 1898 + }, + { + "epoch": 0.5936230071897468, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018942077720266454, + "loss": 1.6017, + "step": 1899 + }, + { + "epoch": 0.5939356048765239, + "grad_norm": 0.22265625, + "learning_rate": 0.0001894097754362004, + "loss": 1.514, + "step": 1900 + }, + { + "epoch": 0.594248202563301, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018939876827195418, + "loss": 1.8716, + "step": 1901 + }, + { + "epoch": 0.5945608002500782, + "grad_norm": 0.232421875, + "learning_rate": 0.00018938775571059039, + "loss": 1.8103, + "step": 1902 + }, + { + "epoch": 0.5948733979368552, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018937673775277388, + "loss": 1.5777, + "step": 1903 + }, + { + "epoch": 0.5951859956236324, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001893657143991698, + "loss": 1.6428, + "step": 1904 + }, + { + "epoch": 0.5954985933104096, + "grad_norm": 0.224609375, + "learning_rate": 0.00018935468565044368, + "loss": 2.0165, + "step": 1905 + }, + { + "epoch": 0.5958111909971866, + "grad_norm": 0.22265625, + "learning_rate": 0.00018934365150726133, + "loss": 1.5724, + "step": 1906 + }, + { + "epoch": 0.5961237886839638, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018933261197028885, + "loss": 1.9301, + "step": 1907 + }, + { + "epoch": 0.5964363863707408, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001893215670401928, + "loss": 1.6571, + "step": 1908 + }, + { + "epoch": 0.596748984057518, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018931051671763988, + "loss": 1.7479, + "step": 1909 + }, + { + "epoch": 0.5970615817442951, + "grad_norm": 0.228515625, + "learning_rate": 0.00018929946100329725, + "loss": 1.6891, + "step": 1910 + }, + { + "epoch": 0.5973741794310722, + "grad_norm": 0.23046875, + "learning_rate": 0.0001892883998978324, + "loss": 1.646, + "step": 1911 + }, + { + "epoch": 0.5976867771178493, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018927733340191308, + "loss": 1.6963, + "step": 1912 + }, + { + "epoch": 0.5979993748046264, + "grad_norm": 0.2265625, + "learning_rate": 0.00018926626151620732, + "loss": 1.9789, + "step": 1913 + }, + { + "epoch": 0.5983119724914036, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018925518424138361, + "loss": 1.9244, + "step": 1914 + }, + { + "epoch": 0.5986245701781807, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018924410157811073, + "loss": 1.5019, + "step": 1915 + }, + { + "epoch": 0.5989371678649578, + "grad_norm": 0.2265625, + "learning_rate": 0.0001892330135270577, + "loss": 1.7337, + "step": 1916 + }, + { + "epoch": 0.5992497655517349, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001892219200888939, + "loss": 1.6027, + "step": 1917 + }, + { + "epoch": 0.5995623632385121, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018921082126428912, + "loss": 1.6431, + "step": 1918 + }, + { + "epoch": 0.5998749609252891, + "grad_norm": 0.22265625, + "learning_rate": 0.00018919971705391335, + "loss": 1.822, + "step": 1919 + }, + { + "epoch": 0.6001875586120663, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018918860745843703, + "loss": 1.656, + "step": 1920 + }, + { + "epoch": 0.6005001562988433, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018917749247853078, + "loss": 1.6685, + "step": 1921 + }, + { + "epoch": 0.6008127539856205, + "grad_norm": 0.21875, + "learning_rate": 0.0001891663721148657, + "loss": 1.8229, + "step": 1922 + }, + { + "epoch": 0.6011253516723977, + "grad_norm": 0.228515625, + "learning_rate": 0.0001891552463681131, + "loss": 1.7224, + "step": 1923 + }, + { + "epoch": 0.6014379493591747, + "grad_norm": 0.23046875, + "learning_rate": 0.00018914411523894467, + "loss": 1.9986, + "step": 1924 + }, + { + "epoch": 0.6017505470459519, + "grad_norm": 0.22265625, + "learning_rate": 0.0001891329787280324, + "loss": 1.4848, + "step": 1925 + }, + { + "epoch": 0.6020631447327289, + "grad_norm": 0.224609375, + "learning_rate": 0.00018912183683604864, + "loss": 1.7737, + "step": 1926 + }, + { + "epoch": 0.6023757424195061, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018911068956366597, + "loss": 1.7155, + "step": 1927 + }, + { + "epoch": 0.6026883401062832, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018909953691155745, + "loss": 1.7669, + "step": 1928 + }, + { + "epoch": 0.6030009377930603, + "grad_norm": 0.232421875, + "learning_rate": 0.00018908837888039637, + "loss": 1.8628, + "step": 1929 + }, + { + "epoch": 0.6033135354798375, + "grad_norm": 0.23046875, + "learning_rate": 0.0001890772154708563, + "loss": 1.7606, + "step": 1930 + }, + { + "epoch": 0.6036261331666146, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001890660466836112, + "loss": 1.5453, + "step": 1931 + }, + { + "epoch": 0.6039387308533917, + "grad_norm": 0.236328125, + "learning_rate": 0.00018905487251933542, + "loss": 1.7034, + "step": 1932 + }, + { + "epoch": 0.6042513285401688, + "grad_norm": 0.23046875, + "learning_rate": 0.00018904369297870349, + "loss": 1.6582, + "step": 1933 + }, + { + "epoch": 0.6045639262269459, + "grad_norm": 0.2421875, + "learning_rate": 0.0001890325080623903, + "loss": 1.5893, + "step": 1934 + }, + { + "epoch": 0.604876523913723, + "grad_norm": 0.220703125, + "learning_rate": 0.00018902131777107117, + "loss": 1.602, + "step": 1935 + }, + { + "epoch": 0.6051891216005002, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018901012210542165, + "loss": 1.636, + "step": 1936 + }, + { + "epoch": 0.6055017192872773, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018899892106611762, + "loss": 1.8495, + "step": 1937 + }, + { + "epoch": 0.6058143169740544, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018898771465383532, + "loss": 1.7294, + "step": 1938 + }, + { + "epoch": 0.6061269146608315, + "grad_norm": 0.23046875, + "learning_rate": 0.0001889765028692513, + "loss": 1.6063, + "step": 1939 + }, + { + "epoch": 0.6064395123476086, + "grad_norm": 0.23046875, + "learning_rate": 0.0001889652857130424, + "loss": 1.5972, + "step": 1940 + }, + { + "epoch": 0.6067521100343858, + "grad_norm": 0.224609375, + "learning_rate": 0.00018895406318588585, + "loss": 1.9705, + "step": 1941 + }, + { + "epoch": 0.6070647077211628, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018894283528845914, + "loss": 1.9463, + "step": 1942 + }, + { + "epoch": 0.60737730540794, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018893160202144012, + "loss": 1.7365, + "step": 1943 + }, + { + "epoch": 0.6076899030947172, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018892036338550696, + "loss": 1.6313, + "step": 1944 + }, + { + "epoch": 0.6080025007814942, + "grad_norm": 0.228515625, + "learning_rate": 0.00018890911938133814, + "loss": 1.7297, + "step": 1945 + }, + { + "epoch": 0.6083150984682714, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001888978700096125, + "loss": 1.5932, + "step": 1946 + }, + { + "epoch": 0.6086276961550484, + "grad_norm": 0.255859375, + "learning_rate": 0.00018888661527100914, + "loss": 1.7416, + "step": 1947 + }, + { + "epoch": 0.6089402938418256, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001888753551662076, + "loss": 1.5615, + "step": 1948 + }, + { + "epoch": 0.6092528915286027, + "grad_norm": 0.21875, + "learning_rate": 0.00018886408969588756, + "loss": 1.9525, + "step": 1949 + }, + { + "epoch": 0.6095654892153798, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001888528188607292, + "loss": 1.4709, + "step": 1950 + }, + { + "epoch": 0.609878086902157, + "grad_norm": 0.2265625, + "learning_rate": 0.00018884154266141296, + "loss": 1.6341, + "step": 1951 + }, + { + "epoch": 0.610190684588934, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018883026109861955, + "loss": 1.6915, + "step": 1952 + }, + { + "epoch": 0.6105032822757112, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001888189741730301, + "loss": 1.7387, + "step": 1953 + }, + { + "epoch": 0.6108158799624883, + "grad_norm": 0.23828125, + "learning_rate": 0.000188807681885326, + "loss": 1.4454, + "step": 1954 + }, + { + "epoch": 0.6111284776492654, + "grad_norm": 0.22265625, + "learning_rate": 0.00018879638423618893, + "loss": 1.644, + "step": 1955 + }, + { + "epoch": 0.6114410753360425, + "grad_norm": 0.2265625, + "learning_rate": 0.00018878508122630106, + "loss": 1.6955, + "step": 1956 + }, + { + "epoch": 0.6117536730228196, + "grad_norm": 0.228515625, + "learning_rate": 0.00018877377285634464, + "loss": 1.5826, + "step": 1957 + }, + { + "epoch": 0.6120662707095967, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018876245912700243, + "loss": 1.7957, + "step": 1958 + }, + { + "epoch": 0.6123788683963739, + "grad_norm": 0.23046875, + "learning_rate": 0.00018875114003895748, + "loss": 1.5181, + "step": 1959 + }, + { + "epoch": 0.612691466083151, + "grad_norm": 0.23046875, + "learning_rate": 0.00018873981559289308, + "loss": 1.7115, + "step": 1960 + }, + { + "epoch": 0.6130040637699281, + "grad_norm": 0.236328125, + "learning_rate": 0.00018872848578949296, + "loss": 1.9347, + "step": 1961 + }, + { + "epoch": 0.6133166614567053, + "grad_norm": 0.23046875, + "learning_rate": 0.00018871715062944108, + "loss": 1.7506, + "step": 1962 + }, + { + "epoch": 0.6136292591434823, + "grad_norm": 0.29296875, + "learning_rate": 0.00018870581011342174, + "loss": 2.3271, + "step": 1963 + }, + { + "epoch": 0.6139418568302595, + "grad_norm": 0.228515625, + "learning_rate": 0.00018869446424211962, + "loss": 2.0109, + "step": 1964 + }, + { + "epoch": 0.6142544545170365, + "grad_norm": 0.23046875, + "learning_rate": 0.00018868311301621968, + "loss": 1.5306, + "step": 1965 + }, + { + "epoch": 0.6145670522038137, + "grad_norm": 0.224609375, + "learning_rate": 0.00018867175643640717, + "loss": 1.7745, + "step": 1966 + }, + { + "epoch": 0.6148796498905909, + "grad_norm": 0.23046875, + "learning_rate": 0.00018866039450336777, + "loss": 1.7684, + "step": 1967 + }, + { + "epoch": 0.6151922475773679, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018864902721778734, + "loss": 1.738, + "step": 1968 + }, + { + "epoch": 0.6155048452641451, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018863765458035218, + "loss": 1.6707, + "step": 1969 + }, + { + "epoch": 0.6158174429509221, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018862627659174886, + "loss": 1.5577, + "step": 1970 + }, + { + "epoch": 0.6161300406376993, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018861489325266425, + "loss": 1.6428, + "step": 1971 + }, + { + "epoch": 0.6164426383244764, + "grad_norm": 0.2421875, + "learning_rate": 0.00018860350456378566, + "loss": 1.5885, + "step": 1972 + }, + { + "epoch": 0.6167552360112535, + "grad_norm": 0.21875, + "learning_rate": 0.00018859211052580057, + "loss": 1.3899, + "step": 1973 + }, + { + "epoch": 0.6170678336980306, + "grad_norm": 0.23046875, + "learning_rate": 0.0001885807111393969, + "loss": 1.8002, + "step": 1974 + }, + { + "epoch": 0.6173804313848078, + "grad_norm": 0.2265625, + "learning_rate": 0.0001885693064052628, + "loss": 1.7554, + "step": 1975 + }, + { + "epoch": 0.6176930290715849, + "grad_norm": 0.22265625, + "learning_rate": 0.0001885578963240868, + "loss": 1.5717, + "step": 1976 + }, + { + "epoch": 0.618005626758362, + "grad_norm": 0.228515625, + "learning_rate": 0.00018854648089655776, + "loss": 1.6693, + "step": 1977 + }, + { + "epoch": 0.6183182244451391, + "grad_norm": 0.2265625, + "learning_rate": 0.00018853506012336482, + "loss": 1.8787, + "step": 1978 + }, + { + "epoch": 0.6186308221319162, + "grad_norm": 0.220703125, + "learning_rate": 0.00018852363400519745, + "loss": 1.6435, + "step": 1979 + }, + { + "epoch": 0.6189434198186934, + "grad_norm": 0.224609375, + "learning_rate": 0.00018851220254274554, + "loss": 1.7522, + "step": 1980 + }, + { + "epoch": 0.6192560175054704, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018850076573669915, + "loss": 1.5828, + "step": 1981 + }, + { + "epoch": 0.6195686151922476, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001884893235877488, + "loss": 1.457, + "step": 1982 + }, + { + "epoch": 0.6198812128790246, + "grad_norm": 0.22265625, + "learning_rate": 0.00018847787609658516, + "loss": 1.5991, + "step": 1983 + }, + { + "epoch": 0.6201938105658018, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001884664232638994, + "loss": 1.598, + "step": 1984 + }, + { + "epoch": 0.620506408252579, + "grad_norm": 0.228515625, + "learning_rate": 0.00018845496509038294, + "loss": 1.6774, + "step": 1985 + }, + { + "epoch": 0.620819005939356, + "grad_norm": 0.220703125, + "learning_rate": 0.00018844350157672755, + "loss": 1.7232, + "step": 1986 + }, + { + "epoch": 0.6211316036261332, + "grad_norm": 0.228515625, + "learning_rate": 0.00018843203272362523, + "loss": 1.7184, + "step": 1987 + }, + { + "epoch": 0.6214442013129103, + "grad_norm": 0.22265625, + "learning_rate": 0.00018842055853176838, + "loss": 1.6561, + "step": 1988 + }, + { + "epoch": 0.6217567989996874, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001884090790018498, + "loss": 1.5792, + "step": 1989 + }, + { + "epoch": 0.6220693966864645, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001883975941345624, + "loss": 1.9449, + "step": 1990 + }, + { + "epoch": 0.6223819943732416, + "grad_norm": 0.251953125, + "learning_rate": 0.00018838610393059964, + "loss": 2.1031, + "step": 1991 + }, + { + "epoch": 0.6226945920600188, + "grad_norm": 0.228515625, + "learning_rate": 0.00018837460839065515, + "loss": 1.9063, + "step": 1992 + }, + { + "epoch": 0.6230071897467959, + "grad_norm": 0.25390625, + "learning_rate": 0.0001883631075154229, + "loss": 2.1289, + "step": 1993 + }, + { + "epoch": 0.623319787433573, + "grad_norm": 0.23828125, + "learning_rate": 0.0001883516013055973, + "loss": 2.0025, + "step": 1994 + }, + { + "epoch": 0.6236323851203501, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001883400897618729, + "loss": 1.8512, + "step": 1995 + }, + { + "epoch": 0.6239449828071272, + "grad_norm": 0.236328125, + "learning_rate": 0.0001883285728849447, + "loss": 1.8326, + "step": 1996 + }, + { + "epoch": 0.6242575804939043, + "grad_norm": 0.224609375, + "learning_rate": 0.00018831705067550805, + "loss": 1.6852, + "step": 1997 + }, + { + "epoch": 0.6245701781806815, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018830552313425845, + "loss": 1.8256, + "step": 1998 + }, + { + "epoch": 0.6248827758674586, + "grad_norm": 0.23046875, + "learning_rate": 0.0001882939902618919, + "loss": 1.6083, + "step": 1999 + }, + { + "epoch": 0.6251953735542357, + "grad_norm": 0.224609375, + "learning_rate": 0.00018828245205910465, + "loss": 1.7561, + "step": 2000 + }, + { + "epoch": 0.6255079712410128, + "grad_norm": 0.2421875, + "learning_rate": 0.0001882709085265933, + "loss": 1.7635, + "step": 2001 + }, + { + "epoch": 0.6258205689277899, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001882593596650547, + "loss": 1.8553, + "step": 2002 + }, + { + "epoch": 0.6261331666145671, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001882478054751861, + "loss": 1.6012, + "step": 2003 + }, + { + "epoch": 0.6264457643013441, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018823624595768498, + "loss": 1.8742, + "step": 2004 + }, + { + "epoch": 0.6267583619881213, + "grad_norm": 0.23828125, + "learning_rate": 0.0001882246811132493, + "loss": 1.2608, + "step": 2005 + }, + { + "epoch": 0.6270709596748985, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018821311094257716, + "loss": 1.5808, + "step": 2006 + }, + { + "epoch": 0.6273835573616755, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018820153544636713, + "loss": 1.6451, + "step": 2007 + }, + { + "epoch": 0.6276961550484527, + "grad_norm": 0.2392578125, + "learning_rate": 0.000188189954625318, + "loss": 1.6479, + "step": 2008 + }, + { + "epoch": 0.6280087527352297, + "grad_norm": 0.23046875, + "learning_rate": 0.0001881783684801289, + "loss": 1.6755, + "step": 2009 + }, + { + "epoch": 0.6283213504220069, + "grad_norm": 0.228515625, + "learning_rate": 0.00018816677701149939, + "loss": 1.6337, + "step": 2010 + }, + { + "epoch": 0.628633948108784, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018815518022012915, + "loss": 1.648, + "step": 2011 + }, + { + "epoch": 0.6289465457955611, + "grad_norm": 0.234375, + "learning_rate": 0.00018814357810671833, + "loss": 1.586, + "step": 2012 + }, + { + "epoch": 0.6292591434823382, + "grad_norm": 0.232421875, + "learning_rate": 0.0001881319706719674, + "loss": 1.5722, + "step": 2013 + }, + { + "epoch": 0.6295717411691153, + "grad_norm": 0.251953125, + "learning_rate": 0.0001881203579165771, + "loss": 1.946, + "step": 2014 + }, + { + "epoch": 0.6298843388558925, + "grad_norm": 0.228515625, + "learning_rate": 0.0001881087398412485, + "loss": 1.7869, + "step": 2015 + }, + { + "epoch": 0.6301969365426696, + "grad_norm": 0.21875, + "learning_rate": 0.000188097116446683, + "loss": 1.7194, + "step": 2016 + }, + { + "epoch": 0.6305095342294467, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001880854877335823, + "loss": 2.0099, + "step": 2017 + }, + { + "epoch": 0.6308221319162238, + "grad_norm": 0.228515625, + "learning_rate": 0.00018807385370264848, + "loss": 1.8415, + "step": 2018 + }, + { + "epoch": 0.631134729603001, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018806221435458388, + "loss": 1.6398, + "step": 2019 + }, + { + "epoch": 0.631447327289778, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018805056969009115, + "loss": 1.8436, + "step": 2020 + }, + { + "epoch": 0.6317599249765552, + "grad_norm": 0.22265625, + "learning_rate": 0.00018803891970987333, + "loss": 1.5016, + "step": 2021 + }, + { + "epoch": 0.6320725226633322, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018802726441463375, + "loss": 1.5147, + "step": 2022 + }, + { + "epoch": 0.6323851203501094, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018801560380507604, + "loss": 1.5146, + "step": 2023 + }, + { + "epoch": 0.6326977180368866, + "grad_norm": 0.224609375, + "learning_rate": 0.00018800393788190415, + "loss": 1.8504, + "step": 2024 + }, + { + "epoch": 0.6330103157236636, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018799226664582245, + "loss": 1.6024, + "step": 2025 + }, + { + "epoch": 0.6333229134104408, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018798059009753542, + "loss": 1.8456, + "step": 2026 + }, + { + "epoch": 0.6336355110972178, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018796890823774806, + "loss": 1.5829, + "step": 2027 + }, + { + "epoch": 0.633948108783995, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018795722106716562, + "loss": 1.8332, + "step": 2028 + }, + { + "epoch": 0.6342607064707722, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018794552858649366, + "loss": 1.8867, + "step": 2029 + }, + { + "epoch": 0.6345733041575492, + "grad_norm": 0.23828125, + "learning_rate": 0.00018793383079643804, + "loss": 1.7046, + "step": 2030 + }, + { + "epoch": 0.6348859018443264, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018792212769770507, + "loss": 1.4539, + "step": 2031 + }, + { + "epoch": 0.6351984995311035, + "grad_norm": 0.224609375, + "learning_rate": 0.00018791041929100115, + "loss": 1.7966, + "step": 2032 + }, + { + "epoch": 0.6355110972178806, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001878987055770332, + "loss": 1.7888, + "step": 2033 + }, + { + "epoch": 0.6358236949046577, + "grad_norm": 0.24609375, + "learning_rate": 0.0001878869865565084, + "loss": 1.5578, + "step": 2034 + }, + { + "epoch": 0.6361362925914348, + "grad_norm": 0.228515625, + "learning_rate": 0.0001878752622301342, + "loss": 1.7211, + "step": 2035 + }, + { + "epoch": 0.6364488902782119, + "grad_norm": 0.228515625, + "learning_rate": 0.00018786353259861847, + "loss": 1.5837, + "step": 2036 + }, + { + "epoch": 0.6367614879649891, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001878517976626693, + "loss": 1.6654, + "step": 2037 + }, + { + "epoch": 0.6370740856517662, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018784005742299514, + "loss": 1.9085, + "step": 2038 + }, + { + "epoch": 0.6373866833385433, + "grad_norm": 0.275390625, + "learning_rate": 0.0001878283118803048, + "loss": 1.6215, + "step": 2039 + }, + { + "epoch": 0.6376992810253204, + "grad_norm": 0.240234375, + "learning_rate": 0.00018781656103530737, + "loss": 1.9168, + "step": 2040 + }, + { + "epoch": 0.6380118787120975, + "grad_norm": 0.224609375, + "learning_rate": 0.0001878048048887122, + "loss": 1.8944, + "step": 2041 + }, + { + "epoch": 0.6383244763988747, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018779304344122908, + "loss": 1.7528, + "step": 2042 + }, + { + "epoch": 0.6386370740856517, + "grad_norm": 0.228515625, + "learning_rate": 0.00018778127669356805, + "loss": 1.8204, + "step": 2043 + }, + { + "epoch": 0.6389496717724289, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001877695046464395, + "loss": 1.7069, + "step": 2044 + }, + { + "epoch": 0.6392622694592061, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001877577273005541, + "loss": 1.3533, + "step": 2045 + }, + { + "epoch": 0.6395748671459831, + "grad_norm": 0.22265625, + "learning_rate": 0.00018774594465662288, + "loss": 1.6023, + "step": 2046 + }, + { + "epoch": 0.6398874648327603, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018773415671535714, + "loss": 1.9426, + "step": 2047 + }, + { + "epoch": 0.6402000625195373, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018772236347746856, + "loss": 1.7982, + "step": 2048 + }, + { + "epoch": 0.6405126602063145, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018771056494366913, + "loss": 1.7041, + "step": 2049 + }, + { + "epoch": 0.6408252578930916, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018769876111467113, + "loss": 1.7406, + "step": 2050 + }, + { + "epoch": 0.6411378555798687, + "grad_norm": 0.240234375, + "learning_rate": 0.00018768695199118717, + "loss": 1.6077, + "step": 2051 + }, + { + "epoch": 0.6414504532666458, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018767513757393016, + "loss": 1.7813, + "step": 2052 + }, + { + "epoch": 0.6417630509534229, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018766331786361338, + "loss": 1.6976, + "step": 2053 + }, + { + "epoch": 0.6420756486402001, + "grad_norm": 0.2421875, + "learning_rate": 0.00018765149286095037, + "loss": 1.6368, + "step": 2054 + }, + { + "epoch": 0.6423882463269772, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018763966256665505, + "loss": 1.6045, + "step": 2055 + }, + { + "epoch": 0.6427008440137543, + "grad_norm": 0.22265625, + "learning_rate": 0.00018762782698144163, + "loss": 1.5185, + "step": 2056 + }, + { + "epoch": 0.6430134417005314, + "grad_norm": 0.23828125, + "learning_rate": 0.00018761598610602463, + "loss": 1.5806, + "step": 2057 + }, + { + "epoch": 0.6433260393873085, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001876041399411189, + "loss": 1.6609, + "step": 2058 + }, + { + "epoch": 0.6436386370740856, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001875922884874396, + "loss": 1.6643, + "step": 2059 + }, + { + "epoch": 0.6439512347608628, + "grad_norm": 0.236328125, + "learning_rate": 0.00018758043174570222, + "loss": 1.5697, + "step": 2060 + }, + { + "epoch": 0.6442638324476399, + "grad_norm": 0.22265625, + "learning_rate": 0.00018756856971662258, + "loss": 1.6761, + "step": 2061 + }, + { + "epoch": 0.644576430134417, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018755670240091677, + "loss": 1.5763, + "step": 2062 + }, + { + "epoch": 0.6448890278211942, + "grad_norm": 0.240234375, + "learning_rate": 0.0001875448297993013, + "loss": 1.7233, + "step": 2063 + }, + { + "epoch": 0.6452016255079712, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018753295191249286, + "loss": 1.623, + "step": 2064 + }, + { + "epoch": 0.6455142231947484, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018752106874120862, + "loss": 1.5065, + "step": 2065 + }, + { + "epoch": 0.6458268208815254, + "grad_norm": 0.251953125, + "learning_rate": 0.0001875091802861659, + "loss": 2.0689, + "step": 2066 + }, + { + "epoch": 0.6461394185683026, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018749728654808242, + "loss": 1.7316, + "step": 2067 + }, + { + "epoch": 0.6464520162550798, + "grad_norm": 0.224609375, + "learning_rate": 0.0001874853875276763, + "loss": 1.7759, + "step": 2068 + }, + { + "epoch": 0.6467646139418568, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018747348322566582, + "loss": 1.6177, + "step": 2069 + }, + { + "epoch": 0.647077211628634, + "grad_norm": 0.244140625, + "learning_rate": 0.0001874615736427697, + "loss": 1.8813, + "step": 2070 + }, + { + "epoch": 0.647389809315411, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018744965877970696, + "loss": 1.6428, + "step": 2071 + }, + { + "epoch": 0.6477024070021882, + "grad_norm": 0.2890625, + "learning_rate": 0.00018743773863719683, + "loss": 2.3381, + "step": 2072 + }, + { + "epoch": 0.6480150046889653, + "grad_norm": 0.2265625, + "learning_rate": 0.00018742581321595902, + "loss": 1.4568, + "step": 2073 + }, + { + "epoch": 0.6483276023757424, + "grad_norm": 0.220703125, + "learning_rate": 0.00018741388251671345, + "loss": 1.5651, + "step": 2074 + }, + { + "epoch": 0.6486402000625195, + "grad_norm": 0.2421875, + "learning_rate": 0.0001874019465401804, + "loss": 1.8459, + "step": 2075 + }, + { + "epoch": 0.6489527977492967, + "grad_norm": 0.2265625, + "learning_rate": 0.00018739000528708046, + "loss": 1.6691, + "step": 2076 + }, + { + "epoch": 0.6492653954360738, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018737805875813454, + "loss": 1.8378, + "step": 2077 + }, + { + "epoch": 0.6495779931228509, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018736610695406386, + "loss": 1.8245, + "step": 2078 + }, + { + "epoch": 0.649890590809628, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018735414987559, + "loss": 1.7107, + "step": 2079 + }, + { + "epoch": 0.6502031884964051, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018734218752343478, + "loss": 1.7694, + "step": 2080 + }, + { + "epoch": 0.6505157861831823, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018733021989832035, + "loss": 1.7134, + "step": 2081 + }, + { + "epoch": 0.6508283838699593, + "grad_norm": 0.216796875, + "learning_rate": 0.00018731824700096933, + "loss": 1.8064, + "step": 2082 + }, + { + "epoch": 0.6511409815567365, + "grad_norm": 0.23828125, + "learning_rate": 0.00018730626883210443, + "loss": 1.694, + "step": 2083 + }, + { + "epoch": 0.6514535792435135, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018729428539244884, + "loss": 1.7573, + "step": 2084 + }, + { + "epoch": 0.6517661769302907, + "grad_norm": 0.228515625, + "learning_rate": 0.00018728229668272598, + "loss": 1.6263, + "step": 2085 + }, + { + "epoch": 0.6520787746170679, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018727030270365965, + "loss": 1.846, + "step": 2086 + }, + { + "epoch": 0.6523913723038449, + "grad_norm": 0.244140625, + "learning_rate": 0.00018725830345597396, + "loss": 1.7912, + "step": 2087 + }, + { + "epoch": 0.6527039699906221, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001872462989403933, + "loss": 1.777, + "step": 2088 + }, + { + "epoch": 0.6530165676773992, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018723428915764237, + "loss": 1.675, + "step": 2089 + }, + { + "epoch": 0.6533291653641763, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018722227410844625, + "loss": 1.5869, + "step": 2090 + }, + { + "epoch": 0.6536417630509535, + "grad_norm": 0.244140625, + "learning_rate": 0.00018721025379353026, + "loss": 1.8295, + "step": 2091 + }, + { + "epoch": 0.6539543607377305, + "grad_norm": 0.23046875, + "learning_rate": 0.00018719822821362017, + "loss": 1.6437, + "step": 2092 + }, + { + "epoch": 0.6542669584245077, + "grad_norm": 0.2421875, + "learning_rate": 0.0001871861973694419, + "loss": 1.8373, + "step": 2093 + }, + { + "epoch": 0.6545795561112848, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018717416126172177, + "loss": 1.3641, + "step": 2094 + }, + { + "epoch": 0.6548921537980619, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018716211989118646, + "loss": 1.7446, + "step": 2095 + }, + { + "epoch": 0.655204751484839, + "grad_norm": 0.234375, + "learning_rate": 0.00018715007325856292, + "loss": 1.7373, + "step": 2096 + }, + { + "epoch": 0.6555173491716161, + "grad_norm": 0.23828125, + "learning_rate": 0.00018713802136457837, + "loss": 1.6263, + "step": 2097 + }, + { + "epoch": 0.6558299468583932, + "grad_norm": 0.23046875, + "learning_rate": 0.00018712596420996045, + "loss": 1.7508, + "step": 2098 + }, + { + "epoch": 0.6561425445451704, + "grad_norm": 0.232421875, + "learning_rate": 0.00018711390179543703, + "loss": 1.8481, + "step": 2099 + }, + { + "epoch": 0.6564551422319475, + "grad_norm": 0.232421875, + "learning_rate": 0.00018710183412173635, + "loss": 1.7739, + "step": 2100 + }, + { + "epoch": 0.6567677399187246, + "grad_norm": 0.2265625, + "learning_rate": 0.00018708976118958693, + "loss": 1.989, + "step": 2101 + }, + { + "epoch": 0.6570803376055018, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001870776829997177, + "loss": 1.8054, + "step": 2102 + }, + { + "epoch": 0.6573929352922788, + "grad_norm": 0.2265625, + "learning_rate": 0.00018706559955285773, + "loss": 1.665, + "step": 2103 + }, + { + "epoch": 0.657705532979056, + "grad_norm": 0.22265625, + "learning_rate": 0.0001870535108497366, + "loss": 1.703, + "step": 2104 + }, + { + "epoch": 0.658018130665833, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001870414168910841, + "loss": 1.7818, + "step": 2105 + }, + { + "epoch": 0.6583307283526102, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018702931767763028, + "loss": 1.5893, + "step": 2106 + }, + { + "epoch": 0.6586433260393874, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001870172132101057, + "loss": 1.6743, + "step": 2107 + }, + { + "epoch": 0.6589559237261644, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018700510348924106, + "loss": 1.5062, + "step": 2108 + }, + { + "epoch": 0.6592685214129416, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018699298851576743, + "loss": 1.4517, + "step": 2109 + }, + { + "epoch": 0.6595811190997186, + "grad_norm": 0.23828125, + "learning_rate": 0.00018698086829041627, + "loss": 1.7555, + "step": 2110 + }, + { + "epoch": 0.6598937167864958, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001869687428139192, + "loss": 1.7701, + "step": 2111 + }, + { + "epoch": 0.6602063144732729, + "grad_norm": 0.228515625, + "learning_rate": 0.00018695661208700836, + "loss": 1.5693, + "step": 2112 + }, + { + "epoch": 0.66051891216005, + "grad_norm": 0.2265625, + "learning_rate": 0.000186944476110416, + "loss": 1.473, + "step": 2113 + }, + { + "epoch": 0.6608315098468271, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018693233488487483, + "loss": 1.4396, + "step": 2114 + }, + { + "epoch": 0.6611441075336042, + "grad_norm": 0.236328125, + "learning_rate": 0.00018692018841111782, + "loss": 1.9964, + "step": 2115 + }, + { + "epoch": 0.6614567052203814, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018690803668987827, + "loss": 1.6639, + "step": 2116 + }, + { + "epoch": 0.6617693029071585, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001868958797218898, + "loss": 1.7607, + "step": 2117 + }, + { + "epoch": 0.6620819005939356, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018688371750788635, + "loss": 1.5137, + "step": 2118 + }, + { + "epoch": 0.6623944982807127, + "grad_norm": 0.21875, + "learning_rate": 0.00018687155004860215, + "loss": 1.5756, + "step": 2119 + }, + { + "epoch": 0.6627070959674899, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018685937734477177, + "loss": 1.7926, + "step": 2120 + }, + { + "epoch": 0.6630196936542669, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001868471993971301, + "loss": 1.7269, + "step": 2121 + }, + { + "epoch": 0.6633322913410441, + "grad_norm": 0.232421875, + "learning_rate": 0.0001868350162064123, + "loss": 1.6515, + "step": 2122 + }, + { + "epoch": 0.6636448890278211, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018682282777335397, + "loss": 1.5462, + "step": 2123 + }, + { + "epoch": 0.6639574867145983, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018681063409869085, + "loss": 1.7719, + "step": 2124 + }, + { + "epoch": 0.6642700844013755, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018679843518315913, + "loss": 1.9495, + "step": 2125 + }, + { + "epoch": 0.6645826820881525, + "grad_norm": 0.232421875, + "learning_rate": 0.0001867862310274953, + "loss": 1.5323, + "step": 2126 + }, + { + "epoch": 0.6648952797749297, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018677402163243606, + "loss": 1.5997, + "step": 2127 + }, + { + "epoch": 0.6652078774617067, + "grad_norm": 0.2265625, + "learning_rate": 0.0001867618069987186, + "loss": 1.891, + "step": 2128 + }, + { + "epoch": 0.6655204751484839, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018674958712708027, + "loss": 1.7805, + "step": 2129 + }, + { + "epoch": 0.665833072835261, + "grad_norm": 0.228515625, + "learning_rate": 0.00018673736201825882, + "loss": 1.7896, + "step": 2130 + }, + { + "epoch": 0.6661456705220381, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001867251316729923, + "loss": 1.8483, + "step": 2131 + }, + { + "epoch": 0.6664582682088153, + "grad_norm": 0.234375, + "learning_rate": 0.00018671289609201907, + "loss": 1.8642, + "step": 2132 + }, + { + "epoch": 0.6667708658955924, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001867006552760778, + "loss": 1.4944, + "step": 2133 + }, + { + "epoch": 0.6670834635823695, + "grad_norm": 0.2265625, + "learning_rate": 0.00018668840922590746, + "loss": 1.4096, + "step": 2134 + }, + { + "epoch": 0.6673960612691466, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018667615794224743, + "loss": 1.8447, + "step": 2135 + }, + { + "epoch": 0.6677086589559237, + "grad_norm": 0.228515625, + "learning_rate": 0.00018666390142583724, + "loss": 1.7672, + "step": 2136 + }, + { + "epoch": 0.6680212566427008, + "grad_norm": 0.224609375, + "learning_rate": 0.00018665163967741694, + "loss": 1.4677, + "step": 2137 + }, + { + "epoch": 0.668333854329478, + "grad_norm": 0.248046875, + "learning_rate": 0.0001866393726977267, + "loss": 1.9113, + "step": 2138 + }, + { + "epoch": 0.668646452016255, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018662710048750712, + "loss": 1.6074, + "step": 2139 + }, + { + "epoch": 0.6689590497030322, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018661482304749915, + "loss": 1.9865, + "step": 2140 + }, + { + "epoch": 0.6692716473898093, + "grad_norm": 0.234375, + "learning_rate": 0.00018660254037844388, + "loss": 1.5433, + "step": 2141 + }, + { + "epoch": 0.6695842450765864, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018659025248108288, + "loss": 1.7213, + "step": 2142 + }, + { + "epoch": 0.6698968427633636, + "grad_norm": 0.220703125, + "learning_rate": 0.00018657795935615802, + "loss": 1.7668, + "step": 2143 + }, + { + "epoch": 0.6702094404501406, + "grad_norm": 0.240234375, + "learning_rate": 0.00018656566100441144, + "loss": 1.7344, + "step": 2144 + }, + { + "epoch": 0.6705220381369178, + "grad_norm": 0.21875, + "learning_rate": 0.00018655335742658556, + "loss": 1.6451, + "step": 2145 + }, + { + "epoch": 0.670834635823695, + "grad_norm": 0.224609375, + "learning_rate": 0.00018654104862342324, + "loss": 1.6888, + "step": 2146 + }, + { + "epoch": 0.671147233510472, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018652873459566749, + "loss": 1.426, + "step": 2147 + }, + { + "epoch": 0.6714598311972492, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018651641534406178, + "loss": 1.6177, + "step": 2148 + }, + { + "epoch": 0.6717724288840262, + "grad_norm": 0.240234375, + "learning_rate": 0.00018650409086934985, + "loss": 1.6962, + "step": 2149 + }, + { + "epoch": 0.6720850265708034, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001864917611722757, + "loss": 1.6879, + "step": 2150 + }, + { + "epoch": 0.6723976242575805, + "grad_norm": 0.22265625, + "learning_rate": 0.0001864794262535837, + "loss": 1.9992, + "step": 2151 + }, + { + "epoch": 0.6727102219443576, + "grad_norm": 0.23046875, + "learning_rate": 0.0001864670861140186, + "loss": 1.9401, + "step": 2152 + }, + { + "epoch": 0.6730228196311347, + "grad_norm": 0.2421875, + "learning_rate": 0.00018645474075432524, + "loss": 1.8057, + "step": 2153 + }, + { + "epoch": 0.6733354173179118, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018644239017524906, + "loss": 2.0631, + "step": 2154 + }, + { + "epoch": 0.673648015004689, + "grad_norm": 0.234375, + "learning_rate": 0.00018643003437753558, + "loss": 1.6794, + "step": 2155 + }, + { + "epoch": 0.6739606126914661, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018641767336193086, + "loss": 1.7738, + "step": 2156 + }, + { + "epoch": 0.6742732103782432, + "grad_norm": 0.240234375, + "learning_rate": 0.000186405307129181, + "loss": 1.8517, + "step": 2157 + }, + { + "epoch": 0.6745858080650203, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018639293568003268, + "loss": 1.5776, + "step": 2158 + }, + { + "epoch": 0.6748984057517975, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018638055901523277, + "loss": 1.6955, + "step": 2159 + }, + { + "epoch": 0.6752110034385745, + "grad_norm": 0.23828125, + "learning_rate": 0.00018636817713552837, + "loss": 1.6111, + "step": 2160 + }, + { + "epoch": 0.6755236011253517, + "grad_norm": 0.2421875, + "learning_rate": 0.00018635579004166712, + "loss": 1.8155, + "step": 2161 + }, + { + "epoch": 0.6758361988121288, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018634339773439674, + "loss": 1.6656, + "step": 2162 + }, + { + "epoch": 0.6761487964989059, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001863310002144654, + "loss": 1.5922, + "step": 2163 + }, + { + "epoch": 0.6764613941856831, + "grad_norm": 0.220703125, + "learning_rate": 0.0001863185974826216, + "loss": 1.7238, + "step": 2164 + }, + { + "epoch": 0.6767739918724601, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018630618953961408, + "loss": 1.6582, + "step": 2165 + }, + { + "epoch": 0.6770865895592373, + "grad_norm": 0.283203125, + "learning_rate": 0.0001862937763861919, + "loss": 2.3931, + "step": 2166 + }, + { + "epoch": 0.6773991872460143, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018628135802310446, + "loss": 1.7434, + "step": 2167 + }, + { + "epoch": 0.6777117849327915, + "grad_norm": 0.251953125, + "learning_rate": 0.0001862689344511015, + "loss": 2.0366, + "step": 2168 + }, + { + "epoch": 0.6780243826195687, + "grad_norm": 0.232421875, + "learning_rate": 0.000186256505670933, + "loss": 1.6197, + "step": 2169 + }, + { + "epoch": 0.6783369803063457, + "grad_norm": 0.22265625, + "learning_rate": 0.0001862440716833494, + "loss": 1.5561, + "step": 2170 + }, + { + "epoch": 0.6786495779931229, + "grad_norm": 0.248046875, + "learning_rate": 0.00018623163248910127, + "loss": 1.8304, + "step": 2171 + }, + { + "epoch": 0.6789621756798999, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018621918808893958, + "loss": 1.3873, + "step": 2172 + }, + { + "epoch": 0.6792747733666771, + "grad_norm": 0.23046875, + "learning_rate": 0.00018620673848361566, + "loss": 1.4493, + "step": 2173 + }, + { + "epoch": 0.6795873710534542, + "grad_norm": 0.25390625, + "learning_rate": 0.00018619428367388103, + "loss": 1.7057, + "step": 2174 + }, + { + "epoch": 0.6798999687402313, + "grad_norm": 0.232421875, + "learning_rate": 0.0001861818236604877, + "loss": 1.5443, + "step": 2175 + }, + { + "epoch": 0.6802125664270084, + "grad_norm": 0.228515625, + "learning_rate": 0.00018616935844418785, + "loss": 1.651, + "step": 2176 + }, + { + "epoch": 0.6805251641137856, + "grad_norm": 0.2470703125, + "learning_rate": 0.000186156888025734, + "loss": 1.7987, + "step": 2177 + }, + { + "epoch": 0.6808377618005627, + "grad_norm": 0.234375, + "learning_rate": 0.00018614441240587907, + "loss": 1.8154, + "step": 2178 + }, + { + "epoch": 0.6811503594873398, + "grad_norm": 0.232421875, + "learning_rate": 0.0001861319315853762, + "loss": 1.7168, + "step": 2179 + }, + { + "epoch": 0.6814629571741169, + "grad_norm": 0.234375, + "learning_rate": 0.0001861194455649788, + "loss": 1.4816, + "step": 2180 + }, + { + "epoch": 0.681775554860894, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018610695434544074, + "loss": 1.5243, + "step": 2181 + }, + { + "epoch": 0.6820881525476712, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018609445792751618, + "loss": 1.7344, + "step": 2182 + }, + { + "epoch": 0.6824007502344482, + "grad_norm": 0.228515625, + "learning_rate": 0.00018608195631195939, + "loss": 1.8136, + "step": 2183 + }, + { + "epoch": 0.6827133479212254, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018606944949952524, + "loss": 1.7538, + "step": 2184 + }, + { + "epoch": 0.6830259456080024, + "grad_norm": 0.236328125, + "learning_rate": 0.00018605693749096876, + "loss": 1.8747, + "step": 2185 + }, + { + "epoch": 0.6833385432947796, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018604442028704533, + "loss": 1.6926, + "step": 2186 + }, + { + "epoch": 0.6836511409815568, + "grad_norm": 0.228515625, + "learning_rate": 0.00018603189788851055, + "loss": 1.7869, + "step": 2187 + }, + { + "epoch": 0.6839637386683338, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018601937029612048, + "loss": 1.6719, + "step": 2188 + }, + { + "epoch": 0.684276336355111, + "grad_norm": 0.23828125, + "learning_rate": 0.0001860068375106314, + "loss": 1.7719, + "step": 2189 + }, + { + "epoch": 0.6845889340418881, + "grad_norm": 0.2265625, + "learning_rate": 0.00018599429953279994, + "loss": 1.618, + "step": 2190 + }, + { + "epoch": 0.6849015317286652, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018598175636338305, + "loss": 1.7768, + "step": 2191 + }, + { + "epoch": 0.6852141294154424, + "grad_norm": 0.234375, + "learning_rate": 0.00018596920800313798, + "loss": 1.9978, + "step": 2192 + }, + { + "epoch": 0.6855267271022194, + "grad_norm": 0.22265625, + "learning_rate": 0.0001859566544528222, + "loss": 1.3867, + "step": 2193 + }, + { + "epoch": 0.6858393247889966, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001859440957131937, + "loss": 1.5844, + "step": 2194 + }, + { + "epoch": 0.6861519224757737, + "grad_norm": 0.234375, + "learning_rate": 0.00018593153178501063, + "loss": 1.7227, + "step": 2195 + }, + { + "epoch": 0.6864645201625508, + "grad_norm": 0.25390625, + "learning_rate": 0.0001859189626690315, + "loss": 1.8812, + "step": 2196 + }, + { + "epoch": 0.6867771178493279, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018590638836601505, + "loss": 1.5477, + "step": 2197 + }, + { + "epoch": 0.687089715536105, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001858938088767205, + "loss": 1.8684, + "step": 2198 + }, + { + "epoch": 0.6874023132228821, + "grad_norm": 0.236328125, + "learning_rate": 0.00018588122420190722, + "loss": 1.8864, + "step": 2199 + }, + { + "epoch": 0.6877149109096593, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018586863434233504, + "loss": 1.7888, + "step": 2200 + }, + { + "epoch": 0.6880275085964364, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018585603929876395, + "loss": 1.6452, + "step": 2201 + }, + { + "epoch": 0.6883401062832135, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018584343907195437, + "loss": 1.585, + "step": 2202 + }, + { + "epoch": 0.6886527039699907, + "grad_norm": 0.23828125, + "learning_rate": 0.000185830833662667, + "loss": 1.7144, + "step": 2203 + }, + { + "epoch": 0.6889653016567677, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018581822307166281, + "loss": 1.7379, + "step": 2204 + }, + { + "epoch": 0.6892778993435449, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018580560729970313, + "loss": 1.777, + "step": 2205 + }, + { + "epoch": 0.6895904970303219, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018579298634754962, + "loss": 1.902, + "step": 2206 + }, + { + "epoch": 0.6899030947170991, + "grad_norm": 0.220703125, + "learning_rate": 0.00018578036021596415, + "loss": 1.6602, + "step": 2207 + }, + { + "epoch": 0.6902156924038763, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018576772890570905, + "loss": 1.8837, + "step": 2208 + }, + { + "epoch": 0.6905282900906533, + "grad_norm": 0.251953125, + "learning_rate": 0.00018575509241754685, + "loss": 1.6694, + "step": 2209 + }, + { + "epoch": 0.6908408877774305, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018574245075224046, + "loss": 1.7201, + "step": 2210 + }, + { + "epoch": 0.6911534854642075, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018572980391055305, + "loss": 1.4998, + "step": 2211 + }, + { + "epoch": 0.6914660831509847, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018571715189324813, + "loss": 1.4607, + "step": 2212 + }, + { + "epoch": 0.6917786808377618, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018570449470108952, + "loss": 1.8028, + "step": 2213 + }, + { + "epoch": 0.6920912785245389, + "grad_norm": 0.234375, + "learning_rate": 0.00018569183233484133, + "loss": 1.5558, + "step": 2214 + }, + { + "epoch": 0.692403876211316, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018567916479526804, + "loss": 1.5834, + "step": 2215 + }, + { + "epoch": 0.6927164738980931, + "grad_norm": 0.232421875, + "learning_rate": 0.0001856664920831344, + "loss": 1.6607, + "step": 2216 + }, + { + "epoch": 0.6930290715848703, + "grad_norm": 0.236328125, + "learning_rate": 0.00018565381419920546, + "loss": 1.5378, + "step": 2217 + }, + { + "epoch": 0.6933416692716474, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018564113114424662, + "loss": 1.8949, + "step": 2218 + }, + { + "epoch": 0.6936542669584245, + "grad_norm": 0.234375, + "learning_rate": 0.00018562844291902353, + "loss": 1.9261, + "step": 2219 + }, + { + "epoch": 0.6939668646452016, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018561574952430222, + "loss": 2.0413, + "step": 2220 + }, + { + "epoch": 0.6942794623319788, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018560305096084904, + "loss": 1.7628, + "step": 2221 + }, + { + "epoch": 0.6945920600187558, + "grad_norm": 0.240234375, + "learning_rate": 0.00018559034722943056, + "loss": 1.6226, + "step": 2222 + }, + { + "epoch": 0.694904657705533, + "grad_norm": 0.22265625, + "learning_rate": 0.00018557763833081377, + "loss": 1.8693, + "step": 2223 + }, + { + "epoch": 0.69521725539231, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001855649242657659, + "loss": 1.4996, + "step": 2224 + }, + { + "epoch": 0.6955298530790872, + "grad_norm": 0.25390625, + "learning_rate": 0.00018555220503505452, + "loss": 2.2346, + "step": 2225 + }, + { + "epoch": 0.6958424507658644, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018553948063944749, + "loss": 1.773, + "step": 2226 + }, + { + "epoch": 0.6961550484526414, + "grad_norm": 0.2373046875, + "learning_rate": 0.000185526751079713, + "loss": 1.8362, + "step": 2227 + }, + { + "epoch": 0.6964676461394186, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018551401635661958, + "loss": 1.6007, + "step": 2228 + }, + { + "epoch": 0.6967802438261956, + "grad_norm": 0.234375, + "learning_rate": 0.00018550127647093601, + "loss": 1.5875, + "step": 2229 + }, + { + "epoch": 0.6970928415129728, + "grad_norm": 0.228515625, + "learning_rate": 0.00018548853142343142, + "loss": 1.7156, + "step": 2230 + }, + { + "epoch": 0.69740543919975, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018547578121487528, + "loss": 1.784, + "step": 2231 + }, + { + "epoch": 0.697718036886527, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018546302584603727, + "loss": 1.6756, + "step": 2232 + }, + { + "epoch": 0.6980306345733042, + "grad_norm": 0.25, + "learning_rate": 0.0001854502653176875, + "loss": 1.8622, + "step": 2233 + }, + { + "epoch": 0.6983432322600813, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001854374996305963, + "loss": 1.383, + "step": 2234 + }, + { + "epoch": 0.6986558299468584, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001854247287855344, + "loss": 1.516, + "step": 2235 + }, + { + "epoch": 0.6989684276336355, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018541195278327276, + "loss": 1.5284, + "step": 2236 + }, + { + "epoch": 0.6992810253204126, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001853991716245827, + "loss": 1.4208, + "step": 2237 + }, + { + "epoch": 0.6995936230071897, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001853863853102358, + "loss": 1.8169, + "step": 2238 + }, + { + "epoch": 0.6999062206939669, + "grad_norm": 0.23046875, + "learning_rate": 0.000185373593841004, + "loss": 1.686, + "step": 2239 + }, + { + "epoch": 0.700218818380744, + "grad_norm": 0.234375, + "learning_rate": 0.00018536079721765956, + "loss": 1.4067, + "step": 2240 + }, + { + "epoch": 0.7005314160675211, + "grad_norm": 0.228515625, + "learning_rate": 0.00018534799544097505, + "loss": 1.7239, + "step": 2241 + }, + { + "epoch": 0.7008440137542982, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018533518851172325, + "loss": 1.6176, + "step": 2242 + }, + { + "epoch": 0.7011566114410753, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001853223764306774, + "loss": 1.6086, + "step": 2243 + }, + { + "epoch": 0.7014692091278525, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018530955919861096, + "loss": 1.5131, + "step": 2244 + }, + { + "epoch": 0.7017818068146295, + "grad_norm": 0.224609375, + "learning_rate": 0.0001852967368162977, + "loss": 1.685, + "step": 2245 + }, + { + "epoch": 0.7020944045014067, + "grad_norm": 0.232421875, + "learning_rate": 0.00018528390928451173, + "loss": 1.8137, + "step": 2246 + }, + { + "epoch": 0.7024070021881839, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018527107660402752, + "loss": 1.7175, + "step": 2247 + }, + { + "epoch": 0.7027195998749609, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018525823877561974, + "loss": 1.6921, + "step": 2248 + }, + { + "epoch": 0.7030321975617381, + "grad_norm": 0.23046875, + "learning_rate": 0.0001852453958000634, + "loss": 1.9215, + "step": 2249 + }, + { + "epoch": 0.7033447952485151, + "grad_norm": 0.228515625, + "learning_rate": 0.00018523254767813393, + "loss": 1.5655, + "step": 2250 + }, + { + "epoch": 0.7036573929352923, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018521969441060695, + "loss": 1.6418, + "step": 2251 + }, + { + "epoch": 0.7039699906220694, + "grad_norm": 0.240234375, + "learning_rate": 0.0001852068359982584, + "loss": 1.8771, + "step": 2252 + }, + { + "epoch": 0.7042825883088465, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018519397244186458, + "loss": 1.7217, + "step": 2253 + }, + { + "epoch": 0.7045951859956237, + "grad_norm": 0.228515625, + "learning_rate": 0.0001851811037422021, + "loss": 1.8586, + "step": 2254 + }, + { + "epoch": 0.7049077836824007, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018516822990004782, + "loss": 1.5904, + "step": 2255 + }, + { + "epoch": 0.7052203813691779, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018515535091617898, + "loss": 1.6428, + "step": 2256 + }, + { + "epoch": 0.705532979055955, + "grad_norm": 0.234375, + "learning_rate": 0.0001851424667913731, + "loss": 1.7164, + "step": 2257 + }, + { + "epoch": 0.7058455767427321, + "grad_norm": 0.23046875, + "learning_rate": 0.00018512957752640799, + "loss": 1.7193, + "step": 2258 + }, + { + "epoch": 0.7061581744295092, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018511668312206177, + "loss": 1.5025, + "step": 2259 + }, + { + "epoch": 0.7064707721162864, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018510378357911296, + "loss": 1.612, + "step": 2260 + }, + { + "epoch": 0.7067833698030634, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018509087889834031, + "loss": 1.5849, + "step": 2261 + }, + { + "epoch": 0.7070959674898406, + "grad_norm": 0.25, + "learning_rate": 0.00018507796908052285, + "loss": 1.6807, + "step": 2262 + }, + { + "epoch": 0.7074085651766177, + "grad_norm": 0.228515625, + "learning_rate": 0.00018506505412643995, + "loss": 1.6728, + "step": 2263 + }, + { + "epoch": 0.7077211628633948, + "grad_norm": 0.234375, + "learning_rate": 0.00018505213403687137, + "loss": 1.7322, + "step": 2264 + }, + { + "epoch": 0.708033760550172, + "grad_norm": 0.2265625, + "learning_rate": 0.00018503920881259703, + "loss": 1.6204, + "step": 2265 + }, + { + "epoch": 0.708346358236949, + "grad_norm": 0.228515625, + "learning_rate": 0.00018502627845439732, + "loss": 1.5918, + "step": 2266 + }, + { + "epoch": 0.7086589559237262, + "grad_norm": 0.2421875, + "learning_rate": 0.00018501334296305285, + "loss": 1.8249, + "step": 2267 + }, + { + "epoch": 0.7089715536105032, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018500040233934454, + "loss": 1.974, + "step": 2268 + }, + { + "epoch": 0.7092841512972804, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018498745658405356, + "loss": 1.6999, + "step": 2269 + }, + { + "epoch": 0.7095967489840576, + "grad_norm": 0.23046875, + "learning_rate": 0.00018497450569796158, + "loss": 1.9307, + "step": 2270 + }, + { + "epoch": 0.7099093466708346, + "grad_norm": 0.240234375, + "learning_rate": 0.00018496154968185036, + "loss": 1.7392, + "step": 2271 + }, + { + "epoch": 0.7102219443576118, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018494858853650213, + "loss": 1.7068, + "step": 2272 + }, + { + "epoch": 0.7105345420443888, + "grad_norm": 0.232421875, + "learning_rate": 0.0001849356222626994, + "loss": 1.8758, + "step": 2273 + }, + { + "epoch": 0.710847139731166, + "grad_norm": 0.236328125, + "learning_rate": 0.00018492265086122488, + "loss": 1.6345, + "step": 2274 + }, + { + "epoch": 0.7111597374179431, + "grad_norm": 0.24609375, + "learning_rate": 0.0001849096743328617, + "loss": 1.7491, + "step": 2275 + }, + { + "epoch": 0.7114723351047202, + "grad_norm": 0.2265625, + "learning_rate": 0.0001848966926783933, + "loss": 1.5166, + "step": 2276 + }, + { + "epoch": 0.7117849327914973, + "grad_norm": 0.234375, + "learning_rate": 0.0001848837058986034, + "loss": 1.7068, + "step": 2277 + }, + { + "epoch": 0.7120975304782745, + "grad_norm": 0.23046875, + "learning_rate": 0.00018487071399427599, + "loss": 1.7652, + "step": 2278 + }, + { + "epoch": 0.7124101281650516, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018485771696619542, + "loss": 1.7871, + "step": 2279 + }, + { + "epoch": 0.7127227258518287, + "grad_norm": 0.228515625, + "learning_rate": 0.00018484471481514635, + "loss": 1.9055, + "step": 2280 + }, + { + "epoch": 0.7130353235386058, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001848317075419137, + "loss": 1.8693, + "step": 2281 + }, + { + "epoch": 0.7133479212253829, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018481869514728279, + "loss": 1.548, + "step": 2282 + }, + { + "epoch": 0.7136605189121601, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018480567763203918, + "loss": 1.614, + "step": 2283 + }, + { + "epoch": 0.7139731165989371, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001847926549969687, + "loss": 1.4828, + "step": 2284 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018477962724285763, + "loss": 1.8229, + "step": 2285 + }, + { + "epoch": 0.7145983119724914, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018476659437049238, + "loss": 1.877, + "step": 2286 + }, + { + "epoch": 0.7149109096592685, + "grad_norm": 0.23828125, + "learning_rate": 0.00018475355638065984, + "loss": 1.5996, + "step": 2287 + }, + { + "epoch": 0.7152235073460457, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018474051327414709, + "loss": 1.6033, + "step": 2288 + }, + { + "epoch": 0.7155361050328227, + "grad_norm": 0.248046875, + "learning_rate": 0.00018472746505174156, + "loss": 1.6509, + "step": 2289 + }, + { + "epoch": 0.7158487027195999, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018471441171423103, + "loss": 1.8609, + "step": 2290 + }, + { + "epoch": 0.716161300406377, + "grad_norm": 0.224609375, + "learning_rate": 0.00018470135326240347, + "loss": 1.8864, + "step": 2291 + }, + { + "epoch": 0.7164738980931541, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001846882896970473, + "loss": 1.5743, + "step": 2292 + }, + { + "epoch": 0.7167864957799313, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018467522101895116, + "loss": 1.8124, + "step": 2293 + }, + { + "epoch": 0.7170990934667083, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018466214722890402, + "loss": 1.4247, + "step": 2294 + }, + { + "epoch": 0.7174116911534855, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018464906832769517, + "loss": 1.5627, + "step": 2295 + }, + { + "epoch": 0.7177242888402626, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001846359843161142, + "loss": 1.8247, + "step": 2296 + }, + { + "epoch": 0.7180368865270397, + "grad_norm": 0.234375, + "learning_rate": 0.000184622895194951, + "loss": 1.6003, + "step": 2297 + }, + { + "epoch": 0.7183494842138168, + "grad_norm": 0.236328125, + "learning_rate": 0.0001846098009649958, + "loss": 1.6546, + "step": 2298 + }, + { + "epoch": 0.7186620819005939, + "grad_norm": 0.23046875, + "learning_rate": 0.00018459670162703905, + "loss": 1.8521, + "step": 2299 + }, + { + "epoch": 0.718974679587371, + "grad_norm": 0.2421875, + "learning_rate": 0.00018458359718187165, + "loss": 1.7397, + "step": 2300 + }, + { + "epoch": 0.7192872772741482, + "grad_norm": 0.232421875, + "learning_rate": 0.0001845704876302847, + "loss": 1.7336, + "step": 2301 + }, + { + "epoch": 0.7195998749609253, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018455737297306963, + "loss": 1.6112, + "step": 2302 + }, + { + "epoch": 0.7199124726477024, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018454425321101826, + "loss": 1.8522, + "step": 2303 + }, + { + "epoch": 0.7202250703344796, + "grad_norm": 0.236328125, + "learning_rate": 0.0001845311283449225, + "loss": 1.6348, + "step": 2304 + }, + { + "epoch": 0.7205376680212566, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018451799837557485, + "loss": 1.7101, + "step": 2305 + }, + { + "epoch": 0.7208502657080338, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018450486330376793, + "loss": 1.4738, + "step": 2306 + }, + { + "epoch": 0.7211628633948108, + "grad_norm": 0.23828125, + "learning_rate": 0.00018449172313029472, + "loss": 1.6334, + "step": 2307 + }, + { + "epoch": 0.721475461081588, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018447857785594852, + "loss": 1.5218, + "step": 2308 + }, + { + "epoch": 0.7217880587683652, + "grad_norm": 0.2421875, + "learning_rate": 0.00018446542748152292, + "loss": 1.8324, + "step": 2309 + }, + { + "epoch": 0.7221006564551422, + "grad_norm": 0.234375, + "learning_rate": 0.00018445227200781185, + "loss": 1.8051, + "step": 2310 + }, + { + "epoch": 0.7224132541419194, + "grad_norm": 0.228515625, + "learning_rate": 0.0001844391114356095, + "loss": 1.65, + "step": 2311 + }, + { + "epoch": 0.7227258518286964, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018442594576571035, + "loss": 1.8499, + "step": 2312 + }, + { + "epoch": 0.7230384495154736, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001844127749989093, + "loss": 1.671, + "step": 2313 + }, + { + "epoch": 0.7233510472022507, + "grad_norm": 0.22265625, + "learning_rate": 0.0001843995991360014, + "loss": 1.7405, + "step": 2314 + }, + { + "epoch": 0.7236636448890278, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001843864181777822, + "loss": 1.8025, + "step": 2315 + }, + { + "epoch": 0.723976242575805, + "grad_norm": 0.244140625, + "learning_rate": 0.00018437323212504742, + "loss": 1.5695, + "step": 2316 + }, + { + "epoch": 0.7242888402625821, + "grad_norm": 0.22265625, + "learning_rate": 0.00018436004097859308, + "loss": 1.2384, + "step": 2317 + }, + { + "epoch": 0.7246014379493592, + "grad_norm": 0.2265625, + "learning_rate": 0.00018434684473921556, + "loss": 1.6555, + "step": 2318 + }, + { + "epoch": 0.7249140356361363, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018433364340771153, + "loss": 1.6447, + "step": 2319 + }, + { + "epoch": 0.7252266333229134, + "grad_norm": 0.232421875, + "learning_rate": 0.00018432043698487797, + "loss": 1.6859, + "step": 2320 + }, + { + "epoch": 0.7255392310096905, + "grad_norm": 0.232421875, + "learning_rate": 0.0001843072254715122, + "loss": 1.7087, + "step": 2321 + }, + { + "epoch": 0.7258518286964677, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001842940088684118, + "loss": 1.7149, + "step": 2322 + }, + { + "epoch": 0.7261644263832447, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018428078717637467, + "loss": 1.8408, + "step": 2323 + }, + { + "epoch": 0.7264770240700219, + "grad_norm": 0.2294921875, + "learning_rate": 0.000184267560396199, + "loss": 1.7943, + "step": 2324 + }, + { + "epoch": 0.726789621756799, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018425432852868333, + "loss": 1.7252, + "step": 2325 + }, + { + "epoch": 0.7271022194435761, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001842410915746265, + "loss": 1.6914, + "step": 2326 + }, + { + "epoch": 0.7274148171303533, + "grad_norm": 0.232421875, + "learning_rate": 0.0001842278495348276, + "loss": 1.9011, + "step": 2327 + }, + { + "epoch": 0.7277274148171303, + "grad_norm": 0.236328125, + "learning_rate": 0.00018421460241008607, + "loss": 1.8245, + "step": 2328 + }, + { + "epoch": 0.7280400125039075, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018420135020120172, + "loss": 1.8638, + "step": 2329 + }, + { + "epoch": 0.7283526101906845, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018418809290897455, + "loss": 1.7493, + "step": 2330 + }, + { + "epoch": 0.7286652078774617, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001841748305342049, + "loss": 1.5843, + "step": 2331 + }, + { + "epoch": 0.7289778055642389, + "grad_norm": 0.236328125, + "learning_rate": 0.0001841615630776935, + "loss": 1.5289, + "step": 2332 + }, + { + "epoch": 0.7292904032510159, + "grad_norm": 0.240234375, + "learning_rate": 0.00018414829054024128, + "loss": 1.6851, + "step": 2333 + }, + { + "epoch": 0.7296030009377931, + "grad_norm": 0.21875, + "learning_rate": 0.0001841350129226495, + "loss": 1.3236, + "step": 2334 + }, + { + "epoch": 0.7299155986245702, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018412173022571982, + "loss": 1.9465, + "step": 2335 + }, + { + "epoch": 0.7302281963113473, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018410844245025408, + "loss": 1.7362, + "step": 2336 + }, + { + "epoch": 0.7305407939981244, + "grad_norm": 0.228515625, + "learning_rate": 0.00018409514959705448, + "loss": 1.7688, + "step": 2337 + }, + { + "epoch": 0.7308533916849015, + "grad_norm": 0.24609375, + "learning_rate": 0.0001840818516669235, + "loss": 1.658, + "step": 2338 + }, + { + "epoch": 0.7311659893716786, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018406854866066403, + "loss": 1.6786, + "step": 2339 + }, + { + "epoch": 0.7314785870584558, + "grad_norm": 0.23828125, + "learning_rate": 0.00018405524057907915, + "loss": 1.6658, + "step": 2340 + }, + { + "epoch": 0.7317911847452329, + "grad_norm": 0.2578125, + "learning_rate": 0.0001840419274229723, + "loss": 1.6022, + "step": 2341 + }, + { + "epoch": 0.73210378243201, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018402860919314713, + "loss": 1.7735, + "step": 2342 + }, + { + "epoch": 0.7324163801187871, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001840152858904078, + "loss": 1.4977, + "step": 2343 + }, + { + "epoch": 0.7327289778055642, + "grad_norm": 0.236328125, + "learning_rate": 0.00018400195751555858, + "loss": 1.7735, + "step": 2344 + }, + { + "epoch": 0.7330415754923414, + "grad_norm": 0.2421875, + "learning_rate": 0.00018398862406940412, + "loss": 1.5705, + "step": 2345 + }, + { + "epoch": 0.7333541731791184, + "grad_norm": 0.228515625, + "learning_rate": 0.00018397528555274943, + "loss": 1.9914, + "step": 2346 + }, + { + "epoch": 0.7336667708658956, + "grad_norm": 0.224609375, + "learning_rate": 0.00018396194196639972, + "loss": 1.6567, + "step": 2347 + }, + { + "epoch": 0.7339793685526728, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001839485933111606, + "loss": 1.5779, + "step": 2348 + }, + { + "epoch": 0.7342919662394498, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018393523958783788, + "loss": 1.6902, + "step": 2349 + }, + { + "epoch": 0.734604563926227, + "grad_norm": 0.23046875, + "learning_rate": 0.00018392188079723786, + "loss": 1.8415, + "step": 2350 + }, + { + "epoch": 0.734917161613004, + "grad_norm": 0.2421875, + "learning_rate": 0.0001839085169401669, + "loss": 1.7724, + "step": 2351 + }, + { + "epoch": 0.7352297592997812, + "grad_norm": 0.23046875, + "learning_rate": 0.00018389514801743186, + "loss": 1.4619, + "step": 2352 + }, + { + "epoch": 0.7355423569865583, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018388177402983984, + "loss": 1.7035, + "step": 2353 + }, + { + "epoch": 0.7358549546733354, + "grad_norm": 0.23828125, + "learning_rate": 0.00018386839497819821, + "loss": 1.6311, + "step": 2354 + }, + { + "epoch": 0.7361675523601126, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018385501086331472, + "loss": 1.4891, + "step": 2355 + }, + { + "epoch": 0.7364801500468896, + "grad_norm": 0.23046875, + "learning_rate": 0.00018384162168599735, + "loss": 1.7706, + "step": 2356 + }, + { + "epoch": 0.7367927477336668, + "grad_norm": 0.259765625, + "learning_rate": 0.00018382822744705444, + "loss": 1.7342, + "step": 2357 + }, + { + "epoch": 0.7371053454204439, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001838148281472946, + "loss": 1.7338, + "step": 2358 + }, + { + "epoch": 0.737417943107221, + "grad_norm": 0.244140625, + "learning_rate": 0.0001838014237875268, + "loss": 1.6715, + "step": 2359 + }, + { + "epoch": 0.7377305407939981, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018378801436856027, + "loss": 1.8231, + "step": 2360 + }, + { + "epoch": 0.7380431384807753, + "grad_norm": 0.25, + "learning_rate": 0.00018377459989120452, + "loss": 1.6681, + "step": 2361 + }, + { + "epoch": 0.7383557361675523, + "grad_norm": 0.2265625, + "learning_rate": 0.00018376118035626942, + "loss": 1.6599, + "step": 2362 + }, + { + "epoch": 0.7386683338543295, + "grad_norm": 0.265625, + "learning_rate": 0.00018374775576456513, + "loss": 1.8036, + "step": 2363 + }, + { + "epoch": 0.7389809315411066, + "grad_norm": 0.232421875, + "learning_rate": 0.00018373432611690208, + "loss": 1.8082, + "step": 2364 + }, + { + "epoch": 0.7392935292278837, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001837208914140911, + "loss": 1.4781, + "step": 2365 + }, + { + "epoch": 0.7396061269146609, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018370745165694318, + "loss": 1.3993, + "step": 2366 + }, + { + "epoch": 0.7399187246014379, + "grad_norm": 0.23046875, + "learning_rate": 0.00018369400684626976, + "loss": 1.5936, + "step": 2367 + }, + { + "epoch": 0.7402313222882151, + "grad_norm": 0.2265625, + "learning_rate": 0.00018368055698288248, + "loss": 1.4418, + "step": 2368 + }, + { + "epoch": 0.7405439199749921, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018366710206759335, + "loss": 1.5162, + "step": 2369 + }, + { + "epoch": 0.7408565176617693, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018365364210121466, + "loss": 1.9776, + "step": 2370 + }, + { + "epoch": 0.7411691153485465, + "grad_norm": 0.2265625, + "learning_rate": 0.00018364017708455895, + "loss": 1.3729, + "step": 2371 + }, + { + "epoch": 0.7414817130353235, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001836267070184392, + "loss": 1.6031, + "step": 2372 + }, + { + "epoch": 0.7417943107221007, + "grad_norm": 0.251953125, + "learning_rate": 0.0001836132319036686, + "loss": 1.5944, + "step": 2373 + }, + { + "epoch": 0.7421069084088777, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001835997517410606, + "loss": 1.3653, + "step": 2374 + }, + { + "epoch": 0.7424195060956549, + "grad_norm": 0.232421875, + "learning_rate": 0.0001835862665314291, + "loss": 1.7253, + "step": 2375 + }, + { + "epoch": 0.742732103782432, + "grad_norm": 0.234375, + "learning_rate": 0.00018357277627558815, + "loss": 1.326, + "step": 2376 + }, + { + "epoch": 0.7430447014692091, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018355928097435218, + "loss": 1.8161, + "step": 2377 + }, + { + "epoch": 0.7433572991559863, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018354578062853595, + "loss": 1.8656, + "step": 2378 + }, + { + "epoch": 0.7436698968427634, + "grad_norm": 0.23046875, + "learning_rate": 0.0001835322752389545, + "loss": 1.8657, + "step": 2379 + }, + { + "epoch": 0.7439824945295405, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001835187648064231, + "loss": 1.5715, + "step": 2380 + }, + { + "epoch": 0.7442950922163176, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001835052493317575, + "loss": 1.7185, + "step": 2381 + }, + { + "epoch": 0.7446076899030947, + "grad_norm": 0.234375, + "learning_rate": 0.00018349172881577356, + "loss": 1.7779, + "step": 2382 + }, + { + "epoch": 0.7449202875898718, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018347820325928754, + "loss": 1.9479, + "step": 2383 + }, + { + "epoch": 0.745232885276649, + "grad_norm": 0.234375, + "learning_rate": 0.00018346467266311604, + "loss": 1.7667, + "step": 2384 + }, + { + "epoch": 0.745545482963426, + "grad_norm": 0.248046875, + "learning_rate": 0.00018345113702807585, + "loss": 1.4014, + "step": 2385 + }, + { + "epoch": 0.7458580806502032, + "grad_norm": 0.240234375, + "learning_rate": 0.00018343759635498422, + "loss": 1.8576, + "step": 2386 + }, + { + "epoch": 0.7461706783369803, + "grad_norm": 0.23828125, + "learning_rate": 0.00018342405064465856, + "loss": 1.6006, + "step": 2387 + }, + { + "epoch": 0.7464832760237574, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018341049989791666, + "loss": 1.5874, + "step": 2388 + }, + { + "epoch": 0.7467958737105346, + "grad_norm": 0.251953125, + "learning_rate": 0.00018339694411557655, + "loss": 1.6729, + "step": 2389 + }, + { + "epoch": 0.7471084713973116, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018338338329845668, + "loss": 1.5282, + "step": 2390 + }, + { + "epoch": 0.7474210690840888, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018336981744737573, + "loss": 1.5829, + "step": 2391 + }, + { + "epoch": 0.747733666770866, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001833562465631526, + "loss": 1.6278, + "step": 2392 + }, + { + "epoch": 0.748046264457643, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018334267064660668, + "loss": 1.6944, + "step": 2393 + }, + { + "epoch": 0.7483588621444202, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018332908969855753, + "loss": 1.8641, + "step": 2394 + }, + { + "epoch": 0.7486714598311972, + "grad_norm": 0.2421875, + "learning_rate": 0.00018331550371982505, + "loss": 1.6727, + "step": 2395 + }, + { + "epoch": 0.7489840575179744, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018330191271122943, + "loss": 1.6077, + "step": 2396 + }, + { + "epoch": 0.7492966552047515, + "grad_norm": 0.244140625, + "learning_rate": 0.0001832883166735912, + "loss": 1.4713, + "step": 2397 + }, + { + "epoch": 0.7496092528915286, + "grad_norm": 0.248046875, + "learning_rate": 0.00018327471560773112, + "loss": 1.9724, + "step": 2398 + }, + { + "epoch": 0.7499218505783057, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018326110951447037, + "loss": 1.852, + "step": 2399 + }, + { + "epoch": 0.7502344482650828, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018324749839463035, + "loss": 1.7013, + "step": 2400 + }, + { + "epoch": 0.75054704595186, + "grad_norm": 0.234375, + "learning_rate": 0.00018323388224903274, + "loss": 2.0012, + "step": 2401 + }, + { + "epoch": 0.7508596436386371, + "grad_norm": 0.2421875, + "learning_rate": 0.0001832202610784996, + "loss": 1.7133, + "step": 2402 + }, + { + "epoch": 0.7511722413254142, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018320663488385327, + "loss": 1.7841, + "step": 2403 + }, + { + "epoch": 0.7514848390121913, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018319300366591637, + "loss": 1.8134, + "step": 2404 + }, + { + "epoch": 0.7517974366989685, + "grad_norm": 0.23046875, + "learning_rate": 0.00018317936742551178, + "loss": 1.5865, + "step": 2405 + }, + { + "epoch": 0.7521100343857455, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001831657261634628, + "loss": 1.6447, + "step": 2406 + }, + { + "epoch": 0.7524226320725227, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018315207988059298, + "loss": 1.4747, + "step": 2407 + }, + { + "epoch": 0.7527352297592997, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001831384285777261, + "loss": 1.7538, + "step": 2408 + }, + { + "epoch": 0.7530478274460769, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018312477225568635, + "loss": 1.6004, + "step": 2409 + }, + { + "epoch": 0.7533604251328541, + "grad_norm": 0.25, + "learning_rate": 0.00018311111091529818, + "loss": 1.6864, + "step": 2410 + }, + { + "epoch": 0.7536730228196311, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018309744455738633, + "loss": 1.8215, + "step": 2411 + }, + { + "epoch": 0.7539856205064083, + "grad_norm": 0.236328125, + "learning_rate": 0.00018308377318277587, + "loss": 1.672, + "step": 2412 + }, + { + "epoch": 0.7542982181931853, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001830700967922921, + "loss": 1.7247, + "step": 2413 + }, + { + "epoch": 0.7546108158799625, + "grad_norm": 0.24609375, + "learning_rate": 0.00018305641538676079, + "loss": 1.6188, + "step": 2414 + }, + { + "epoch": 0.7549234135667396, + "grad_norm": 0.251953125, + "learning_rate": 0.00018304272896700784, + "loss": 1.8593, + "step": 2415 + }, + { + "epoch": 0.7552360112535167, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001830290375338595, + "loss": 1.5332, + "step": 2416 + }, + { + "epoch": 0.7555486089402939, + "grad_norm": 0.224609375, + "learning_rate": 0.00018301534108814234, + "loss": 1.5756, + "step": 2417 + }, + { + "epoch": 0.755861206627071, + "grad_norm": 0.244140625, + "learning_rate": 0.0001830016396306833, + "loss": 2.0714, + "step": 2418 + }, + { + "epoch": 0.7561738043138481, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018298793316230948, + "loss": 1.64, + "step": 2419 + }, + { + "epoch": 0.7564864020006252, + "grad_norm": 0.220703125, + "learning_rate": 0.00018297422168384836, + "loss": 1.5317, + "step": 2420 + }, + { + "epoch": 0.7567989996874023, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018296050519612777, + "loss": 1.8879, + "step": 2421 + }, + { + "epoch": 0.7571115973741794, + "grad_norm": 0.2421875, + "learning_rate": 0.00018294678369997578, + "loss": 1.7005, + "step": 2422 + }, + { + "epoch": 0.7574241950609566, + "grad_norm": 0.376953125, + "learning_rate": 0.00018293305719622072, + "loss": 2.244, + "step": 2423 + }, + { + "epoch": 0.7577367927477336, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018291932568569134, + "loss": 1.5323, + "step": 2424 + }, + { + "epoch": 0.7580493904345108, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018290558916921659, + "loss": 1.6395, + "step": 2425 + }, + { + "epoch": 0.7583619881212879, + "grad_norm": 0.25, + "learning_rate": 0.00018289184764762575, + "loss": 1.648, + "step": 2426 + }, + { + "epoch": 0.758674585808065, + "grad_norm": 0.234375, + "learning_rate": 0.0001828781011217485, + "loss": 1.5622, + "step": 2427 + }, + { + "epoch": 0.7589871834948422, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018286434959241462, + "loss": 1.5481, + "step": 2428 + }, + { + "epoch": 0.7592997811816192, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018285059306045437, + "loss": 1.77, + "step": 2429 + }, + { + "epoch": 0.7596123788683964, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018283683152669824, + "loss": 1.4071, + "step": 2430 + }, + { + "epoch": 0.7599249765551734, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018282306499197703, + "loss": 2.0644, + "step": 2431 + }, + { + "epoch": 0.7602375742419506, + "grad_norm": 0.2421875, + "learning_rate": 0.00018280929345712186, + "loss": 1.7075, + "step": 2432 + }, + { + "epoch": 0.7605501719287278, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001827955169229641, + "loss": 1.3107, + "step": 2433 + }, + { + "epoch": 0.7608627696155048, + "grad_norm": 0.23046875, + "learning_rate": 0.00018278173539033548, + "loss": 1.7646, + "step": 2434 + }, + { + "epoch": 0.761175367302282, + "grad_norm": 0.23828125, + "learning_rate": 0.00018276794886006804, + "loss": 2.0252, + "step": 2435 + }, + { + "epoch": 0.7614879649890591, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018275415733299402, + "loss": 1.5208, + "step": 2436 + }, + { + "epoch": 0.7618005626758362, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018274036080994605, + "loss": 1.8906, + "step": 2437 + }, + { + "epoch": 0.7621131603626133, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018272655929175708, + "loss": 1.8472, + "step": 2438 + }, + { + "epoch": 0.7624257580493904, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001827127527792603, + "loss": 1.7364, + "step": 2439 + }, + { + "epoch": 0.7627383557361676, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018269894127328926, + "loss": 1.8149, + "step": 2440 + }, + { + "epoch": 0.7630509534229447, + "grad_norm": 0.232421875, + "learning_rate": 0.00018268512477467774, + "loss": 1.8335, + "step": 2441 + }, + { + "epoch": 0.7633635511097218, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018267130328425985, + "loss": 1.7762, + "step": 2442 + }, + { + "epoch": 0.7636761487964989, + "grad_norm": 0.2265625, + "learning_rate": 0.00018265747680287008, + "loss": 1.5251, + "step": 2443 + }, + { + "epoch": 0.763988746483276, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018264364533134304, + "loss": 1.5232, + "step": 2444 + }, + { + "epoch": 0.7643013441700531, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018262980887051385, + "loss": 1.5101, + "step": 2445 + }, + { + "epoch": 0.7646139418568303, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018261596742121777, + "loss": 1.6831, + "step": 2446 + }, + { + "epoch": 0.7649265395436073, + "grad_norm": 0.24609375, + "learning_rate": 0.00018260212098429054, + "loss": 1.8748, + "step": 2447 + }, + { + "epoch": 0.7652391372303845, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018258826956056793, + "loss": 1.7539, + "step": 2448 + }, + { + "epoch": 0.7655517349171617, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018257441315088627, + "loss": 1.5779, + "step": 2449 + }, + { + "epoch": 0.7658643326039387, + "grad_norm": 0.2421875, + "learning_rate": 0.00018256055175608205, + "loss": 1.7147, + "step": 2450 + }, + { + "epoch": 0.7661769302907159, + "grad_norm": 0.2265625, + "learning_rate": 0.00018254668537699212, + "loss": 1.682, + "step": 2451 + }, + { + "epoch": 0.7664895279774929, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001825328140144536, + "loss": 1.8002, + "step": 2452 + }, + { + "epoch": 0.7668021256642701, + "grad_norm": 0.234375, + "learning_rate": 0.0001825189376693039, + "loss": 1.3419, + "step": 2453 + }, + { + "epoch": 0.7671147233510472, + "grad_norm": 0.24609375, + "learning_rate": 0.0001825050563423808, + "loss": 1.7038, + "step": 2454 + }, + { + "epoch": 0.7674273210378243, + "grad_norm": 0.232421875, + "learning_rate": 0.00018249117003452234, + "loss": 1.6925, + "step": 2455 + }, + { + "epoch": 0.7677399187246015, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018247727874656683, + "loss": 1.7601, + "step": 2456 + }, + { + "epoch": 0.7680525164113785, + "grad_norm": 0.23828125, + "learning_rate": 0.00018246338247935285, + "loss": 1.6095, + "step": 2457 + }, + { + "epoch": 0.7683651140981557, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001824494812337194, + "loss": 1.4805, + "step": 2458 + }, + { + "epoch": 0.7686777117849328, + "grad_norm": 0.23046875, + "learning_rate": 0.00018243557501050573, + "loss": 1.6642, + "step": 2459 + }, + { + "epoch": 0.7689903094717099, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018242166381055133, + "loss": 1.4541, + "step": 2460 + }, + { + "epoch": 0.769302907158487, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018240774763469606, + "loss": 1.5884, + "step": 2461 + }, + { + "epoch": 0.7696155048452642, + "grad_norm": 0.224609375, + "learning_rate": 0.00018239382648378006, + "loss": 1.6074, + "step": 2462 + }, + { + "epoch": 0.7699281025320412, + "grad_norm": 0.232421875, + "learning_rate": 0.00018237990035864372, + "loss": 1.7759, + "step": 2463 + }, + { + "epoch": 0.7702407002188184, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018236596926012787, + "loss": 1.6379, + "step": 2464 + }, + { + "epoch": 0.7705532979055955, + "grad_norm": 0.248046875, + "learning_rate": 0.00018235203318907347, + "loss": 1.7159, + "step": 2465 + }, + { + "epoch": 0.7708658955923726, + "grad_norm": 0.251953125, + "learning_rate": 0.00018233809214632184, + "loss": 1.6911, + "step": 2466 + }, + { + "epoch": 0.7711784932791498, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018232414613271475, + "loss": 1.422, + "step": 2467 + }, + { + "epoch": 0.7714910909659268, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018231019514909397, + "loss": 1.551, + "step": 2468 + }, + { + "epoch": 0.771803688652704, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018229623919630188, + "loss": 1.8121, + "step": 2469 + }, + { + "epoch": 0.772116286339481, + "grad_norm": 0.2265625, + "learning_rate": 0.00018228227827518095, + "loss": 1.9086, + "step": 2470 + }, + { + "epoch": 0.7724288840262582, + "grad_norm": 0.330078125, + "learning_rate": 0.000182268312386574, + "loss": 2.5265, + "step": 2471 + }, + { + "epoch": 0.7727414817130354, + "grad_norm": 0.234375, + "learning_rate": 0.0001822543415313242, + "loss": 1.8133, + "step": 2472 + }, + { + "epoch": 0.7730540793998124, + "grad_norm": 0.255859375, + "learning_rate": 0.00018224036571027501, + "loss": 1.9486, + "step": 2473 + }, + { + "epoch": 0.7733666770865896, + "grad_norm": 0.240234375, + "learning_rate": 0.0001822263849242701, + "loss": 1.7464, + "step": 2474 + }, + { + "epoch": 0.7736792747733667, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001822123991741536, + "loss": 1.743, + "step": 2475 + }, + { + "epoch": 0.7739918724601438, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018219840846076977, + "loss": 1.4856, + "step": 2476 + }, + { + "epoch": 0.7743044701469209, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018218441278496328, + "loss": 1.7813, + "step": 2477 + }, + { + "epoch": 0.774617067833698, + "grad_norm": 0.234375, + "learning_rate": 0.00018217041214757903, + "loss": 1.7274, + "step": 2478 + }, + { + "epoch": 0.7749296655204752, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018215640654946233, + "loss": 1.5569, + "step": 2479 + }, + { + "epoch": 0.7752422632072523, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018214239599145866, + "loss": 1.5575, + "step": 2480 + }, + { + "epoch": 0.7755548608940294, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018212838047441387, + "loss": 1.5972, + "step": 2481 + }, + { + "epoch": 0.7758674585808065, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001821143599991741, + "loss": 1.668, + "step": 2482 + }, + { + "epoch": 0.7761800562675836, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018210033456658576, + "loss": 1.646, + "step": 2483 + }, + { + "epoch": 0.7764926539543607, + "grad_norm": 0.26171875, + "learning_rate": 0.00018208630417749561, + "loss": 2.3322, + "step": 2484 + }, + { + "epoch": 0.7768052516411379, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018207226883275069, + "loss": 1.5657, + "step": 2485 + }, + { + "epoch": 0.777117849327915, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001820582285331983, + "loss": 1.4964, + "step": 2486 + }, + { + "epoch": 0.7774304470146921, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018204418327968607, + "loss": 1.5711, + "step": 2487 + }, + { + "epoch": 0.7777430447014692, + "grad_norm": 0.26171875, + "learning_rate": 0.00018203013307306195, + "loss": 1.999, + "step": 2488 + }, + { + "epoch": 0.7780556423882463, + "grad_norm": 0.244140625, + "learning_rate": 0.00018201607791417418, + "loss": 1.5581, + "step": 2489 + }, + { + "epoch": 0.7783682400750235, + "grad_norm": 0.23046875, + "learning_rate": 0.00018200201780387126, + "loss": 1.5618, + "step": 2490 + }, + { + "epoch": 0.7786808377618005, + "grad_norm": 0.251953125, + "learning_rate": 0.00018198795274300205, + "loss": 1.6855, + "step": 2491 + }, + { + "epoch": 0.7789934354485777, + "grad_norm": 0.23046875, + "learning_rate": 0.00018197388273241563, + "loss": 1.4388, + "step": 2492 + }, + { + "epoch": 0.7793060331353548, + "grad_norm": 0.23046875, + "learning_rate": 0.00018195980777296146, + "loss": 1.3961, + "step": 2493 + }, + { + "epoch": 0.7796186308221319, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018194572786548924, + "loss": 1.3543, + "step": 2494 + }, + { + "epoch": 0.7799312285089091, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018193164301084905, + "loss": 1.6291, + "step": 2495 + }, + { + "epoch": 0.7802438261956861, + "grad_norm": 0.244140625, + "learning_rate": 0.00018191755320989112, + "loss": 1.8612, + "step": 2496 + }, + { + "epoch": 0.7805564238824633, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018190345846346613, + "loss": 1.507, + "step": 2497 + }, + { + "epoch": 0.7808690215692404, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018188935877242496, + "loss": 1.4034, + "step": 2498 + }, + { + "epoch": 0.7811816192560175, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018187525413761887, + "loss": 1.3682, + "step": 2499 + }, + { + "epoch": 0.7814942169427946, + "grad_norm": 0.25, + "learning_rate": 0.00018186114455989936, + "loss": 1.3907, + "step": 2500 + }, + { + "epoch": 0.7818068146295717, + "grad_norm": 0.251953125, + "learning_rate": 0.00018184703004011822, + "loss": 1.506, + "step": 2501 + }, + { + "epoch": 0.7821194123163488, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018183291057912758, + "loss": 1.6376, + "step": 2502 + }, + { + "epoch": 0.782432010003126, + "grad_norm": 0.23046875, + "learning_rate": 0.00018181878617777985, + "loss": 1.6524, + "step": 2503 + }, + { + "epoch": 0.7827446076899031, + "grad_norm": 0.251953125, + "learning_rate": 0.00018180465683692774, + "loss": 1.6575, + "step": 2504 + }, + { + "epoch": 0.7830572053766802, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018179052255742423, + "loss": 1.6608, + "step": 2505 + }, + { + "epoch": 0.7833698030634574, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018177638334012267, + "loss": 1.7274, + "step": 2506 + }, + { + "epoch": 0.7836824007502344, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018176223918587664, + "loss": 1.7459, + "step": 2507 + }, + { + "epoch": 0.7839949984370116, + "grad_norm": 0.232421875, + "learning_rate": 0.00018174809009554005, + "loss": 1.366, + "step": 2508 + }, + { + "epoch": 0.7843075961237886, + "grad_norm": 0.234375, + "learning_rate": 0.00018173393606996707, + "loss": 1.7907, + "step": 2509 + }, + { + "epoch": 0.7846201938105658, + "grad_norm": 0.240234375, + "learning_rate": 0.0001817197771100122, + "loss": 1.7705, + "step": 2510 + }, + { + "epoch": 0.784932791497343, + "grad_norm": 0.248046875, + "learning_rate": 0.00018170561321653026, + "loss": 1.4995, + "step": 2511 + }, + { + "epoch": 0.78524538918412, + "grad_norm": 0.24609375, + "learning_rate": 0.00018169144439037632, + "loss": 1.6226, + "step": 2512 + }, + { + "epoch": 0.7855579868708972, + "grad_norm": 0.240234375, + "learning_rate": 0.00018167727063240582, + "loss": 1.619, + "step": 2513 + }, + { + "epoch": 0.7858705845576742, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018166309194347438, + "loss": 1.9021, + "step": 2514 + }, + { + "epoch": 0.7861831822444514, + "grad_norm": 0.2275390625, + "learning_rate": 0.000181648908324438, + "loss": 1.9489, + "step": 2515 + }, + { + "epoch": 0.7864957799312285, + "grad_norm": 0.2421875, + "learning_rate": 0.00018163471977615303, + "loss": 1.5399, + "step": 2516 + }, + { + "epoch": 0.7868083776180056, + "grad_norm": 0.236328125, + "learning_rate": 0.000181620526299476, + "loss": 1.5515, + "step": 2517 + }, + { + "epoch": 0.7871209753047828, + "grad_norm": 0.240234375, + "learning_rate": 0.00018160632789526374, + "loss": 1.4493, + "step": 2518 + }, + { + "epoch": 0.7874335729915599, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018159212456437347, + "loss": 1.6494, + "step": 2519 + }, + { + "epoch": 0.787746170678337, + "grad_norm": 0.23828125, + "learning_rate": 0.0001815779163076627, + "loss": 1.7547, + "step": 2520 + }, + { + "epoch": 0.7880587683651141, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018156370312598914, + "loss": 1.7275, + "step": 2521 + }, + { + "epoch": 0.7883713660518912, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001815494850202109, + "loss": 1.3418, + "step": 2522 + }, + { + "epoch": 0.7886839637386683, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018153526199118634, + "loss": 1.5102, + "step": 2523 + }, + { + "epoch": 0.7889965614254455, + "grad_norm": 0.248046875, + "learning_rate": 0.0001815210340397741, + "loss": 2.0452, + "step": 2524 + }, + { + "epoch": 0.7893091591122225, + "grad_norm": 0.25, + "learning_rate": 0.00018150680116683313, + "loss": 1.5017, + "step": 2525 + }, + { + "epoch": 0.7896217567989997, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018149256337322275, + "loss": 2.0215, + "step": 2526 + }, + { + "epoch": 0.7899343544857768, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018147832065980245, + "loss": 1.7694, + "step": 2527 + }, + { + "epoch": 0.7902469521725539, + "grad_norm": 0.240234375, + "learning_rate": 0.00018146407302743208, + "loss": 1.6186, + "step": 2528 + }, + { + "epoch": 0.7905595498593311, + "grad_norm": 0.232421875, + "learning_rate": 0.00018144982047697185, + "loss": 1.7227, + "step": 2529 + }, + { + "epoch": 0.7908721475461081, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018143556300928215, + "loss": 1.6313, + "step": 2530 + }, + { + "epoch": 0.7911847452328853, + "grad_norm": 0.232421875, + "learning_rate": 0.00018142130062522377, + "loss": 1.4294, + "step": 2531 + }, + { + "epoch": 0.7914973429196624, + "grad_norm": 0.23828125, + "learning_rate": 0.00018140703332565768, + "loss": 1.5747, + "step": 2532 + }, + { + "epoch": 0.7918099406064395, + "grad_norm": 0.25390625, + "learning_rate": 0.00018139276111144525, + "loss": 1.6087, + "step": 2533 + }, + { + "epoch": 0.7921225382932167, + "grad_norm": 0.248046875, + "learning_rate": 0.0001813784839834481, + "loss": 1.6986, + "step": 2534 + }, + { + "epoch": 0.7924351359799937, + "grad_norm": 0.251953125, + "learning_rate": 0.00018136420194252818, + "loss": 1.5952, + "step": 2535 + }, + { + "epoch": 0.7927477336667709, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018134991498954773, + "loss": 1.7808, + "step": 2536 + }, + { + "epoch": 0.793060331353548, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001813356231253692, + "loss": 1.518, + "step": 2537 + }, + { + "epoch": 0.7933729290403251, + "grad_norm": 0.2421875, + "learning_rate": 0.0001813213263508555, + "loss": 1.82, + "step": 2538 + }, + { + "epoch": 0.7936855267271022, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001813070246668697, + "loss": 1.5595, + "step": 2539 + }, + { + "epoch": 0.7939981244138793, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018129271807427517, + "loss": 1.8371, + "step": 2540 + }, + { + "epoch": 0.7943107221006565, + "grad_norm": 0.2265625, + "learning_rate": 0.0001812784065739357, + "loss": 1.5297, + "step": 2541 + }, + { + "epoch": 0.7946233197874336, + "grad_norm": 0.236328125, + "learning_rate": 0.0001812640901667152, + "loss": 1.6262, + "step": 2542 + }, + { + "epoch": 0.7949359174742107, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018124976885347806, + "loss": 1.7128, + "step": 2543 + }, + { + "epoch": 0.7952485151609878, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018123544263508884, + "loss": 1.9219, + "step": 2544 + }, + { + "epoch": 0.7955611128477649, + "grad_norm": 0.228515625, + "learning_rate": 0.00018122111151241241, + "loss": 1.5844, + "step": 2545 + }, + { + "epoch": 0.795873710534542, + "grad_norm": 0.2333984375, + "learning_rate": 0.000181206775486314, + "loss": 1.806, + "step": 2546 + }, + { + "epoch": 0.7961863082213192, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018119243455765903, + "loss": 1.648, + "step": 2547 + }, + { + "epoch": 0.7964989059080962, + "grad_norm": 0.251953125, + "learning_rate": 0.00018117808872731336, + "loss": 1.5256, + "step": 2548 + }, + { + "epoch": 0.7968115035948734, + "grad_norm": 0.2421875, + "learning_rate": 0.000181163737996143, + "loss": 1.491, + "step": 2549 + }, + { + "epoch": 0.7971241012816506, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018114938236501438, + "loss": 1.8205, + "step": 2550 + }, + { + "epoch": 0.7974366989684276, + "grad_norm": 0.234375, + "learning_rate": 0.0001811350218347941, + "loss": 1.6017, + "step": 2551 + }, + { + "epoch": 0.7977492966552048, + "grad_norm": 0.240234375, + "learning_rate": 0.0001811206564063492, + "loss": 1.4423, + "step": 2552 + }, + { + "epoch": 0.7980618943419818, + "grad_norm": 0.255859375, + "learning_rate": 0.00018110628608054686, + "loss": 1.8525, + "step": 2553 + }, + { + "epoch": 0.798374492028759, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001810919108582547, + "loss": 1.7098, + "step": 2554 + }, + { + "epoch": 0.7986870897155361, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018107753074034054, + "loss": 1.7347, + "step": 2555 + }, + { + "epoch": 0.7989996874023132, + "grad_norm": 0.244140625, + "learning_rate": 0.00018106314572767252, + "loss": 1.6353, + "step": 2556 + }, + { + "epoch": 0.7993122850890904, + "grad_norm": 0.244140625, + "learning_rate": 0.00018104875582111913, + "loss": 1.7014, + "step": 2557 + }, + { + "epoch": 0.7996248827758674, + "grad_norm": 0.23046875, + "learning_rate": 0.00018103436102154903, + "loss": 1.5313, + "step": 2558 + }, + { + "epoch": 0.7999374804626446, + "grad_norm": 0.24609375, + "learning_rate": 0.0001810199613298313, + "loss": 1.671, + "step": 2559 + }, + { + "epoch": 0.8002500781494217, + "grad_norm": 0.240234375, + "learning_rate": 0.00018100555674683527, + "loss": 1.5859, + "step": 2560 + }, + { + "epoch": 0.8005626758361988, + "grad_norm": 0.232421875, + "learning_rate": 0.00018099114727343057, + "loss": 1.4992, + "step": 2561 + }, + { + "epoch": 0.8008752735229759, + "grad_norm": 0.232421875, + "learning_rate": 0.00018097673291048706, + "loss": 1.6654, + "step": 2562 + }, + { + "epoch": 0.8011878712097531, + "grad_norm": 0.236328125, + "learning_rate": 0.000180962313658875, + "loss": 1.6192, + "step": 2563 + }, + { + "epoch": 0.8015004688965301, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001809478895194649, + "loss": 1.7311, + "step": 2564 + }, + { + "epoch": 0.8018130665833073, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018093346049312758, + "loss": 1.5685, + "step": 2565 + }, + { + "epoch": 0.8021256642700844, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001809190265807341, + "loss": 1.9562, + "step": 2566 + }, + { + "epoch": 0.8024382619568615, + "grad_norm": 0.251953125, + "learning_rate": 0.00018090458778315588, + "loss": 1.662, + "step": 2567 + }, + { + "epoch": 0.8027508596436387, + "grad_norm": 0.251953125, + "learning_rate": 0.00018089014410126457, + "loss": 1.611, + "step": 2568 + }, + { + "epoch": 0.8030634573304157, + "grad_norm": 0.2265625, + "learning_rate": 0.0001808756955359322, + "loss": 1.7113, + "step": 2569 + }, + { + "epoch": 0.8033760550171929, + "grad_norm": 0.234375, + "learning_rate": 0.00018086124208803103, + "loss": 1.3589, + "step": 2570 + }, + { + "epoch": 0.8036886527039699, + "grad_norm": 0.23828125, + "learning_rate": 0.00018084678375843364, + "loss": 1.819, + "step": 2571 + }, + { + "epoch": 0.8040012503907471, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018083232054801288, + "loss": 1.6764, + "step": 2572 + }, + { + "epoch": 0.8043138480775243, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001808178524576419, + "loss": 1.5922, + "step": 2573 + }, + { + "epoch": 0.8046264457643013, + "grad_norm": 0.251953125, + "learning_rate": 0.0001808033794881942, + "loss": 1.5336, + "step": 2574 + }, + { + "epoch": 0.8049390434510785, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001807889016405435, + "loss": 1.443, + "step": 2575 + }, + { + "epoch": 0.8052516411378556, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001807744189155639, + "loss": 1.7123, + "step": 2576 + }, + { + "epoch": 0.8055642388246327, + "grad_norm": 0.24609375, + "learning_rate": 0.00018075993131412966, + "loss": 1.9127, + "step": 2577 + }, + { + "epoch": 0.8058768365114098, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018074543883711547, + "loss": 1.7716, + "step": 2578 + }, + { + "epoch": 0.8061894341981869, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018073094148539625, + "loss": 1.7905, + "step": 2579 + }, + { + "epoch": 0.806502031884964, + "grad_norm": 0.236328125, + "learning_rate": 0.00018071643925984717, + "loss": 1.5217, + "step": 2580 + }, + { + "epoch": 0.8068146295717412, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018070193216134384, + "loss": 1.6451, + "step": 2581 + }, + { + "epoch": 0.8071272272585183, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018068742019076203, + "loss": 1.7439, + "step": 2582 + }, + { + "epoch": 0.8074398249452954, + "grad_norm": 0.25, + "learning_rate": 0.0001806729033489778, + "loss": 2.0439, + "step": 2583 + }, + { + "epoch": 0.8077524226320725, + "grad_norm": 0.263671875, + "learning_rate": 0.0001806583816368676, + "loss": 1.7726, + "step": 2584 + }, + { + "epoch": 0.8080650203188496, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018064385505530813, + "loss": 1.8142, + "step": 2585 + }, + { + "epoch": 0.8083776180056268, + "grad_norm": 0.234375, + "learning_rate": 0.00018062932360517637, + "loss": 1.8507, + "step": 2586 + }, + { + "epoch": 0.8086902156924038, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001806147872873496, + "loss": 1.8861, + "step": 2587 + }, + { + "epoch": 0.809002813379181, + "grad_norm": 0.24609375, + "learning_rate": 0.00018060024610270538, + "loss": 2.04, + "step": 2588 + }, + { + "epoch": 0.8093154110659581, + "grad_norm": 0.23828125, + "learning_rate": 0.0001805857000521216, + "loss": 1.5433, + "step": 2589 + }, + { + "epoch": 0.8096280087527352, + "grad_norm": 0.23828125, + "learning_rate": 0.00018057114913647642, + "loss": 1.5803, + "step": 2590 + }, + { + "epoch": 0.8099406064395124, + "grad_norm": 0.244140625, + "learning_rate": 0.0001805565933566483, + "loss": 1.7928, + "step": 2591 + }, + { + "epoch": 0.8102532041262894, + "grad_norm": 0.25390625, + "learning_rate": 0.00018054203271351599, + "loss": 1.8568, + "step": 2592 + }, + { + "epoch": 0.8105658018130666, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018052746720795848, + "loss": 1.5727, + "step": 2593 + }, + { + "epoch": 0.8108783994998437, + "grad_norm": 0.251953125, + "learning_rate": 0.00018051289684085518, + "loss": 1.543, + "step": 2594 + }, + { + "epoch": 0.8111909971866208, + "grad_norm": 0.2421875, + "learning_rate": 0.00018049832161308574, + "loss": 1.5196, + "step": 2595 + }, + { + "epoch": 0.811503594873398, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018048374152553, + "loss": 1.592, + "step": 2596 + }, + { + "epoch": 0.811816192560175, + "grad_norm": 0.2421875, + "learning_rate": 0.00018046915657906826, + "loss": 1.6238, + "step": 2597 + }, + { + "epoch": 0.8121287902469522, + "grad_norm": 0.2421875, + "learning_rate": 0.00018045456677458094, + "loss": 1.6494, + "step": 2598 + }, + { + "epoch": 0.8124413879337293, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018043997211294896, + "loss": 1.7159, + "step": 2599 + }, + { + "epoch": 0.8127539856205064, + "grad_norm": 0.244140625, + "learning_rate": 0.00018042537259505332, + "loss": 1.7333, + "step": 2600 + }, + { + "epoch": 0.8130665833072835, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018041076822177546, + "loss": 1.7428, + "step": 2601 + }, + { + "epoch": 0.8133791809940606, + "grad_norm": 0.244140625, + "learning_rate": 0.00018039615899399704, + "loss": 1.5266, + "step": 2602 + }, + { + "epoch": 0.8136917786808378, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018038154491260006, + "loss": 1.4482, + "step": 2603 + }, + { + "epoch": 0.8140043763676149, + "grad_norm": 0.25, + "learning_rate": 0.0001803669259784668, + "loss": 1.8164, + "step": 2604 + }, + { + "epoch": 0.814316974054392, + "grad_norm": 0.24609375, + "learning_rate": 0.00018035230219247978, + "loss": 1.7801, + "step": 2605 + }, + { + "epoch": 0.8146295717411691, + "grad_norm": 0.224609375, + "learning_rate": 0.0001803376735555219, + "loss": 1.5818, + "step": 2606 + }, + { + "epoch": 0.8149421694279463, + "grad_norm": 0.236328125, + "learning_rate": 0.0001803230400684763, + "loss": 2.0025, + "step": 2607 + }, + { + "epoch": 0.8152547671147233, + "grad_norm": 0.240234375, + "learning_rate": 0.0001803084017322264, + "loss": 1.6328, + "step": 2608 + }, + { + "epoch": 0.8155673648015005, + "grad_norm": 0.30859375, + "learning_rate": 0.00018029375854765597, + "loss": 2.289, + "step": 2609 + }, + { + "epoch": 0.8158799624882775, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018027911051564897, + "loss": 1.4681, + "step": 2610 + }, + { + "epoch": 0.8161925601750547, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001802644576370898, + "loss": 1.7437, + "step": 2611 + }, + { + "epoch": 0.8165051578618319, + "grad_norm": 0.228515625, + "learning_rate": 0.00018024979991286303, + "loss": 2.0136, + "step": 2612 + }, + { + "epoch": 0.8168177555486089, + "grad_norm": 0.240234375, + "learning_rate": 0.0001802351373438536, + "loss": 1.6401, + "step": 2613 + }, + { + "epoch": 0.8171303532353861, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018022046993094665, + "loss": 1.5986, + "step": 2614 + }, + { + "epoch": 0.8174429509221631, + "grad_norm": 0.228515625, + "learning_rate": 0.00018020579767502774, + "loss": 1.7392, + "step": 2615 + }, + { + "epoch": 0.8177555486089403, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001801911205769826, + "loss": 1.6622, + "step": 2616 + }, + { + "epoch": 0.8180681462957174, + "grad_norm": 0.232421875, + "learning_rate": 0.0001801764386376973, + "loss": 1.6786, + "step": 2617 + }, + { + "epoch": 0.8183807439824945, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001801617518580583, + "loss": 1.6723, + "step": 2618 + }, + { + "epoch": 0.8186933416692717, + "grad_norm": 0.232421875, + "learning_rate": 0.0001801470602389521, + "loss": 1.6344, + "step": 2619 + }, + { + "epoch": 0.8190059393560488, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001801323637812658, + "loss": 1.8773, + "step": 2620 + }, + { + "epoch": 0.8193185370428259, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018011766248588655, + "loss": 1.7633, + "step": 2621 + }, + { + "epoch": 0.819631134729603, + "grad_norm": 0.24609375, + "learning_rate": 0.00018010295635370192, + "loss": 1.7818, + "step": 2622 + }, + { + "epoch": 0.8199437324163801, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018008824538559977, + "loss": 1.5338, + "step": 2623 + }, + { + "epoch": 0.8202563301031572, + "grad_norm": 0.2890625, + "learning_rate": 0.00018007352958246818, + "loss": 2.1521, + "step": 2624 + }, + { + "epoch": 0.8205689277899344, + "grad_norm": 0.2421875, + "learning_rate": 0.00018005880894519555, + "loss": 1.6819, + "step": 2625 + }, + { + "epoch": 0.8208815254767114, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018004408347467062, + "loss": 1.7966, + "step": 2626 + }, + { + "epoch": 0.8211941231634886, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018002935317178235, + "loss": 1.5681, + "step": 2627 + }, + { + "epoch": 0.8215067208502657, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018001461803742008, + "loss": 1.8119, + "step": 2628 + }, + { + "epoch": 0.8218193185370428, + "grad_norm": 0.259765625, + "learning_rate": 0.00017999987807247334, + "loss": 2.2241, + "step": 2629 + }, + { + "epoch": 0.82213191622382, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017998513327783199, + "loss": 1.5033, + "step": 2630 + }, + { + "epoch": 0.822444513910597, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017997038365438628, + "loss": 1.481, + "step": 2631 + }, + { + "epoch": 0.8227571115973742, + "grad_norm": 0.2578125, + "learning_rate": 0.00017995562920302652, + "loss": 1.7684, + "step": 2632 + }, + { + "epoch": 0.8230697092841514, + "grad_norm": 0.251953125, + "learning_rate": 0.0001799408699246436, + "loss": 1.6599, + "step": 2633 + }, + { + "epoch": 0.8233823069709284, + "grad_norm": 0.255859375, + "learning_rate": 0.00017992610582012847, + "loss": 1.3327, + "step": 2634 + }, + { + "epoch": 0.8236949046577056, + "grad_norm": 0.236328125, + "learning_rate": 0.0001799113368903725, + "loss": 1.7121, + "step": 2635 + }, + { + "epoch": 0.8240075023444826, + "grad_norm": 0.22265625, + "learning_rate": 0.00017989656313626727, + "loss": 1.766, + "step": 2636 + }, + { + "epoch": 0.8243201000312598, + "grad_norm": 0.2265625, + "learning_rate": 0.0001798817845587047, + "loss": 1.85, + "step": 2637 + }, + { + "epoch": 0.8246326977180369, + "grad_norm": 0.240234375, + "learning_rate": 0.000179867001158577, + "loss": 1.8962, + "step": 2638 + }, + { + "epoch": 0.824945295404814, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001798522129367767, + "loss": 1.4497, + "step": 2639 + }, + { + "epoch": 0.8252578930915911, + "grad_norm": 0.2421875, + "learning_rate": 0.00017983741989419655, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 0.8255704907783682, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001798226220317296, + "loss": 1.718, + "step": 2641 + }, + { + "epoch": 0.8258830884651454, + "grad_norm": 0.23046875, + "learning_rate": 0.00017980781935026925, + "loss": 1.7489, + "step": 2642 + }, + { + "epoch": 0.8261956861519225, + "grad_norm": 0.25390625, + "learning_rate": 0.0001797930118507091, + "loss": 1.7344, + "step": 2643 + }, + { + "epoch": 0.8265082838386996, + "grad_norm": 0.25, + "learning_rate": 0.0001797781995339432, + "loss": 1.7674, + "step": 2644 + }, + { + "epoch": 0.8268208815254767, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001797633824008657, + "loss": 2.0352, + "step": 2645 + }, + { + "epoch": 0.8271334792122538, + "grad_norm": 0.2265625, + "learning_rate": 0.00017974856045237117, + "loss": 1.6354, + "step": 2646 + }, + { + "epoch": 0.8274460768990309, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017973373368935445, + "loss": 1.737, + "step": 2647 + }, + { + "epoch": 0.8277586745858081, + "grad_norm": 0.25, + "learning_rate": 0.00017971890211271059, + "loss": 1.7081, + "step": 2648 + }, + { + "epoch": 0.8280712722725851, + "grad_norm": 0.251953125, + "learning_rate": 0.000179704065723335, + "loss": 1.3865, + "step": 2649 + }, + { + "epoch": 0.8283838699593623, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017968922452212343, + "loss": 1.5347, + "step": 2650 + }, + { + "epoch": 0.8286964676461395, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017967437850997185, + "loss": 1.7372, + "step": 2651 + }, + { + "epoch": 0.8290090653329165, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017965952768777649, + "loss": 1.5994, + "step": 2652 + }, + { + "epoch": 0.8293216630196937, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001796446720564339, + "loss": 1.8905, + "step": 2653 + }, + { + "epoch": 0.8296342607064707, + "grad_norm": 0.3359375, + "learning_rate": 0.00017962981161684098, + "loss": 2.5074, + "step": 2654 + }, + { + "epoch": 0.8299468583932479, + "grad_norm": 0.2421875, + "learning_rate": 0.00017961494636989486, + "loss": 1.9347, + "step": 2655 + }, + { + "epoch": 0.830259456080025, + "grad_norm": 0.24609375, + "learning_rate": 0.00017960007631649298, + "loss": 1.8819, + "step": 2656 + }, + { + "epoch": 0.8305720537668021, + "grad_norm": 0.240234375, + "learning_rate": 0.00017958520145753307, + "loss": 1.6299, + "step": 2657 + }, + { + "epoch": 0.8308846514535793, + "grad_norm": 0.25390625, + "learning_rate": 0.00017957032179391312, + "loss": 1.7028, + "step": 2658 + }, + { + "epoch": 0.8311972491403563, + "grad_norm": 0.23046875, + "learning_rate": 0.00017955543732653143, + "loss": 1.8788, + "step": 2659 + }, + { + "epoch": 0.8315098468271335, + "grad_norm": 0.228515625, + "learning_rate": 0.0001795405480562866, + "loss": 1.7432, + "step": 2660 + }, + { + "epoch": 0.8318224445139106, + "grad_norm": 0.24609375, + "learning_rate": 0.00017952565398407757, + "loss": 1.583, + "step": 2661 + }, + { + "epoch": 0.8321350422006877, + "grad_norm": 0.240234375, + "learning_rate": 0.00017951075511080347, + "loss": 1.7078, + "step": 2662 + }, + { + "epoch": 0.8324476398874648, + "grad_norm": 0.234375, + "learning_rate": 0.0001794958514373637, + "loss": 1.4488, + "step": 2663 + }, + { + "epoch": 0.832760237574242, + "grad_norm": 0.2421875, + "learning_rate": 0.00017948094296465814, + "loss": 1.6082, + "step": 2664 + }, + { + "epoch": 0.833072835261019, + "grad_norm": 0.244140625, + "learning_rate": 0.00017946602969358673, + "loss": 1.6088, + "step": 2665 + }, + { + "epoch": 0.8333854329477962, + "grad_norm": 0.248046875, + "learning_rate": 0.00017945111162504987, + "loss": 1.7525, + "step": 2666 + }, + { + "epoch": 0.8336980306345733, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017943618875994815, + "loss": 1.8168, + "step": 2667 + }, + { + "epoch": 0.8340106283213504, + "grad_norm": 0.244140625, + "learning_rate": 0.00017942126109918248, + "loss": 1.7631, + "step": 2668 + }, + { + "epoch": 0.8343232260081276, + "grad_norm": 0.234375, + "learning_rate": 0.00017940632864365408, + "loss": 1.665, + "step": 2669 + }, + { + "epoch": 0.8346358236949046, + "grad_norm": 0.2265625, + "learning_rate": 0.00017939139139426443, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 0.8349484213816818, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001793764493519153, + "loss": 1.6251, + "step": 2671 + }, + { + "epoch": 0.8352610190684588, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017936150251750876, + "loss": 1.5676, + "step": 2672 + }, + { + "epoch": 0.835573616755236, + "grad_norm": 0.244140625, + "learning_rate": 0.0001793465508919472, + "loss": 1.9198, + "step": 2673 + }, + { + "epoch": 0.8358862144420132, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017933159447613325, + "loss": 1.8999, + "step": 2674 + }, + { + "epoch": 0.8361988121287902, + "grad_norm": 0.232421875, + "learning_rate": 0.00017931663327096985, + "loss": 1.5773, + "step": 2675 + }, + { + "epoch": 0.8365114098155674, + "grad_norm": 0.251953125, + "learning_rate": 0.00017930166727736022, + "loss": 1.5615, + "step": 2676 + }, + { + "epoch": 0.8368240075023445, + "grad_norm": 0.2265625, + "learning_rate": 0.0001792866964962079, + "loss": 1.7466, + "step": 2677 + }, + { + "epoch": 0.8371366051891216, + "grad_norm": 0.236328125, + "learning_rate": 0.00017927172092841665, + "loss": 1.5719, + "step": 2678 + }, + { + "epoch": 0.8374492028758987, + "grad_norm": 0.236328125, + "learning_rate": 0.00017925674057489062, + "loss": 1.8351, + "step": 2679 + }, + { + "epoch": 0.8377618005626758, + "grad_norm": 0.2421875, + "learning_rate": 0.00017924175543653412, + "loss": 1.3423, + "step": 2680 + }, + { + "epoch": 0.838074398249453, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001792267655142519, + "loss": 1.8691, + "step": 2681 + }, + { + "epoch": 0.8383869959362301, + "grad_norm": 0.23046875, + "learning_rate": 0.00017921177080894887, + "loss": 1.5727, + "step": 2682 + }, + { + "epoch": 0.8386995936230072, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001791967713215303, + "loss": 1.5138, + "step": 2683 + }, + { + "epoch": 0.8390121913097843, + "grad_norm": 0.244140625, + "learning_rate": 0.00017918176705290174, + "loss": 1.7783, + "step": 2684 + }, + { + "epoch": 0.8393247889965614, + "grad_norm": 0.23828125, + "learning_rate": 0.00017916675800396897, + "loss": 1.8948, + "step": 2685 + }, + { + "epoch": 0.8396373866833385, + "grad_norm": 0.240234375, + "learning_rate": 0.00017915174417563816, + "loss": 1.6654, + "step": 2686 + }, + { + "epoch": 0.8399499843701157, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017913672556881566, + "loss": 1.8393, + "step": 2687 + }, + { + "epoch": 0.8402625820568927, + "grad_norm": 0.232421875, + "learning_rate": 0.00017912170218440822, + "loss": 1.5724, + "step": 2688 + }, + { + "epoch": 0.8405751797436699, + "grad_norm": 0.236328125, + "learning_rate": 0.0001791066740233228, + "loss": 1.5801, + "step": 2689 + }, + { + "epoch": 0.8408877774304471, + "grad_norm": 0.259765625, + "learning_rate": 0.00017909164108646667, + "loss": 1.6645, + "step": 2690 + }, + { + "epoch": 0.8412003751172241, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017907660337474735, + "loss": 1.6794, + "step": 2691 + }, + { + "epoch": 0.8415129728040013, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001790615608890727, + "loss": 1.6382, + "step": 2692 + }, + { + "epoch": 0.8418255704907783, + "grad_norm": 0.24609375, + "learning_rate": 0.00017904651363035093, + "loss": 1.6977, + "step": 2693 + }, + { + "epoch": 0.8421381681775555, + "grad_norm": 0.23828125, + "learning_rate": 0.00017903146159949036, + "loss": 1.4432, + "step": 2694 + }, + { + "epoch": 0.8424507658643327, + "grad_norm": 0.22265625, + "learning_rate": 0.00017901640479739975, + "loss": 1.7628, + "step": 2695 + }, + { + "epoch": 0.8427633635511097, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001790013432249881, + "loss": 1.6406, + "step": 2696 + }, + { + "epoch": 0.8430759612378869, + "grad_norm": 0.37890625, + "learning_rate": 0.00017898627688316468, + "loss": 2.2605, + "step": 2697 + }, + { + "epoch": 0.8433885589246639, + "grad_norm": 0.255859375, + "learning_rate": 0.00017897120577283908, + "loss": 1.6559, + "step": 2698 + }, + { + "epoch": 0.8437011566114411, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017895612989492113, + "loss": 1.7878, + "step": 2699 + }, + { + "epoch": 0.8440137542982182, + "grad_norm": 0.25, + "learning_rate": 0.000178941049250321, + "loss": 1.7082, + "step": 2700 + }, + { + "epoch": 0.8443263519849953, + "grad_norm": 0.228515625, + "learning_rate": 0.00017892596383994915, + "loss": 1.6265, + "step": 2701 + }, + { + "epoch": 0.8446389496717724, + "grad_norm": 0.2421875, + "learning_rate": 0.00017891087366471632, + "loss": 1.6036, + "step": 2702 + }, + { + "epoch": 0.8449515473585495, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017889577872553343, + "loss": 1.4701, + "step": 2703 + }, + { + "epoch": 0.8452641450453267, + "grad_norm": 0.236328125, + "learning_rate": 0.00017888067902331186, + "loss": 1.7345, + "step": 2704 + }, + { + "epoch": 0.8455767427321038, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001788655745589632, + "loss": 1.7042, + "step": 2705 + }, + { + "epoch": 0.8458893404188809, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001788504653333993, + "loss": 1.9033, + "step": 2706 + }, + { + "epoch": 0.846201938105658, + "grad_norm": 0.216796875, + "learning_rate": 0.0001788353513475323, + "loss": 1.6525, + "step": 2707 + }, + { + "epoch": 0.8465145357924352, + "grad_norm": 0.251953125, + "learning_rate": 0.0001788202326022747, + "loss": 1.6119, + "step": 2708 + }, + { + "epoch": 0.8468271334792122, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001788051090985392, + "loss": 1.7473, + "step": 2709 + }, + { + "epoch": 0.8471397311659894, + "grad_norm": 0.244140625, + "learning_rate": 0.00017878998083723885, + "loss": 1.8992, + "step": 2710 + }, + { + "epoch": 0.8474523288527664, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017877484781928698, + "loss": 1.6285, + "step": 2711 + }, + { + "epoch": 0.8477649265395436, + "grad_norm": 0.24609375, + "learning_rate": 0.00017875971004559712, + "loss": 1.671, + "step": 2712 + }, + { + "epoch": 0.8480775242263208, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001787445675170832, + "loss": 1.639, + "step": 2713 + }, + { + "epoch": 0.8483901219130978, + "grad_norm": 0.263671875, + "learning_rate": 0.00017872942023465944, + "loss": 2.2887, + "step": 2714 + }, + { + "epoch": 0.848702719599875, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017871426819924025, + "loss": 1.6424, + "step": 2715 + }, + { + "epoch": 0.849015317286652, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017869911141174034, + "loss": 1.6615, + "step": 2716 + }, + { + "epoch": 0.8493279149734292, + "grad_norm": 0.251953125, + "learning_rate": 0.00017868394987307482, + "loss": 1.8865, + "step": 2717 + }, + { + "epoch": 0.8496405126602063, + "grad_norm": 0.251953125, + "learning_rate": 0.00017866878358415895, + "loss": 1.4584, + "step": 2718 + }, + { + "epoch": 0.8499531103469834, + "grad_norm": 0.236328125, + "learning_rate": 0.0001786536125459084, + "loss": 1.7852, + "step": 2719 + }, + { + "epoch": 0.8502657080337606, + "grad_norm": 0.2392578125, + "learning_rate": 0.000178638436759239, + "loss": 1.5773, + "step": 2720 + }, + { + "epoch": 0.8505783057205377, + "grad_norm": 0.255859375, + "learning_rate": 0.00017862325622506698, + "loss": 1.5571, + "step": 2721 + }, + { + "epoch": 0.8508909034073148, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017860807094430877, + "loss": 1.6325, + "step": 2722 + }, + { + "epoch": 0.8512035010940919, + "grad_norm": 0.2421875, + "learning_rate": 0.0001785928809178812, + "loss": 2.1872, + "step": 2723 + }, + { + "epoch": 0.851516098780869, + "grad_norm": 0.2421875, + "learning_rate": 0.0001785776861467012, + "loss": 1.7218, + "step": 2724 + }, + { + "epoch": 0.8518286964676461, + "grad_norm": 0.232421875, + "learning_rate": 0.00017856248663168618, + "loss": 1.8967, + "step": 2725 + }, + { + "epoch": 0.8521412941544233, + "grad_norm": 0.234375, + "learning_rate": 0.00017854728237375373, + "loss": 1.412, + "step": 2726 + }, + { + "epoch": 0.8524538918412004, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017853207337382174, + "loss": 1.5824, + "step": 2727 + }, + { + "epoch": 0.8527664895279775, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001785168596328084, + "loss": 1.6068, + "step": 2728 + }, + { + "epoch": 0.8530790872147546, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001785016411516322, + "loss": 1.5164, + "step": 2729 + }, + { + "epoch": 0.8533916849015317, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017848641793121188, + "loss": 1.8491, + "step": 2730 + }, + { + "epoch": 0.8537042825883089, + "grad_norm": 0.24609375, + "learning_rate": 0.0001784711899724665, + "loss": 1.6247, + "step": 2731 + }, + { + "epoch": 0.8540168802750859, + "grad_norm": 0.25390625, + "learning_rate": 0.0001784559572763154, + "loss": 1.4966, + "step": 2732 + }, + { + "epoch": 0.8543294779618631, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017844071984367816, + "loss": 1.5311, + "step": 2733 + }, + { + "epoch": 0.8546420756486403, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001784254776754747, + "loss": 1.521, + "step": 2734 + }, + { + "epoch": 0.8549546733354173, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017841023077262523, + "loss": 1.7637, + "step": 2735 + }, + { + "epoch": 0.8552672710221945, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001783949791360502, + "loss": 1.3663, + "step": 2736 + }, + { + "epoch": 0.8555798687089715, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001783797227666704, + "loss": 1.6854, + "step": 2737 + }, + { + "epoch": 0.8558924663957487, + "grad_norm": 0.232421875, + "learning_rate": 0.00017836446166540683, + "loss": 1.7461, + "step": 2738 + }, + { + "epoch": 0.8562050640825258, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017834919583318087, + "loss": 1.5579, + "step": 2739 + }, + { + "epoch": 0.8565176617693029, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017833392527091412, + "loss": 1.8503, + "step": 2740 + }, + { + "epoch": 0.85683025945608, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017831864997952846, + "loss": 1.7036, + "step": 2741 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017830336995994608, + "loss": 1.546, + "step": 2742 + }, + { + "epoch": 0.8574554548296343, + "grad_norm": 0.2421875, + "learning_rate": 0.00017828808521308949, + "loss": 1.7367, + "step": 2743 + }, + { + "epoch": 0.8577680525164114, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017827279573988145, + "loss": 1.6342, + "step": 2744 + }, + { + "epoch": 0.8580806502031885, + "grad_norm": 0.2421875, + "learning_rate": 0.00017825750154124497, + "loss": 1.4992, + "step": 2745 + }, + { + "epoch": 0.8583932478899656, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017824220261810337, + "loss": 1.6274, + "step": 2746 + }, + { + "epoch": 0.8587058455767427, + "grad_norm": 0.25, + "learning_rate": 0.00017822689897138035, + "loss": 1.4625, + "step": 2747 + }, + { + "epoch": 0.8590184432635198, + "grad_norm": 0.232421875, + "learning_rate": 0.00017821159060199974, + "loss": 1.4388, + "step": 2748 + }, + { + "epoch": 0.859331040950297, + "grad_norm": 0.23828125, + "learning_rate": 0.00017819627751088573, + "loss": 1.4505, + "step": 2749 + }, + { + "epoch": 0.859643638637074, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001781809596989628, + "loss": 1.4593, + "step": 2750 + }, + { + "epoch": 0.8599562363238512, + "grad_norm": 0.224609375, + "learning_rate": 0.0001781656371671557, + "loss": 1.5498, + "step": 2751 + }, + { + "epoch": 0.8602688340106284, + "grad_norm": 0.30859375, + "learning_rate": 0.00017815030991638947, + "loss": 2.1876, + "step": 2752 + }, + { + "epoch": 0.8605814316974054, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017813497794758946, + "loss": 1.4955, + "step": 2753 + }, + { + "epoch": 0.8608940293841826, + "grad_norm": 0.236328125, + "learning_rate": 0.00017811964126168123, + "loss": 1.6525, + "step": 2754 + }, + { + "epoch": 0.8612066270709596, + "grad_norm": 0.24609375, + "learning_rate": 0.00017810429985959077, + "loss": 1.7273, + "step": 2755 + }, + { + "epoch": 0.8615192247577368, + "grad_norm": 0.26171875, + "learning_rate": 0.00017808895374224414, + "loss": 1.6337, + "step": 2756 + }, + { + "epoch": 0.861831822444514, + "grad_norm": 0.232421875, + "learning_rate": 0.0001780736029105679, + "loss": 1.572, + "step": 2757 + }, + { + "epoch": 0.862144420131291, + "grad_norm": 0.2421875, + "learning_rate": 0.00017805824736548872, + "loss": 1.7677, + "step": 2758 + }, + { + "epoch": 0.8624570178180682, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017804288710793374, + "loss": 1.4813, + "step": 2759 + }, + { + "epoch": 0.8627696155048452, + "grad_norm": 0.255859375, + "learning_rate": 0.00017802752213883017, + "loss": 1.863, + "step": 2760 + }, + { + "epoch": 0.8630822131916224, + "grad_norm": 0.232421875, + "learning_rate": 0.00017801215245910569, + "loss": 1.7106, + "step": 2761 + }, + { + "epoch": 0.8633948108783995, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017799677806968811, + "loss": 1.5748, + "step": 2762 + }, + { + "epoch": 0.8637074085651766, + "grad_norm": 0.263671875, + "learning_rate": 0.00017798139897150564, + "loss": 1.7248, + "step": 2763 + }, + { + "epoch": 0.8640200062519537, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017796601516548676, + "loss": 1.7132, + "step": 2764 + }, + { + "epoch": 0.8643326039387309, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001779506266525602, + "loss": 1.742, + "step": 2765 + }, + { + "epoch": 0.864645201625508, + "grad_norm": 0.2431640625, + "learning_rate": 0.000177935233433655, + "loss": 1.8706, + "step": 2766 + }, + { + "epoch": 0.8649577993122851, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001779198355097004, + "loss": 1.5686, + "step": 2767 + }, + { + "epoch": 0.8652703969990622, + "grad_norm": 0.234375, + "learning_rate": 0.00017790443288162605, + "loss": 1.7863, + "step": 2768 + }, + { + "epoch": 0.8655829946858393, + "grad_norm": 0.248046875, + "learning_rate": 0.00017788902555036182, + "loss": 1.6466, + "step": 2769 + }, + { + "epoch": 0.8658955923726165, + "grad_norm": 0.26171875, + "learning_rate": 0.00017787361351683786, + "loss": 1.7133, + "step": 2770 + }, + { + "epoch": 0.8662081900593935, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017785819678198462, + "loss": 1.7669, + "step": 2771 + }, + { + "epoch": 0.8665207877461707, + "grad_norm": 0.23046875, + "learning_rate": 0.0001778427753467328, + "loss": 1.7054, + "step": 2772 + }, + { + "epoch": 0.8668333854329477, + "grad_norm": 0.240234375, + "learning_rate": 0.00017782734921201348, + "loss": 1.5878, + "step": 2773 + }, + { + "epoch": 0.8671459831197249, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017781191837875788, + "loss": 1.5847, + "step": 2774 + }, + { + "epoch": 0.8674585808065021, + "grad_norm": 0.240234375, + "learning_rate": 0.0001777964828478976, + "loss": 1.556, + "step": 2775 + }, + { + "epoch": 0.8677711784932791, + "grad_norm": 0.236328125, + "learning_rate": 0.00017778104262036455, + "loss": 1.481, + "step": 2776 + }, + { + "epoch": 0.8680837761800563, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001777655976970908, + "loss": 1.5842, + "step": 2777 + }, + { + "epoch": 0.8683963738668334, + "grad_norm": 0.251953125, + "learning_rate": 0.00017775014807900884, + "loss": 1.6188, + "step": 2778 + }, + { + "epoch": 0.8687089715536105, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017773469376705138, + "loss": 1.7405, + "step": 2779 + }, + { + "epoch": 0.8690215692403876, + "grad_norm": 0.234375, + "learning_rate": 0.00017771923476215138, + "loss": 2.009, + "step": 2780 + }, + { + "epoch": 0.8693341669271647, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017770377106524215, + "loss": 1.5022, + "step": 2781 + }, + { + "epoch": 0.8696467646139419, + "grad_norm": 0.298828125, + "learning_rate": 0.0001776883026772572, + "loss": 2.3243, + "step": 2782 + }, + { + "epoch": 0.869959362300719, + "grad_norm": 0.25, + "learning_rate": 0.00017767282959913047, + "loss": 1.5778, + "step": 2783 + }, + { + "epoch": 0.8702719599874961, + "grad_norm": 0.244140625, + "learning_rate": 0.00017765735183179602, + "loss": 1.648, + "step": 2784 + }, + { + "epoch": 0.8705845576742732, + "grad_norm": 0.23828125, + "learning_rate": 0.00017764186937618828, + "loss": 1.9461, + "step": 2785 + }, + { + "epoch": 0.8708971553610503, + "grad_norm": 0.23828125, + "learning_rate": 0.00017762638223324192, + "loss": 1.6331, + "step": 2786 + }, + { + "epoch": 0.8712097530478274, + "grad_norm": 0.23046875, + "learning_rate": 0.00017761089040389198, + "loss": 1.5506, + "step": 2787 + }, + { + "epoch": 0.8715223507346046, + "grad_norm": 0.236328125, + "learning_rate": 0.00017759539388907366, + "loss": 1.4817, + "step": 2788 + }, + { + "epoch": 0.8718349484213817, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017757989268972257, + "loss": 1.4606, + "step": 2789 + }, + { + "epoch": 0.8721475461081588, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017756438680677445, + "loss": 1.4484, + "step": 2790 + }, + { + "epoch": 0.872460143794936, + "grad_norm": 0.234375, + "learning_rate": 0.00017754887624116548, + "loss": 1.5865, + "step": 2791 + }, + { + "epoch": 0.872772741481713, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017753336099383203, + "loss": 1.514, + "step": 2792 + }, + { + "epoch": 0.8730853391684902, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017751784106571079, + "loss": 1.3963, + "step": 2793 + }, + { + "epoch": 0.8733979368552672, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017750231645773869, + "loss": 1.8982, + "step": 2794 + }, + { + "epoch": 0.8737105345420444, + "grad_norm": 0.232421875, + "learning_rate": 0.00017748678717085297, + "loss": 1.7107, + "step": 2795 + }, + { + "epoch": 0.8740231322288216, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017747125320599118, + "loss": 1.5219, + "step": 2796 + }, + { + "epoch": 0.8743357299155986, + "grad_norm": 0.236328125, + "learning_rate": 0.0001774557145640911, + "loss": 1.6148, + "step": 2797 + }, + { + "epoch": 0.8746483276023758, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017744017124609083, + "loss": 1.4968, + "step": 2798 + }, + { + "epoch": 0.8749609252891528, + "grad_norm": 0.24609375, + "learning_rate": 0.00017742462325292873, + "loss": 1.6438, + "step": 2799 + }, + { + "epoch": 0.87527352297593, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001774090705855435, + "loss": 1.8157, + "step": 2800 + }, + { + "epoch": 0.8755861206627071, + "grad_norm": 0.2314453125, + "learning_rate": 0.000177393513244874, + "loss": 1.8969, + "step": 2801 + }, + { + "epoch": 0.8758987183494842, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001773779512318595, + "loss": 1.7561, + "step": 2802 + }, + { + "epoch": 0.8762113160362613, + "grad_norm": 0.2421875, + "learning_rate": 0.00017736238454743946, + "loss": 1.8387, + "step": 2803 + }, + { + "epoch": 0.8765239137230384, + "grad_norm": 0.2421875, + "learning_rate": 0.0001773468131925537, + "loss": 1.8426, + "step": 2804 + }, + { + "epoch": 0.8768365114098156, + "grad_norm": 0.25, + "learning_rate": 0.00017733123716814225, + "loss": 1.5613, + "step": 2805 + }, + { + "epoch": 0.8771491090965927, + "grad_norm": 0.255859375, + "learning_rate": 0.0001773156564751455, + "loss": 1.9907, + "step": 2806 + }, + { + "epoch": 0.8774617067833698, + "grad_norm": 0.232421875, + "learning_rate": 0.00017730007111450402, + "loss": 1.3814, + "step": 2807 + }, + { + "epoch": 0.8777743044701469, + "grad_norm": 0.23046875, + "learning_rate": 0.00017728448108715874, + "loss": 1.459, + "step": 2808 + }, + { + "epoch": 0.8780869021569241, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017726888639405086, + "loss": 1.6541, + "step": 2809 + }, + { + "epoch": 0.8783994998437011, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017725328703612183, + "loss": 1.6136, + "step": 2810 + }, + { + "epoch": 0.8787120975304783, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017723768301431344, + "loss": 1.9023, + "step": 2811 + }, + { + "epoch": 0.8790246952172553, + "grad_norm": 0.3203125, + "learning_rate": 0.00017722207432956767, + "loss": 2.4062, + "step": 2812 + }, + { + "epoch": 0.8793372929040325, + "grad_norm": 0.232421875, + "learning_rate": 0.00017720646098282687, + "loss": 1.6481, + "step": 2813 + }, + { + "epoch": 0.8796498905908097, + "grad_norm": 0.232421875, + "learning_rate": 0.00017719084297503367, + "loss": 1.7955, + "step": 2814 + }, + { + "epoch": 0.8799624882775867, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001771752203071309, + "loss": 1.7442, + "step": 2815 + }, + { + "epoch": 0.8802750859643639, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001771595929800617, + "loss": 1.9734, + "step": 2816 + }, + { + "epoch": 0.8805876836511409, + "grad_norm": 0.244140625, + "learning_rate": 0.0001771439609947696, + "loss": 1.651, + "step": 2817 + }, + { + "epoch": 0.8809002813379181, + "grad_norm": 0.240234375, + "learning_rate": 0.00017712832435219823, + "loss": 1.6914, + "step": 2818 + }, + { + "epoch": 0.8812128790246953, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017711268305329166, + "loss": 1.9028, + "step": 2819 + }, + { + "epoch": 0.8815254767114723, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017709703709899413, + "loss": 1.7345, + "step": 2820 + }, + { + "epoch": 0.8818380743982495, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017708138649025023, + "loss": 1.8512, + "step": 2821 + }, + { + "epoch": 0.8821506720850266, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001770657312280048, + "loss": 1.6781, + "step": 2822 + }, + { + "epoch": 0.8824632697718037, + "grad_norm": 0.23828125, + "learning_rate": 0.00017705007131320298, + "loss": 1.5084, + "step": 2823 + }, + { + "epoch": 0.8827758674585808, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017703440674679015, + "loss": 1.5801, + "step": 2824 + }, + { + "epoch": 0.8830884651453579, + "grad_norm": 0.2265625, + "learning_rate": 0.00017701873752971206, + "loss": 1.7738, + "step": 2825 + }, + { + "epoch": 0.883401062832135, + "grad_norm": 0.232421875, + "learning_rate": 0.00017700306366291458, + "loss": 1.7093, + "step": 2826 + }, + { + "epoch": 0.8837136605189122, + "grad_norm": 0.23046875, + "learning_rate": 0.00017698738514734406, + "loss": 1.7994, + "step": 2827 + }, + { + "epoch": 0.8840262582056893, + "grad_norm": 0.2216796875, + "learning_rate": 0.00017697170198394696, + "loss": 1.7524, + "step": 2828 + }, + { + "epoch": 0.8843388558924664, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001769560141736702, + "loss": 1.4667, + "step": 2829 + }, + { + "epoch": 0.8846514535792435, + "grad_norm": 0.23828125, + "learning_rate": 0.00017694032171746072, + "loss": 1.4843, + "step": 2830 + }, + { + "epoch": 0.8849640512660206, + "grad_norm": 0.240234375, + "learning_rate": 0.000176924624616266, + "loss": 1.4988, + "step": 2831 + }, + { + "epoch": 0.8852766489527978, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017690892287103367, + "loss": 1.5816, + "step": 2832 + }, + { + "epoch": 0.8855892466395748, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017689321648271166, + "loss": 1.7245, + "step": 2833 + }, + { + "epoch": 0.885901844326352, + "grad_norm": 0.2216796875, + "learning_rate": 0.00017687750545224815, + "loss": 1.7804, + "step": 2834 + }, + { + "epoch": 0.8862144420131292, + "grad_norm": 0.251953125, + "learning_rate": 0.0001768617897805917, + "loss": 1.5097, + "step": 2835 + }, + { + "epoch": 0.8865270396999062, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017684606946869106, + "loss": 1.5496, + "step": 2836 + }, + { + "epoch": 0.8868396373866834, + "grad_norm": 0.236328125, + "learning_rate": 0.00017683034451749526, + "loss": 1.829, + "step": 2837 + }, + { + "epoch": 0.8871522350734604, + "grad_norm": 0.251953125, + "learning_rate": 0.0001768146149279537, + "loss": 1.4844, + "step": 2838 + }, + { + "epoch": 0.8874648327602376, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017679888070101592, + "loss": 1.7066, + "step": 2839 + }, + { + "epoch": 0.8877774304470147, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017678314183763183, + "loss": 1.5307, + "step": 2840 + }, + { + "epoch": 0.8880900281337918, + "grad_norm": 0.240234375, + "learning_rate": 0.00017676739833875164, + "loss": 1.4304, + "step": 2841 + }, + { + "epoch": 0.888402625820569, + "grad_norm": 0.22265625, + "learning_rate": 0.00017675165020532578, + "loss": 1.6068, + "step": 2842 + }, + { + "epoch": 0.888715223507346, + "grad_norm": 0.244140625, + "learning_rate": 0.000176735897438305, + "loss": 1.4709, + "step": 2843 + }, + { + "epoch": 0.8890278211941232, + "grad_norm": 0.23828125, + "learning_rate": 0.00017672014003864033, + "loss": 1.6562, + "step": 2844 + }, + { + "epoch": 0.8893404188809003, + "grad_norm": 0.2265625, + "learning_rate": 0.000176704378007283, + "loss": 1.8352, + "step": 2845 + }, + { + "epoch": 0.8896530165676774, + "grad_norm": 0.259765625, + "learning_rate": 0.0001766886113451846, + "loss": 1.8639, + "step": 2846 + }, + { + "epoch": 0.8899656142544545, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017667284005329708, + "loss": 1.6163, + "step": 2847 + }, + { + "epoch": 0.8902782119412317, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017665706413257245, + "loss": 1.7933, + "step": 2848 + }, + { + "epoch": 0.8905908096280087, + "grad_norm": 0.232421875, + "learning_rate": 0.0001766412835839632, + "loss": 1.6013, + "step": 2849 + }, + { + "epoch": 0.8909034073147859, + "grad_norm": 0.248046875, + "learning_rate": 0.000176625498408422, + "loss": 1.6694, + "step": 2850 + }, + { + "epoch": 0.891216005001563, + "grad_norm": 0.25390625, + "learning_rate": 0.0001766097086069018, + "loss": 1.6816, + "step": 2851 + }, + { + "epoch": 0.8915286026883401, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017659391418035588, + "loss": 1.7289, + "step": 2852 + }, + { + "epoch": 0.8918412003751173, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001765781151297377, + "loss": 1.4146, + "step": 2853 + }, + { + "epoch": 0.8921537980618943, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001765623114560012, + "loss": 1.6338, + "step": 2854 + }, + { + "epoch": 0.8924663957486715, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017654650316010036, + "loss": 1.623, + "step": 2855 + }, + { + "epoch": 0.8927789934354485, + "grad_norm": 0.24609375, + "learning_rate": 0.00017653069024298957, + "loss": 1.6547, + "step": 2856 + }, + { + "epoch": 0.8930915911222257, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001765148727056235, + "loss": 1.7697, + "step": 2857 + }, + { + "epoch": 0.8934041888090029, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017649905054895705, + "loss": 1.7488, + "step": 2858 + }, + { + "epoch": 0.8937167864957799, + "grad_norm": 0.3046875, + "learning_rate": 0.00017648322377394546, + "loss": 2.1237, + "step": 2859 + }, + { + "epoch": 0.8940293841825571, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017646739238154417, + "loss": 1.6839, + "step": 2860 + }, + { + "epoch": 0.8943419818693341, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017645155637270897, + "loss": 1.6423, + "step": 2861 + }, + { + "epoch": 0.8946545795561113, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017643571574839587, + "loss": 1.7184, + "step": 2862 + }, + { + "epoch": 0.8949671772428884, + "grad_norm": 0.251953125, + "learning_rate": 0.00017641987050956122, + "loss": 1.8355, + "step": 2863 + }, + { + "epoch": 0.8952797749296655, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001764040206571616, + "loss": 1.6686, + "step": 2864 + }, + { + "epoch": 0.8955923726164426, + "grad_norm": 0.23046875, + "learning_rate": 0.00017638816619215388, + "loss": 1.7545, + "step": 2865 + }, + { + "epoch": 0.8959049703032198, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017637230711549525, + "loss": 1.7738, + "step": 2866 + }, + { + "epoch": 0.8962175679899969, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001763564434281431, + "loss": 1.7099, + "step": 2867 + }, + { + "epoch": 0.896530165676774, + "grad_norm": 0.23046875, + "learning_rate": 0.00017634057513105515, + "loss": 1.6731, + "step": 2868 + }, + { + "epoch": 0.8968427633635511, + "grad_norm": 0.244140625, + "learning_rate": 0.0001763247022251894, + "loss": 1.4654, + "step": 2869 + }, + { + "epoch": 0.8971553610503282, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017630882471150413, + "loss": 1.7359, + "step": 2870 + }, + { + "epoch": 0.8974679587371054, + "grad_norm": 0.2421875, + "learning_rate": 0.00017629294259095785, + "loss": 1.5702, + "step": 2871 + }, + { + "epoch": 0.8977805564238824, + "grad_norm": 0.26171875, + "learning_rate": 0.00017627705586450944, + "loss": 2.429, + "step": 2872 + }, + { + "epoch": 0.8980931541106596, + "grad_norm": 0.25, + "learning_rate": 0.00017626116453311794, + "loss": 1.8714, + "step": 2873 + }, + { + "epoch": 0.8984057517974366, + "grad_norm": 0.2421875, + "learning_rate": 0.00017624526859774274, + "loss": 1.592, + "step": 2874 + }, + { + "epoch": 0.8987183494842138, + "grad_norm": 0.234375, + "learning_rate": 0.00017622936805934355, + "loss": 1.9351, + "step": 2875 + }, + { + "epoch": 0.899030947170991, + "grad_norm": 0.244140625, + "learning_rate": 0.00017621346291888025, + "loss": 1.5676, + "step": 2876 + }, + { + "epoch": 0.899343544857768, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001761975531773131, + "loss": 2.0676, + "step": 2877 + }, + { + "epoch": 0.8996561425445452, + "grad_norm": 0.234375, + "learning_rate": 0.00017618163883560255, + "loss": 1.8676, + "step": 2878 + }, + { + "epoch": 0.8999687402313223, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017616571989470937, + "loss": 1.6823, + "step": 2879 + }, + { + "epoch": 0.9002813379180994, + "grad_norm": 0.24609375, + "learning_rate": 0.00017614979635559462, + "loss": 1.6829, + "step": 2880 + }, + { + "epoch": 0.9005939356048765, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017613386821921964, + "loss": 1.3811, + "step": 2881 + }, + { + "epoch": 0.9009065332916536, + "grad_norm": 0.259765625, + "learning_rate": 0.00017611793548654602, + "loss": 1.3734, + "step": 2882 + }, + { + "epoch": 0.9012191309784308, + "grad_norm": 0.220703125, + "learning_rate": 0.00017610199815853563, + "loss": 1.8464, + "step": 2883 + }, + { + "epoch": 0.9015317286652079, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017608605623615063, + "loss": 1.4275, + "step": 2884 + }, + { + "epoch": 0.901844326351985, + "grad_norm": 0.232421875, + "learning_rate": 0.00017607010972035348, + "loss": 1.5875, + "step": 2885 + }, + { + "epoch": 0.9021569240387621, + "grad_norm": 0.236328125, + "learning_rate": 0.00017605415861210685, + "loss": 1.8575, + "step": 2886 + }, + { + "epoch": 0.9024695217255392, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017603820291237375, + "loss": 1.8156, + "step": 2887 + }, + { + "epoch": 0.9027821194123163, + "grad_norm": 0.240234375, + "learning_rate": 0.00017602224262211743, + "loss": 1.4908, + "step": 2888 + }, + { + "epoch": 0.9030947170990935, + "grad_norm": 0.244140625, + "learning_rate": 0.00017600627774230144, + "loss": 1.7584, + "step": 2889 + }, + { + "epoch": 0.9034073147858706, + "grad_norm": 0.25, + "learning_rate": 0.00017599030827388965, + "loss": 1.7706, + "step": 2890 + }, + { + "epoch": 0.9037199124726477, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001759743342178461, + "loss": 1.6771, + "step": 2891 + }, + { + "epoch": 0.9040325101594249, + "grad_norm": 0.251953125, + "learning_rate": 0.00017595835557513516, + "loss": 1.838, + "step": 2892 + }, + { + "epoch": 0.9043451078462019, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017594237234672152, + "loss": 1.7833, + "step": 2893 + }, + { + "epoch": 0.9046577055329791, + "grad_norm": 0.244140625, + "learning_rate": 0.00017592638453357005, + "loss": 1.8564, + "step": 2894 + }, + { + "epoch": 0.9049703032197561, + "grad_norm": 0.236328125, + "learning_rate": 0.000175910392136646, + "loss": 1.4054, + "step": 2895 + }, + { + "epoch": 0.9052829009065333, + "grad_norm": 0.234375, + "learning_rate": 0.00017589439515691487, + "loss": 1.7344, + "step": 2896 + }, + { + "epoch": 0.9055954985933105, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001758783935953424, + "loss": 1.6391, + "step": 2897 + }, + { + "epoch": 0.9059080962800875, + "grad_norm": 0.236328125, + "learning_rate": 0.00017586238745289457, + "loss": 1.6244, + "step": 2898 + }, + { + "epoch": 0.9062206939668647, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017584637673053778, + "loss": 1.6056, + "step": 2899 + }, + { + "epoch": 0.9065332916536417, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017583036142923856, + "loss": 1.7858, + "step": 2900 + }, + { + "epoch": 0.9068458893404189, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001758143415499638, + "loss": 1.6028, + "step": 2901 + }, + { + "epoch": 0.907158487027196, + "grad_norm": 0.23828125, + "learning_rate": 0.0001757983170936806, + "loss": 1.6918, + "step": 2902 + }, + { + "epoch": 0.9074710847139731, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017578228806135643, + "loss": 1.9901, + "step": 2903 + }, + { + "epoch": 0.9077836824007502, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017576625445395893, + "loss": 1.5383, + "step": 2904 + }, + { + "epoch": 0.9080962800875274, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017575021627245612, + "loss": 1.5068, + "step": 2905 + }, + { + "epoch": 0.9084088777743045, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017573417351781625, + "loss": 1.8062, + "step": 2906 + }, + { + "epoch": 0.9087214754610816, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017571812619100778, + "loss": 1.4791, + "step": 2907 + }, + { + "epoch": 0.9090340731478587, + "grad_norm": 0.255859375, + "learning_rate": 0.00017570207429299956, + "loss": 1.7496, + "step": 2908 + }, + { + "epoch": 0.9093466708346358, + "grad_norm": 0.244140625, + "learning_rate": 0.00017568601782476064, + "loss": 1.5202, + "step": 2909 + }, + { + "epoch": 0.909659268521413, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017566995678726038, + "loss": 1.6579, + "step": 2910 + }, + { + "epoch": 0.90997186620819, + "grad_norm": 0.251953125, + "learning_rate": 0.0001756538911814684, + "loss": 1.606, + "step": 2911 + }, + { + "epoch": 0.9102844638949672, + "grad_norm": 0.244140625, + "learning_rate": 0.0001756378210083546, + "loss": 1.6417, + "step": 2912 + }, + { + "epoch": 0.9105970615817442, + "grad_norm": 0.232421875, + "learning_rate": 0.00017562174626888918, + "loss": 1.6654, + "step": 2913 + }, + { + "epoch": 0.9109096592685214, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017560566696404254, + "loss": 1.676, + "step": 2914 + }, + { + "epoch": 0.9112222569552986, + "grad_norm": 0.240234375, + "learning_rate": 0.00017558958309478543, + "loss": 1.5845, + "step": 2915 + }, + { + "epoch": 0.9115348546420756, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001755734946620889, + "loss": 1.5907, + "step": 2916 + }, + { + "epoch": 0.9118474523288528, + "grad_norm": 0.23828125, + "learning_rate": 0.00017555740166692418, + "loss": 1.8526, + "step": 2917 + }, + { + "epoch": 0.9121600500156298, + "grad_norm": 0.255859375, + "learning_rate": 0.00017554130411026283, + "loss": 1.4743, + "step": 2918 + }, + { + "epoch": 0.912472647702407, + "grad_norm": 0.2421875, + "learning_rate": 0.0001755252019930767, + "loss": 1.4929, + "step": 2919 + }, + { + "epoch": 0.9127852453891842, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001755090953163379, + "loss": 1.4583, + "step": 2920 + }, + { + "epoch": 0.9130978430759612, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017549298408101876, + "loss": 1.7967, + "step": 2921 + }, + { + "epoch": 0.9134104407627384, + "grad_norm": 0.244140625, + "learning_rate": 0.00017547686828809196, + "loss": 1.9172, + "step": 2922 + }, + { + "epoch": 0.9137230384495155, + "grad_norm": 0.244140625, + "learning_rate": 0.00017546074793853048, + "loss": 1.5975, + "step": 2923 + }, + { + "epoch": 0.9140356361362926, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017544462303330748, + "loss": 1.8838, + "step": 2924 + }, + { + "epoch": 0.9143482338230697, + "grad_norm": 0.2421875, + "learning_rate": 0.00017542849357339644, + "loss": 1.8619, + "step": 2925 + }, + { + "epoch": 0.9146608315098468, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017541235955977112, + "loss": 1.6366, + "step": 2926 + }, + { + "epoch": 0.9149734291966239, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017539622099340554, + "loss": 1.5817, + "step": 2927 + }, + { + "epoch": 0.9152860268834011, + "grad_norm": 0.248046875, + "learning_rate": 0.000175380077875274, + "loss": 1.5323, + "step": 2928 + }, + { + "epoch": 0.9155986245701782, + "grad_norm": 0.240234375, + "learning_rate": 0.00017536393020635118, + "loss": 1.762, + "step": 2929 + }, + { + "epoch": 0.9159112222569553, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001753477779876118, + "loss": 1.5217, + "step": 2930 + }, + { + "epoch": 0.9162238199437324, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017533162122003107, + "loss": 1.6377, + "step": 2931 + }, + { + "epoch": 0.9165364176305095, + "grad_norm": 0.25, + "learning_rate": 0.00017531545990458436, + "loss": 1.5614, + "step": 2932 + }, + { + "epoch": 0.9168490153172867, + "grad_norm": 0.25390625, + "learning_rate": 0.00017529929404224733, + "loss": 1.9785, + "step": 2933 + }, + { + "epoch": 0.9171616130040637, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017528312363399598, + "loss": 1.6278, + "step": 2934 + }, + { + "epoch": 0.9174742106908409, + "grad_norm": 0.2421875, + "learning_rate": 0.00017526694868080656, + "loss": 1.62, + "step": 2935 + }, + { + "epoch": 0.9177868083776181, + "grad_norm": 0.244140625, + "learning_rate": 0.0001752507691836555, + "loss": 1.66, + "step": 2936 + }, + { + "epoch": 0.9180994060643951, + "grad_norm": 0.234375, + "learning_rate": 0.00017523458514351963, + "loss": 1.711, + "step": 2937 + }, + { + "epoch": 0.9184120037511723, + "grad_norm": 0.236328125, + "learning_rate": 0.00017521839656137598, + "loss": 1.606, + "step": 2938 + }, + { + "epoch": 0.9187246014379493, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017520220343820184, + "loss": 1.8548, + "step": 2939 + }, + { + "epoch": 0.9190371991247265, + "grad_norm": 0.26171875, + "learning_rate": 0.00017518600577497487, + "loss": 1.6217, + "step": 2940 + }, + { + "epoch": 0.9193497968115036, + "grad_norm": 0.326171875, + "learning_rate": 0.00017516980357267295, + "loss": 2.4887, + "step": 2941 + }, + { + "epoch": 0.9196623944982807, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017515359683227416, + "loss": 1.7841, + "step": 2942 + }, + { + "epoch": 0.9199749921850578, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017513738555475697, + "loss": 1.7065, + "step": 2943 + }, + { + "epoch": 0.9202875898718349, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001751211697411001, + "loss": 1.7469, + "step": 2944 + }, + { + "epoch": 0.9206001875586121, + "grad_norm": 0.228515625, + "learning_rate": 0.00017510494939228246, + "loss": 1.5839, + "step": 2945 + }, + { + "epoch": 0.9209127852453892, + "grad_norm": 0.24609375, + "learning_rate": 0.0001750887245092833, + "loss": 1.7413, + "step": 2946 + }, + { + "epoch": 0.9212253829321663, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017507249509308217, + "loss": 1.433, + "step": 2947 + }, + { + "epoch": 0.9215379806189434, + "grad_norm": 0.244140625, + "learning_rate": 0.00017505626114465886, + "loss": 1.5907, + "step": 2948 + }, + { + "epoch": 0.9218505783057206, + "grad_norm": 0.25, + "learning_rate": 0.0001750400226649934, + "loss": 1.6737, + "step": 2949 + }, + { + "epoch": 0.9221631759924976, + "grad_norm": 0.25390625, + "learning_rate": 0.00017502377965506613, + "loss": 1.5084, + "step": 2950 + }, + { + "epoch": 0.9224757736792748, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017500753211585772, + "loss": 1.4999, + "step": 2951 + }, + { + "epoch": 0.9227883713660519, + "grad_norm": 0.2451171875, + "learning_rate": 0.000174991280048349, + "loss": 1.6843, + "step": 2952 + }, + { + "epoch": 0.923100969052829, + "grad_norm": 0.2421875, + "learning_rate": 0.00017497502345352112, + "loss": 1.6222, + "step": 2953 + }, + { + "epoch": 0.9234135667396062, + "grad_norm": 0.23828125, + "learning_rate": 0.00017495876233235554, + "loss": 1.5935, + "step": 2954 + }, + { + "epoch": 0.9237261644263832, + "grad_norm": 0.25390625, + "learning_rate": 0.000174942496685834, + "loss": 1.9363, + "step": 2955 + }, + { + "epoch": 0.9240387621131604, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017492622651493837, + "loss": 1.8212, + "step": 2956 + }, + { + "epoch": 0.9243513597999374, + "grad_norm": 0.255859375, + "learning_rate": 0.000174909951820651, + "loss": 1.8014, + "step": 2957 + }, + { + "epoch": 0.9246639574867146, + "grad_norm": 0.251953125, + "learning_rate": 0.00017489367260395438, + "loss": 1.7982, + "step": 2958 + }, + { + "epoch": 0.9249765551734918, + "grad_norm": 0.240234375, + "learning_rate": 0.0001748773888658313, + "loss": 1.6039, + "step": 2959 + }, + { + "epoch": 0.9252891528602688, + "grad_norm": 0.24609375, + "learning_rate": 0.00017486110060726485, + "loss": 1.8941, + "step": 2960 + }, + { + "epoch": 0.925601750547046, + "grad_norm": 0.25, + "learning_rate": 0.00017484480782923835, + "loss": 2.0574, + "step": 2961 + }, + { + "epoch": 0.925914348233823, + "grad_norm": 0.244140625, + "learning_rate": 0.00017482851053273542, + "loss": 1.404, + "step": 2962 + }, + { + "epoch": 0.9262269459206002, + "grad_norm": 0.23046875, + "learning_rate": 0.00017481220871873996, + "loss": 1.6843, + "step": 2963 + }, + { + "epoch": 0.9265395436073773, + "grad_norm": 0.263671875, + "learning_rate": 0.00017479590238823613, + "loss": 1.61, + "step": 2964 + }, + { + "epoch": 0.9268521412941544, + "grad_norm": 0.388671875, + "learning_rate": 0.00017477959154220835, + "loss": 2.4723, + "step": 2965 + }, + { + "epoch": 0.9271647389809315, + "grad_norm": 0.240234375, + "learning_rate": 0.0001747632761816413, + "loss": 1.6597, + "step": 2966 + }, + { + "epoch": 0.9274773366677087, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017474695630752008, + "loss": 1.5784, + "step": 2967 + }, + { + "epoch": 0.9277899343544858, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017473063192082982, + "loss": 1.8403, + "step": 2968 + }, + { + "epoch": 0.9281025320412629, + "grad_norm": 0.25, + "learning_rate": 0.00017471430302255604, + "loss": 1.8024, + "step": 2969 + }, + { + "epoch": 0.92841512972804, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017469796961368462, + "loss": 1.714, + "step": 2970 + }, + { + "epoch": 0.9287277274148171, + "grad_norm": 0.251953125, + "learning_rate": 0.00017468163169520156, + "loss": 1.4359, + "step": 2971 + }, + { + "epoch": 0.9290403251015943, + "grad_norm": 0.23828125, + "learning_rate": 0.00017466528926809324, + "loss": 1.6177, + "step": 2972 + }, + { + "epoch": 0.9293529227883713, + "grad_norm": 0.2578125, + "learning_rate": 0.00017464894233334627, + "loss": 1.9172, + "step": 2973 + }, + { + "epoch": 0.9296655204751485, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017463259089194752, + "loss": 2.023, + "step": 2974 + }, + { + "epoch": 0.9299781181619255, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017461623494488416, + "loss": 1.3345, + "step": 2975 + }, + { + "epoch": 0.9302907158487027, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001745998744931436, + "loss": 1.6451, + "step": 2976 + }, + { + "epoch": 0.9306033135354799, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017458350953771355, + "loss": 1.4398, + "step": 2977 + }, + { + "epoch": 0.9309159112222569, + "grad_norm": 0.236328125, + "learning_rate": 0.000174567140079582, + "loss": 1.4698, + "step": 2978 + }, + { + "epoch": 0.9312285089090341, + "grad_norm": 0.2421875, + "learning_rate": 0.00017455076611973716, + "loss": 1.586, + "step": 2979 + }, + { + "epoch": 0.9315411065958112, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017453438765916758, + "loss": 1.4608, + "step": 2980 + }, + { + "epoch": 0.9318537042825883, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017451800469886207, + "loss": 1.7327, + "step": 2981 + }, + { + "epoch": 0.9321663019693655, + "grad_norm": 0.232421875, + "learning_rate": 0.0001745016172398096, + "loss": 1.7701, + "step": 2982 + }, + { + "epoch": 0.9324788996561425, + "grad_norm": 0.2421875, + "learning_rate": 0.0001744852252829996, + "loss": 1.6054, + "step": 2983 + }, + { + "epoch": 0.9327914973429197, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017446882882942162, + "loss": 1.7484, + "step": 2984 + }, + { + "epoch": 0.9331040950296968, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017445242788006552, + "loss": 1.6647, + "step": 2985 + }, + { + "epoch": 0.9334166927164739, + "grad_norm": 0.248046875, + "learning_rate": 0.0001744360224359215, + "loss": 1.6536, + "step": 2986 + }, + { + "epoch": 0.933729290403251, + "grad_norm": 0.25, + "learning_rate": 0.00017441961249797995, + "loss": 1.9033, + "step": 2987 + }, + { + "epoch": 0.9340418880900281, + "grad_norm": 0.24609375, + "learning_rate": 0.00017440319806723157, + "loss": 1.5145, + "step": 2988 + }, + { + "epoch": 0.9343544857768052, + "grad_norm": 0.25390625, + "learning_rate": 0.0001743867791446673, + "loss": 1.6766, + "step": 2989 + }, + { + "epoch": 0.9346670834635824, + "grad_norm": 0.232421875, + "learning_rate": 0.00017437035573127836, + "loss": 1.5665, + "step": 2990 + }, + { + "epoch": 0.9349796811503595, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017435392782805628, + "loss": 1.7932, + "step": 2991 + }, + { + "epoch": 0.9352922788371366, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017433749543599287, + "loss": 1.595, + "step": 2992 + }, + { + "epoch": 0.9356048765239138, + "grad_norm": 0.228515625, + "learning_rate": 0.00017432105855608008, + "loss": 1.7333, + "step": 2993 + }, + { + "epoch": 0.9359174742106908, + "grad_norm": 0.240234375, + "learning_rate": 0.0001743046171893103, + "loss": 1.6385, + "step": 2994 + }, + { + "epoch": 0.936230071897468, + "grad_norm": 0.25390625, + "learning_rate": 0.0001742881713366761, + "loss": 1.7989, + "step": 2995 + }, + { + "epoch": 0.936542669584245, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017427172099917032, + "loss": 1.5065, + "step": 2996 + }, + { + "epoch": 0.9368552672710222, + "grad_norm": 0.244140625, + "learning_rate": 0.0001742552661777861, + "loss": 1.6564, + "step": 2997 + }, + { + "epoch": 0.9371678649577994, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017423880687351685, + "loss": 1.5779, + "step": 2998 + }, + { + "epoch": 0.9374804626445764, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001742223430873562, + "loss": 1.7974, + "step": 2999 + }, + { + "epoch": 0.9377930603313536, + "grad_norm": 0.234375, + "learning_rate": 0.0001742058748202981, + "loss": 1.4744, + "step": 3000 + }, + { + "epoch": 0.9381056580181306, + "grad_norm": 0.236328125, + "learning_rate": 0.0001741894020733368, + "loss": 1.6008, + "step": 3001 + }, + { + "epoch": 0.9384182557049078, + "grad_norm": 0.248046875, + "learning_rate": 0.00017417292484746676, + "loss": 1.5435, + "step": 3002 + }, + { + "epoch": 0.9387308533916849, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017415644314368274, + "loss": 1.6641, + "step": 3003 + }, + { + "epoch": 0.939043451078462, + "grad_norm": 0.244140625, + "learning_rate": 0.00017413995696297972, + "loss": 1.661, + "step": 3004 + }, + { + "epoch": 0.9393560487652391, + "grad_norm": 0.248046875, + "learning_rate": 0.00017412346630635303, + "loss": 1.5462, + "step": 3005 + }, + { + "epoch": 0.9396686464520163, + "grad_norm": 0.263671875, + "learning_rate": 0.00017410697117479823, + "loss": 1.7804, + "step": 3006 + }, + { + "epoch": 0.9399812441387934, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017409047156931114, + "loss": 1.8893, + "step": 3007 + }, + { + "epoch": 0.9402938418255705, + "grad_norm": 0.248046875, + "learning_rate": 0.00017407396749088787, + "loss": 1.5371, + "step": 3008 + }, + { + "epoch": 0.9406064395123476, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017405745894052477, + "loss": 1.5866, + "step": 3009 + }, + { + "epoch": 0.9409190371991247, + "grad_norm": 0.24609375, + "learning_rate": 0.00017404094591921853, + "loss": 1.5388, + "step": 3010 + }, + { + "epoch": 0.9412316348859019, + "grad_norm": 0.25390625, + "learning_rate": 0.00017402442842796604, + "loss": 1.438, + "step": 3011 + }, + { + "epoch": 0.9415442325726789, + "grad_norm": 0.251953125, + "learning_rate": 0.00017400790646776443, + "loss": 1.892, + "step": 3012 + }, + { + "epoch": 0.9418568302594561, + "grad_norm": 0.24609375, + "learning_rate": 0.00017399138003961124, + "loss": 1.4763, + "step": 3013 + }, + { + "epoch": 0.9421694279462332, + "grad_norm": 0.25, + "learning_rate": 0.0001739748491445041, + "loss": 1.6418, + "step": 3014 + }, + { + "epoch": 0.9424820256330103, + "grad_norm": 0.240234375, + "learning_rate": 0.00017395831378344112, + "loss": 1.7746, + "step": 3015 + }, + { + "epoch": 0.9427946233197875, + "grad_norm": 0.236328125, + "learning_rate": 0.00017394177395742047, + "loss": 1.8002, + "step": 3016 + }, + { + "epoch": 0.9431072210065645, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017392522966744068, + "loss": 1.686, + "step": 3017 + }, + { + "epoch": 0.9434198186933417, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017390868091450055, + "loss": 1.6964, + "step": 3018 + }, + { + "epoch": 0.9437324163801187, + "grad_norm": 0.240234375, + "learning_rate": 0.00017389212769959922, + "loss": 1.656, + "step": 3019 + }, + { + "epoch": 0.9440450140668959, + "grad_norm": 0.240234375, + "learning_rate": 0.00017387557002373596, + "loss": 1.6357, + "step": 3020 + }, + { + "epoch": 0.944357611753673, + "grad_norm": 0.236328125, + "learning_rate": 0.00017385900788791038, + "loss": 1.8136, + "step": 3021 + }, + { + "epoch": 0.9446702094404501, + "grad_norm": 0.236328125, + "learning_rate": 0.00017384244129312239, + "loss": 1.5841, + "step": 3022 + }, + { + "epoch": 0.9449828071272273, + "grad_norm": 0.23828125, + "learning_rate": 0.00017382587024037212, + "loss": 1.5595, + "step": 3023 + }, + { + "epoch": 0.9452954048140044, + "grad_norm": 0.248046875, + "learning_rate": 0.00017380929473066, + "loss": 1.6447, + "step": 3024 + }, + { + "epoch": 0.9456080025007815, + "grad_norm": 0.2421875, + "learning_rate": 0.00017379271476498665, + "loss": 1.6323, + "step": 3025 + }, + { + "epoch": 0.9459206001875586, + "grad_norm": 0.25, + "learning_rate": 0.00017377613034435315, + "loss": 1.62, + "step": 3026 + }, + { + "epoch": 0.9462331978743357, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017375954146976058, + "loss": 1.4751, + "step": 3027 + }, + { + "epoch": 0.9465457955611128, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017374294814221055, + "loss": 2.2368, + "step": 3028 + }, + { + "epoch": 0.94685839324789, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017372635036270472, + "loss": 1.7495, + "step": 3029 + }, + { + "epoch": 0.9471709909346671, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001737097481322452, + "loss": 1.9299, + "step": 3030 + }, + { + "epoch": 0.9474835886214442, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017369314145183426, + "loss": 1.5842, + "step": 3031 + }, + { + "epoch": 0.9477961863082213, + "grad_norm": 0.240234375, + "learning_rate": 0.00017367653032247446, + "loss": 1.6439, + "step": 3032 + }, + { + "epoch": 0.9481087839949984, + "grad_norm": 0.24609375, + "learning_rate": 0.0001736599147451686, + "loss": 1.7489, + "step": 3033 + }, + { + "epoch": 0.9484213816817756, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017364329472091986, + "loss": 1.6981, + "step": 3034 + }, + { + "epoch": 0.9487339793685526, + "grad_norm": 0.24609375, + "learning_rate": 0.0001736266702507316, + "loss": 1.98, + "step": 3035 + }, + { + "epoch": 0.9490465770553298, + "grad_norm": 0.24609375, + "learning_rate": 0.0001736100413356074, + "loss": 1.5686, + "step": 3036 + }, + { + "epoch": 0.949359174742107, + "grad_norm": 0.2421875, + "learning_rate": 0.00017359340797655116, + "loss": 1.6756, + "step": 3037 + }, + { + "epoch": 0.949671772428884, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017357677017456715, + "loss": 1.6345, + "step": 3038 + }, + { + "epoch": 0.9499843701156612, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017356012793065976, + "loss": 1.6958, + "step": 3039 + }, + { + "epoch": 0.9502969678024382, + "grad_norm": 0.234375, + "learning_rate": 0.0001735434812458337, + "loss": 1.6856, + "step": 3040 + }, + { + "epoch": 0.9506095654892154, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017352683012109395, + "loss": 1.6888, + "step": 3041 + }, + { + "epoch": 0.9509221631759925, + "grad_norm": 0.25, + "learning_rate": 0.0001735101745574458, + "loss": 1.7944, + "step": 3042 + }, + { + "epoch": 0.9512347608627696, + "grad_norm": 0.244140625, + "learning_rate": 0.0001734935145558947, + "loss": 1.4633, + "step": 3043 + }, + { + "epoch": 0.9515473585495468, + "grad_norm": 0.251953125, + "learning_rate": 0.0001734768501174465, + "loss": 1.5549, + "step": 3044 + }, + { + "epoch": 0.9518599562363238, + "grad_norm": 0.24609375, + "learning_rate": 0.00017346018124310723, + "loss": 1.6942, + "step": 3045 + }, + { + "epoch": 0.952172553923101, + "grad_norm": 0.232421875, + "learning_rate": 0.0001734435079338832, + "loss": 1.8094, + "step": 3046 + }, + { + "epoch": 0.9524851516098781, + "grad_norm": 0.244140625, + "learning_rate": 0.00017342683019078102, + "loss": 1.6422, + "step": 3047 + }, + { + "epoch": 0.9527977492966552, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017341014801480748, + "loss": 1.4798, + "step": 3048 + }, + { + "epoch": 0.9531103469834323, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001733934614069698, + "loss": 1.6282, + "step": 3049 + }, + { + "epoch": 0.9534229446702095, + "grad_norm": 0.23828125, + "learning_rate": 0.00017337677036827534, + "loss": 1.5165, + "step": 3050 + }, + { + "epoch": 0.9537355423569865, + "grad_norm": 0.248046875, + "learning_rate": 0.00017336007489973171, + "loss": 1.6635, + "step": 3051 + }, + { + "epoch": 0.9540481400437637, + "grad_norm": 0.2578125, + "learning_rate": 0.00017334337500234687, + "loss": 1.7504, + "step": 3052 + }, + { + "epoch": 0.9543607377305408, + "grad_norm": 0.25, + "learning_rate": 0.00017332667067712905, + "loss": 1.8412, + "step": 3053 + }, + { + "epoch": 0.9546733354173179, + "grad_norm": 0.2421875, + "learning_rate": 0.0001733099619250867, + "loss": 1.616, + "step": 3054 + }, + { + "epoch": 0.9549859331040951, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017329324874722847, + "loss": 1.7954, + "step": 3055 + }, + { + "epoch": 0.9552985307908721, + "grad_norm": 0.2421875, + "learning_rate": 0.00017327653114456343, + "loss": 1.6591, + "step": 3056 + }, + { + "epoch": 0.9556111284776493, + "grad_norm": 0.240234375, + "learning_rate": 0.00017325980911810085, + "loss": 1.6327, + "step": 3057 + }, + { + "epoch": 0.9559237261644263, + "grad_norm": 0.24609375, + "learning_rate": 0.00017324308266885026, + "loss": 1.5621, + "step": 3058 + }, + { + "epoch": 0.9562363238512035, + "grad_norm": 0.2578125, + "learning_rate": 0.00017322635179782138, + "loss": 2.0408, + "step": 3059 + }, + { + "epoch": 0.9565489215379807, + "grad_norm": 0.23828125, + "learning_rate": 0.00017320961650602436, + "loss": 1.5293, + "step": 3060 + }, + { + "epoch": 0.9568615192247577, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017319287679446949, + "loss": 1.5787, + "step": 3061 + }, + { + "epoch": 0.9571741169115349, + "grad_norm": 0.23828125, + "learning_rate": 0.0001731761326641674, + "loss": 1.6182, + "step": 3062 + }, + { + "epoch": 0.957486714598312, + "grad_norm": 0.244140625, + "learning_rate": 0.0001731593841161289, + "loss": 1.6671, + "step": 3063 + }, + { + "epoch": 0.9577993122850891, + "grad_norm": 0.23828125, + "learning_rate": 0.00017314263115136516, + "loss": 1.6618, + "step": 3064 + }, + { + "epoch": 0.9581119099718662, + "grad_norm": 0.25, + "learning_rate": 0.00017312587377088756, + "loss": 1.6887, + "step": 3065 + }, + { + "epoch": 0.9584245076586433, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017310911197570777, + "loss": 1.6217, + "step": 3066 + }, + { + "epoch": 0.9587371053454204, + "grad_norm": 0.240234375, + "learning_rate": 0.00017309234576683778, + "loss": 1.7303, + "step": 3067 + }, + { + "epoch": 0.9590497030321976, + "grad_norm": 0.25, + "learning_rate": 0.0001730755751452897, + "loss": 1.6497, + "step": 3068 + }, + { + "epoch": 0.9593623007189747, + "grad_norm": 0.228515625, + "learning_rate": 0.000173058800112076, + "loss": 1.8203, + "step": 3069 + }, + { + "epoch": 0.9596748984057518, + "grad_norm": 0.255859375, + "learning_rate": 0.00017304202066820948, + "loss": 2.1236, + "step": 3070 + }, + { + "epoch": 0.9599874960925289, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001730252368147031, + "loss": 1.7534, + "step": 3071 + }, + { + "epoch": 0.960300093779306, + "grad_norm": 0.25390625, + "learning_rate": 0.00017300844855257008, + "loss": 1.6816, + "step": 3072 + }, + { + "epoch": 0.9606126914660832, + "grad_norm": 0.2373046875, + "learning_rate": 0.000172991655882824, + "loss": 1.5992, + "step": 3073 + }, + { + "epoch": 0.9609252891528602, + "grad_norm": 0.24609375, + "learning_rate": 0.00017297485880647862, + "loss": 1.8889, + "step": 3074 + }, + { + "epoch": 0.9612378868396374, + "grad_norm": 0.240234375, + "learning_rate": 0.00017295805732454804, + "loss": 1.6511, + "step": 3075 + }, + { + "epoch": 0.9615504845264145, + "grad_norm": 0.265625, + "learning_rate": 0.00017294125143804657, + "loss": 1.7686, + "step": 3076 + }, + { + "epoch": 0.9618630822131916, + "grad_norm": 0.26953125, + "learning_rate": 0.0001729244411479888, + "loss": 1.7564, + "step": 3077 + }, + { + "epoch": 0.9621756798999688, + "grad_norm": 0.23046875, + "learning_rate": 0.0001729076264553896, + "loss": 1.6458, + "step": 3078 + }, + { + "epoch": 0.9624882775867458, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017289080736126409, + "loss": 1.698, + "step": 3079 + }, + { + "epoch": 0.962800875273523, + "grad_norm": 0.25, + "learning_rate": 0.00017287398386662764, + "loss": 1.684, + "step": 3080 + }, + { + "epoch": 0.9631134729603001, + "grad_norm": 0.234375, + "learning_rate": 0.0001728571559724959, + "loss": 1.7003, + "step": 3081 + }, + { + "epoch": 0.9634260706470772, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017284032367988482, + "loss": 1.5827, + "step": 3082 + }, + { + "epoch": 0.9637386683338544, + "grad_norm": 0.240234375, + "learning_rate": 0.0001728234869898106, + "loss": 1.7952, + "step": 3083 + }, + { + "epoch": 0.9640512660206314, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017280664590328966, + "loss": 1.5528, + "step": 3084 + }, + { + "epoch": 0.9643638637074086, + "grad_norm": 0.25, + "learning_rate": 0.0001727898004213387, + "loss": 1.8732, + "step": 3085 + }, + { + "epoch": 0.9646764613941857, + "grad_norm": 0.234375, + "learning_rate": 0.00017277295054497478, + "loss": 1.5453, + "step": 3086 + }, + { + "epoch": 0.9649890590809628, + "grad_norm": 0.25, + "learning_rate": 0.00017275609627521508, + "loss": 1.8652, + "step": 3087 + }, + { + "epoch": 0.9653016567677399, + "grad_norm": 0.23828125, + "learning_rate": 0.00017273923761307712, + "loss": 1.5761, + "step": 3088 + }, + { + "epoch": 0.965614254454517, + "grad_norm": 0.232421875, + "learning_rate": 0.00017272237455957868, + "loss": 1.3679, + "step": 3089 + }, + { + "epoch": 0.9659268521412941, + "grad_norm": 0.365234375, + "learning_rate": 0.00017270550711573788, + "loss": 2.1864, + "step": 3090 + }, + { + "epoch": 0.9662394498280713, + "grad_norm": 0.25390625, + "learning_rate": 0.0001726886352825729, + "loss": 1.8203, + "step": 3091 + }, + { + "epoch": 0.9665520475148484, + "grad_norm": 0.236328125, + "learning_rate": 0.0001726717590611024, + "loss": 1.6397, + "step": 3092 + }, + { + "epoch": 0.9668646452016255, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017265487845234524, + "loss": 1.7298, + "step": 3093 + }, + { + "epoch": 0.9671772428884027, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017263799345732043, + "loss": 1.4412, + "step": 3094 + }, + { + "epoch": 0.9674898405751797, + "grad_norm": 0.2578125, + "learning_rate": 0.0001726211040770474, + "loss": 1.6235, + "step": 3095 + }, + { + "epoch": 0.9678024382619569, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001726042103125458, + "loss": 1.4866, + "step": 3096 + }, + { + "epoch": 0.9681150359487339, + "grad_norm": 0.25, + "learning_rate": 0.0001725873121648355, + "loss": 1.8129, + "step": 3097 + }, + { + "epoch": 0.9684276336355111, + "grad_norm": 0.244140625, + "learning_rate": 0.00017257040963493663, + "loss": 1.7193, + "step": 3098 + }, + { + "epoch": 0.9687402313222883, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017255350272386968, + "loss": 1.6863, + "step": 3099 + }, + { + "epoch": 0.9690528290090653, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017253659143265534, + "loss": 1.5868, + "step": 3100 + }, + { + "epoch": 0.9693654266958425, + "grad_norm": 0.26171875, + "learning_rate": 0.00017251967576231448, + "loss": 1.9038, + "step": 3101 + }, + { + "epoch": 0.9696780243826195, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001725027557138684, + "loss": 1.5963, + "step": 3102 + }, + { + "epoch": 0.9699906220693967, + "grad_norm": 0.25390625, + "learning_rate": 0.0001724858312883386, + "loss": 1.9158, + "step": 3103 + }, + { + "epoch": 0.9703032197561738, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001724689024867468, + "loss": 1.7879, + "step": 3104 + }, + { + "epoch": 0.9706158174429509, + "grad_norm": 0.349609375, + "learning_rate": 0.00017245196931011495, + "loss": 2.2104, + "step": 3105 + }, + { + "epoch": 0.970928415129728, + "grad_norm": 0.25390625, + "learning_rate": 0.00017243503175946542, + "loss": 1.3733, + "step": 3106 + }, + { + "epoch": 0.9712410128165052, + "grad_norm": 0.26171875, + "learning_rate": 0.0001724180898358207, + "loss": 1.8072, + "step": 3107 + }, + { + "epoch": 0.9715536105032823, + "grad_norm": 0.25, + "learning_rate": 0.00017240114354020368, + "loss": 1.6554, + "step": 3108 + }, + { + "epoch": 0.9718662081900594, + "grad_norm": 0.232421875, + "learning_rate": 0.0001723841928736373, + "loss": 1.8434, + "step": 3109 + }, + { + "epoch": 0.9721788058768365, + "grad_norm": 0.251953125, + "learning_rate": 0.00017236723783714496, + "loss": 1.7078, + "step": 3110 + }, + { + "epoch": 0.9724914035636136, + "grad_norm": 0.248046875, + "learning_rate": 0.00017235027843175027, + "loss": 1.4973, + "step": 3111 + }, + { + "epoch": 0.9728040012503908, + "grad_norm": 0.23828125, + "learning_rate": 0.00017233331465847705, + "loss": 2.0236, + "step": 3112 + }, + { + "epoch": 0.9731165989371678, + "grad_norm": 0.248046875, + "learning_rate": 0.00017231634651834946, + "loss": 1.55, + "step": 3113 + }, + { + "epoch": 0.973429196623945, + "grad_norm": 0.240234375, + "learning_rate": 0.00017229937401239188, + "loss": 1.5074, + "step": 3114 + }, + { + "epoch": 0.973741794310722, + "grad_norm": 0.232421875, + "learning_rate": 0.00017228239714162896, + "loss": 1.4308, + "step": 3115 + }, + { + "epoch": 0.9740543919974992, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017226541590708566, + "loss": 1.8249, + "step": 3116 + }, + { + "epoch": 0.9743669896842764, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017224843030978705, + "loss": 1.8337, + "step": 3117 + }, + { + "epoch": 0.9746795873710534, + "grad_norm": 0.26171875, + "learning_rate": 0.00017223144035075864, + "loss": 1.7211, + "step": 3118 + }, + { + "epoch": 0.9749921850578306, + "grad_norm": 0.25, + "learning_rate": 0.00017221444603102617, + "loss": 1.7391, + "step": 3119 + }, + { + "epoch": 0.9753047827446076, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017219744735161554, + "loss": 2.0078, + "step": 3120 + }, + { + "epoch": 0.9756173804313848, + "grad_norm": 0.2412109375, + "learning_rate": 0.000172180444313553, + "loss": 1.7833, + "step": 3121 + }, + { + "epoch": 0.975929978118162, + "grad_norm": 0.251953125, + "learning_rate": 0.00017216343691786509, + "loss": 1.508, + "step": 3122 + }, + { + "epoch": 0.976242575804939, + "grad_norm": 0.251953125, + "learning_rate": 0.0001721464251655785, + "loss": 2.0652, + "step": 3123 + }, + { + "epoch": 0.9765551734917162, + "grad_norm": 0.2421875, + "learning_rate": 0.0001721294090577203, + "loss": 1.5267, + "step": 3124 + }, + { + "epoch": 0.9768677711784933, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017211238859531774, + "loss": 1.838, + "step": 3125 + }, + { + "epoch": 0.9771803688652704, + "grad_norm": 0.234375, + "learning_rate": 0.00017209536377939846, + "loss": 1.7286, + "step": 3126 + }, + { + "epoch": 0.9774929665520475, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001720783346109901, + "loss": 1.8045, + "step": 3127 + }, + { + "epoch": 0.9778055642388246, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001720613010911209, + "loss": 1.712, + "step": 3128 + }, + { + "epoch": 0.9781181619256017, + "grad_norm": 0.234375, + "learning_rate": 0.0001720442632208191, + "loss": 1.5521, + "step": 3129 + }, + { + "epoch": 0.9784307596123789, + "grad_norm": 0.23828125, + "learning_rate": 0.0001720272210011133, + "loss": 1.7718, + "step": 3130 + }, + { + "epoch": 0.978743357299156, + "grad_norm": 0.2578125, + "learning_rate": 0.00017201017443303242, + "loss": 1.4686, + "step": 3131 + }, + { + "epoch": 0.9790559549859331, + "grad_norm": 0.251953125, + "learning_rate": 0.00017199312351760555, + "loss": 1.6478, + "step": 3132 + }, + { + "epoch": 0.9793685526727102, + "grad_norm": 0.228515625, + "learning_rate": 0.00017197606825586204, + "loss": 1.4012, + "step": 3133 + }, + { + "epoch": 0.9796811503594873, + "grad_norm": 0.24609375, + "learning_rate": 0.00017195900864883158, + "loss": 1.6166, + "step": 3134 + }, + { + "epoch": 0.9799937480462645, + "grad_norm": 0.255859375, + "learning_rate": 0.00017194194469754407, + "loss": 1.7632, + "step": 3135 + }, + { + "epoch": 0.9803063457330415, + "grad_norm": 0.248046875, + "learning_rate": 0.00017192487640302969, + "loss": 1.497, + "step": 3136 + }, + { + "epoch": 0.9806189434198187, + "grad_norm": 0.25, + "learning_rate": 0.00017190780376631886, + "loss": 1.756, + "step": 3137 + }, + { + "epoch": 0.9809315411065959, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001718907267884423, + "loss": 1.5489, + "step": 3138 + }, + { + "epoch": 0.9812441387933729, + "grad_norm": 0.26171875, + "learning_rate": 0.00017187364547043091, + "loss": 1.5929, + "step": 3139 + }, + { + "epoch": 0.9815567364801501, + "grad_norm": 0.244140625, + "learning_rate": 0.000171856559813316, + "loss": 1.7889, + "step": 3140 + }, + { + "epoch": 0.9818693341669271, + "grad_norm": 0.234375, + "learning_rate": 0.00017183946981812897, + "loss": 1.4263, + "step": 3141 + }, + { + "epoch": 0.9821819318537043, + "grad_norm": 0.259765625, + "learning_rate": 0.00017182237548590162, + "loss": 1.8588, + "step": 3142 + }, + { + "epoch": 0.9824945295404814, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017180527681766593, + "loss": 1.7062, + "step": 3143 + }, + { + "epoch": 0.9828071272272585, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017178817381445418, + "loss": 1.5145, + "step": 3144 + }, + { + "epoch": 0.9831197249140357, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001717710664772989, + "loss": 1.6806, + "step": 3145 + }, + { + "epoch": 0.9834323226008127, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017175395480723286, + "loss": 1.9361, + "step": 3146 + }, + { + "epoch": 0.9837449202875899, + "grad_norm": 0.23828125, + "learning_rate": 0.00017173683880528917, + "loss": 1.5781, + "step": 3147 + }, + { + "epoch": 0.984057517974367, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017171971847250106, + "loss": 1.5337, + "step": 3148 + }, + { + "epoch": 0.9843701156611441, + "grad_norm": 0.24609375, + "learning_rate": 0.00017170259380990216, + "loss": 1.8557, + "step": 3149 + }, + { + "epoch": 0.9846827133479212, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017168546481852634, + "loss": 1.735, + "step": 3150 + }, + { + "epoch": 0.9849953110346984, + "grad_norm": 0.234375, + "learning_rate": 0.00017166833149940763, + "loss": 1.6696, + "step": 3151 + }, + { + "epoch": 0.9853079087214754, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017165119385358045, + "loss": 1.5103, + "step": 3152 + }, + { + "epoch": 0.9856205064082526, + "grad_norm": 0.232421875, + "learning_rate": 0.00017163405188207932, + "loss": 1.3137, + "step": 3153 + }, + { + "epoch": 0.9859331040950297, + "grad_norm": 0.3125, + "learning_rate": 0.00017161690558593925, + "loss": 2.1945, + "step": 3154 + }, + { + "epoch": 0.9862457017818068, + "grad_norm": 0.251953125, + "learning_rate": 0.0001715997549661953, + "loss": 1.8129, + "step": 3155 + }, + { + "epoch": 0.986558299468584, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017158260002388294, + "loss": 1.7308, + "step": 3156 + }, + { + "epoch": 0.986870897155361, + "grad_norm": 0.240234375, + "learning_rate": 0.00017156544076003778, + "loss": 1.7969, + "step": 3157 + }, + { + "epoch": 0.9871834948421382, + "grad_norm": 0.25, + "learning_rate": 0.00017154827717569577, + "loss": 1.5541, + "step": 3158 + }, + { + "epoch": 0.9874960925289152, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017153110927189307, + "loss": 1.6279, + "step": 3159 + }, + { + "epoch": 0.9878086902156924, + "grad_norm": 0.2578125, + "learning_rate": 0.00017151393704966617, + "loss": 1.7777, + "step": 3160 + }, + { + "epoch": 0.9881212879024696, + "grad_norm": 0.240234375, + "learning_rate": 0.00017149676051005176, + "loss": 1.7864, + "step": 3161 + }, + { + "epoch": 0.9884338855892466, + "grad_norm": 0.263671875, + "learning_rate": 0.0001714795796540868, + "loss": 1.8507, + "step": 3162 + }, + { + "epoch": 0.9887464832760238, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017146239448280853, + "loss": 1.5787, + "step": 3163 + }, + { + "epoch": 0.9890590809628009, + "grad_norm": 0.25, + "learning_rate": 0.00017144520499725444, + "loss": 1.6532, + "step": 3164 + }, + { + "epoch": 0.989371678649578, + "grad_norm": 0.265625, + "learning_rate": 0.00017142801119846227, + "loss": 1.4543, + "step": 3165 + }, + { + "epoch": 0.9896842763363551, + "grad_norm": 0.240234375, + "learning_rate": 0.00017141081308747003, + "loss": 1.639, + "step": 3166 + }, + { + "epoch": 0.9899968740231322, + "grad_norm": 0.26171875, + "learning_rate": 0.00017139361066531605, + "loss": 1.6788, + "step": 3167 + }, + { + "epoch": 0.9903094717099094, + "grad_norm": 0.25390625, + "learning_rate": 0.00017137640393303878, + "loss": 1.5768, + "step": 3168 + }, + { + "epoch": 0.9906220693966865, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017135919289167707, + "loss": 1.6102, + "step": 3169 + }, + { + "epoch": 0.9909346670834636, + "grad_norm": 0.255859375, + "learning_rate": 0.00017134197754226996, + "loss": 1.5106, + "step": 3170 + }, + { + "epoch": 0.9912472647702407, + "grad_norm": 0.24609375, + "learning_rate": 0.00017132475788585674, + "loss": 1.4294, + "step": 3171 + }, + { + "epoch": 0.9915598624570178, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017130753392347698, + "loss": 1.552, + "step": 3172 + }, + { + "epoch": 0.9918724601437949, + "grad_norm": 0.2421875, + "learning_rate": 0.00017129030565617053, + "loss": 1.4553, + "step": 3173 + }, + { + "epoch": 0.9921850578305721, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017127307308497752, + "loss": 1.6594, + "step": 3174 + }, + { + "epoch": 0.9924976555173491, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001712558362109382, + "loss": 1.7315, + "step": 3175 + }, + { + "epoch": 0.9928102532041263, + "grad_norm": 0.248046875, + "learning_rate": 0.0001712385950350933, + "loss": 1.5794, + "step": 3176 + }, + { + "epoch": 0.9931228508909034, + "grad_norm": 0.240234375, + "learning_rate": 0.0001712213495584836, + "loss": 1.7619, + "step": 3177 + }, + { + "epoch": 0.9934354485776805, + "grad_norm": 0.228515625, + "learning_rate": 0.00017120409978215034, + "loss": 1.6773, + "step": 3178 + }, + { + "epoch": 0.9937480462644577, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017118684570713476, + "loss": 1.5635, + "step": 3179 + }, + { + "epoch": 0.9940606439512347, + "grad_norm": 0.25390625, + "learning_rate": 0.00017116958733447862, + "loss": 1.8061, + "step": 3180 + }, + { + "epoch": 0.9943732416380119, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017115232466522379, + "loss": 1.496, + "step": 3181 + }, + { + "epoch": 0.994685839324789, + "grad_norm": 0.283203125, + "learning_rate": 0.0001711350577004125, + "loss": 1.9932, + "step": 3182 + }, + { + "epoch": 0.9949984370115661, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017111778644108707, + "loss": 1.7719, + "step": 3183 + }, + { + "epoch": 0.9953110346983433, + "grad_norm": 0.234375, + "learning_rate": 0.00017110051088829023, + "loss": 1.9202, + "step": 3184 + }, + { + "epoch": 0.9956236323851203, + "grad_norm": 0.2392578125, + "learning_rate": 0.000171083231043065, + "loss": 1.7274, + "step": 3185 + }, + { + "epoch": 0.9959362300718975, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017106594690645454, + "loss": 1.6006, + "step": 3186 + }, + { + "epoch": 0.9962488277586746, + "grad_norm": 0.25, + "learning_rate": 0.00017104865847950224, + "loss": 1.8627, + "step": 3187 + }, + { + "epoch": 0.9965614254454517, + "grad_norm": 0.234375, + "learning_rate": 0.00017103136576325194, + "loss": 1.6147, + "step": 3188 + }, + { + "epoch": 0.9968740231322288, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017101406875874754, + "loss": 1.8255, + "step": 3189 + }, + { + "epoch": 0.9971866208190059, + "grad_norm": 0.25390625, + "learning_rate": 0.0001709967674670333, + "loss": 1.6937, + "step": 3190 + }, + { + "epoch": 0.997499218505783, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001709794618891538, + "loss": 1.7125, + "step": 3191 + }, + { + "epoch": 0.9978118161925602, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001709621520261537, + "loss": 1.7602, + "step": 3192 + }, + { + "epoch": 0.9981244138793373, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017094483787907804, + "loss": 1.8293, + "step": 3193 + }, + { + "epoch": 0.9984370115661144, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017092751944897214, + "loss": 2.0362, + "step": 3194 + }, + { + "epoch": 0.9987496092528916, + "grad_norm": 0.25, + "learning_rate": 0.00017091019673688148, + "loss": 1.8003, + "step": 3195 + }, + { + "epoch": 0.9990622069396686, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001708928697438519, + "loss": 1.6969, + "step": 3196 + }, + { + "epoch": 0.9993748046264458, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017087553847092943, + "loss": 1.4631, + "step": 3197 + }, + { + "epoch": 0.9996874023132228, + "grad_norm": 0.248046875, + "learning_rate": 0.0001708582029191604, + "loss": 2.0063, + "step": 3198 + }, + { + "epoch": 1.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017084086308959132, + "loss": 1.4657, + "step": 3199 + } + ], + "logging_steps": 1, + "max_steps": 12796, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 3199, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.538493341728768e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}