{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2895, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017271157167530224, "grad_norm": 8.458369226243938, "learning_rate": 2.7586206896551726e-07, "loss": 1.4107, "step": 1 }, { "epoch": 0.0034542314335060447, "grad_norm": 8.522535282987155, "learning_rate": 5.517241379310345e-07, "loss": 1.4378, "step": 2 }, { "epoch": 0.0051813471502590676, "grad_norm": 8.516584874594743, "learning_rate": 8.275862068965518e-07, "loss": 1.4251, "step": 3 }, { "epoch": 0.0069084628670120895, "grad_norm": 8.476284635307973, "learning_rate": 1.103448275862069e-06, "loss": 1.4196, "step": 4 }, { "epoch": 0.008635578583765112, "grad_norm": 8.402891599798389, "learning_rate": 1.3793103448275862e-06, "loss": 1.4335, "step": 5 }, { "epoch": 0.010362694300518135, "grad_norm": 7.960113252882278, "learning_rate": 1.6551724137931037e-06, "loss": 1.409, "step": 6 }, { "epoch": 0.012089810017271158, "grad_norm": 6.583256652536399, "learning_rate": 1.9310344827586207e-06, "loss": 1.3701, "step": 7 }, { "epoch": 0.013816925734024179, "grad_norm": 6.159470841204725, "learning_rate": 2.206896551724138e-06, "loss": 1.3641, "step": 8 }, { "epoch": 0.015544041450777202, "grad_norm": 3.6334537225475323, "learning_rate": 2.4827586206896555e-06, "loss": 1.319, "step": 9 }, { "epoch": 0.017271157167530225, "grad_norm": 3.2521071991938175, "learning_rate": 2.7586206896551725e-06, "loss": 1.3176, "step": 10 }, { "epoch": 0.018998272884283247, "grad_norm": 2.6840757557992405, "learning_rate": 3.03448275862069e-06, "loss": 1.3049, "step": 11 }, { "epoch": 0.02072538860103627, "grad_norm": 2.1634367551296365, "learning_rate": 3.3103448275862073e-06, "loss": 1.3267, "step": 12 }, { "epoch": 0.022452504317789293, "grad_norm": 5.964658205305405, "learning_rate": 3.5862068965517243e-06, "loss": 1.3187, "step": 13 }, { "epoch": 0.024179620034542316, "grad_norm": 6.310361399236371, "learning_rate": 3.862068965517241e-06, "loss": 1.3357, "step": 14 }, { "epoch": 0.025906735751295335, "grad_norm": 6.156204467925499, "learning_rate": 4.137931034482759e-06, "loss": 1.3245, "step": 15 }, { "epoch": 0.027633851468048358, "grad_norm": 5.763549975760555, "learning_rate": 4.413793103448276e-06, "loss": 1.2667, "step": 16 }, { "epoch": 0.02936096718480138, "grad_norm": 4.445244278719363, "learning_rate": 4.689655172413793e-06, "loss": 1.246, "step": 17 }, { "epoch": 0.031088082901554404, "grad_norm": 3.887899842391064, "learning_rate": 4.965517241379311e-06, "loss": 1.2493, "step": 18 }, { "epoch": 0.03281519861830743, "grad_norm": 3.058291779991781, "learning_rate": 5.241379310344829e-06, "loss": 1.2501, "step": 19 }, { "epoch": 0.03454231433506045, "grad_norm": 2.0890319620020597, "learning_rate": 5.517241379310345e-06, "loss": 1.2266, "step": 20 }, { "epoch": 0.03626943005181347, "grad_norm": 1.4761575359235328, "learning_rate": 5.793103448275863e-06, "loss": 1.1743, "step": 21 }, { "epoch": 0.037996545768566495, "grad_norm": 1.3793316723776046, "learning_rate": 6.06896551724138e-06, "loss": 1.1806, "step": 22 }, { "epoch": 0.039723661485319514, "grad_norm": 1.4383444132656968, "learning_rate": 6.344827586206898e-06, "loss": 1.1874, "step": 23 }, { "epoch": 0.04145077720207254, "grad_norm": 1.6041249662855457, "learning_rate": 6.620689655172415e-06, "loss": 1.1936, "step": 24 }, { "epoch": 0.04317789291882556, "grad_norm": 1.4358413075514336, "learning_rate": 6.896551724137932e-06, "loss": 1.1689, "step": 25 }, { "epoch": 0.044905008635578586, "grad_norm": 1.088550510688887, "learning_rate": 7.172413793103449e-06, "loss": 1.1552, "step": 26 }, { "epoch": 0.046632124352331605, "grad_norm": 0.9068704916356982, "learning_rate": 7.4482758620689665e-06, "loss": 1.1361, "step": 27 }, { "epoch": 0.04835924006908463, "grad_norm": 1.028979361767933, "learning_rate": 7.724137931034483e-06, "loss": 1.1247, "step": 28 }, { "epoch": 0.05008635578583765, "grad_norm": 1.0162023996956782, "learning_rate": 8.000000000000001e-06, "loss": 1.1197, "step": 29 }, { "epoch": 0.05181347150259067, "grad_norm": 0.9181234270775043, "learning_rate": 8.275862068965518e-06, "loss": 1.1293, "step": 30 }, { "epoch": 0.0535405872193437, "grad_norm": 0.7725609722367764, "learning_rate": 8.551724137931035e-06, "loss": 1.1212, "step": 31 }, { "epoch": 0.055267702936096716, "grad_norm": 0.8231027688449307, "learning_rate": 8.827586206896552e-06, "loss": 1.0884, "step": 32 }, { "epoch": 0.05699481865284974, "grad_norm": 0.823118772695324, "learning_rate": 9.10344827586207e-06, "loss": 1.1006, "step": 33 }, { "epoch": 0.05872193436960276, "grad_norm": 0.7198667247008856, "learning_rate": 9.379310344827586e-06, "loss": 1.1072, "step": 34 }, { "epoch": 0.06044905008635579, "grad_norm": 0.6251480540966018, "learning_rate": 9.655172413793105e-06, "loss": 1.087, "step": 35 }, { "epoch": 0.06217616580310881, "grad_norm": 0.7454159233665608, "learning_rate": 9.931034482758622e-06, "loss": 1.0777, "step": 36 }, { "epoch": 0.06390328151986183, "grad_norm": 0.7063670303885172, "learning_rate": 1.0206896551724139e-05, "loss": 1.0849, "step": 37 }, { "epoch": 0.06563039723661486, "grad_norm": 0.576024327438757, "learning_rate": 1.0482758620689658e-05, "loss": 1.0893, "step": 38 }, { "epoch": 0.06735751295336788, "grad_norm": 0.6311406776102843, "learning_rate": 1.0758620689655173e-05, "loss": 1.0965, "step": 39 }, { "epoch": 0.0690846286701209, "grad_norm": 0.6267826642410674, "learning_rate": 1.103448275862069e-05, "loss": 1.0661, "step": 40 }, { "epoch": 0.07081174438687392, "grad_norm": 0.43831885468959675, "learning_rate": 1.1310344827586209e-05, "loss": 1.0918, "step": 41 }, { "epoch": 0.07253886010362694, "grad_norm": 0.550723070521231, "learning_rate": 1.1586206896551726e-05, "loss": 1.0547, "step": 42 }, { "epoch": 0.07426597582037997, "grad_norm": 0.5435309195265966, "learning_rate": 1.1862068965517241e-05, "loss": 1.0621, "step": 43 }, { "epoch": 0.07599309153713299, "grad_norm": 0.432885440987974, "learning_rate": 1.213793103448276e-05, "loss": 1.0446, "step": 44 }, { "epoch": 0.07772020725388601, "grad_norm": 0.4111812176511941, "learning_rate": 1.2413793103448277e-05, "loss": 1.0654, "step": 45 }, { "epoch": 0.07944732297063903, "grad_norm": 0.4395468411750326, "learning_rate": 1.2689655172413795e-05, "loss": 1.0531, "step": 46 }, { "epoch": 0.08117443868739206, "grad_norm": 0.46180423480813365, "learning_rate": 1.296551724137931e-05, "loss": 1.042, "step": 47 }, { "epoch": 0.08290155440414508, "grad_norm": 0.3684935897604485, "learning_rate": 1.324137931034483e-05, "loss": 1.0469, "step": 48 }, { "epoch": 0.0846286701208981, "grad_norm": 0.3844738823783207, "learning_rate": 1.3517241379310346e-05, "loss": 1.033, "step": 49 }, { "epoch": 0.08635578583765112, "grad_norm": 0.3999183226127656, "learning_rate": 1.3793103448275863e-05, "loss": 1.0762, "step": 50 }, { "epoch": 0.08808290155440414, "grad_norm": 0.41848172559299224, "learning_rate": 1.406896551724138e-05, "loss": 1.0492, "step": 51 }, { "epoch": 0.08981001727115717, "grad_norm": 0.5307882053128087, "learning_rate": 1.4344827586206897e-05, "loss": 1.0546, "step": 52 }, { "epoch": 0.09153713298791019, "grad_norm": 0.5856640647560636, "learning_rate": 1.4620689655172416e-05, "loss": 1.038, "step": 53 }, { "epoch": 0.09326424870466321, "grad_norm": 0.7327070810889413, "learning_rate": 1.4896551724137933e-05, "loss": 1.0289, "step": 54 }, { "epoch": 0.09499136442141623, "grad_norm": 0.9216775217117842, "learning_rate": 1.5172413793103448e-05, "loss": 1.0516, "step": 55 }, { "epoch": 0.09671848013816926, "grad_norm": 1.1301708747667296, "learning_rate": 1.5448275862068965e-05, "loss": 1.0364, "step": 56 }, { "epoch": 0.09844559585492228, "grad_norm": 0.8091864314145123, "learning_rate": 1.5724137931034484e-05, "loss": 1.0325, "step": 57 }, { "epoch": 0.1001727115716753, "grad_norm": 0.8970016514619292, "learning_rate": 1.6000000000000003e-05, "loss": 1.0228, "step": 58 }, { "epoch": 0.10189982728842832, "grad_norm": 1.075812430896929, "learning_rate": 1.6275862068965518e-05, "loss": 1.0411, "step": 59 }, { "epoch": 0.10362694300518134, "grad_norm": 0.7170111872178632, "learning_rate": 1.6551724137931037e-05, "loss": 1.0235, "step": 60 }, { "epoch": 0.10535405872193437, "grad_norm": 0.6545815770499425, "learning_rate": 1.6827586206896552e-05, "loss": 1.0317, "step": 61 }, { "epoch": 0.1070811744386874, "grad_norm": 0.988947643732204, "learning_rate": 1.710344827586207e-05, "loss": 1.0552, "step": 62 }, { "epoch": 0.10880829015544041, "grad_norm": 1.2030008315072644, "learning_rate": 1.7379310344827586e-05, "loss": 1.0112, "step": 63 }, { "epoch": 0.11053540587219343, "grad_norm": 0.9241749102172462, "learning_rate": 1.7655172413793105e-05, "loss": 1.0275, "step": 64 }, { "epoch": 0.11226252158894647, "grad_norm": 1.1972968552251808, "learning_rate": 1.7931034482758623e-05, "loss": 1.0286, "step": 65 }, { "epoch": 0.11398963730569948, "grad_norm": 0.6752592984913095, "learning_rate": 1.820689655172414e-05, "loss": 1.0107, "step": 66 }, { "epoch": 0.1157167530224525, "grad_norm": 0.8873301137793208, "learning_rate": 1.8482758620689657e-05, "loss": 1.0259, "step": 67 }, { "epoch": 0.11744386873920552, "grad_norm": 1.1483561088066652, "learning_rate": 1.8758620689655173e-05, "loss": 1.0242, "step": 68 }, { "epoch": 0.11917098445595854, "grad_norm": 0.7900773494708847, "learning_rate": 1.903448275862069e-05, "loss": 1.0221, "step": 69 }, { "epoch": 0.12089810017271158, "grad_norm": 1.0790118647535079, "learning_rate": 1.931034482758621e-05, "loss": 0.9927, "step": 70 }, { "epoch": 0.1226252158894646, "grad_norm": 0.9385031680478307, "learning_rate": 1.9586206896551725e-05, "loss": 1.0208, "step": 71 }, { "epoch": 0.12435233160621761, "grad_norm": 1.0030011282923386, "learning_rate": 1.9862068965517244e-05, "loss": 1.0252, "step": 72 }, { "epoch": 0.12607944732297063, "grad_norm": 1.168905685906972, "learning_rate": 2.013793103448276e-05, "loss": 1.0105, "step": 73 }, { "epoch": 0.12780656303972365, "grad_norm": 1.1773716169869006, "learning_rate": 2.0413793103448278e-05, "loss": 1.029, "step": 74 }, { "epoch": 0.12953367875647667, "grad_norm": 1.5284922829646774, "learning_rate": 2.0689655172413797e-05, "loss": 1.0258, "step": 75 }, { "epoch": 0.13126079447322972, "grad_norm": 0.7069638138841103, "learning_rate": 2.0965517241379315e-05, "loss": 1.0036, "step": 76 }, { "epoch": 0.13298791018998274, "grad_norm": 1.034074754921524, "learning_rate": 2.1241379310344827e-05, "loss": 1.0092, "step": 77 }, { "epoch": 0.13471502590673576, "grad_norm": 1.3925469649973654, "learning_rate": 2.1517241379310346e-05, "loss": 1.039, "step": 78 }, { "epoch": 0.13644214162348878, "grad_norm": 1.1002070410080642, "learning_rate": 2.1793103448275865e-05, "loss": 1.011, "step": 79 }, { "epoch": 0.1381692573402418, "grad_norm": 1.6569885358039762, "learning_rate": 2.206896551724138e-05, "loss": 1.0271, "step": 80 }, { "epoch": 0.13989637305699482, "grad_norm": 1.1111859692936894, "learning_rate": 2.23448275862069e-05, "loss": 1.0048, "step": 81 }, { "epoch": 0.14162348877374784, "grad_norm": 2.3173468957343077, "learning_rate": 2.2620689655172417e-05, "loss": 1.0079, "step": 82 }, { "epoch": 0.14335060449050085, "grad_norm": 1.8040966683934183, "learning_rate": 2.2896551724137933e-05, "loss": 1.0273, "step": 83 }, { "epoch": 0.14507772020725387, "grad_norm": 1.8050370071892026, "learning_rate": 2.317241379310345e-05, "loss": 0.9946, "step": 84 }, { "epoch": 0.14680483592400692, "grad_norm": 1.7915490815729496, "learning_rate": 2.3448275862068967e-05, "loss": 1.0226, "step": 85 }, { "epoch": 0.14853195164075994, "grad_norm": 1.8111001926740895, "learning_rate": 2.3724137931034482e-05, "loss": 1.0173, "step": 86 }, { "epoch": 0.15025906735751296, "grad_norm": 1.5565645569645281, "learning_rate": 2.4e-05, "loss": 1.0197, "step": 87 }, { "epoch": 0.15198618307426598, "grad_norm": 1.6386555312593383, "learning_rate": 2.427586206896552e-05, "loss": 1.0435, "step": 88 }, { "epoch": 0.153713298791019, "grad_norm": 1.0925797272193254, "learning_rate": 2.4551724137931038e-05, "loss": 1.0241, "step": 89 }, { "epoch": 0.15544041450777202, "grad_norm": 1.8511648335474673, "learning_rate": 2.4827586206896553e-05, "loss": 1.0313, "step": 90 }, { "epoch": 0.15716753022452504, "grad_norm": 1.2080971344098816, "learning_rate": 2.5103448275862072e-05, "loss": 0.9984, "step": 91 }, { "epoch": 0.15889464594127806, "grad_norm": 1.6777352314686234, "learning_rate": 2.537931034482759e-05, "loss": 0.9836, "step": 92 }, { "epoch": 0.16062176165803108, "grad_norm": 1.6507155598955343, "learning_rate": 2.5655172413793103e-05, "loss": 0.9949, "step": 93 }, { "epoch": 0.16234887737478412, "grad_norm": 1.3158707254905975, "learning_rate": 2.593103448275862e-05, "loss": 1.0116, "step": 94 }, { "epoch": 0.16407599309153714, "grad_norm": 1.487882147016264, "learning_rate": 2.620689655172414e-05, "loss": 0.998, "step": 95 }, { "epoch": 0.16580310880829016, "grad_norm": 1.1570628177903894, "learning_rate": 2.648275862068966e-05, "loss": 0.9764, "step": 96 }, { "epoch": 0.16753022452504318, "grad_norm": 1.7909911353046553, "learning_rate": 2.6758620689655174e-05, "loss": 0.9963, "step": 97 }, { "epoch": 0.1692573402417962, "grad_norm": 1.2103440403948094, "learning_rate": 2.7034482758620693e-05, "loss": 0.9885, "step": 98 }, { "epoch": 0.17098445595854922, "grad_norm": 1.749681351456062, "learning_rate": 2.731034482758621e-05, "loss": 1.0104, "step": 99 }, { "epoch": 0.17271157167530224, "grad_norm": 1.383962662601561, "learning_rate": 2.7586206896551727e-05, "loss": 1.0036, "step": 100 }, { "epoch": 0.17443868739205526, "grad_norm": 1.851319515446871, "learning_rate": 2.7862068965517242e-05, "loss": 1.0174, "step": 101 }, { "epoch": 0.17616580310880828, "grad_norm": 1.6171459292115506, "learning_rate": 2.813793103448276e-05, "loss": 1.0038, "step": 102 }, { "epoch": 0.17789291882556132, "grad_norm": 1.5418635384233517, "learning_rate": 2.8413793103448276e-05, "loss": 0.977, "step": 103 }, { "epoch": 0.17962003454231434, "grad_norm": 1.2679000474038675, "learning_rate": 2.8689655172413795e-05, "loss": 1.0002, "step": 104 }, { "epoch": 0.18134715025906736, "grad_norm": 1.51864525509961, "learning_rate": 2.8965517241379313e-05, "loss": 0.9877, "step": 105 }, { "epoch": 0.18307426597582038, "grad_norm": 1.1353233644847018, "learning_rate": 2.9241379310344832e-05, "loss": 1.0008, "step": 106 }, { "epoch": 0.1848013816925734, "grad_norm": 1.620740948402656, "learning_rate": 2.9517241379310347e-05, "loss": 0.9876, "step": 107 }, { "epoch": 0.18652849740932642, "grad_norm": 1.6033566891187319, "learning_rate": 2.9793103448275866e-05, "loss": 1.0155, "step": 108 }, { "epoch": 0.18825561312607944, "grad_norm": 1.4159711046989851, "learning_rate": 3.006896551724138e-05, "loss": 1.0045, "step": 109 }, { "epoch": 0.18998272884283246, "grad_norm": 2.3137593308039, "learning_rate": 3.0344827586206897e-05, "loss": 0.9789, "step": 110 }, { "epoch": 0.19170984455958548, "grad_norm": 1.2382987638564806, "learning_rate": 3.0620689655172415e-05, "loss": 0.9896, "step": 111 }, { "epoch": 0.19343696027633853, "grad_norm": 1.5102179896387888, "learning_rate": 3.089655172413793e-05, "loss": 0.9766, "step": 112 }, { "epoch": 0.19516407599309155, "grad_norm": 2.8172068757564324, "learning_rate": 3.117241379310345e-05, "loss": 1.0201, "step": 113 }, { "epoch": 0.19689119170984457, "grad_norm": 1.3589155302514218, "learning_rate": 3.144827586206897e-05, "loss": 0.9843, "step": 114 }, { "epoch": 0.19861830742659758, "grad_norm": 3.5793348778535123, "learning_rate": 3.172413793103448e-05, "loss": 1.015, "step": 115 }, { "epoch": 0.2003454231433506, "grad_norm": 2.722400011634995, "learning_rate": 3.2000000000000005e-05, "loss": 1.0143, "step": 116 }, { "epoch": 0.20207253886010362, "grad_norm": 2.933812674554113, "learning_rate": 3.227586206896552e-05, "loss": 1.0053, "step": 117 }, { "epoch": 0.20379965457685664, "grad_norm": 2.336826082823202, "learning_rate": 3.2551724137931036e-05, "loss": 1.0187, "step": 118 }, { "epoch": 0.20552677029360966, "grad_norm": 2.3225203948682833, "learning_rate": 3.282758620689655e-05, "loss": 0.9879, "step": 119 }, { "epoch": 0.20725388601036268, "grad_norm": 2.103801529707044, "learning_rate": 3.310344827586207e-05, "loss": 1.005, "step": 120 }, { "epoch": 0.20898100172711573, "grad_norm": 1.9241949909349405, "learning_rate": 3.337931034482759e-05, "loss": 0.9859, "step": 121 }, { "epoch": 0.21070811744386875, "grad_norm": 2.393925724162197, "learning_rate": 3.3655172413793104e-05, "loss": 1.0221, "step": 122 }, { "epoch": 0.21243523316062177, "grad_norm": 1.7858962938783012, "learning_rate": 3.3931034482758626e-05, "loss": 1.0139, "step": 123 }, { "epoch": 0.2141623488773748, "grad_norm": 2.2477907358724813, "learning_rate": 3.420689655172414e-05, "loss": 1.0124, "step": 124 }, { "epoch": 0.2158894645941278, "grad_norm": 1.857118181455895, "learning_rate": 3.4482758620689657e-05, "loss": 1.0112, "step": 125 }, { "epoch": 0.21761658031088082, "grad_norm": 1.5894328977323737, "learning_rate": 3.475862068965517e-05, "loss": 0.9994, "step": 126 }, { "epoch": 0.21934369602763384, "grad_norm": 2.3198350685990077, "learning_rate": 3.5034482758620694e-05, "loss": 0.9781, "step": 127 }, { "epoch": 0.22107081174438686, "grad_norm": 1.5304526013680362, "learning_rate": 3.531034482758621e-05, "loss": 0.9751, "step": 128 }, { "epoch": 0.22279792746113988, "grad_norm": 1.7461095735169017, "learning_rate": 3.5586206896551725e-05, "loss": 0.9918, "step": 129 }, { "epoch": 0.22452504317789293, "grad_norm": 1.9489019180999647, "learning_rate": 3.586206896551725e-05, "loss": 0.9885, "step": 130 }, { "epoch": 0.22625215889464595, "grad_norm": 1.4021914052685762, "learning_rate": 3.613793103448276e-05, "loss": 0.9944, "step": 131 }, { "epoch": 0.22797927461139897, "grad_norm": 1.4257191688534847, "learning_rate": 3.641379310344828e-05, "loss": 0.9971, "step": 132 }, { "epoch": 0.229706390328152, "grad_norm": 1.2976211114211536, "learning_rate": 3.668965517241379e-05, "loss": 0.9941, "step": 133 }, { "epoch": 0.231433506044905, "grad_norm": 1.40106762554563, "learning_rate": 3.6965517241379315e-05, "loss": 0.9928, "step": 134 }, { "epoch": 0.23316062176165803, "grad_norm": 1.8798336769287254, "learning_rate": 3.724137931034483e-05, "loss": 0.9931, "step": 135 }, { "epoch": 0.23488773747841105, "grad_norm": 1.5292803301690017, "learning_rate": 3.7517241379310345e-05, "loss": 0.9723, "step": 136 }, { "epoch": 0.23661485319516407, "grad_norm": 1.786644934905774, "learning_rate": 3.779310344827587e-05, "loss": 0.9804, "step": 137 }, { "epoch": 0.23834196891191708, "grad_norm": 1.536738341391576, "learning_rate": 3.806896551724138e-05, "loss": 1.0119, "step": 138 }, { "epoch": 0.24006908462867013, "grad_norm": 1.1412768559151718, "learning_rate": 3.83448275862069e-05, "loss": 0.9912, "step": 139 }, { "epoch": 0.24179620034542315, "grad_norm": 1.7075014620337063, "learning_rate": 3.862068965517242e-05, "loss": 0.995, "step": 140 }, { "epoch": 0.24352331606217617, "grad_norm": 1.3454250377181702, "learning_rate": 3.8896551724137935e-05, "loss": 0.9814, "step": 141 }, { "epoch": 0.2452504317789292, "grad_norm": 2.682646620392395, "learning_rate": 3.917241379310345e-05, "loss": 1.004, "step": 142 }, { "epoch": 0.2469775474956822, "grad_norm": 1.1849957179039325, "learning_rate": 3.9448275862068966e-05, "loss": 0.9788, "step": 143 }, { "epoch": 0.24870466321243523, "grad_norm": 2.52423846240828, "learning_rate": 3.972413793103449e-05, "loss": 0.9811, "step": 144 }, { "epoch": 0.2504317789291883, "grad_norm": 1.8962309006323252, "learning_rate": 4e-05, "loss": 0.9738, "step": 145 }, { "epoch": 0.25215889464594127, "grad_norm": 1.8975516545385798, "learning_rate": 4.027586206896552e-05, "loss": 0.9821, "step": 146 }, { "epoch": 0.2538860103626943, "grad_norm": 2.067287331739851, "learning_rate": 4.055172413793104e-05, "loss": 0.997, "step": 147 }, { "epoch": 0.2556131260794473, "grad_norm": 1.663669444998624, "learning_rate": 4.0827586206896556e-05, "loss": 0.9869, "step": 148 }, { "epoch": 0.25734024179620035, "grad_norm": 1.8964954985487665, "learning_rate": 4.110344827586207e-05, "loss": 0.9732, "step": 149 }, { "epoch": 0.25906735751295334, "grad_norm": 1.3385444418833135, "learning_rate": 4.137931034482759e-05, "loss": 1.0179, "step": 150 }, { "epoch": 0.2607944732297064, "grad_norm": 1.726017407019494, "learning_rate": 4.165517241379311e-05, "loss": 0.9731, "step": 151 }, { "epoch": 0.26252158894645944, "grad_norm": 1.2674010776466071, "learning_rate": 4.193103448275863e-05, "loss": 0.9896, "step": 152 }, { "epoch": 0.26424870466321243, "grad_norm": 2.103491063738079, "learning_rate": 4.2206896551724146e-05, "loss": 0.988, "step": 153 }, { "epoch": 0.2659758203799655, "grad_norm": 2.0728087666170967, "learning_rate": 4.2482758620689655e-05, "loss": 0.988, "step": 154 }, { "epoch": 0.26770293609671847, "grad_norm": 1.257068726874153, "learning_rate": 4.275862068965517e-05, "loss": 0.9983, "step": 155 }, { "epoch": 0.2694300518134715, "grad_norm": 2.0936950082934223, "learning_rate": 4.303448275862069e-05, "loss": 1.0092, "step": 156 }, { "epoch": 0.2711571675302245, "grad_norm": 1.7145858941224774, "learning_rate": 4.331034482758621e-05, "loss": 1.0033, "step": 157 }, { "epoch": 0.27288428324697755, "grad_norm": 1.2476261200436278, "learning_rate": 4.358620689655173e-05, "loss": 0.9816, "step": 158 }, { "epoch": 0.27461139896373055, "grad_norm": 2.700684868200322, "learning_rate": 4.3862068965517245e-05, "loss": 0.9913, "step": 159 }, { "epoch": 0.2763385146804836, "grad_norm": 1.5110538009479884, "learning_rate": 4.413793103448276e-05, "loss": 1.0126, "step": 160 }, { "epoch": 0.27806563039723664, "grad_norm": 2.4035822410504957, "learning_rate": 4.441379310344828e-05, "loss": 0.9672, "step": 161 }, { "epoch": 0.27979274611398963, "grad_norm": 1.9055844013155914, "learning_rate": 4.46896551724138e-05, "loss": 0.9846, "step": 162 }, { "epoch": 0.2815198618307427, "grad_norm": 2.619901197079302, "learning_rate": 4.496551724137931e-05, "loss": 0.9653, "step": 163 }, { "epoch": 0.28324697754749567, "grad_norm": 1.8417306797090445, "learning_rate": 4.5241379310344835e-05, "loss": 0.9646, "step": 164 }, { "epoch": 0.2849740932642487, "grad_norm": 2.558350046931099, "learning_rate": 4.551724137931035e-05, "loss": 1.0294, "step": 165 }, { "epoch": 0.2867012089810017, "grad_norm": 2.230353536455154, "learning_rate": 4.5793103448275865e-05, "loss": 0.9668, "step": 166 }, { "epoch": 0.28842832469775476, "grad_norm": 1.8945505334823218, "learning_rate": 4.606896551724139e-05, "loss": 0.9782, "step": 167 }, { "epoch": 0.29015544041450775, "grad_norm": 1.8937179502313362, "learning_rate": 4.63448275862069e-05, "loss": 0.986, "step": 168 }, { "epoch": 0.2918825561312608, "grad_norm": 2.0558895428286754, "learning_rate": 4.6620689655172425e-05, "loss": 0.9585, "step": 169 }, { "epoch": 0.29360967184801384, "grad_norm": 2.1320669105590815, "learning_rate": 4.689655172413793e-05, "loss": 0.9877, "step": 170 }, { "epoch": 0.29533678756476683, "grad_norm": 1.5217770095529934, "learning_rate": 4.717241379310345e-05, "loss": 0.9755, "step": 171 }, { "epoch": 0.2970639032815199, "grad_norm": 1.8809060562159547, "learning_rate": 4.7448275862068964e-05, "loss": 0.9813, "step": 172 }, { "epoch": 0.2987910189982729, "grad_norm": 1.4432283187612975, "learning_rate": 4.7724137931034486e-05, "loss": 0.9829, "step": 173 }, { "epoch": 0.3005181347150259, "grad_norm": 2.0598903011264116, "learning_rate": 4.8e-05, "loss": 0.9807, "step": 174 }, { "epoch": 0.3022452504317789, "grad_norm": 2.1444020303663063, "learning_rate": 4.827586206896552e-05, "loss": 1.0087, "step": 175 }, { "epoch": 0.30397236614853196, "grad_norm": 1.7262964481477305, "learning_rate": 4.855172413793104e-05, "loss": 0.9683, "step": 176 }, { "epoch": 0.30569948186528495, "grad_norm": 1.3107134444342654, "learning_rate": 4.8827586206896554e-05, "loss": 0.9713, "step": 177 }, { "epoch": 0.307426597582038, "grad_norm": 1.695387201613467, "learning_rate": 4.9103448275862076e-05, "loss": 0.9529, "step": 178 }, { "epoch": 0.30915371329879104, "grad_norm": 1.9355747113609216, "learning_rate": 4.937931034482759e-05, "loss": 0.9898, "step": 179 }, { "epoch": 0.31088082901554404, "grad_norm": 1.74717009823291, "learning_rate": 4.9655172413793107e-05, "loss": 0.9646, "step": 180 }, { "epoch": 0.3126079447322971, "grad_norm": 2.258289875998736, "learning_rate": 4.993103448275863e-05, "loss": 0.9836, "step": 181 }, { "epoch": 0.3143350604490501, "grad_norm": 1.9227042480802885, "learning_rate": 5.0206896551724144e-05, "loss": 0.9672, "step": 182 }, { "epoch": 0.3160621761658031, "grad_norm": 1.4203397487751226, "learning_rate": 5.048275862068966e-05, "loss": 0.9744, "step": 183 }, { "epoch": 0.3177892918825561, "grad_norm": 1.7024007095789255, "learning_rate": 5.075862068965518e-05, "loss": 0.9681, "step": 184 }, { "epoch": 0.31951640759930916, "grad_norm": 2.060610649676582, "learning_rate": 5.10344827586207e-05, "loss": 0.987, "step": 185 }, { "epoch": 0.32124352331606215, "grad_norm": 2.220047035046375, "learning_rate": 5.1310344827586205e-05, "loss": 0.986, "step": 186 }, { "epoch": 0.3229706390328152, "grad_norm": 1.8351080873385561, "learning_rate": 5.158620689655173e-05, "loss": 0.962, "step": 187 }, { "epoch": 0.32469775474956825, "grad_norm": 1.5838250999130967, "learning_rate": 5.186206896551724e-05, "loss": 0.9824, "step": 188 }, { "epoch": 0.32642487046632124, "grad_norm": 2.351877773797332, "learning_rate": 5.213793103448276e-05, "loss": 0.9712, "step": 189 }, { "epoch": 0.3281519861830743, "grad_norm": 1.4467755003503138, "learning_rate": 5.241379310344828e-05, "loss": 0.9744, "step": 190 }, { "epoch": 0.3298791018998273, "grad_norm": 2.153508120124668, "learning_rate": 5.2689655172413795e-05, "loss": 0.9494, "step": 191 }, { "epoch": 0.3316062176165803, "grad_norm": 2.214308501816819, "learning_rate": 5.296551724137932e-05, "loss": 0.9725, "step": 192 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9773995047619738, "learning_rate": 5.324137931034483e-05, "loss": 0.965, "step": 193 }, { "epoch": 0.33506044905008636, "grad_norm": 3.9040049156313037, "learning_rate": 5.351724137931035e-05, "loss": 0.9815, "step": 194 }, { "epoch": 0.33678756476683935, "grad_norm": 1.4670295617658502, "learning_rate": 5.379310344827587e-05, "loss": 0.9715, "step": 195 }, { "epoch": 0.3385146804835924, "grad_norm": 3.793832876824532, "learning_rate": 5.4068965517241385e-05, "loss": 1.0245, "step": 196 }, { "epoch": 0.34024179620034545, "grad_norm": 2.7961854030335167, "learning_rate": 5.43448275862069e-05, "loss": 1.0296, "step": 197 }, { "epoch": 0.34196891191709844, "grad_norm": 2.9479243113253286, "learning_rate": 5.462068965517242e-05, "loss": 0.9921, "step": 198 }, { "epoch": 0.3436960276338515, "grad_norm": 2.779013406331845, "learning_rate": 5.489655172413794e-05, "loss": 0.9859, "step": 199 }, { "epoch": 0.3454231433506045, "grad_norm": 2.1218827417971378, "learning_rate": 5.517241379310345e-05, "loss": 1.0051, "step": 200 }, { "epoch": 0.3471502590673575, "grad_norm": 2.842542976007534, "learning_rate": 5.5448275862068975e-05, "loss": 1.0003, "step": 201 }, { "epoch": 0.3488773747841105, "grad_norm": 2.43048818004948, "learning_rate": 5.5724137931034484e-05, "loss": 0.9745, "step": 202 }, { "epoch": 0.35060449050086356, "grad_norm": 2.0516571616891137, "learning_rate": 5.6e-05, "loss": 0.9798, "step": 203 }, { "epoch": 0.35233160621761656, "grad_norm": 3.722941704595437, "learning_rate": 5.627586206896552e-05, "loss": 0.9727, "step": 204 }, { "epoch": 0.3540587219343696, "grad_norm": 2.22406534063779, "learning_rate": 5.6551724137931037e-05, "loss": 0.9998, "step": 205 }, { "epoch": 0.35578583765112265, "grad_norm": 4.626829519836735, "learning_rate": 5.682758620689655e-05, "loss": 0.9927, "step": 206 }, { "epoch": 0.35751295336787564, "grad_norm": 3.6293031471713086, "learning_rate": 5.7103448275862074e-05, "loss": 0.9943, "step": 207 }, { "epoch": 0.3592400690846287, "grad_norm": 4.005788385471665, "learning_rate": 5.737931034482759e-05, "loss": 0.9936, "step": 208 }, { "epoch": 0.3609671848013817, "grad_norm": 3.535084106446588, "learning_rate": 5.765517241379311e-05, "loss": 1.0097, "step": 209 }, { "epoch": 0.3626943005181347, "grad_norm": 3.4496556926877906, "learning_rate": 5.7931034482758627e-05, "loss": 0.9788, "step": 210 }, { "epoch": 0.3644214162348877, "grad_norm": 2.758435384630932, "learning_rate": 5.820689655172414e-05, "loss": 0.9806, "step": 211 }, { "epoch": 0.36614853195164077, "grad_norm": 3.938395086601924, "learning_rate": 5.8482758620689664e-05, "loss": 0.9875, "step": 212 }, { "epoch": 0.36787564766839376, "grad_norm": 3.1166001915507984, "learning_rate": 5.875862068965518e-05, "loss": 0.9835, "step": 213 }, { "epoch": 0.3696027633851468, "grad_norm": 4.011545581450208, "learning_rate": 5.9034482758620695e-05, "loss": 0.9858, "step": 214 }, { "epoch": 0.37132987910189985, "grad_norm": 3.9196131940026935, "learning_rate": 5.931034482758622e-05, "loss": 0.9707, "step": 215 }, { "epoch": 0.37305699481865284, "grad_norm": 2.533598007474565, "learning_rate": 5.958620689655173e-05, "loss": 0.9583, "step": 216 }, { "epoch": 0.3747841105354059, "grad_norm": 2.42476419967734, "learning_rate": 5.986206896551725e-05, "loss": 0.9736, "step": 217 }, { "epoch": 0.3765112262521589, "grad_norm": 3.4365242037263934, "learning_rate": 6.013793103448276e-05, "loss": 0.9726, "step": 218 }, { "epoch": 0.37823834196891193, "grad_norm": 2.517678780108284, "learning_rate": 6.041379310344828e-05, "loss": 0.9801, "step": 219 }, { "epoch": 0.3799654576856649, "grad_norm": 4.024186425238897, "learning_rate": 6.068965517241379e-05, "loss": 0.9535, "step": 220 }, { "epoch": 0.38169257340241797, "grad_norm": 3.402588319349314, "learning_rate": 6.0965517241379315e-05, "loss": 0.9671, "step": 221 }, { "epoch": 0.38341968911917096, "grad_norm": 3.4461959439035383, "learning_rate": 6.124137931034483e-05, "loss": 0.9949, "step": 222 }, { "epoch": 0.385146804835924, "grad_norm": 3.3009726988550687, "learning_rate": 6.151724137931035e-05, "loss": 0.9886, "step": 223 }, { "epoch": 0.38687392055267705, "grad_norm": 2.9571386040407788, "learning_rate": 6.179310344827586e-05, "loss": 0.9565, "step": 224 }, { "epoch": 0.38860103626943004, "grad_norm": 2.496401232183266, "learning_rate": 6.206896551724138e-05, "loss": 0.9749, "step": 225 }, { "epoch": 0.3903281519861831, "grad_norm": 3.558764338282437, "learning_rate": 6.23448275862069e-05, "loss": 0.9867, "step": 226 }, { "epoch": 0.3920552677029361, "grad_norm": 3.1276166203517266, "learning_rate": 6.262068965517241e-05, "loss": 0.998, "step": 227 }, { "epoch": 0.39378238341968913, "grad_norm": 3.402288207825404, "learning_rate": 6.289655172413794e-05, "loss": 0.9707, "step": 228 }, { "epoch": 0.3955094991364421, "grad_norm": 3.1187724338458342, "learning_rate": 6.317241379310346e-05, "loss": 0.9989, "step": 229 }, { "epoch": 0.39723661485319517, "grad_norm": 2.9306227779744436, "learning_rate": 6.344827586206897e-05, "loss": 0.9606, "step": 230 }, { "epoch": 0.39896373056994816, "grad_norm": 2.4035418725371005, "learning_rate": 6.372413793103449e-05, "loss": 0.9637, "step": 231 }, { "epoch": 0.4006908462867012, "grad_norm": 3.720887522836001, "learning_rate": 6.400000000000001e-05, "loss": 0.9686, "step": 232 }, { "epoch": 0.40241796200345425, "grad_norm": 3.253833245620748, "learning_rate": 6.427586206896553e-05, "loss": 0.9789, "step": 233 }, { "epoch": 0.40414507772020725, "grad_norm": 2.929467097033268, "learning_rate": 6.455172413793104e-05, "loss": 0.9748, "step": 234 }, { "epoch": 0.4058721934369603, "grad_norm": 2.6641704391598533, "learning_rate": 6.482758620689655e-05, "loss": 0.9644, "step": 235 }, { "epoch": 0.4075993091537133, "grad_norm": 3.244535594099209, "learning_rate": 6.510344827586207e-05, "loss": 0.9679, "step": 236 }, { "epoch": 0.40932642487046633, "grad_norm": 2.667133592355478, "learning_rate": 6.53793103448276e-05, "loss": 0.9765, "step": 237 }, { "epoch": 0.4110535405872193, "grad_norm": 3.319691465219357, "learning_rate": 6.56551724137931e-05, "loss": 0.9663, "step": 238 }, { "epoch": 0.41278065630397237, "grad_norm": 3.009065940516306, "learning_rate": 6.593103448275862e-05, "loss": 0.9646, "step": 239 }, { "epoch": 0.41450777202072536, "grad_norm": 2.870627120869508, "learning_rate": 6.620689655172415e-05, "loss": 0.957, "step": 240 }, { "epoch": 0.4162348877374784, "grad_norm": 2.3215101742811353, "learning_rate": 6.648275862068966e-05, "loss": 0.9569, "step": 241 }, { "epoch": 0.41796200345423146, "grad_norm": 3.317037380186973, "learning_rate": 6.675862068965518e-05, "loss": 0.9654, "step": 242 }, { "epoch": 0.41968911917098445, "grad_norm": 2.9245218786929796, "learning_rate": 6.70344827586207e-05, "loss": 0.9635, "step": 243 }, { "epoch": 0.4214162348877375, "grad_norm": 2.6349099414622694, "learning_rate": 6.731034482758621e-05, "loss": 0.9713, "step": 244 }, { "epoch": 0.4231433506044905, "grad_norm": 2.1652369287319226, "learning_rate": 6.758620689655173e-05, "loss": 0.9707, "step": 245 }, { "epoch": 0.42487046632124353, "grad_norm": 3.237941888486808, "learning_rate": 6.786206896551725e-05, "loss": 0.9725, "step": 246 }, { "epoch": 0.4265975820379965, "grad_norm": 2.901363199175894, "learning_rate": 6.813793103448276e-05, "loss": 0.9716, "step": 247 }, { "epoch": 0.4283246977547496, "grad_norm": 2.6022336562120763, "learning_rate": 6.841379310344828e-05, "loss": 0.981, "step": 248 }, { "epoch": 0.43005181347150256, "grad_norm": 2.2620309279706006, "learning_rate": 6.86896551724138e-05, "loss": 0.9662, "step": 249 }, { "epoch": 0.4317789291882556, "grad_norm": 2.857966181255635, "learning_rate": 6.896551724137931e-05, "loss": 0.9584, "step": 250 }, { "epoch": 0.43350604490500866, "grad_norm": 2.3185628806782295, "learning_rate": 6.924137931034484e-05, "loss": 0.9749, "step": 251 }, { "epoch": 0.43523316062176165, "grad_norm": 2.865001268981118, "learning_rate": 6.951724137931034e-05, "loss": 0.9498, "step": 252 }, { "epoch": 0.4369602763385147, "grad_norm": 2.327932256111525, "learning_rate": 6.979310344827587e-05, "loss": 0.9779, "step": 253 }, { "epoch": 0.4386873920552677, "grad_norm": 2.621023297216621, "learning_rate": 7.006896551724139e-05, "loss": 0.9787, "step": 254 }, { "epoch": 0.44041450777202074, "grad_norm": 2.0932729301529207, "learning_rate": 7.03448275862069e-05, "loss": 0.9778, "step": 255 }, { "epoch": 0.4421416234887737, "grad_norm": 2.7223149731682796, "learning_rate": 7.062068965517242e-05, "loss": 0.9623, "step": 256 }, { "epoch": 0.4438687392055268, "grad_norm": 2.179369299779228, "learning_rate": 7.089655172413794e-05, "loss": 0.9754, "step": 257 }, { "epoch": 0.44559585492227977, "grad_norm": 2.3895496142365333, "learning_rate": 7.117241379310345e-05, "loss": 0.9727, "step": 258 }, { "epoch": 0.4473229706390328, "grad_norm": 1.912553604196251, "learning_rate": 7.144827586206897e-05, "loss": 0.9712, "step": 259 }, { "epoch": 0.44905008635578586, "grad_norm": 2.503549509995192, "learning_rate": 7.17241379310345e-05, "loss": 0.9686, "step": 260 }, { "epoch": 0.45077720207253885, "grad_norm": 1.9960952509845362, "learning_rate": 7.2e-05, "loss": 0.9788, "step": 261 }, { "epoch": 0.4525043177892919, "grad_norm": 2.0235901221216133, "learning_rate": 7.227586206896552e-05, "loss": 0.9579, "step": 262 }, { "epoch": 0.4542314335060449, "grad_norm": 2.689457383091535, "learning_rate": 7.255172413793105e-05, "loss": 0.9895, "step": 263 }, { "epoch": 0.45595854922279794, "grad_norm": 1.2620338416479764, "learning_rate": 7.282758620689655e-05, "loss": 0.9716, "step": 264 }, { "epoch": 0.45768566493955093, "grad_norm": 2.6003109485504243, "learning_rate": 7.310344827586208e-05, "loss": 0.9825, "step": 265 }, { "epoch": 0.459412780656304, "grad_norm": 2.1699546939060714, "learning_rate": 7.337931034482759e-05, "loss": 0.9773, "step": 266 }, { "epoch": 0.46113989637305697, "grad_norm": 2.10846895888519, "learning_rate": 7.365517241379311e-05, "loss": 0.9492, "step": 267 }, { "epoch": 0.46286701208981, "grad_norm": 3.0497662262240803, "learning_rate": 7.393103448275863e-05, "loss": 0.9741, "step": 268 }, { "epoch": 0.46459412780656306, "grad_norm": 2.2473914787216804, "learning_rate": 7.420689655172414e-05, "loss": 0.989, "step": 269 }, { "epoch": 0.46632124352331605, "grad_norm": 2.3553073462851164, "learning_rate": 7.448275862068966e-05, "loss": 0.9664, "step": 270 }, { "epoch": 0.4680483592400691, "grad_norm": 1.9699715080389066, "learning_rate": 7.475862068965518e-05, "loss": 0.9927, "step": 271 }, { "epoch": 0.4697754749568221, "grad_norm": 1.748861552447629, "learning_rate": 7.503448275862069e-05, "loss": 0.978, "step": 272 }, { "epoch": 0.47150259067357514, "grad_norm": 2.0443997456133918, "learning_rate": 7.531034482758621e-05, "loss": 0.9853, "step": 273 }, { "epoch": 0.47322970639032813, "grad_norm": 1.8943671891609803, "learning_rate": 7.558620689655173e-05, "loss": 0.9626, "step": 274 }, { "epoch": 0.4749568221070812, "grad_norm": 2.007946567979536, "learning_rate": 7.586206896551724e-05, "loss": 0.9629, "step": 275 }, { "epoch": 0.47668393782383417, "grad_norm": 1.4079511789087813, "learning_rate": 7.613793103448277e-05, "loss": 0.9698, "step": 276 }, { "epoch": 0.4784110535405872, "grad_norm": 2.157056884050364, "learning_rate": 7.641379310344829e-05, "loss": 0.9765, "step": 277 }, { "epoch": 0.48013816925734026, "grad_norm": 1.9905757611202703, "learning_rate": 7.66896551724138e-05, "loss": 0.97, "step": 278 }, { "epoch": 0.48186528497409326, "grad_norm": 1.8495506134986126, "learning_rate": 7.696551724137932e-05, "loss": 0.9667, "step": 279 }, { "epoch": 0.4835924006908463, "grad_norm": 1.3086331316766306, "learning_rate": 7.724137931034484e-05, "loss": 0.9734, "step": 280 }, { "epoch": 0.4853195164075993, "grad_norm": 2.4711467813866457, "learning_rate": 7.751724137931035e-05, "loss": 1.0094, "step": 281 }, { "epoch": 0.48704663212435234, "grad_norm": 2.2962389574445723, "learning_rate": 7.779310344827587e-05, "loss": 0.9812, "step": 282 }, { "epoch": 0.48877374784110533, "grad_norm": 1.6054477532806601, "learning_rate": 7.806896551724138e-05, "loss": 0.9936, "step": 283 }, { "epoch": 0.4905008635578584, "grad_norm": 3.184990092289826, "learning_rate": 7.83448275862069e-05, "loss": 0.9946, "step": 284 }, { "epoch": 0.49222797927461137, "grad_norm": 2.5078995187044817, "learning_rate": 7.862068965517242e-05, "loss": 1.0035, "step": 285 }, { "epoch": 0.4939550949913644, "grad_norm": 2.79424929913559, "learning_rate": 7.889655172413793e-05, "loss": 0.9748, "step": 286 }, { "epoch": 0.49568221070811747, "grad_norm": 2.518367655911046, "learning_rate": 7.917241379310345e-05, "loss": 0.9903, "step": 287 }, { "epoch": 0.49740932642487046, "grad_norm": 2.251756131611213, "learning_rate": 7.944827586206898e-05, "loss": 0.977, "step": 288 }, { "epoch": 0.4991364421416235, "grad_norm": 2.064349094708489, "learning_rate": 7.972413793103448e-05, "loss": 0.9781, "step": 289 }, { "epoch": 0.5008635578583766, "grad_norm": 1.8656724271352325, "learning_rate": 8e-05, "loss": 0.9692, "step": 290 }, { "epoch": 0.5025906735751295, "grad_norm": 2.0447863538659283, "learning_rate": 7.999997091197512e-05, "loss": 0.9654, "step": 291 }, { "epoch": 0.5043177892918825, "grad_norm": 1.7660950754287998, "learning_rate": 7.999988364794276e-05, "loss": 0.9953, "step": 292 }, { "epoch": 0.5060449050086355, "grad_norm": 1.7608867943608972, "learning_rate": 7.999973820802984e-05, "loss": 0.9664, "step": 293 }, { "epoch": 0.5077720207253886, "grad_norm": 1.5274638158822054, "learning_rate": 7.99995345924479e-05, "loss": 0.971, "step": 294 }, { "epoch": 0.5094991364421416, "grad_norm": 2.4064866195668637, "learning_rate": 7.999927280149307e-05, "loss": 0.9769, "step": 295 }, { "epoch": 0.5112262521588946, "grad_norm": 1.6258625123277073, "learning_rate": 7.999895283554609e-05, "loss": 0.9786, "step": 296 }, { "epoch": 0.5129533678756477, "grad_norm": 1.912160434710529, "learning_rate": 7.999857469507234e-05, "loss": 0.9689, "step": 297 }, { "epoch": 0.5146804835924007, "grad_norm": 1.7908027787058003, "learning_rate": 7.999813838062177e-05, "loss": 0.9704, "step": 298 }, { "epoch": 0.5164075993091537, "grad_norm": 2.2368345182538816, "learning_rate": 7.999764389282896e-05, "loss": 0.9425, "step": 299 }, { "epoch": 0.5181347150259067, "grad_norm": 1.4529201245194163, "learning_rate": 7.99970912324131e-05, "loss": 0.9455, "step": 300 }, { "epoch": 0.5198618307426598, "grad_norm": 2.32073432583536, "learning_rate": 7.999648040017798e-05, "loss": 0.9698, "step": 301 }, { "epoch": 0.5215889464594128, "grad_norm": 2.025396727399755, "learning_rate": 7.9995811397012e-05, "loss": 0.9802, "step": 302 }, { "epoch": 0.5233160621761658, "grad_norm": 1.3403481386847484, "learning_rate": 7.999508422388815e-05, "loss": 0.9621, "step": 303 }, { "epoch": 0.5250431778929189, "grad_norm": 1.708087294029539, "learning_rate": 7.999429888186402e-05, "loss": 0.973, "step": 304 }, { "epoch": 0.5267702936096719, "grad_norm": 1.3027391915839108, "learning_rate": 7.999345537208183e-05, "loss": 0.9665, "step": 305 }, { "epoch": 0.5284974093264249, "grad_norm": 2.7803878321629725, "learning_rate": 7.999255369576838e-05, "loss": 0.9577, "step": 306 }, { "epoch": 0.5302245250431779, "grad_norm": 1.758414320684093, "learning_rate": 7.999159385423507e-05, "loss": 0.9598, "step": 307 }, { "epoch": 0.531951640759931, "grad_norm": 2.4567202487700497, "learning_rate": 7.999057584887788e-05, "loss": 0.9582, "step": 308 }, { "epoch": 0.533678756476684, "grad_norm": 1.6999356771483132, "learning_rate": 7.998949968117741e-05, "loss": 0.9677, "step": 309 }, { "epoch": 0.5354058721934369, "grad_norm": 2.7200084442071617, "learning_rate": 7.998836535269885e-05, "loss": 0.9708, "step": 310 }, { "epoch": 0.5371329879101899, "grad_norm": 1.8540328466143723, "learning_rate": 7.998717286509194e-05, "loss": 0.9783, "step": 311 }, { "epoch": 0.538860103626943, "grad_norm": 2.3610479811974345, "learning_rate": 7.998592222009108e-05, "loss": 0.9851, "step": 312 }, { "epoch": 0.540587219343696, "grad_norm": 2.169657686890793, "learning_rate": 7.998461341951516e-05, "loss": 0.9475, "step": 313 }, { "epoch": 0.542314335060449, "grad_norm": 1.4223751743031756, "learning_rate": 7.998324646526772e-05, "loss": 0.9609, "step": 314 }, { "epoch": 0.5440414507772021, "grad_norm": 1.9614427661721223, "learning_rate": 7.998182135933687e-05, "loss": 0.9649, "step": 315 }, { "epoch": 0.5457685664939551, "grad_norm": 1.9549559285570357, "learning_rate": 7.998033810379529e-05, "loss": 0.9729, "step": 316 }, { "epoch": 0.5474956822107081, "grad_norm": 1.660889440414169, "learning_rate": 7.997879670080022e-05, "loss": 0.9474, "step": 317 }, { "epoch": 0.5492227979274611, "grad_norm": 1.7716042916594266, "learning_rate": 7.997719715259346e-05, "loss": 0.9726, "step": 318 }, { "epoch": 0.5509499136442142, "grad_norm": 2.582503402243517, "learning_rate": 7.997553946150142e-05, "loss": 0.9856, "step": 319 }, { "epoch": 0.5526770293609672, "grad_norm": 1.3075395209730745, "learning_rate": 7.997382362993505e-05, "loss": 0.9565, "step": 320 }, { "epoch": 0.5544041450777202, "grad_norm": 2.147674879714373, "learning_rate": 7.997204966038983e-05, "loss": 0.9695, "step": 321 }, { "epoch": 0.5561312607944733, "grad_norm": 2.3815813493952014, "learning_rate": 7.997021755544583e-05, "loss": 0.9532, "step": 322 }, { "epoch": 0.5578583765112263, "grad_norm": 1.4700731529316198, "learning_rate": 7.996832731776772e-05, "loss": 0.9642, "step": 323 }, { "epoch": 0.5595854922279793, "grad_norm": 2.357194604055875, "learning_rate": 7.996637895010457e-05, "loss": 0.9698, "step": 324 }, { "epoch": 0.5613126079447323, "grad_norm": 1.3496163869138698, "learning_rate": 7.996437245529017e-05, "loss": 0.9657, "step": 325 }, { "epoch": 0.5630397236614854, "grad_norm": 2.144384149895314, "learning_rate": 7.996230783624272e-05, "loss": 0.9657, "step": 326 }, { "epoch": 0.5647668393782384, "grad_norm": 1.4876524421870254, "learning_rate": 7.996018509596503e-05, "loss": 0.9622, "step": 327 }, { "epoch": 0.5664939550949913, "grad_norm": 1.888861337614147, "learning_rate": 7.99580042375444e-05, "loss": 0.961, "step": 328 }, { "epoch": 0.5682210708117443, "grad_norm": 1.919869823135174, "learning_rate": 7.995576526415271e-05, "loss": 0.9563, "step": 329 }, { "epoch": 0.5699481865284974, "grad_norm": 1.989523873910073, "learning_rate": 7.995346817904627e-05, "loss": 0.9653, "step": 330 }, { "epoch": 0.5716753022452504, "grad_norm": 1.3504136077915698, "learning_rate": 7.9951112985566e-05, "loss": 0.9576, "step": 331 }, { "epoch": 0.5734024179620034, "grad_norm": 2.042933860376093, "learning_rate": 7.994869968713728e-05, "loss": 0.9682, "step": 332 }, { "epoch": 0.5751295336787565, "grad_norm": 2.119411349830954, "learning_rate": 7.994622828727002e-05, "loss": 0.9558, "step": 333 }, { "epoch": 0.5768566493955095, "grad_norm": 1.4051336383592772, "learning_rate": 7.994369878955864e-05, "loss": 0.958, "step": 334 }, { "epoch": 0.5785837651122625, "grad_norm": 1.3281153498626426, "learning_rate": 7.994111119768202e-05, "loss": 0.9439, "step": 335 }, { "epoch": 0.5803108808290155, "grad_norm": 2.033661927291712, "learning_rate": 7.993846551540356e-05, "loss": 0.9376, "step": 336 }, { "epoch": 0.5820379965457686, "grad_norm": 2.699041496971939, "learning_rate": 7.993576174657118e-05, "loss": 0.9788, "step": 337 }, { "epoch": 0.5837651122625216, "grad_norm": 1.2017107200456254, "learning_rate": 7.99329998951172e-05, "loss": 0.9438, "step": 338 }, { "epoch": 0.5854922279792746, "grad_norm": 3.757640474630509, "learning_rate": 7.993017996505847e-05, "loss": 0.9655, "step": 339 }, { "epoch": 0.5872193436960277, "grad_norm": 2.62088224701973, "learning_rate": 7.992730196049632e-05, "loss": 0.9647, "step": 340 }, { "epoch": 0.5889464594127807, "grad_norm": 3.871431745781576, "learning_rate": 7.992436588561651e-05, "loss": 0.9921, "step": 341 }, { "epoch": 0.5906735751295337, "grad_norm": 3.1552338311689843, "learning_rate": 7.992137174468928e-05, "loss": 0.9796, "step": 342 }, { "epoch": 0.5924006908462867, "grad_norm": 3.0721700913538945, "learning_rate": 7.991831954206928e-05, "loss": 0.9571, "step": 343 }, { "epoch": 0.5941278065630398, "grad_norm": 2.7961495551757345, "learning_rate": 7.99152092821957e-05, "loss": 0.9698, "step": 344 }, { "epoch": 0.5958549222797928, "grad_norm": 2.432978315608702, "learning_rate": 7.991204096959203e-05, "loss": 0.9612, "step": 345 }, { "epoch": 0.5975820379965457, "grad_norm": 2.3083698256582843, "learning_rate": 7.990881460886633e-05, "loss": 0.9622, "step": 346 }, { "epoch": 0.5993091537132987, "grad_norm": 2.2253354570364583, "learning_rate": 7.990553020471097e-05, "loss": 0.9552, "step": 347 }, { "epoch": 0.6010362694300518, "grad_norm": 1.7945202157892455, "learning_rate": 7.990218776190285e-05, "loss": 0.9762, "step": 348 }, { "epoch": 0.6027633851468048, "grad_norm": 2.968156898167295, "learning_rate": 7.989878728530318e-05, "loss": 0.9589, "step": 349 }, { "epoch": 0.6044905008635578, "grad_norm": 2.140433744019964, "learning_rate": 7.989532877985763e-05, "loss": 0.9683, "step": 350 }, { "epoch": 0.6062176165803109, "grad_norm": 2.8025739945195007, "learning_rate": 7.989181225059625e-05, "loss": 0.9332, "step": 351 }, { "epoch": 0.6079447322970639, "grad_norm": 2.6159824589294836, "learning_rate": 7.98882377026335e-05, "loss": 0.9674, "step": 352 }, { "epoch": 0.6096718480138169, "grad_norm": 2.0164190488281526, "learning_rate": 7.98846051411682e-05, "loss": 0.9464, "step": 353 }, { "epoch": 0.6113989637305699, "grad_norm": 2.2861546994970143, "learning_rate": 7.988091457148353e-05, "loss": 0.9351, "step": 354 }, { "epoch": 0.613126079447323, "grad_norm": 1.8230351787779564, "learning_rate": 7.987716599894709e-05, "loss": 0.9627, "step": 355 }, { "epoch": 0.614853195164076, "grad_norm": 0.976296360989386, "learning_rate": 7.987335942901079e-05, "loss": 0.9385, "step": 356 }, { "epoch": 0.616580310880829, "grad_norm": 2.7595015393080846, "learning_rate": 7.986949486721092e-05, "loss": 0.9462, "step": 357 }, { "epoch": 0.6183074265975821, "grad_norm": 1.361471415662833, "learning_rate": 7.98655723191681e-05, "loss": 0.9451, "step": 358 }, { "epoch": 0.6200345423143351, "grad_norm": 2.6861494324104855, "learning_rate": 7.986159179058728e-05, "loss": 0.9637, "step": 359 }, { "epoch": 0.6217616580310881, "grad_norm": 1.8994462577318454, "learning_rate": 7.985755328725776e-05, "loss": 0.9689, "step": 360 }, { "epoch": 0.6234887737478411, "grad_norm": 2.1027898168899646, "learning_rate": 7.985345681505313e-05, "loss": 0.9532, "step": 361 }, { "epoch": 0.6252158894645942, "grad_norm": 1.6326499850932708, "learning_rate": 7.984930237993132e-05, "loss": 0.9584, "step": 362 }, { "epoch": 0.6269430051813472, "grad_norm": 1.3405434358375616, "learning_rate": 7.984508998793454e-05, "loss": 0.9519, "step": 363 }, { "epoch": 0.6286701208981001, "grad_norm": 2.480803958128163, "learning_rate": 7.98408196451893e-05, "loss": 0.9674, "step": 364 }, { "epoch": 0.6303972366148531, "grad_norm": 1.6297806897878742, "learning_rate": 7.983649135790637e-05, "loss": 0.9835, "step": 365 }, { "epoch": 0.6321243523316062, "grad_norm": 1.656187295687212, "learning_rate": 7.983210513238085e-05, "loss": 0.9523, "step": 366 }, { "epoch": 0.6338514680483592, "grad_norm": 1.165786759932975, "learning_rate": 7.982766097499206e-05, "loss": 0.9652, "step": 367 }, { "epoch": 0.6355785837651122, "grad_norm": 2.812254238702045, "learning_rate": 7.982315889220356e-05, "loss": 0.9709, "step": 368 }, { "epoch": 0.6373056994818653, "grad_norm": 1.8547137491504977, "learning_rate": 7.981859889056324e-05, "loss": 0.9781, "step": 369 }, { "epoch": 0.6390328151986183, "grad_norm": 1.940346517211301, "learning_rate": 7.981398097670312e-05, "loss": 0.9714, "step": 370 }, { "epoch": 0.6407599309153713, "grad_norm": 2.015365289510412, "learning_rate": 7.980930515733952e-05, "loss": 0.9772, "step": 371 }, { "epoch": 0.6424870466321243, "grad_norm": 1.4589500098587829, "learning_rate": 7.980457143927297e-05, "loss": 0.9716, "step": 372 }, { "epoch": 0.6442141623488774, "grad_norm": 1.957998432999204, "learning_rate": 7.979977982938818e-05, "loss": 0.9654, "step": 373 }, { "epoch": 0.6459412780656304, "grad_norm": 1.6841414159944708, "learning_rate": 7.979493033465408e-05, "loss": 0.9609, "step": 374 }, { "epoch": 0.6476683937823834, "grad_norm": 1.441993049937988, "learning_rate": 7.979002296212379e-05, "loss": 0.9887, "step": 375 }, { "epoch": 0.6493955094991365, "grad_norm": 1.4482827538173317, "learning_rate": 7.978505771893457e-05, "loss": 0.9579, "step": 376 }, { "epoch": 0.6511226252158895, "grad_norm": 2.465587692687215, "learning_rate": 7.978003461230789e-05, "loss": 0.9486, "step": 377 }, { "epoch": 0.6528497409326425, "grad_norm": 1.4468209858888794, "learning_rate": 7.977495364954937e-05, "loss": 0.9655, "step": 378 }, { "epoch": 0.6545768566493955, "grad_norm": 2.4780976730191404, "learning_rate": 7.976981483804876e-05, "loss": 0.9693, "step": 379 }, { "epoch": 0.6563039723661486, "grad_norm": 1.7544474279427458, "learning_rate": 7.976461818527996e-05, "loss": 0.9684, "step": 380 }, { "epoch": 0.6580310880829016, "grad_norm": 2.104557961398843, "learning_rate": 7.9759363698801e-05, "loss": 0.9546, "step": 381 }, { "epoch": 0.6597582037996546, "grad_norm": 1.6437435533172073, "learning_rate": 7.975405138625399e-05, "loss": 0.9473, "step": 382 }, { "epoch": 0.6614853195164075, "grad_norm": 1.8217991923911958, "learning_rate": 7.974868125536516e-05, "loss": 0.9561, "step": 383 }, { "epoch": 0.6632124352331606, "grad_norm": 1.4065214347310642, "learning_rate": 7.974325331394486e-05, "loss": 0.9482, "step": 384 }, { "epoch": 0.6649395509499136, "grad_norm": 2.1294647644597053, "learning_rate": 7.973776756988746e-05, "loss": 0.9854, "step": 385 }, { "epoch": 0.6666666666666666, "grad_norm": 1.7683373171826446, "learning_rate": 7.973222403117149e-05, "loss": 0.9635, "step": 386 }, { "epoch": 0.6683937823834197, "grad_norm": 1.6957664503032732, "learning_rate": 7.972662270585941e-05, "loss": 0.9572, "step": 387 }, { "epoch": 0.6701208981001727, "grad_norm": 1.6140583833625228, "learning_rate": 7.972096360209784e-05, "loss": 0.9679, "step": 388 }, { "epoch": 0.6718480138169257, "grad_norm": 1.8258688298905508, "learning_rate": 7.97152467281174e-05, "loss": 0.9458, "step": 389 }, { "epoch": 0.6735751295336787, "grad_norm": 1.4747296387103537, "learning_rate": 7.970947209223268e-05, "loss": 0.9324, "step": 390 }, { "epoch": 0.6753022452504318, "grad_norm": 1.7064907753905676, "learning_rate": 7.970363970284233e-05, "loss": 0.9472, "step": 391 }, { "epoch": 0.6770293609671848, "grad_norm": 1.6417721454689531, "learning_rate": 7.969774956842898e-05, "loss": 0.944, "step": 392 }, { "epoch": 0.6787564766839378, "grad_norm": 2.2196443599310536, "learning_rate": 7.969180169755926e-05, "loss": 0.9551, "step": 393 }, { "epoch": 0.6804835924006909, "grad_norm": 2.007714025566053, "learning_rate": 7.968579609888375e-05, "loss": 0.9695, "step": 394 }, { "epoch": 0.6822107081174439, "grad_norm": 0.9733043354244745, "learning_rate": 7.967973278113702e-05, "loss": 0.9577, "step": 395 }, { "epoch": 0.6839378238341969, "grad_norm": 2.825203450168701, "learning_rate": 7.967361175313753e-05, "loss": 0.9728, "step": 396 }, { "epoch": 0.6856649395509499, "grad_norm": 1.9508027437306406, "learning_rate": 7.966743302378776e-05, "loss": 0.9648, "step": 397 }, { "epoch": 0.687392055267703, "grad_norm": 1.927330882847649, "learning_rate": 7.966119660207403e-05, "loss": 0.9469, "step": 398 }, { "epoch": 0.689119170984456, "grad_norm": 1.8227112048088396, "learning_rate": 7.96549024970666e-05, "loss": 0.9409, "step": 399 }, { "epoch": 0.690846286701209, "grad_norm": 1.5369260378478058, "learning_rate": 7.964855071791964e-05, "loss": 0.9426, "step": 400 }, { "epoch": 0.6925734024179621, "grad_norm": 1.9218938840812865, "learning_rate": 7.964214127387116e-05, "loss": 0.959, "step": 401 }, { "epoch": 0.694300518134715, "grad_norm": 1.688270887142604, "learning_rate": 7.963567417424309e-05, "loss": 0.9732, "step": 402 }, { "epoch": 0.696027633851468, "grad_norm": 1.7700239719753486, "learning_rate": 7.962914942844117e-05, "loss": 0.9602, "step": 403 }, { "epoch": 0.697754749568221, "grad_norm": 1.4002444884468643, "learning_rate": 7.962256704595501e-05, "loss": 0.9647, "step": 404 }, { "epoch": 0.6994818652849741, "grad_norm": 2.1983078607070814, "learning_rate": 7.961592703635803e-05, "loss": 0.9361, "step": 405 }, { "epoch": 0.7012089810017271, "grad_norm": 1.2300561736972677, "learning_rate": 7.960922940930746e-05, "loss": 0.9572, "step": 406 }, { "epoch": 0.7029360967184801, "grad_norm": 2.322066914062218, "learning_rate": 7.960247417454436e-05, "loss": 0.9503, "step": 407 }, { "epoch": 0.7046632124352331, "grad_norm": 1.5786391986907202, "learning_rate": 7.959566134189352e-05, "loss": 0.97, "step": 408 }, { "epoch": 0.7063903281519862, "grad_norm": 1.5790708047414046, "learning_rate": 7.958879092126355e-05, "loss": 0.9646, "step": 409 }, { "epoch": 0.7081174438687392, "grad_norm": 1.6820544929707515, "learning_rate": 7.95818629226468e-05, "loss": 0.9591, "step": 410 }, { "epoch": 0.7098445595854922, "grad_norm": 2.514885221609998, "learning_rate": 7.957487735611936e-05, "loss": 0.9464, "step": 411 }, { "epoch": 0.7115716753022453, "grad_norm": 1.4905600145507574, "learning_rate": 7.956783423184106e-05, "loss": 0.9335, "step": 412 }, { "epoch": 0.7132987910189983, "grad_norm": 2.129778072508937, "learning_rate": 7.95607335600554e-05, "loss": 0.9626, "step": 413 }, { "epoch": 0.7150259067357513, "grad_norm": 1.631968566008413, "learning_rate": 7.955357535108962e-05, "loss": 0.9526, "step": 414 }, { "epoch": 0.7167530224525043, "grad_norm": 2.339186619087506, "learning_rate": 7.954635961535461e-05, "loss": 0.9445, "step": 415 }, { "epoch": 0.7184801381692574, "grad_norm": 1.9702495571941914, "learning_rate": 7.953908636334498e-05, "loss": 0.9433, "step": 416 }, { "epoch": 0.7202072538860104, "grad_norm": 1.2800861455802754, "learning_rate": 7.953175560563894e-05, "loss": 0.9421, "step": 417 }, { "epoch": 0.7219343696027634, "grad_norm": 2.6384278961918275, "learning_rate": 7.952436735289834e-05, "loss": 0.9285, "step": 418 }, { "epoch": 0.7236614853195165, "grad_norm": 1.557338087511295, "learning_rate": 7.951692161586868e-05, "loss": 0.9685, "step": 419 }, { "epoch": 0.7253886010362695, "grad_norm": 1.4912405518605727, "learning_rate": 7.950941840537906e-05, "loss": 0.959, "step": 420 }, { "epoch": 0.7271157167530224, "grad_norm": 1.7181703563395743, "learning_rate": 7.950185773234213e-05, "loss": 0.9459, "step": 421 }, { "epoch": 0.7288428324697754, "grad_norm": 1.901728710543773, "learning_rate": 7.949423960775415e-05, "loss": 0.9595, "step": 422 }, { "epoch": 0.7305699481865285, "grad_norm": 2.0436910677141165, "learning_rate": 7.948656404269495e-05, "loss": 0.9438, "step": 423 }, { "epoch": 0.7322970639032815, "grad_norm": 1.374301248589808, "learning_rate": 7.947883104832785e-05, "loss": 0.9421, "step": 424 }, { "epoch": 0.7340241796200345, "grad_norm": 2.1536610154101066, "learning_rate": 7.947104063589975e-05, "loss": 0.9493, "step": 425 }, { "epoch": 0.7357512953367875, "grad_norm": 1.6915645762436184, "learning_rate": 7.946319281674104e-05, "loss": 0.9433, "step": 426 }, { "epoch": 0.7374784110535406, "grad_norm": 1.8479000896092064, "learning_rate": 7.945528760226557e-05, "loss": 0.9456, "step": 427 }, { "epoch": 0.7392055267702936, "grad_norm": 1.5864479100260893, "learning_rate": 7.944732500397072e-05, "loss": 0.9541, "step": 428 }, { "epoch": 0.7409326424870466, "grad_norm": 1.4698946555743528, "learning_rate": 7.94393050334373e-05, "loss": 0.984, "step": 429 }, { "epoch": 0.7426597582037997, "grad_norm": 2.5314434109084427, "learning_rate": 7.943122770232956e-05, "loss": 0.9461, "step": 430 }, { "epoch": 0.7443868739205527, "grad_norm": 1.4509435587759134, "learning_rate": 7.942309302239517e-05, "loss": 0.9531, "step": 431 }, { "epoch": 0.7461139896373057, "grad_norm": 1.6422137703471584, "learning_rate": 7.941490100546523e-05, "loss": 0.949, "step": 432 }, { "epoch": 0.7478411053540587, "grad_norm": 2.763404417096794, "learning_rate": 7.940665166345422e-05, "loss": 0.971, "step": 433 }, { "epoch": 0.7495682210708118, "grad_norm": 1.6348577995920932, "learning_rate": 7.939834500835999e-05, "loss": 0.9611, "step": 434 }, { "epoch": 0.7512953367875648, "grad_norm": 2.7266898340705636, "learning_rate": 7.938998105226374e-05, "loss": 0.9677, "step": 435 }, { "epoch": 0.7530224525043178, "grad_norm": 1.8166551442062389, "learning_rate": 7.938155980733005e-05, "loss": 0.9445, "step": 436 }, { "epoch": 0.7547495682210709, "grad_norm": 2.3432267477165003, "learning_rate": 7.937308128580675e-05, "loss": 0.9414, "step": 437 }, { "epoch": 0.7564766839378239, "grad_norm": 1.3998545266867657, "learning_rate": 7.936454550002504e-05, "loss": 0.9437, "step": 438 }, { "epoch": 0.7582037996545768, "grad_norm": 2.3087534034042796, "learning_rate": 7.935595246239936e-05, "loss": 0.9696, "step": 439 }, { "epoch": 0.7599309153713298, "grad_norm": 1.9766303313203333, "learning_rate": 7.934730218542745e-05, "loss": 0.9512, "step": 440 }, { "epoch": 0.7616580310880829, "grad_norm": 1.8464214491219417, "learning_rate": 7.933859468169026e-05, "loss": 0.9609, "step": 441 }, { "epoch": 0.7633851468048359, "grad_norm": 2.449265244686902, "learning_rate": 7.932982996385204e-05, "loss": 0.9328, "step": 442 }, { "epoch": 0.7651122625215889, "grad_norm": 1.7483970319557836, "learning_rate": 7.932100804466015e-05, "loss": 0.9445, "step": 443 }, { "epoch": 0.7668393782383419, "grad_norm": 1.7634017867805376, "learning_rate": 7.931212893694523e-05, "loss": 0.9431, "step": 444 }, { "epoch": 0.768566493955095, "grad_norm": 1.7175731949573552, "learning_rate": 7.930319265362107e-05, "loss": 0.95, "step": 445 }, { "epoch": 0.770293609671848, "grad_norm": 1.7123102675944613, "learning_rate": 7.92941992076846e-05, "loss": 0.9579, "step": 446 }, { "epoch": 0.772020725388601, "grad_norm": 2.002500312350649, "learning_rate": 7.92851486122159e-05, "loss": 0.9527, "step": 447 }, { "epoch": 0.7737478411053541, "grad_norm": 1.5659878405169405, "learning_rate": 7.927604088037818e-05, "loss": 0.9408, "step": 448 }, { "epoch": 0.7754749568221071, "grad_norm": 1.0883868116378206, "learning_rate": 7.926687602541772e-05, "loss": 0.9284, "step": 449 }, { "epoch": 0.7772020725388601, "grad_norm": 2.1470864279880173, "learning_rate": 7.92576540606639e-05, "loss": 0.9417, "step": 450 }, { "epoch": 0.7789291882556131, "grad_norm": 1.4654720360271047, "learning_rate": 7.924837499952915e-05, "loss": 0.9488, "step": 451 }, { "epoch": 0.7806563039723662, "grad_norm": 2.4370269372209536, "learning_rate": 7.923903885550897e-05, "loss": 0.947, "step": 452 }, { "epoch": 0.7823834196891192, "grad_norm": 1.4617348594906747, "learning_rate": 7.922964564218184e-05, "loss": 0.9493, "step": 453 }, { "epoch": 0.7841105354058722, "grad_norm": 2.412788555404472, "learning_rate": 7.922019537320929e-05, "loss": 0.9523, "step": 454 }, { "epoch": 0.7858376511226253, "grad_norm": 1.9961354144745855, "learning_rate": 7.921068806233577e-05, "loss": 0.9514, "step": 455 }, { "epoch": 0.7875647668393783, "grad_norm": 2.10288513806563, "learning_rate": 7.920112372338873e-05, "loss": 0.9302, "step": 456 }, { "epoch": 0.7892918825561313, "grad_norm": 1.7060031593669256, "learning_rate": 7.919150237027856e-05, "loss": 0.9471, "step": 457 }, { "epoch": 0.7910189982728842, "grad_norm": 1.7868610053874667, "learning_rate": 7.918182401699858e-05, "loss": 0.927, "step": 458 }, { "epoch": 0.7927461139896373, "grad_norm": 1.8389902801920006, "learning_rate": 7.917208867762497e-05, "loss": 0.9416, "step": 459 }, { "epoch": 0.7944732297063903, "grad_norm": 1.5111707658784796, "learning_rate": 7.916229636631686e-05, "loss": 0.9638, "step": 460 }, { "epoch": 0.7962003454231433, "grad_norm": 1.38827710290977, "learning_rate": 7.915244709731618e-05, "loss": 0.9392, "step": 461 }, { "epoch": 0.7979274611398963, "grad_norm": 2.3088774983199496, "learning_rate": 7.91425408849477e-05, "loss": 0.9309, "step": 462 }, { "epoch": 0.7996545768566494, "grad_norm": 1.0155616140208144, "learning_rate": 7.913257774361907e-05, "loss": 0.9643, "step": 463 }, { "epoch": 0.8013816925734024, "grad_norm": 2.325302379777068, "learning_rate": 7.912255768782067e-05, "loss": 0.9485, "step": 464 }, { "epoch": 0.8031088082901554, "grad_norm": 1.6259369921577451, "learning_rate": 7.911248073212565e-05, "loss": 0.9383, "step": 465 }, { "epoch": 0.8048359240069085, "grad_norm": 2.195338057944829, "learning_rate": 7.910234689119001e-05, "loss": 0.965, "step": 466 }, { "epoch": 0.8065630397236615, "grad_norm": 1.9070188000745818, "learning_rate": 7.90921561797524e-05, "loss": 0.9493, "step": 467 }, { "epoch": 0.8082901554404145, "grad_norm": 1.6974881120440466, "learning_rate": 7.908190861263416e-05, "loss": 0.9482, "step": 468 }, { "epoch": 0.8100172711571675, "grad_norm": 1.6917911573725228, "learning_rate": 7.907160420473942e-05, "loss": 0.9571, "step": 469 }, { "epoch": 0.8117443868739206, "grad_norm": 1.4338526221646386, "learning_rate": 7.906124297105489e-05, "loss": 0.9666, "step": 470 }, { "epoch": 0.8134715025906736, "grad_norm": 2.0519816544714287, "learning_rate": 7.905082492664999e-05, "loss": 0.9689, "step": 471 }, { "epoch": 0.8151986183074266, "grad_norm": 1.3653769152987032, "learning_rate": 7.90403500866767e-05, "loss": 0.9653, "step": 472 }, { "epoch": 0.8169257340241797, "grad_norm": 1.918013346349042, "learning_rate": 7.902981846636968e-05, "loss": 0.9604, "step": 473 }, { "epoch": 0.8186528497409327, "grad_norm": 1.4924751249710988, "learning_rate": 7.901923008104609e-05, "loss": 0.9209, "step": 474 }, { "epoch": 0.8203799654576857, "grad_norm": 2.0787486933079773, "learning_rate": 7.900858494610573e-05, "loss": 0.9274, "step": 475 }, { "epoch": 0.8221070811744386, "grad_norm": 1.0890288566156414, "learning_rate": 7.899788307703088e-05, "loss": 0.945, "step": 476 }, { "epoch": 0.8238341968911918, "grad_norm": 1.7295482340798187, "learning_rate": 7.898712448938634e-05, "loss": 0.9401, "step": 477 }, { "epoch": 0.8255613126079447, "grad_norm": 1.633351938169091, "learning_rate": 7.897630919881943e-05, "loss": 0.9413, "step": 478 }, { "epoch": 0.8272884283246977, "grad_norm": 1.5618198955343818, "learning_rate": 7.896543722105991e-05, "loss": 0.9601, "step": 479 }, { "epoch": 0.8290155440414507, "grad_norm": 1.779365112947071, "learning_rate": 7.895450857192001e-05, "loss": 0.9471, "step": 480 }, { "epoch": 0.8307426597582038, "grad_norm": 1.7306509433460506, "learning_rate": 7.894352326729437e-05, "loss": 0.8996, "step": 481 }, { "epoch": 0.8324697754749568, "grad_norm": 1.2498113581833115, "learning_rate": 7.893248132316002e-05, "loss": 0.9495, "step": 482 }, { "epoch": 0.8341968911917098, "grad_norm": 1.223560042848609, "learning_rate": 7.892138275557639e-05, "loss": 0.9437, "step": 483 }, { "epoch": 0.8359240069084629, "grad_norm": 2.9621560902936577, "learning_rate": 7.891022758068526e-05, "loss": 0.9484, "step": 484 }, { "epoch": 0.8376511226252159, "grad_norm": 1.2600523928196043, "learning_rate": 7.889901581471068e-05, "loss": 0.9324, "step": 485 }, { "epoch": 0.8393782383419689, "grad_norm": 4.048219738979023, "learning_rate": 7.88877474739591e-05, "loss": 0.9425, "step": 486 }, { "epoch": 0.8411053540587219, "grad_norm": 2.847331879405994, "learning_rate": 7.887642257481922e-05, "loss": 0.9785, "step": 487 }, { "epoch": 0.842832469775475, "grad_norm": 3.834196648167804, "learning_rate": 7.886504113376196e-05, "loss": 0.9574, "step": 488 }, { "epoch": 0.844559585492228, "grad_norm": 2.6503101422283297, "learning_rate": 7.88536031673405e-05, "loss": 0.9527, "step": 489 }, { "epoch": 0.846286701208981, "grad_norm": 3.5940496365123966, "learning_rate": 7.884210869219026e-05, "loss": 0.9683, "step": 490 }, { "epoch": 0.8480138169257341, "grad_norm": 3.482513235450149, "learning_rate": 7.883055772502877e-05, "loss": 0.9373, "step": 491 }, { "epoch": 0.8497409326424871, "grad_norm": 2.2043096594469302, "learning_rate": 7.881895028265583e-05, "loss": 0.9607, "step": 492 }, { "epoch": 0.8514680483592401, "grad_norm": 2.499873782995305, "learning_rate": 7.880728638195327e-05, "loss": 0.9623, "step": 493 }, { "epoch": 0.853195164075993, "grad_norm": 1.7828362796312271, "learning_rate": 7.879556603988512e-05, "loss": 0.9637, "step": 494 }, { "epoch": 0.8549222797927462, "grad_norm": 2.3275581557620333, "learning_rate": 7.878378927349744e-05, "loss": 0.9677, "step": 495 }, { "epoch": 0.8566493955094991, "grad_norm": 1.5405219604522895, "learning_rate": 7.877195609991836e-05, "loss": 0.9609, "step": 496 }, { "epoch": 0.8583765112262521, "grad_norm": 2.5993757831817663, "learning_rate": 7.87600665363581e-05, "loss": 0.9671, "step": 497 }, { "epoch": 0.8601036269430051, "grad_norm": 1.9869126706957867, "learning_rate": 7.874812060010882e-05, "loss": 0.9471, "step": 498 }, { "epoch": 0.8618307426597582, "grad_norm": 2.495766884764808, "learning_rate": 7.873611830854472e-05, "loss": 0.9338, "step": 499 }, { "epoch": 0.8635578583765112, "grad_norm": 2.483640191896434, "learning_rate": 7.872405967912196e-05, "loss": 0.9502, "step": 500 }, { "epoch": 0.8652849740932642, "grad_norm": 1.8627866650466258, "learning_rate": 7.871194472937859e-05, "loss": 0.9217, "step": 501 }, { "epoch": 0.8670120898100173, "grad_norm": 2.536778746677032, "learning_rate": 7.869977347693464e-05, "loss": 0.9307, "step": 502 }, { "epoch": 0.8687392055267703, "grad_norm": 1.3429929836920727, "learning_rate": 7.868754593949199e-05, "loss": 0.9208, "step": 503 }, { "epoch": 0.8704663212435233, "grad_norm": 3.221556457412879, "learning_rate": 7.867526213483438e-05, "loss": 0.9396, "step": 504 }, { "epoch": 0.8721934369602763, "grad_norm": 2.2675079340852515, "learning_rate": 7.866292208082738e-05, "loss": 0.953, "step": 505 }, { "epoch": 0.8739205526770294, "grad_norm": 2.548608315768669, "learning_rate": 7.865052579541841e-05, "loss": 0.9495, "step": 506 }, { "epoch": 0.8756476683937824, "grad_norm": 1.6275108959338453, "learning_rate": 7.863807329663662e-05, "loss": 0.9514, "step": 507 }, { "epoch": 0.8773747841105354, "grad_norm": 1.6594120775569807, "learning_rate": 7.862556460259294e-05, "loss": 0.9437, "step": 508 }, { "epoch": 0.8791018998272885, "grad_norm": 2.361576299015049, "learning_rate": 7.861299973148003e-05, "loss": 0.9441, "step": 509 }, { "epoch": 0.8808290155440415, "grad_norm": 1.5874600063586255, "learning_rate": 7.860037870157226e-05, "loss": 0.9307, "step": 510 }, { "epoch": 0.8825561312607945, "grad_norm": 2.5577308924627187, "learning_rate": 7.858770153122567e-05, "loss": 0.951, "step": 511 }, { "epoch": 0.8842832469775475, "grad_norm": 1.6838812633137203, "learning_rate": 7.857496823887796e-05, "loss": 0.9291, "step": 512 }, { "epoch": 0.8860103626943006, "grad_norm": 2.189348912175924, "learning_rate": 7.856217884304843e-05, "loss": 0.9435, "step": 513 }, { "epoch": 0.8877374784110535, "grad_norm": 1.7083942121445175, "learning_rate": 7.854933336233802e-05, "loss": 0.9185, "step": 514 }, { "epoch": 0.8894645941278065, "grad_norm": 2.069160479544157, "learning_rate": 7.853643181542917e-05, "loss": 0.9392, "step": 515 }, { "epoch": 0.8911917098445595, "grad_norm": 1.5439631570119998, "learning_rate": 7.852347422108594e-05, "loss": 0.959, "step": 516 }, { "epoch": 0.8929188255613126, "grad_norm": 2.4187433270499583, "learning_rate": 7.851046059815386e-05, "loss": 0.9439, "step": 517 }, { "epoch": 0.8946459412780656, "grad_norm": 1.3609503825737594, "learning_rate": 7.849739096555997e-05, "loss": 0.9605, "step": 518 }, { "epoch": 0.8963730569948186, "grad_norm": 2.852613332354843, "learning_rate": 7.848426534231275e-05, "loss": 0.9475, "step": 519 }, { "epoch": 0.8981001727115717, "grad_norm": 1.9724176286727546, "learning_rate": 7.847108374750211e-05, "loss": 0.9562, "step": 520 }, { "epoch": 0.8998272884283247, "grad_norm": 2.6213268043658444, "learning_rate": 7.84578462002994e-05, "loss": 0.9608, "step": 521 }, { "epoch": 0.9015544041450777, "grad_norm": 1.8439425543257437, "learning_rate": 7.844455271995734e-05, "loss": 0.9466, "step": 522 }, { "epoch": 0.9032815198618307, "grad_norm": 1.6949636835658672, "learning_rate": 7.843120332580993e-05, "loss": 0.9299, "step": 523 }, { "epoch": 0.9050086355785838, "grad_norm": 1.9552520441334837, "learning_rate": 7.841779803727258e-05, "loss": 0.9328, "step": 524 }, { "epoch": 0.9067357512953368, "grad_norm": 1.470536948714905, "learning_rate": 7.840433687384198e-05, "loss": 0.9323, "step": 525 }, { "epoch": 0.9084628670120898, "grad_norm": 1.5309215267793024, "learning_rate": 7.8390819855096e-05, "loss": 0.9464, "step": 526 }, { "epoch": 0.9101899827288429, "grad_norm": 1.1240142119752967, "learning_rate": 7.837724700069388e-05, "loss": 0.933, "step": 527 }, { "epoch": 0.9119170984455959, "grad_norm": 1.296951199197395, "learning_rate": 7.836361833037595e-05, "loss": 0.926, "step": 528 }, { "epoch": 0.9136442141623489, "grad_norm": 1.276078595152236, "learning_rate": 7.834993386396378e-05, "loss": 0.9474, "step": 529 }, { "epoch": 0.9153713298791019, "grad_norm": 1.0993352035253852, "learning_rate": 7.833619362136008e-05, "loss": 0.9514, "step": 530 }, { "epoch": 0.917098445595855, "grad_norm": 1.364003127739658, "learning_rate": 7.832239762254867e-05, "loss": 0.9428, "step": 531 }, { "epoch": 0.918825561312608, "grad_norm": 2.236650957379953, "learning_rate": 7.830854588759446e-05, "loss": 0.928, "step": 532 }, { "epoch": 0.9205526770293609, "grad_norm": 1.163348006493071, "learning_rate": 7.829463843664343e-05, "loss": 0.9592, "step": 533 }, { "epoch": 0.9222797927461139, "grad_norm": 1.75782145554767, "learning_rate": 7.828067528992262e-05, "loss": 0.9539, "step": 534 }, { "epoch": 0.924006908462867, "grad_norm": 1.6377729681018331, "learning_rate": 7.826665646774002e-05, "loss": 0.9473, "step": 535 }, { "epoch": 0.92573402417962, "grad_norm": 1.51367430588839, "learning_rate": 7.825258199048464e-05, "loss": 0.9441, "step": 536 }, { "epoch": 0.927461139896373, "grad_norm": 1.8450737803603978, "learning_rate": 7.82384518786264e-05, "loss": 0.9387, "step": 537 }, { "epoch": 0.9291882556131261, "grad_norm": 1.808628318280589, "learning_rate": 7.822426615271616e-05, "loss": 0.9511, "step": 538 }, { "epoch": 0.9309153713298791, "grad_norm": 1.3310202660667732, "learning_rate": 7.821002483338566e-05, "loss": 0.9339, "step": 539 }, { "epoch": 0.9326424870466321, "grad_norm": 1.8170730543171505, "learning_rate": 7.81957279413475e-05, "loss": 0.9443, "step": 540 }, { "epoch": 0.9343696027633851, "grad_norm": 1.1527235807989746, "learning_rate": 7.818137549739509e-05, "loss": 0.9223, "step": 541 }, { "epoch": 0.9360967184801382, "grad_norm": 2.1267296577006096, "learning_rate": 7.816696752240265e-05, "loss": 0.9456, "step": 542 }, { "epoch": 0.9378238341968912, "grad_norm": 1.6316329089652346, "learning_rate": 7.815250403732512e-05, "loss": 0.9315, "step": 543 }, { "epoch": 0.9395509499136442, "grad_norm": 1.8521710041746549, "learning_rate": 7.813798506319825e-05, "loss": 0.9478, "step": 544 }, { "epoch": 0.9412780656303973, "grad_norm": 1.6131738877605657, "learning_rate": 7.812341062113845e-05, "loss": 0.9383, "step": 545 }, { "epoch": 0.9430051813471503, "grad_norm": 1.5661460290530884, "learning_rate": 7.81087807323428e-05, "loss": 0.9025, "step": 546 }, { "epoch": 0.9447322970639033, "grad_norm": 1.424702144977362, "learning_rate": 7.809409541808903e-05, "loss": 0.9312, "step": 547 }, { "epoch": 0.9464594127806563, "grad_norm": 1.7174764563580742, "learning_rate": 7.807935469973547e-05, "loss": 0.936, "step": 548 }, { "epoch": 0.9481865284974094, "grad_norm": 1.5083641918388944, "learning_rate": 7.806455859872105e-05, "loss": 0.9599, "step": 549 }, { "epoch": 0.9499136442141624, "grad_norm": 1.4619943114858933, "learning_rate": 7.804970713656523e-05, "loss": 0.9354, "step": 550 }, { "epoch": 0.9516407599309153, "grad_norm": 1.4894454118647082, "learning_rate": 7.8034800334868e-05, "loss": 0.9258, "step": 551 }, { "epoch": 0.9533678756476683, "grad_norm": 2.1401410552402322, "learning_rate": 7.801983821530984e-05, "loss": 0.9696, "step": 552 }, { "epoch": 0.9550949913644214, "grad_norm": 1.242223688422029, "learning_rate": 7.800482079965166e-05, "loss": 0.9475, "step": 553 }, { "epoch": 0.9568221070811744, "grad_norm": 0.9385259497252199, "learning_rate": 7.79897481097348e-05, "loss": 0.9404, "step": 554 }, { "epoch": 0.9585492227979274, "grad_norm": 1.3198328094274132, "learning_rate": 7.797462016748103e-05, "loss": 0.9455, "step": 555 }, { "epoch": 0.9602763385146805, "grad_norm": 1.9973611356228822, "learning_rate": 7.795943699489244e-05, "loss": 0.939, "step": 556 }, { "epoch": 0.9620034542314335, "grad_norm": 1.0046449602310108, "learning_rate": 7.794419861405143e-05, "loss": 0.9345, "step": 557 }, { "epoch": 0.9637305699481865, "grad_norm": 3.417678711965351, "learning_rate": 7.792890504712073e-05, "loss": 0.965, "step": 558 }, { "epoch": 0.9654576856649395, "grad_norm": 2.493567049591001, "learning_rate": 7.791355631634334e-05, "loss": 0.9572, "step": 559 }, { "epoch": 0.9671848013816926, "grad_norm": 2.985144505194277, "learning_rate": 7.789815244404246e-05, "loss": 0.9351, "step": 560 }, { "epoch": 0.9689119170984456, "grad_norm": 2.8160128773374176, "learning_rate": 7.788269345262151e-05, "loss": 0.9333, "step": 561 }, { "epoch": 0.9706390328151986, "grad_norm": 2.3500651206665606, "learning_rate": 7.786717936456405e-05, "loss": 0.9455, "step": 562 }, { "epoch": 0.9723661485319517, "grad_norm": 2.4369130097154024, "learning_rate": 7.78516102024338e-05, "loss": 0.9304, "step": 563 }, { "epoch": 0.9740932642487047, "grad_norm": 1.7652015186578967, "learning_rate": 7.783598598887456e-05, "loss": 0.9379, "step": 564 }, { "epoch": 0.9758203799654577, "grad_norm": 2.586525606501815, "learning_rate": 7.782030674661022e-05, "loss": 0.9672, "step": 565 }, { "epoch": 0.9775474956822107, "grad_norm": 1.8415154491201504, "learning_rate": 7.780457249844469e-05, "loss": 0.9476, "step": 566 }, { "epoch": 0.9792746113989638, "grad_norm": 2.1889220878051256, "learning_rate": 7.778878326726186e-05, "loss": 0.9241, "step": 567 }, { "epoch": 0.9810017271157168, "grad_norm": 2.0631280221370765, "learning_rate": 7.777293907602563e-05, "loss": 0.9298, "step": 568 }, { "epoch": 0.9827288428324698, "grad_norm": 1.2176586419597468, "learning_rate": 7.775703994777979e-05, "loss": 0.9496, "step": 569 }, { "epoch": 0.9844559585492227, "grad_norm": 3.0917461465915665, "learning_rate": 7.774108590564806e-05, "loss": 0.9504, "step": 570 }, { "epoch": 0.9861830742659758, "grad_norm": 2.1515764752394713, "learning_rate": 7.772507697283404e-05, "loss": 0.9538, "step": 571 }, { "epoch": 0.9879101899827288, "grad_norm": 2.6545331822952285, "learning_rate": 7.770901317262111e-05, "loss": 0.9418, "step": 572 }, { "epoch": 0.9896373056994818, "grad_norm": 1.589867152246178, "learning_rate": 7.769289452837249e-05, "loss": 0.9084, "step": 573 }, { "epoch": 0.9913644214162349, "grad_norm": 2.717889267929853, "learning_rate": 7.767672106353118e-05, "loss": 0.9258, "step": 574 }, { "epoch": 0.9930915371329879, "grad_norm": 1.6849003905655984, "learning_rate": 7.766049280161985e-05, "loss": 0.9498, "step": 575 }, { "epoch": 0.9948186528497409, "grad_norm": 2.0795000824608154, "learning_rate": 7.764420976624093e-05, "loss": 0.9392, "step": 576 }, { "epoch": 0.9965457685664939, "grad_norm": 2.017351056455212, "learning_rate": 7.762787198107648e-05, "loss": 0.9604, "step": 577 }, { "epoch": 0.998272884283247, "grad_norm": 1.1229534839282005, "learning_rate": 7.76114794698882e-05, "loss": 0.9425, "step": 578 }, { "epoch": 1.0, "grad_norm": 1.7680616667570948, "learning_rate": 7.759503225651737e-05, "loss": 0.9376, "step": 579 }, { "epoch": 1.001727115716753, "grad_norm": 1.9651348353940268, "learning_rate": 7.757853036488483e-05, "loss": 0.9074, "step": 580 }, { "epoch": 1.003454231433506, "grad_norm": 0.8194464258641793, "learning_rate": 7.756197381899097e-05, "loss": 0.9051, "step": 581 }, { "epoch": 1.005181347150259, "grad_norm": 1.6923623656504332, "learning_rate": 7.754536264291565e-05, "loss": 0.9302, "step": 582 }, { "epoch": 1.0069084628670122, "grad_norm": 3.140135542373117, "learning_rate": 7.752869686081816e-05, "loss": 0.9124, "step": 583 }, { "epoch": 1.008635578583765, "grad_norm": 2.1578810197529497, "learning_rate": 7.751197649693728e-05, "loss": 0.9174, "step": 584 }, { "epoch": 1.0103626943005182, "grad_norm": 30.600683114912147, "learning_rate": 7.749520157559109e-05, "loss": 0.9431, "step": 585 }, { "epoch": 1.012089810017271, "grad_norm": 5.037848733319132, "learning_rate": 7.747837212117706e-05, "loss": 0.94, "step": 586 }, { "epoch": 1.0138169257340242, "grad_norm": 3.0467737010504434, "learning_rate": 7.746148815817198e-05, "loss": 0.9386, "step": 587 }, { "epoch": 1.0155440414507773, "grad_norm": 808.9861342582481, "learning_rate": 7.744454971113189e-05, "loss": 3.235, "step": 588 }, { "epoch": 1.0172711571675301, "grad_norm": 7.3860173966219325, "learning_rate": 7.742755680469213e-05, "loss": 0.9922, "step": 589 }, { "epoch": 1.0189982728842832, "grad_norm": 6.92034058443626, "learning_rate": 7.741050946356716e-05, "loss": 0.9949, "step": 590 }, { "epoch": 1.0207253886010363, "grad_norm": 27.785016854382793, "learning_rate": 7.739340771255067e-05, "loss": 1.0338, "step": 591 }, { "epoch": 1.0224525043177892, "grad_norm": 2.8397039436850773, "learning_rate": 7.737625157651546e-05, "loss": 0.9637, "step": 592 }, { "epoch": 1.0241796200345423, "grad_norm": 4.31321230573454, "learning_rate": 7.735904108041347e-05, "loss": 0.9772, "step": 593 }, { "epoch": 1.0259067357512954, "grad_norm": 14.749102625289659, "learning_rate": 7.734177624927562e-05, "loss": 1.731, "step": 594 }, { "epoch": 1.0276338514680483, "grad_norm": 57.25897588764931, "learning_rate": 7.732445710821194e-05, "loss": 1.3387, "step": 595 }, { "epoch": 1.0293609671848014, "grad_norm": 15.854994018879246, "learning_rate": 7.730708368241138e-05, "loss": 1.1781, "step": 596 }, { "epoch": 1.0310880829015545, "grad_norm": 5.595552064240207, "learning_rate": 7.728965599714189e-05, "loss": 1.1462, "step": 597 }, { "epoch": 1.0328151986183074, "grad_norm": 10.222929063675506, "learning_rate": 7.727217407775032e-05, "loss": 1.0575, "step": 598 }, { "epoch": 1.0345423143350605, "grad_norm": 5.713531429605335, "learning_rate": 7.725463794966236e-05, "loss": 1.0302, "step": 599 }, { "epoch": 1.0362694300518134, "grad_norm": 1.928728043818377, "learning_rate": 7.723704763838264e-05, "loss": 1.0271, "step": 600 }, { "epoch": 1.0379965457685665, "grad_norm": 1.8553596718547616, "learning_rate": 7.721940316949447e-05, "loss": 0.9751, "step": 601 }, { "epoch": 1.0397236614853196, "grad_norm": 1.4856951797265059, "learning_rate": 7.720170456866003e-05, "loss": 0.9722, "step": 602 }, { "epoch": 1.0414507772020725, "grad_norm": 2.0526901063893987, "learning_rate": 7.718395186162016e-05, "loss": 0.9813, "step": 603 }, { "epoch": 1.0431778929188256, "grad_norm": 1.6240214186470012, "learning_rate": 7.716614507419442e-05, "loss": 0.9789, "step": 604 }, { "epoch": 1.0449050086355787, "grad_norm": 3.614882093919851, "learning_rate": 7.714828423228105e-05, "loss": 0.972, "step": 605 }, { "epoch": 1.0466321243523315, "grad_norm": 2.177429138241506, "learning_rate": 7.713036936185684e-05, "loss": 0.9716, "step": 606 }, { "epoch": 1.0483592400690847, "grad_norm": 2.702786426321352, "learning_rate": 7.711240048897724e-05, "loss": 0.9519, "step": 607 }, { "epoch": 1.0500863557858378, "grad_norm": 1.5267396113916192, "learning_rate": 7.709437763977618e-05, "loss": 0.9737, "step": 608 }, { "epoch": 1.0518134715025906, "grad_norm": 1.5154474261654216, "learning_rate": 7.707630084046612e-05, "loss": 0.9404, "step": 609 }, { "epoch": 1.0535405872193437, "grad_norm": 2.528436049345461, "learning_rate": 7.705817011733799e-05, "loss": 0.9502, "step": 610 }, { "epoch": 1.0552677029360966, "grad_norm": 1.1554987951813012, "learning_rate": 7.703998549676112e-05, "loss": 0.9629, "step": 611 }, { "epoch": 1.0569948186528497, "grad_norm": 2.736473207797906, "learning_rate": 7.702174700518324e-05, "loss": 0.9655, "step": 612 }, { "epoch": 1.0587219343696028, "grad_norm": 1.9160708511179625, "learning_rate": 7.700345466913044e-05, "loss": 0.9559, "step": 613 }, { "epoch": 1.0604490500863557, "grad_norm": 2.1133764828882438, "learning_rate": 7.698510851520712e-05, "loss": 0.9663, "step": 614 }, { "epoch": 1.0621761658031088, "grad_norm": 1.8959402418985136, "learning_rate": 7.696670857009596e-05, "loss": 0.9366, "step": 615 }, { "epoch": 1.063903281519862, "grad_norm": 1.340591322423617, "learning_rate": 7.694825486055785e-05, "loss": 0.9385, "step": 616 }, { "epoch": 1.0656303972366148, "grad_norm": 1.4506280023415594, "learning_rate": 7.69297474134319e-05, "loss": 0.9547, "step": 617 }, { "epoch": 1.067357512953368, "grad_norm": 1.2301648936836027, "learning_rate": 7.691118625563534e-05, "loss": 0.9569, "step": 618 }, { "epoch": 1.069084628670121, "grad_norm": 1.8303100541669226, "learning_rate": 7.689257141416354e-05, "loss": 0.9333, "step": 619 }, { "epoch": 1.0708117443868739, "grad_norm": 1.7723317266458187, "learning_rate": 7.687390291608999e-05, "loss": 0.9272, "step": 620 }, { "epoch": 1.072538860103627, "grad_norm": 1.331659465926775, "learning_rate": 7.685518078856615e-05, "loss": 0.9225, "step": 621 }, { "epoch": 1.07426597582038, "grad_norm": 1.9840574988937736, "learning_rate": 7.68364050588215e-05, "loss": 0.9398, "step": 622 }, { "epoch": 1.075993091537133, "grad_norm": 2.032264986199088, "learning_rate": 7.681757575416348e-05, "loss": 0.9204, "step": 623 }, { "epoch": 1.077720207253886, "grad_norm": 0.9675479113992906, "learning_rate": 7.679869290197747e-05, "loss": 0.9457, "step": 624 }, { "epoch": 1.079447322970639, "grad_norm": 2.1696882571633056, "learning_rate": 7.677975652972673e-05, "loss": 0.9434, "step": 625 }, { "epoch": 1.081174438687392, "grad_norm": 1.5622724905196237, "learning_rate": 7.67607666649523e-05, "loss": 0.9288, "step": 626 }, { "epoch": 1.0829015544041452, "grad_norm": 1.2915979247299394, "learning_rate": 7.67417233352731e-05, "loss": 0.9433, "step": 627 }, { "epoch": 1.084628670120898, "grad_norm": 2.3144228526780233, "learning_rate": 7.672262656838575e-05, "loss": 0.9478, "step": 628 }, { "epoch": 1.0863557858376511, "grad_norm": 1.6653776212994544, "learning_rate": 7.670347639206462e-05, "loss": 0.92, "step": 629 }, { "epoch": 1.0880829015544042, "grad_norm": 1.933912669840748, "learning_rate": 7.668427283416176e-05, "loss": 0.9205, "step": 630 }, { "epoch": 1.0898100172711571, "grad_norm": 1.9088430712945446, "learning_rate": 7.666501592260682e-05, "loss": 0.9471, "step": 631 }, { "epoch": 1.0915371329879102, "grad_norm": 1.4046023877934681, "learning_rate": 7.664570568540709e-05, "loss": 0.913, "step": 632 }, { "epoch": 1.093264248704663, "grad_norm": 1.356230559690035, "learning_rate": 7.662634215064742e-05, "loss": 0.9234, "step": 633 }, { "epoch": 1.0949913644214162, "grad_norm": 1.717393852087629, "learning_rate": 7.660692534649015e-05, "loss": 0.9284, "step": 634 }, { "epoch": 1.0967184801381693, "grad_norm": 1.5874365903165724, "learning_rate": 7.65874553011751e-05, "loss": 0.933, "step": 635 }, { "epoch": 1.0984455958549222, "grad_norm": 0.9524415883796115, "learning_rate": 7.656793204301952e-05, "loss": 0.9313, "step": 636 }, { "epoch": 1.1001727115716753, "grad_norm": 1.594195816408818, "learning_rate": 7.654835560041808e-05, "loss": 0.9281, "step": 637 }, { "epoch": 1.1018998272884284, "grad_norm": 1.3185395320230098, "learning_rate": 7.652872600184276e-05, "loss": 0.9359, "step": 638 }, { "epoch": 1.1036269430051813, "grad_norm": 1.7893509261233955, "learning_rate": 7.650904327584289e-05, "loss": 0.9197, "step": 639 }, { "epoch": 1.1053540587219344, "grad_norm": 1.3239760779090408, "learning_rate": 7.648930745104504e-05, "loss": 0.9197, "step": 640 }, { "epoch": 1.1070811744386875, "grad_norm": 1.2661379653554168, "learning_rate": 7.646951855615306e-05, "loss": 0.9328, "step": 641 }, { "epoch": 1.1088082901554404, "grad_norm": 1.1655519507134213, "learning_rate": 7.644967661994787e-05, "loss": 0.9414, "step": 642 }, { "epoch": 1.1105354058721935, "grad_norm": 1.4920110807997902, "learning_rate": 7.642978167128764e-05, "loss": 0.9178, "step": 643 }, { "epoch": 1.1122625215889466, "grad_norm": 1.9093545954188524, "learning_rate": 7.640983373910763e-05, "loss": 0.9203, "step": 644 }, { "epoch": 1.1139896373056994, "grad_norm": 1.007960727000885, "learning_rate": 7.638983285242012e-05, "loss": 0.9207, "step": 645 }, { "epoch": 1.1157167530224525, "grad_norm": 1.545662232036679, "learning_rate": 7.636977904031441e-05, "loss": 0.9249, "step": 646 }, { "epoch": 1.1174438687392054, "grad_norm": 0.8936055806401864, "learning_rate": 7.634967233195682e-05, "loss": 0.9169, "step": 647 }, { "epoch": 1.1191709844559585, "grad_norm": 1.6430163234643238, "learning_rate": 7.632951275659054e-05, "loss": 0.9149, "step": 648 }, { "epoch": 1.1208981001727116, "grad_norm": 1.5359017237620431, "learning_rate": 7.63093003435357e-05, "loss": 0.9053, "step": 649 }, { "epoch": 1.1226252158894645, "grad_norm": 1.7293853433417323, "learning_rate": 7.628903512218927e-05, "loss": 0.9194, "step": 650 }, { "epoch": 1.1243523316062176, "grad_norm": 1.2468897349619261, "learning_rate": 7.626871712202498e-05, "loss": 0.918, "step": 651 }, { "epoch": 1.1260794473229707, "grad_norm": 1.3864796371247545, "learning_rate": 7.624834637259337e-05, "loss": 0.9463, "step": 652 }, { "epoch": 1.1278065630397236, "grad_norm": 1.083937657058968, "learning_rate": 7.62279229035217e-05, "loss": 0.916, "step": 653 }, { "epoch": 1.1295336787564767, "grad_norm": 2.197688904003741, "learning_rate": 7.620744674451387e-05, "loss": 0.9199, "step": 654 }, { "epoch": 1.1312607944732298, "grad_norm": 1.392390951412386, "learning_rate": 7.618691792535044e-05, "loss": 0.939, "step": 655 }, { "epoch": 1.1329879101899827, "grad_norm": 1.1926368683239852, "learning_rate": 7.616633647588855e-05, "loss": 0.9129, "step": 656 }, { "epoch": 1.1347150259067358, "grad_norm": 1.478245450968466, "learning_rate": 7.614570242606187e-05, "loss": 0.9131, "step": 657 }, { "epoch": 1.1364421416234887, "grad_norm": 2.286230538985635, "learning_rate": 7.61250158058806e-05, "loss": 0.9325, "step": 658 }, { "epoch": 1.1381692573402418, "grad_norm": 1.1841784787700667, "learning_rate": 7.610427664543139e-05, "loss": 0.9317, "step": 659 }, { "epoch": 1.1398963730569949, "grad_norm": 2.5861527061450076, "learning_rate": 7.60834849748773e-05, "loss": 0.9127, "step": 660 }, { "epoch": 1.1416234887737478, "grad_norm": 1.8958079731299562, "learning_rate": 7.606264082445775e-05, "loss": 0.9362, "step": 661 }, { "epoch": 1.1433506044905009, "grad_norm": 2.0549355802404814, "learning_rate": 7.604174422448853e-05, "loss": 0.9224, "step": 662 }, { "epoch": 1.145077720207254, "grad_norm": 1.9824981886150115, "learning_rate": 7.602079520536164e-05, "loss": 0.938, "step": 663 }, { "epoch": 1.1468048359240068, "grad_norm": 1.1678607099173095, "learning_rate": 7.599979379754537e-05, "loss": 0.9195, "step": 664 }, { "epoch": 1.14853195164076, "grad_norm": 2.472739591064189, "learning_rate": 7.597874003158421e-05, "loss": 0.9391, "step": 665 }, { "epoch": 1.150259067357513, "grad_norm": 1.7954665349561711, "learning_rate": 7.595763393809877e-05, "loss": 0.933, "step": 666 }, { "epoch": 1.151986183074266, "grad_norm": 1.9360550797767306, "learning_rate": 7.593647554778579e-05, "loss": 0.9357, "step": 667 }, { "epoch": 1.153713298791019, "grad_norm": 2.0119401177604153, "learning_rate": 7.591526489141804e-05, "loss": 0.9297, "step": 668 }, { "epoch": 1.1554404145077721, "grad_norm": 1.650299629894317, "learning_rate": 7.589400199984434e-05, "loss": 0.9229, "step": 669 }, { "epoch": 1.157167530224525, "grad_norm": 1.627288805872058, "learning_rate": 7.587268690398948e-05, "loss": 0.9062, "step": 670 }, { "epoch": 1.1588946459412781, "grad_norm": 2.406544788983173, "learning_rate": 7.585131963485414e-05, "loss": 0.912, "step": 671 }, { "epoch": 1.160621761658031, "grad_norm": 1.3986335393176006, "learning_rate": 7.58299002235149e-05, "loss": 0.9034, "step": 672 }, { "epoch": 1.162348877374784, "grad_norm": 3.669794521632918, "learning_rate": 7.580842870112416e-05, "loss": 0.932, "step": 673 }, { "epoch": 1.1640759930915372, "grad_norm": 3.082080937763114, "learning_rate": 7.57869050989102e-05, "loss": 0.915, "step": 674 }, { "epoch": 1.16580310880829, "grad_norm": 2.7818043954889418, "learning_rate": 7.576532944817691e-05, "loss": 0.9413, "step": 675 }, { "epoch": 1.1675302245250432, "grad_norm": 2.4382629844166996, "learning_rate": 7.574370178030396e-05, "loss": 0.9285, "step": 676 }, { "epoch": 1.1692573402417963, "grad_norm": 2.7027918507629054, "learning_rate": 7.572202212674667e-05, "loss": 0.9361, "step": 677 }, { "epoch": 1.1709844559585492, "grad_norm": 2.4748064278638084, "learning_rate": 7.570029051903594e-05, "loss": 0.9357, "step": 678 }, { "epoch": 1.1727115716753023, "grad_norm": 2.3313030232984513, "learning_rate": 7.567850698877823e-05, "loss": 0.9255, "step": 679 }, { "epoch": 1.1744386873920551, "grad_norm": 1.6427836218671459, "learning_rate": 7.56566715676556e-05, "loss": 0.9308, "step": 680 }, { "epoch": 1.1761658031088082, "grad_norm": 3.29304809897759, "learning_rate": 7.563478428742542e-05, "loss": 0.9166, "step": 681 }, { "epoch": 1.1778929188255614, "grad_norm": 3.054086753994092, "learning_rate": 7.561284517992066e-05, "loss": 0.9113, "step": 682 }, { "epoch": 1.1796200345423142, "grad_norm": 1.7564482959444405, "learning_rate": 7.559085427704953e-05, "loss": 0.9207, "step": 683 }, { "epoch": 1.1813471502590673, "grad_norm": 1.5479227542038265, "learning_rate": 7.556881161079567e-05, "loss": 0.901, "step": 684 }, { "epoch": 1.1830742659758204, "grad_norm": 2.8335006923748662, "learning_rate": 7.554671721321793e-05, "loss": 0.8992, "step": 685 }, { "epoch": 1.1848013816925733, "grad_norm": 2.1531603915251742, "learning_rate": 7.552457111645043e-05, "loss": 0.9511, "step": 686 }, { "epoch": 1.1865284974093264, "grad_norm": 3.067880871215394, "learning_rate": 7.550237335270251e-05, "loss": 0.9693, "step": 687 }, { "epoch": 1.1882556131260795, "grad_norm": 2.9550412327971562, "learning_rate": 7.548012395425858e-05, "loss": 0.9295, "step": 688 }, { "epoch": 1.1899827288428324, "grad_norm": 1.6171127054728218, "learning_rate": 7.545782295347824e-05, "loss": 0.927, "step": 689 }, { "epoch": 1.1917098445595855, "grad_norm": 1.6166610025597379, "learning_rate": 7.543547038279607e-05, "loss": 0.9249, "step": 690 }, { "epoch": 1.1934369602763386, "grad_norm": 2.5165909989764352, "learning_rate": 7.541306627472165e-05, "loss": 0.9235, "step": 691 }, { "epoch": 1.1951640759930915, "grad_norm": 1.8018040126180381, "learning_rate": 7.539061066183958e-05, "loss": 0.8897, "step": 692 }, { "epoch": 1.1968911917098446, "grad_norm": 2.839499602678277, "learning_rate": 7.536810357680933e-05, "loss": 0.937, "step": 693 }, { "epoch": 1.1986183074265977, "grad_norm": 2.5961811699230783, "learning_rate": 7.534554505236524e-05, "loss": 0.9189, "step": 694 }, { "epoch": 1.2003454231433506, "grad_norm": 1.9543491491260547, "learning_rate": 7.532293512131642e-05, "loss": 0.9087, "step": 695 }, { "epoch": 1.2020725388601037, "grad_norm": 1.6383848907972782, "learning_rate": 7.53002738165468e-05, "loss": 0.915, "step": 696 }, { "epoch": 1.2037996545768566, "grad_norm": 2.6565505872777138, "learning_rate": 7.527756117101502e-05, "loss": 0.93, "step": 697 }, { "epoch": 1.2055267702936097, "grad_norm": 2.1924565921297017, "learning_rate": 7.525479721775437e-05, "loss": 0.9137, "step": 698 }, { "epoch": 1.2072538860103628, "grad_norm": 2.3006494907122392, "learning_rate": 7.523198198987277e-05, "loss": 0.9044, "step": 699 }, { "epoch": 1.2089810017271156, "grad_norm": 2.025039090673993, "learning_rate": 7.520911552055272e-05, "loss": 0.926, "step": 700 }, { "epoch": 1.2107081174438687, "grad_norm": 2.1110996231190753, "learning_rate": 7.518619784305124e-05, "loss": 0.9042, "step": 701 }, { "epoch": 1.2124352331606219, "grad_norm": 1.9791072239095817, "learning_rate": 7.516322899069984e-05, "loss": 0.9194, "step": 702 }, { "epoch": 1.2141623488773747, "grad_norm": 2.394425724095647, "learning_rate": 7.514020899690442e-05, "loss": 0.9387, "step": 703 }, { "epoch": 1.2158894645941278, "grad_norm": 2.0662787126510263, "learning_rate": 7.511713789514531e-05, "loss": 0.9265, "step": 704 }, { "epoch": 1.2176165803108807, "grad_norm": 2.0701375645783138, "learning_rate": 7.509401571897716e-05, "loss": 0.9043, "step": 705 }, { "epoch": 1.2193436960276338, "grad_norm": 1.8111252904983766, "learning_rate": 7.507084250202886e-05, "loss": 0.9346, "step": 706 }, { "epoch": 1.221070811744387, "grad_norm": 2.347685126782399, "learning_rate": 7.504761827800358e-05, "loss": 0.9158, "step": 707 }, { "epoch": 1.2227979274611398, "grad_norm": 2.039053929428014, "learning_rate": 7.502434308067866e-05, "loss": 0.9203, "step": 708 }, { "epoch": 1.224525043177893, "grad_norm": 2.077097112769339, "learning_rate": 7.500101694390558e-05, "loss": 0.9318, "step": 709 }, { "epoch": 1.226252158894646, "grad_norm": 1.7669563523500704, "learning_rate": 7.497763990160991e-05, "loss": 0.9359, "step": 710 }, { "epoch": 1.2279792746113989, "grad_norm": 2.3134978988681936, "learning_rate": 7.495421198779123e-05, "loss": 0.9185, "step": 711 }, { "epoch": 1.229706390328152, "grad_norm": 2.0433964733530314, "learning_rate": 7.493073323652314e-05, "loss": 0.93, "step": 712 }, { "epoch": 1.231433506044905, "grad_norm": 2.0151322104669496, "learning_rate": 7.490720368195316e-05, "loss": 0.9008, "step": 713 }, { "epoch": 1.233160621761658, "grad_norm": 1.7670031223970308, "learning_rate": 7.48836233583027e-05, "loss": 0.9451, "step": 714 }, { "epoch": 1.234887737478411, "grad_norm": 2.1431924173598906, "learning_rate": 7.485999229986701e-05, "loss": 0.9068, "step": 715 }, { "epoch": 1.2366148531951642, "grad_norm": 1.8322119571699433, "learning_rate": 7.483631054101516e-05, "loss": 0.9162, "step": 716 }, { "epoch": 1.238341968911917, "grad_norm": 2.1884817051264376, "learning_rate": 7.481257811618989e-05, "loss": 0.9189, "step": 717 }, { "epoch": 1.2400690846286702, "grad_norm": 1.672507624648457, "learning_rate": 7.47887950599077e-05, "loss": 0.8932, "step": 718 }, { "epoch": 1.2417962003454233, "grad_norm": 2.1514005445932476, "learning_rate": 7.476496140675865e-05, "loss": 0.9014, "step": 719 }, { "epoch": 1.2435233160621761, "grad_norm": 1.8023346734874324, "learning_rate": 7.47410771914065e-05, "loss": 0.9238, "step": 720 }, { "epoch": 1.2452504317789292, "grad_norm": 2.13973234848276, "learning_rate": 7.471714244858843e-05, "loss": 0.9152, "step": 721 }, { "epoch": 1.2469775474956821, "grad_norm": 1.6697995874847198, "learning_rate": 7.469315721311519e-05, "loss": 0.9055, "step": 722 }, { "epoch": 1.2487046632124352, "grad_norm": 2.0181058114417625, "learning_rate": 7.466912151987092e-05, "loss": 0.9286, "step": 723 }, { "epoch": 1.2504317789291883, "grad_norm": 1.724771696149722, "learning_rate": 7.464503540381316e-05, "loss": 0.9069, "step": 724 }, { "epoch": 1.2521588946459412, "grad_norm": 2.0699253500274133, "learning_rate": 7.462089889997281e-05, "loss": 0.9056, "step": 725 }, { "epoch": 1.2538860103626943, "grad_norm": 1.5938564237814428, "learning_rate": 7.4596712043454e-05, "loss": 0.9239, "step": 726 }, { "epoch": 1.2556131260794472, "grad_norm": 1.8509236311760713, "learning_rate": 7.457247486943414e-05, "loss": 0.9135, "step": 727 }, { "epoch": 1.2573402417962003, "grad_norm": 1.574110638661317, "learning_rate": 7.454818741316382e-05, "loss": 0.9307, "step": 728 }, { "epoch": 1.2590673575129534, "grad_norm": 2.1238595319124416, "learning_rate": 7.452384970996674e-05, "loss": 0.929, "step": 729 }, { "epoch": 1.2607944732297063, "grad_norm": 1.5224641649713784, "learning_rate": 7.449946179523965e-05, "loss": 0.913, "step": 730 }, { "epoch": 1.2625215889464594, "grad_norm": 1.845593940263051, "learning_rate": 7.447502370445242e-05, "loss": 0.9343, "step": 731 }, { "epoch": 1.2642487046632125, "grad_norm": 1.4748963633588534, "learning_rate": 7.445053547314779e-05, "loss": 0.9427, "step": 732 }, { "epoch": 1.2659758203799654, "grad_norm": 1.9726680993747507, "learning_rate": 7.44259971369415e-05, "loss": 0.914, "step": 733 }, { "epoch": 1.2677029360967185, "grad_norm": 1.5082820572759812, "learning_rate": 7.440140873152216e-05, "loss": 0.9356, "step": 734 }, { "epoch": 1.2694300518134716, "grad_norm": 1.6137471660138305, "learning_rate": 7.437677029265112e-05, "loss": 0.9122, "step": 735 }, { "epoch": 1.2711571675302245, "grad_norm": 1.3613064030029536, "learning_rate": 7.435208185616259e-05, "loss": 0.9292, "step": 736 }, { "epoch": 1.2728842832469776, "grad_norm": 1.7326910497021542, "learning_rate": 7.432734345796347e-05, "loss": 0.9409, "step": 737 }, { "epoch": 1.2746113989637307, "grad_norm": 1.3224361745446245, "learning_rate": 7.430255513403329e-05, "loss": 0.9292, "step": 738 }, { "epoch": 1.2763385146804835, "grad_norm": 1.5454837432332384, "learning_rate": 7.427771692042424e-05, "loss": 0.9275, "step": 739 }, { "epoch": 1.2780656303972366, "grad_norm": 1.3950135814439004, "learning_rate": 7.425282885326106e-05, "loss": 0.9453, "step": 740 }, { "epoch": 1.2797927461139897, "grad_norm": 1.2868827182525742, "learning_rate": 7.422789096874094e-05, "loss": 0.9099, "step": 741 }, { "epoch": 1.2815198618307426, "grad_norm": 1.123903764163658, "learning_rate": 7.420290330313363e-05, "loss": 0.9, "step": 742 }, { "epoch": 1.2832469775474957, "grad_norm": 1.177355503606385, "learning_rate": 7.417786589278117e-05, "loss": 0.9325, "step": 743 }, { "epoch": 1.2849740932642488, "grad_norm": 1.1166134836435304, "learning_rate": 7.415277877409804e-05, "loss": 0.912, "step": 744 }, { "epoch": 1.2867012089810017, "grad_norm": 1.0469543800585828, "learning_rate": 7.412764198357095e-05, "loss": 0.9111, "step": 745 }, { "epoch": 1.2884283246977548, "grad_norm": 1.2274759161220934, "learning_rate": 7.410245555775889e-05, "loss": 0.9193, "step": 746 }, { "epoch": 1.2901554404145077, "grad_norm": 2.072157915023452, "learning_rate": 7.407721953329302e-05, "loss": 0.9021, "step": 747 }, { "epoch": 1.2918825561312608, "grad_norm": 0.9698431641064525, "learning_rate": 7.405193394687666e-05, "loss": 0.9396, "step": 748 }, { "epoch": 1.293609671848014, "grad_norm": 2.4072761182912767, "learning_rate": 7.40265988352852e-05, "loss": 0.9122, "step": 749 }, { "epoch": 1.2953367875647668, "grad_norm": 1.7017850293880692, "learning_rate": 7.400121423536605e-05, "loss": 0.9334, "step": 750 }, { "epoch": 1.2970639032815199, "grad_norm": 2.1306996324818064, "learning_rate": 7.397578018403858e-05, "loss": 0.9041, "step": 751 }, { "epoch": 1.2987910189982728, "grad_norm": 1.7329392067027696, "learning_rate": 7.395029671829415e-05, "loss": 0.9176, "step": 752 }, { "epoch": 1.3005181347150259, "grad_norm": 1.7024063862517287, "learning_rate": 7.392476387519592e-05, "loss": 0.9189, "step": 753 }, { "epoch": 1.302245250431779, "grad_norm": 1.6769465545700946, "learning_rate": 7.389918169187889e-05, "loss": 0.909, "step": 754 }, { "epoch": 1.3039723661485318, "grad_norm": 1.7296601687765405, "learning_rate": 7.387355020554982e-05, "loss": 0.8979, "step": 755 }, { "epoch": 1.305699481865285, "grad_norm": 1.3820600864722183, "learning_rate": 7.384786945348718e-05, "loss": 0.9275, "step": 756 }, { "epoch": 1.307426597582038, "grad_norm": 2.4146269512849092, "learning_rate": 7.38221394730411e-05, "loss": 0.9221, "step": 757 }, { "epoch": 1.309153713298791, "grad_norm": 2.52066409146413, "learning_rate": 7.379636030163325e-05, "loss": 0.9358, "step": 758 }, { "epoch": 1.310880829015544, "grad_norm": 1.6631928918117176, "learning_rate": 7.377053197675694e-05, "loss": 0.9194, "step": 759 }, { "epoch": 1.3126079447322971, "grad_norm": 0.9763134284676742, "learning_rate": 7.374465453597688e-05, "loss": 0.9139, "step": 760 }, { "epoch": 1.31433506044905, "grad_norm": 1.789584287872814, "learning_rate": 7.371872801692928e-05, "loss": 0.9216, "step": 761 }, { "epoch": 1.3160621761658031, "grad_norm": 1.337908838334418, "learning_rate": 7.36927524573217e-05, "loss": 0.9355, "step": 762 }, { "epoch": 1.3177892918825562, "grad_norm": 1.9442084651994755, "learning_rate": 7.366672789493299e-05, "loss": 0.9109, "step": 763 }, { "epoch": 1.319516407599309, "grad_norm": 1.098786492283857, "learning_rate": 7.364065436761335e-05, "loss": 0.9237, "step": 764 }, { "epoch": 1.3212435233160622, "grad_norm": 2.151115560651471, "learning_rate": 7.361453191328415e-05, "loss": 0.9047, "step": 765 }, { "epoch": 1.3229706390328153, "grad_norm": 1.7797309781971755, "learning_rate": 7.358836056993788e-05, "loss": 0.9078, "step": 766 }, { "epoch": 1.3246977547495682, "grad_norm": 1.9728041222867387, "learning_rate": 7.356214037563821e-05, "loss": 0.9246, "step": 767 }, { "epoch": 1.3264248704663213, "grad_norm": 1.8849062816766096, "learning_rate": 7.35358713685198e-05, "loss": 0.9227, "step": 768 }, { "epoch": 1.3281519861830744, "grad_norm": 1.6508313763349869, "learning_rate": 7.350955358678834e-05, "loss": 0.9184, "step": 769 }, { "epoch": 1.3298791018998273, "grad_norm": 1.6569645062973057, "learning_rate": 7.348318706872045e-05, "loss": 0.9469, "step": 770 }, { "epoch": 1.3316062176165804, "grad_norm": 1.7538250896653371, "learning_rate": 7.345677185266361e-05, "loss": 0.9339, "step": 771 }, { "epoch": 1.3333333333333333, "grad_norm": 1.2429786154681337, "learning_rate": 7.343030797703616e-05, "loss": 0.941, "step": 772 }, { "epoch": 1.3350604490500864, "grad_norm": 2.102689986090864, "learning_rate": 7.340379548032719e-05, "loss": 0.9231, "step": 773 }, { "epoch": 1.3367875647668392, "grad_norm": 1.8722490253240915, "learning_rate": 7.33772344010965e-05, "loss": 0.9225, "step": 774 }, { "epoch": 1.3385146804835923, "grad_norm": 1.3970109592915003, "learning_rate": 7.335062477797455e-05, "loss": 0.9465, "step": 775 }, { "epoch": 1.3402417962003454, "grad_norm": 1.0416587186298776, "learning_rate": 7.332396664966243e-05, "loss": 0.9198, "step": 776 }, { "epoch": 1.3419689119170983, "grad_norm": 1.9062225602995226, "learning_rate": 7.329726005493173e-05, "loss": 0.9359, "step": 777 }, { "epoch": 1.3436960276338514, "grad_norm": 1.7644112970049466, "learning_rate": 7.32705050326246e-05, "loss": 0.9186, "step": 778 }, { "epoch": 1.3454231433506045, "grad_norm": 1.4303481873039272, "learning_rate": 7.324370162165352e-05, "loss": 0.9211, "step": 779 }, { "epoch": 1.3471502590673574, "grad_norm": 1.7670966912336843, "learning_rate": 7.321684986100144e-05, "loss": 0.9256, "step": 780 }, { "epoch": 1.3488773747841105, "grad_norm": 1.5870311252752842, "learning_rate": 7.318994978972158e-05, "loss": 0.9211, "step": 781 }, { "epoch": 1.3506044905008636, "grad_norm": 0.8930747653803177, "learning_rate": 7.316300144693745e-05, "loss": 0.9284, "step": 782 }, { "epoch": 1.3523316062176165, "grad_norm": 1.7720605729750123, "learning_rate": 7.313600487184274e-05, "loss": 0.9143, "step": 783 }, { "epoch": 1.3540587219343696, "grad_norm": 1.0826893486929883, "learning_rate": 7.310896010370131e-05, "loss": 0.9107, "step": 784 }, { "epoch": 1.3557858376511227, "grad_norm": 2.1065842229633214, "learning_rate": 7.308186718184711e-05, "loss": 0.9207, "step": 785 }, { "epoch": 1.3575129533678756, "grad_norm": 2.2081019320123354, "learning_rate": 7.30547261456841e-05, "loss": 0.9259, "step": 786 }, { "epoch": 1.3592400690846287, "grad_norm": 0.8877463559183291, "learning_rate": 7.302753703468626e-05, "loss": 0.8987, "step": 787 }, { "epoch": 1.3609671848013818, "grad_norm": 2.0463623410411795, "learning_rate": 7.300029988839744e-05, "loss": 0.938, "step": 788 }, { "epoch": 1.3626943005181347, "grad_norm": 1.1249033899205376, "learning_rate": 7.29730147464314e-05, "loss": 0.9377, "step": 789 }, { "epoch": 1.3644214162348878, "grad_norm": 1.496005016605809, "learning_rate": 7.294568164847169e-05, "loss": 0.9314, "step": 790 }, { "epoch": 1.3661485319516409, "grad_norm": 1.392436199248161, "learning_rate": 7.291830063427158e-05, "loss": 0.916, "step": 791 }, { "epoch": 1.3678756476683938, "grad_norm": 1.2511932540862842, "learning_rate": 7.289087174365408e-05, "loss": 0.9165, "step": 792 }, { "epoch": 1.3696027633851469, "grad_norm": 2.0103847088788394, "learning_rate": 7.286339501651177e-05, "loss": 0.9779, "step": 793 }, { "epoch": 1.3713298791019, "grad_norm": 1.072440053294198, "learning_rate": 7.283587049280685e-05, "loss": 0.9179, "step": 794 }, { "epoch": 1.3730569948186528, "grad_norm": 1.5010116335779338, "learning_rate": 7.280829821257102e-05, "loss": 0.9255, "step": 795 }, { "epoch": 1.374784110535406, "grad_norm": 2.1442909868865625, "learning_rate": 7.278067821590545e-05, "loss": 0.9152, "step": 796 }, { "epoch": 1.3765112262521588, "grad_norm": 1.0228251176144236, "learning_rate": 7.27530105429807e-05, "loss": 0.932, "step": 797 }, { "epoch": 1.378238341968912, "grad_norm": 2.86402392723867, "learning_rate": 7.272529523403663e-05, "loss": 0.941, "step": 798 }, { "epoch": 1.3799654576856648, "grad_norm": 1.9869669866930684, "learning_rate": 7.269753232938246e-05, "loss": 0.9668, "step": 799 }, { "epoch": 1.381692573402418, "grad_norm": 2.6683951785363496, "learning_rate": 7.266972186939658e-05, "loss": 0.9349, "step": 800 }, { "epoch": 1.383419689119171, "grad_norm": 1.4034321184898524, "learning_rate": 7.264186389452656e-05, "loss": 0.9187, "step": 801 }, { "epoch": 1.385146804835924, "grad_norm": 2.8571129557812385, "learning_rate": 7.261395844528907e-05, "loss": 0.9625, "step": 802 }, { "epoch": 1.386873920552677, "grad_norm": 1.8514028996694571, "learning_rate": 7.258600556226981e-05, "loss": 0.9329, "step": 803 }, { "epoch": 1.38860103626943, "grad_norm": 2.9121959646381463, "learning_rate": 7.255800528612353e-05, "loss": 0.9478, "step": 804 }, { "epoch": 1.390328151986183, "grad_norm": 2.516446309103585, "learning_rate": 7.252995765757384e-05, "loss": 0.9339, "step": 805 }, { "epoch": 1.392055267702936, "grad_norm": 2.5822820310532992, "learning_rate": 7.250186271741326e-05, "loss": 0.9304, "step": 806 }, { "epoch": 1.3937823834196892, "grad_norm": 2.2721610995661976, "learning_rate": 7.247372050650307e-05, "loss": 0.9481, "step": 807 }, { "epoch": 1.395509499136442, "grad_norm": 1.716878405816628, "learning_rate": 7.244553106577339e-05, "loss": 0.9245, "step": 808 }, { "epoch": 1.3972366148531952, "grad_norm": 0.9846153829327052, "learning_rate": 7.241729443622294e-05, "loss": 0.9377, "step": 809 }, { "epoch": 1.3989637305699483, "grad_norm": 2.674713304952946, "learning_rate": 7.238901065891913e-05, "loss": 0.9346, "step": 810 }, { "epoch": 1.4006908462867012, "grad_norm": 1.7215156027295522, "learning_rate": 7.236067977499791e-05, "loss": 0.9277, "step": 811 }, { "epoch": 1.4024179620034543, "grad_norm": 2.467953157005053, "learning_rate": 7.233230182566374e-05, "loss": 0.9469, "step": 812 }, { "epoch": 1.4041450777202074, "grad_norm": 2.032646304664004, "learning_rate": 7.230387685218956e-05, "loss": 0.9361, "step": 813 }, { "epoch": 1.4058721934369602, "grad_norm": 2.053302735740798, "learning_rate": 7.22754048959167e-05, "loss": 0.9447, "step": 814 }, { "epoch": 1.4075993091537133, "grad_norm": 1.5397702536119615, "learning_rate": 7.224688599825478e-05, "loss": 0.9174, "step": 815 }, { "epoch": 1.4093264248704664, "grad_norm": 1.9662262415527936, "learning_rate": 7.221832020068174e-05, "loss": 0.9477, "step": 816 }, { "epoch": 1.4110535405872193, "grad_norm": 1.326310752242633, "learning_rate": 7.21897075447437e-05, "loss": 0.9296, "step": 817 }, { "epoch": 1.4127806563039724, "grad_norm": 1.9853020654553353, "learning_rate": 7.216104807205497e-05, "loss": 0.9356, "step": 818 }, { "epoch": 1.4145077720207253, "grad_norm": 1.5852797320138994, "learning_rate": 7.213234182429788e-05, "loss": 0.922, "step": 819 }, { "epoch": 1.4162348877374784, "grad_norm": 1.1489687026364048, "learning_rate": 7.210358884322285e-05, "loss": 0.9194, "step": 820 }, { "epoch": 1.4179620034542315, "grad_norm": 1.7393755753399223, "learning_rate": 7.207478917064826e-05, "loss": 0.9092, "step": 821 }, { "epoch": 1.4196891191709844, "grad_norm": 0.9937967928354019, "learning_rate": 7.20459428484604e-05, "loss": 0.9143, "step": 822 }, { "epoch": 1.4214162348877375, "grad_norm": 1.353264342485409, "learning_rate": 7.201704991861338e-05, "loss": 0.9412, "step": 823 }, { "epoch": 1.4231433506044904, "grad_norm": 1.2040142211598472, "learning_rate": 7.19881104231291e-05, "loss": 0.9289, "step": 824 }, { "epoch": 1.4248704663212435, "grad_norm": 1.1906511577649028, "learning_rate": 7.195912440409722e-05, "loss": 0.9326, "step": 825 }, { "epoch": 1.4265975820379966, "grad_norm": 1.0793084762810825, "learning_rate": 7.193009190367502e-05, "loss": 0.9256, "step": 826 }, { "epoch": 1.4283246977547495, "grad_norm": 1.4657902142789818, "learning_rate": 7.190101296408743e-05, "loss": 0.8993, "step": 827 }, { "epoch": 1.4300518134715026, "grad_norm": 1.073970648022281, "learning_rate": 7.18718876276269e-05, "loss": 0.9222, "step": 828 }, { "epoch": 1.4317789291882557, "grad_norm": 1.5802063140949836, "learning_rate": 7.184271593665333e-05, "loss": 0.9154, "step": 829 }, { "epoch": 1.4335060449050085, "grad_norm": 1.4198893032285345, "learning_rate": 7.181349793359407e-05, "loss": 0.916, "step": 830 }, { "epoch": 1.4352331606217616, "grad_norm": 0.7015441380034355, "learning_rate": 7.178423366094383e-05, "loss": 0.918, "step": 831 }, { "epoch": 1.4369602763385148, "grad_norm": 1.5228277939908226, "learning_rate": 7.17549231612646e-05, "loss": 0.9275, "step": 832 }, { "epoch": 1.4386873920552676, "grad_norm": 1.4965775084629815, "learning_rate": 7.17255664771856e-05, "loss": 0.9291, "step": 833 }, { "epoch": 1.4404145077720207, "grad_norm": 1.1582905143124376, "learning_rate": 7.169616365140325e-05, "loss": 0.9112, "step": 834 }, { "epoch": 1.4421416234887738, "grad_norm": 1.0234269739913482, "learning_rate": 7.166671472668103e-05, "loss": 0.9265, "step": 835 }, { "epoch": 1.4438687392055267, "grad_norm": 1.0086692798532582, "learning_rate": 7.16372197458495e-05, "loss": 0.9151, "step": 836 }, { "epoch": 1.4455958549222798, "grad_norm": 1.0595472893813829, "learning_rate": 7.16076787518062e-05, "loss": 0.921, "step": 837 }, { "epoch": 1.447322970639033, "grad_norm": 1.3536116150777178, "learning_rate": 7.15780917875156e-05, "loss": 0.9263, "step": 838 }, { "epoch": 1.4490500863557858, "grad_norm": 1.7092087760555383, "learning_rate": 7.1548458896009e-05, "loss": 0.9287, "step": 839 }, { "epoch": 1.450777202072539, "grad_norm": 0.7660241527064209, "learning_rate": 7.151878012038453e-05, "loss": 0.9281, "step": 840 }, { "epoch": 1.452504317789292, "grad_norm": 1.0552836499833722, "learning_rate": 7.148905550380701e-05, "loss": 0.9058, "step": 841 }, { "epoch": 1.454231433506045, "grad_norm": 2.2184076871038254, "learning_rate": 7.1459285089508e-05, "loss": 0.9336, "step": 842 }, { "epoch": 1.455958549222798, "grad_norm": 1.140213940875552, "learning_rate": 7.14294689207856e-05, "loss": 0.9408, "step": 843 }, { "epoch": 1.4576856649395509, "grad_norm": 2.5960639332639786, "learning_rate": 7.139960704100448e-05, "loss": 0.949, "step": 844 }, { "epoch": 1.459412780656304, "grad_norm": 2.2056612136972316, "learning_rate": 7.13696994935958e-05, "loss": 0.9256, "step": 845 }, { "epoch": 1.4611398963730569, "grad_norm": 2.071711516511868, "learning_rate": 7.133974632205715e-05, "loss": 0.9287, "step": 846 }, { "epoch": 1.46286701208981, "grad_norm": 2.2317338594759177, "learning_rate": 7.130974756995245e-05, "loss": 0.9394, "step": 847 }, { "epoch": 1.464594127806563, "grad_norm": 1.6715116620826158, "learning_rate": 7.127970328091192e-05, "loss": 0.9305, "step": 848 }, { "epoch": 1.466321243523316, "grad_norm": 1.9203329817148593, "learning_rate": 7.1249613498632e-05, "loss": 0.9274, "step": 849 }, { "epoch": 1.468048359240069, "grad_norm": 1.5965435584923655, "learning_rate": 7.121947826687533e-05, "loss": 0.9162, "step": 850 }, { "epoch": 1.4697754749568221, "grad_norm": 1.6977049730395901, "learning_rate": 7.118929762947062e-05, "loss": 0.9218, "step": 851 }, { "epoch": 1.471502590673575, "grad_norm": 1.2206102932593534, "learning_rate": 7.115907163031262e-05, "loss": 0.9236, "step": 852 }, { "epoch": 1.4732297063903281, "grad_norm": 1.3655168995324123, "learning_rate": 7.112880031336206e-05, "loss": 0.9329, "step": 853 }, { "epoch": 1.4749568221070812, "grad_norm": 0.9694610704510233, "learning_rate": 7.10984837226456e-05, "loss": 0.9363, "step": 854 }, { "epoch": 1.4766839378238341, "grad_norm": 1.647492331823345, "learning_rate": 7.106812190225571e-05, "loss": 0.9048, "step": 855 }, { "epoch": 1.4784110535405872, "grad_norm": 1.3014715485039499, "learning_rate": 7.103771489635065e-05, "loss": 0.9129, "step": 856 }, { "epoch": 1.4801381692573403, "grad_norm": 1.2176606954042277, "learning_rate": 7.100726274915442e-05, "loss": 0.9092, "step": 857 }, { "epoch": 1.4818652849740932, "grad_norm": 2.079206897162132, "learning_rate": 7.097676550495667e-05, "loss": 0.9117, "step": 858 }, { "epoch": 1.4835924006908463, "grad_norm": 1.0329560230669634, "learning_rate": 7.094622320811262e-05, "loss": 0.9201, "step": 859 }, { "epoch": 1.4853195164075994, "grad_norm": 2.4209707376678513, "learning_rate": 7.091563590304301e-05, "loss": 0.931, "step": 860 }, { "epoch": 1.4870466321243523, "grad_norm": 1.7880127495286515, "learning_rate": 7.088500363423409e-05, "loss": 0.9299, "step": 861 }, { "epoch": 1.4887737478411054, "grad_norm": 2.2275907681380303, "learning_rate": 7.085432644623744e-05, "loss": 0.9186, "step": 862 }, { "epoch": 1.4905008635578585, "grad_norm": 1.8230910121921058, "learning_rate": 7.082360438367002e-05, "loss": 0.9199, "step": 863 }, { "epoch": 1.4922279792746114, "grad_norm": 1.8948209474363076, "learning_rate": 7.0792837491214e-05, "loss": 0.9205, "step": 864 }, { "epoch": 1.4939550949913645, "grad_norm": 1.6180983345882056, "learning_rate": 7.076202581361683e-05, "loss": 0.9246, "step": 865 }, { "epoch": 1.4956822107081176, "grad_norm": 1.5917654218070394, "learning_rate": 7.073116939569105e-05, "loss": 0.9138, "step": 866 }, { "epoch": 1.4974093264248705, "grad_norm": 1.7889340849204394, "learning_rate": 7.070026828231424e-05, "loss": 0.9472, "step": 867 }, { "epoch": 1.4991364421416236, "grad_norm": 1.2409251113989586, "learning_rate": 7.066932251842903e-05, "loss": 0.9223, "step": 868 }, { "epoch": 1.5008635578583767, "grad_norm": 1.6746692848717006, "learning_rate": 7.0638332149043e-05, "loss": 0.906, "step": 869 }, { "epoch": 1.5025906735751295, "grad_norm": 1.20268241157853, "learning_rate": 7.060729721922854e-05, "loss": 0.9515, "step": 870 }, { "epoch": 1.5043177892918824, "grad_norm": 1.3621870202522959, "learning_rate": 7.057621777412292e-05, "loss": 0.9391, "step": 871 }, { "epoch": 1.5060449050086355, "grad_norm": 0.9546076174474968, "learning_rate": 7.054509385892814e-05, "loss": 0.8977, "step": 872 }, { "epoch": 1.5077720207253886, "grad_norm": 1.5765181336179277, "learning_rate": 7.051392551891083e-05, "loss": 0.9314, "step": 873 }, { "epoch": 1.5094991364421415, "grad_norm": 1.1493044930977827, "learning_rate": 7.048271279940226e-05, "loss": 0.8992, "step": 874 }, { "epoch": 1.5112262521588946, "grad_norm": 1.0874297004594777, "learning_rate": 7.045145574579827e-05, "loss": 0.9144, "step": 875 }, { "epoch": 1.5129533678756477, "grad_norm": 0.9517158169756857, "learning_rate": 7.042015440355915e-05, "loss": 0.9315, "step": 876 }, { "epoch": 1.5146804835924006, "grad_norm": 1.4464272808707894, "learning_rate": 7.038880881820961e-05, "loss": 0.9286, "step": 877 }, { "epoch": 1.5164075993091537, "grad_norm": 1.03694424645714, "learning_rate": 7.03574190353387e-05, "loss": 0.8971, "step": 878 }, { "epoch": 1.5181347150259068, "grad_norm": 1.457907655231974, "learning_rate": 7.032598510059977e-05, "loss": 0.9106, "step": 879 }, { "epoch": 1.5198618307426597, "grad_norm": 1.1951054291534196, "learning_rate": 7.029450705971037e-05, "loss": 0.9091, "step": 880 }, { "epoch": 1.5215889464594128, "grad_norm": 1.2684614600395372, "learning_rate": 7.02629849584522e-05, "loss": 0.9009, "step": 881 }, { "epoch": 1.5233160621761659, "grad_norm": 0.8649374468511898, "learning_rate": 7.023141884267105e-05, "loss": 0.9341, "step": 882 }, { "epoch": 1.5250431778929188, "grad_norm": 1.122644835320047, "learning_rate": 7.019980875827671e-05, "loss": 0.9194, "step": 883 }, { "epoch": 1.5267702936096719, "grad_norm": 1.3569662608802513, "learning_rate": 7.016815475124293e-05, "loss": 0.9269, "step": 884 }, { "epoch": 1.528497409326425, "grad_norm": 1.2304751241582257, "learning_rate": 7.013645686760735e-05, "loss": 0.9245, "step": 885 }, { "epoch": 1.5302245250431779, "grad_norm": 1.1063573463164924, "learning_rate": 7.010471515347137e-05, "loss": 0.926, "step": 886 }, { "epoch": 1.531951640759931, "grad_norm": 1.3304127765528568, "learning_rate": 7.007292965500023e-05, "loss": 0.9038, "step": 887 }, { "epoch": 1.533678756476684, "grad_norm": 1.3311175297214348, "learning_rate": 7.004110041842277e-05, "loss": 0.9008, "step": 888 }, { "epoch": 1.535405872193437, "grad_norm": 0.8869186488945034, "learning_rate": 7.000922749003148e-05, "loss": 0.8911, "step": 889 }, { "epoch": 1.5371329879101898, "grad_norm": 1.081234219902256, "learning_rate": 6.997731091618236e-05, "loss": 0.9052, "step": 890 }, { "epoch": 1.5388601036269431, "grad_norm": 1.4310209296792904, "learning_rate": 6.994535074329497e-05, "loss": 0.9255, "step": 891 }, { "epoch": 1.540587219343696, "grad_norm": 1.223526450847189, "learning_rate": 6.991334701785219e-05, "loss": 0.8919, "step": 892 }, { "epoch": 1.542314335060449, "grad_norm": 1.372357236255315, "learning_rate": 6.988129978640027e-05, "loss": 0.894, "step": 893 }, { "epoch": 1.5440414507772022, "grad_norm": 1.0261925851319387, "learning_rate": 6.984920909554875e-05, "loss": 0.9129, "step": 894 }, { "epoch": 1.545768566493955, "grad_norm": 1.1295905735691474, "learning_rate": 6.981707499197038e-05, "loss": 0.915, "step": 895 }, { "epoch": 1.547495682210708, "grad_norm": 0.9536065505144254, "learning_rate": 6.978489752240105e-05, "loss": 0.926, "step": 896 }, { "epoch": 1.549222797927461, "grad_norm": 1.5831925665894764, "learning_rate": 6.975267673363969e-05, "loss": 0.9255, "step": 897 }, { "epoch": 1.5509499136442142, "grad_norm": 1.1054736305598811, "learning_rate": 6.972041267254825e-05, "loss": 0.9144, "step": 898 }, { "epoch": 1.552677029360967, "grad_norm": 1.111384259973724, "learning_rate": 6.968810538605164e-05, "loss": 0.934, "step": 899 }, { "epoch": 1.5544041450777202, "grad_norm": 1.5953988399310532, "learning_rate": 6.965575492113762e-05, "loss": 0.9094, "step": 900 }, { "epoch": 1.5561312607944733, "grad_norm": 1.0710712195289724, "learning_rate": 6.962336132485672e-05, "loss": 0.9371, "step": 901 }, { "epoch": 1.5578583765112262, "grad_norm": 1.6574995435817126, "learning_rate": 6.959092464432225e-05, "loss": 0.9278, "step": 902 }, { "epoch": 1.5595854922279793, "grad_norm": 0.7788561617464567, "learning_rate": 6.955844492671017e-05, "loss": 0.9026, "step": 903 }, { "epoch": 1.5613126079447324, "grad_norm": 0.7920406533041474, "learning_rate": 6.952592221925899e-05, "loss": 0.9123, "step": 904 }, { "epoch": 1.5630397236614852, "grad_norm": 0.9903772475407172, "learning_rate": 6.949335656926979e-05, "loss": 0.9181, "step": 905 }, { "epoch": 1.5647668393782384, "grad_norm": 1.5212364764617643, "learning_rate": 6.946074802410609e-05, "loss": 0.9173, "step": 906 }, { "epoch": 1.5664939550949915, "grad_norm": 1.2644536887656028, "learning_rate": 6.942809663119381e-05, "loss": 0.9213, "step": 907 }, { "epoch": 1.5682210708117443, "grad_norm": 1.06986836625323, "learning_rate": 6.939540243802116e-05, "loss": 0.9125, "step": 908 }, { "epoch": 1.5699481865284974, "grad_norm": 1.5898274329170747, "learning_rate": 6.936266549213863e-05, "loss": 0.9161, "step": 909 }, { "epoch": 1.5716753022452505, "grad_norm": 1.265092899086832, "learning_rate": 6.932988584115886e-05, "loss": 0.9146, "step": 910 }, { "epoch": 1.5734024179620034, "grad_norm": 0.8920394383923178, "learning_rate": 6.929706353275664e-05, "loss": 0.9077, "step": 911 }, { "epoch": 1.5751295336787565, "grad_norm": 1.0365232500741453, "learning_rate": 6.926419861466874e-05, "loss": 0.9069, "step": 912 }, { "epoch": 1.5768566493955096, "grad_norm": 1.3168326241693777, "learning_rate": 6.923129113469396e-05, "loss": 0.9221, "step": 913 }, { "epoch": 1.5785837651122625, "grad_norm": 1.2827227515979676, "learning_rate": 6.919834114069299e-05, "loss": 0.9262, "step": 914 }, { "epoch": 1.5803108808290154, "grad_norm": 1.5624736802931602, "learning_rate": 6.916534868058831e-05, "loss": 0.932, "step": 915 }, { "epoch": 1.5820379965457687, "grad_norm": 0.6903816628484963, "learning_rate": 6.913231380236424e-05, "loss": 0.9146, "step": 916 }, { "epoch": 1.5837651122625216, "grad_norm": 0.8325408846985621, "learning_rate": 6.90992365540667e-05, "loss": 0.9036, "step": 917 }, { "epoch": 1.5854922279792745, "grad_norm": 1.0024070443782058, "learning_rate": 6.90661169838033e-05, "loss": 0.9134, "step": 918 }, { "epoch": 1.5872193436960278, "grad_norm": 1.003306278926298, "learning_rate": 6.90329551397432e-05, "loss": 0.9047, "step": 919 }, { "epoch": 1.5889464594127807, "grad_norm": 1.8084449596178716, "learning_rate": 6.899975107011701e-05, "loss": 0.9208, "step": 920 }, { "epoch": 1.5906735751295336, "grad_norm": 0.9588357051146446, "learning_rate": 6.896650482321677e-05, "loss": 0.9157, "step": 921 }, { "epoch": 1.5924006908462867, "grad_norm": 2.1259490626306623, "learning_rate": 6.893321644739587e-05, "loss": 0.9043, "step": 922 }, { "epoch": 1.5941278065630398, "grad_norm": 1.3980635138462183, "learning_rate": 6.889988599106897e-05, "loss": 0.9115, "step": 923 }, { "epoch": 1.5958549222797926, "grad_norm": 2.2618368315528192, "learning_rate": 6.886651350271192e-05, "loss": 0.9036, "step": 924 }, { "epoch": 1.5975820379965457, "grad_norm": 1.6187943240219544, "learning_rate": 6.88330990308617e-05, "loss": 0.9194, "step": 925 }, { "epoch": 1.5993091537132988, "grad_norm": 2.2524779643712614, "learning_rate": 6.879964262411637e-05, "loss": 0.9228, "step": 926 }, { "epoch": 1.6010362694300517, "grad_norm": 1.933794402654093, "learning_rate": 6.876614433113497e-05, "loss": 0.9561, "step": 927 }, { "epoch": 1.6027633851468048, "grad_norm": 1.64888829004232, "learning_rate": 6.873260420063746e-05, "loss": 0.9295, "step": 928 }, { "epoch": 1.604490500863558, "grad_norm": 1.6857738069806067, "learning_rate": 6.869902228140464e-05, "loss": 0.9225, "step": 929 }, { "epoch": 1.6062176165803108, "grad_norm": 1.7497354876836644, "learning_rate": 6.86653986222781e-05, "loss": 0.9081, "step": 930 }, { "epoch": 1.607944732297064, "grad_norm": 1.327985949808922, "learning_rate": 6.863173327216013e-05, "loss": 0.9293, "step": 931 }, { "epoch": 1.609671848013817, "grad_norm": 1.5141200257155651, "learning_rate": 6.859802628001366e-05, "loss": 0.8937, "step": 932 }, { "epoch": 1.61139896373057, "grad_norm": 1.3102123068210683, "learning_rate": 6.856427769486217e-05, "loss": 0.9179, "step": 933 }, { "epoch": 1.613126079447323, "grad_norm": 1.0100057195730154, "learning_rate": 6.853048756578967e-05, "loss": 0.8857, "step": 934 }, { "epoch": 1.614853195164076, "grad_norm": 1.9326088887095947, "learning_rate": 6.849665594194055e-05, "loss": 0.9166, "step": 935 }, { "epoch": 1.616580310880829, "grad_norm": 1.1185782870898515, "learning_rate": 6.846278287251955e-05, "loss": 0.9186, "step": 936 }, { "epoch": 1.618307426597582, "grad_norm": 2.235542768403597, "learning_rate": 6.842886840679174e-05, "loss": 0.9205, "step": 937 }, { "epoch": 1.6200345423143352, "grad_norm": 1.6831139693692898, "learning_rate": 6.839491259408232e-05, "loss": 0.9035, "step": 938 }, { "epoch": 1.621761658031088, "grad_norm": 1.506691469989788, "learning_rate": 6.83609154837767e-05, "loss": 0.8999, "step": 939 }, { "epoch": 1.623488773747841, "grad_norm": 1.5011379971084755, "learning_rate": 6.832687712532029e-05, "loss": 0.9326, "step": 940 }, { "epoch": 1.6252158894645943, "grad_norm": 1.4158779629330536, "learning_rate": 6.829279756821856e-05, "loss": 0.9082, "step": 941 }, { "epoch": 1.6269430051813472, "grad_norm": 1.3360954892836199, "learning_rate": 6.825867686203682e-05, "loss": 0.9187, "step": 942 }, { "epoch": 1.6286701208981, "grad_norm": 1.1439636108308557, "learning_rate": 6.82245150564003e-05, "loss": 0.916, "step": 943 }, { "epoch": 1.6303972366148531, "grad_norm": 2.0453851867648276, "learning_rate": 6.819031220099395e-05, "loss": 0.9015, "step": 944 }, { "epoch": 1.6321243523316062, "grad_norm": 1.1666874731427341, "learning_rate": 6.815606834556243e-05, "loss": 0.8952, "step": 945 }, { "epoch": 1.6338514680483591, "grad_norm": 2.436014752107182, "learning_rate": 6.812178353991011e-05, "loss": 0.9311, "step": 946 }, { "epoch": 1.6355785837651122, "grad_norm": 1.8127900110242812, "learning_rate": 6.80874578339008e-05, "loss": 0.9192, "step": 947 }, { "epoch": 1.6373056994818653, "grad_norm": 1.996151967956713, "learning_rate": 6.805309127745785e-05, "loss": 0.9245, "step": 948 }, { "epoch": 1.6390328151986182, "grad_norm": 1.6437441353662172, "learning_rate": 6.801868392056405e-05, "loss": 0.9042, "step": 949 }, { "epoch": 1.6407599309153713, "grad_norm": 1.5276443701609, "learning_rate": 6.798423581326149e-05, "loss": 0.927, "step": 950 }, { "epoch": 1.6424870466321244, "grad_norm": 1.8067966097007544, "learning_rate": 6.794974700565152e-05, "loss": 0.9204, "step": 951 }, { "epoch": 1.6442141623488773, "grad_norm": 1.2989337023117873, "learning_rate": 6.791521754789473e-05, "loss": 0.9141, "step": 952 }, { "epoch": 1.6459412780656304, "grad_norm": 1.6242076715928653, "learning_rate": 6.78806474902108e-05, "loss": 0.9101, "step": 953 }, { "epoch": 1.6476683937823835, "grad_norm": 1.6262566456440049, "learning_rate": 6.784603688287847e-05, "loss": 0.9216, "step": 954 }, { "epoch": 1.6493955094991364, "grad_norm": 1.015206136645676, "learning_rate": 6.781138577623544e-05, "loss": 0.894, "step": 955 }, { "epoch": 1.6511226252158895, "grad_norm": 1.7945652967682, "learning_rate": 6.777669422067831e-05, "loss": 0.8961, "step": 956 }, { "epoch": 1.6528497409326426, "grad_norm": 1.112752888992224, "learning_rate": 6.774196226666257e-05, "loss": 0.9148, "step": 957 }, { "epoch": 1.6545768566493955, "grad_norm": 1.9283526920339595, "learning_rate": 6.770718996470237e-05, "loss": 0.8966, "step": 958 }, { "epoch": 1.6563039723661486, "grad_norm": 1.2670389603932752, "learning_rate": 6.767237736537061e-05, "loss": 0.9262, "step": 959 }, { "epoch": 1.6580310880829017, "grad_norm": 2.2306583638188733, "learning_rate": 6.763752451929878e-05, "loss": 0.8898, "step": 960 }, { "epoch": 1.6597582037996546, "grad_norm": 1.745590643785188, "learning_rate": 6.760263147717687e-05, "loss": 0.9354, "step": 961 }, { "epoch": 1.6614853195164074, "grad_norm": 2.26783368226166, "learning_rate": 6.756769828975341e-05, "loss": 0.9063, "step": 962 }, { "epoch": 1.6632124352331608, "grad_norm": 1.8356084594951008, "learning_rate": 6.753272500783525e-05, "loss": 0.9046, "step": 963 }, { "epoch": 1.6649395509499136, "grad_norm": 1.9539612292965518, "learning_rate": 6.749771168228756e-05, "loss": 0.9208, "step": 964 }, { "epoch": 1.6666666666666665, "grad_norm": 1.9105267142696198, "learning_rate": 6.74626583640338e-05, "loss": 0.9228, "step": 965 }, { "epoch": 1.6683937823834198, "grad_norm": 1.7254417456519098, "learning_rate": 6.742756510405553e-05, "loss": 0.9173, "step": 966 }, { "epoch": 1.6701208981001727, "grad_norm": 1.6314528661043406, "learning_rate": 6.739243195339243e-05, "loss": 0.912, "step": 967 }, { "epoch": 1.6718480138169256, "grad_norm": 1.4343223192872256, "learning_rate": 6.735725896314222e-05, "loss": 0.9314, "step": 968 }, { "epoch": 1.6735751295336787, "grad_norm": 1.1275208104108985, "learning_rate": 6.732204618446053e-05, "loss": 0.9146, "step": 969 }, { "epoch": 1.6753022452504318, "grad_norm": 1.7613525549975153, "learning_rate": 6.728679366856086e-05, "loss": 0.895, "step": 970 }, { "epoch": 1.6770293609671847, "grad_norm": 1.1140321219669778, "learning_rate": 6.725150146671453e-05, "loss": 0.8982, "step": 971 }, { "epoch": 1.6787564766839378, "grad_norm": 1.5841586795483955, "learning_rate": 6.721616963025055e-05, "loss": 0.9097, "step": 972 }, { "epoch": 1.680483592400691, "grad_norm": 1.14803702341358, "learning_rate": 6.71807982105556e-05, "loss": 0.9183, "step": 973 }, { "epoch": 1.6822107081174438, "grad_norm": 1.8773566628432847, "learning_rate": 6.71453872590739e-05, "loss": 0.9052, "step": 974 }, { "epoch": 1.6839378238341969, "grad_norm": 1.429933495687212, "learning_rate": 6.710993682730717e-05, "loss": 0.9379, "step": 975 }, { "epoch": 1.68566493955095, "grad_norm": 1.712611792860529, "learning_rate": 6.707444696681461e-05, "loss": 0.917, "step": 976 }, { "epoch": 1.6873920552677029, "grad_norm": 1.6044251486521213, "learning_rate": 6.703891772921267e-05, "loss": 0.9393, "step": 977 }, { "epoch": 1.689119170984456, "grad_norm": 1.344713405611433, "learning_rate": 6.700334916617515e-05, "loss": 0.9223, "step": 978 }, { "epoch": 1.690846286701209, "grad_norm": 1.2614482290326272, "learning_rate": 6.696774132943299e-05, "loss": 0.9184, "step": 979 }, { "epoch": 1.692573402417962, "grad_norm": 1.8799918463688279, "learning_rate": 6.693209427077428e-05, "loss": 0.9253, "step": 980 }, { "epoch": 1.694300518134715, "grad_norm": 1.1159857593783402, "learning_rate": 6.689640804204415e-05, "loss": 0.918, "step": 981 }, { "epoch": 1.6960276338514682, "grad_norm": 1.8414648715599857, "learning_rate": 6.686068269514469e-05, "loss": 0.9218, "step": 982 }, { "epoch": 1.697754749568221, "grad_norm": 1.828480170607571, "learning_rate": 6.68249182820349e-05, "loss": 0.933, "step": 983 }, { "epoch": 1.6994818652849741, "grad_norm": 0.993872949153196, "learning_rate": 6.678911485473056e-05, "loss": 0.9268, "step": 984 }, { "epoch": 1.7012089810017272, "grad_norm": 2.5650083265899424, "learning_rate": 6.675327246530425e-05, "loss": 0.916, "step": 985 }, { "epoch": 1.7029360967184801, "grad_norm": 1.6086867336987112, "learning_rate": 6.671739116588518e-05, "loss": 0.9224, "step": 986 }, { "epoch": 1.704663212435233, "grad_norm": 2.549026942598164, "learning_rate": 6.668147100865913e-05, "loss": 0.9223, "step": 987 }, { "epoch": 1.7063903281519863, "grad_norm": 2.019424145441625, "learning_rate": 6.664551204586846e-05, "loss": 0.9198, "step": 988 }, { "epoch": 1.7081174438687392, "grad_norm": 2.3723356121969195, "learning_rate": 6.66095143298119e-05, "loss": 0.9085, "step": 989 }, { "epoch": 1.709844559585492, "grad_norm": 1.4227507398286634, "learning_rate": 6.657347791284458e-05, "loss": 0.9114, "step": 990 }, { "epoch": 1.7115716753022454, "grad_norm": 2.8122310700937283, "learning_rate": 6.653740284737794e-05, "loss": 0.925, "step": 991 }, { "epoch": 1.7132987910189983, "grad_norm": 2.181144640905276, "learning_rate": 6.650128918587955e-05, "loss": 0.9361, "step": 992 }, { "epoch": 1.7150259067357512, "grad_norm": 2.573288026802726, "learning_rate": 6.64651369808732e-05, "loss": 0.9333, "step": 993 }, { "epoch": 1.7167530224525043, "grad_norm": 2.557700939417764, "learning_rate": 6.642894628493868e-05, "loss": 0.9187, "step": 994 }, { "epoch": 1.7184801381692574, "grad_norm": 1.5125675649753867, "learning_rate": 6.63927171507118e-05, "loss": 0.9011, "step": 995 }, { "epoch": 1.7202072538860103, "grad_norm": 1.3321279479179653, "learning_rate": 6.635644963088424e-05, "loss": 0.9085, "step": 996 }, { "epoch": 1.7219343696027634, "grad_norm": 2.323195392740303, "learning_rate": 6.632014377820356e-05, "loss": 0.9293, "step": 997 }, { "epoch": 1.7236614853195165, "grad_norm": 1.6967243834071086, "learning_rate": 6.6283799645473e-05, "loss": 0.9238, "step": 998 }, { "epoch": 1.7253886010362693, "grad_norm": 2.59431226635551, "learning_rate": 6.624741728555154e-05, "loss": 0.9207, "step": 999 }, { "epoch": 1.7271157167530224, "grad_norm": 2.3312789981335467, "learning_rate": 6.62109967513537e-05, "loss": 0.9128, "step": 1000 }, { "epoch": 1.7288428324697755, "grad_norm": 1.8126688178543549, "learning_rate": 6.617453809584957e-05, "loss": 0.9136, "step": 1001 }, { "epoch": 1.7305699481865284, "grad_norm": 1.6444239376524987, "learning_rate": 6.613804137206466e-05, "loss": 0.9154, "step": 1002 }, { "epoch": 1.7322970639032815, "grad_norm": 2.3770519024009467, "learning_rate": 6.610150663307986e-05, "loss": 0.9099, "step": 1003 }, { "epoch": 1.7340241796200346, "grad_norm": 2.039677780085216, "learning_rate": 6.606493393203133e-05, "loss": 0.9211, "step": 1004 }, { "epoch": 1.7357512953367875, "grad_norm": 2.093360728666251, "learning_rate": 6.602832332211044e-05, "loss": 0.9243, "step": 1005 }, { "epoch": 1.7374784110535406, "grad_norm": 2.0260092490447987, "learning_rate": 6.599167485656372e-05, "loss": 0.9428, "step": 1006 }, { "epoch": 1.7392055267702937, "grad_norm": 2.026466135183706, "learning_rate": 6.595498858869276e-05, "loss": 0.9422, "step": 1007 }, { "epoch": 1.7409326424870466, "grad_norm": 1.743354713337449, "learning_rate": 6.591826457185408e-05, "loss": 0.9164, "step": 1008 }, { "epoch": 1.7426597582037997, "grad_norm": 2.1376515687034217, "learning_rate": 6.588150285945917e-05, "loss": 0.9307, "step": 1009 }, { "epoch": 1.7443868739205528, "grad_norm": 1.713118036217165, "learning_rate": 6.584470350497426e-05, "loss": 0.9183, "step": 1010 }, { "epoch": 1.7461139896373057, "grad_norm": 2.3188225292724542, "learning_rate": 6.580786656192044e-05, "loss": 0.909, "step": 1011 }, { "epoch": 1.7478411053540586, "grad_norm": 1.9634421230503631, "learning_rate": 6.577099208387337e-05, "loss": 0.9115, "step": 1012 }, { "epoch": 1.749568221070812, "grad_norm": 1.997617681687678, "learning_rate": 6.573408012446334e-05, "loss": 0.9001, "step": 1013 }, { "epoch": 1.7512953367875648, "grad_norm": 1.7565734058304245, "learning_rate": 6.569713073737514e-05, "loss": 0.9205, "step": 1014 }, { "epoch": 1.7530224525043177, "grad_norm": 2.115205661163874, "learning_rate": 6.566014397634803e-05, "loss": 0.8893, "step": 1015 }, { "epoch": 1.754749568221071, "grad_norm": 1.7977382126042607, "learning_rate": 6.56231198951756e-05, "loss": 0.9107, "step": 1016 }, { "epoch": 1.7564766839378239, "grad_norm": 2.1874115017773375, "learning_rate": 6.55860585477057e-05, "loss": 0.9365, "step": 1017 }, { "epoch": 1.7582037996545767, "grad_norm": 1.88584364107021, "learning_rate": 6.554895998784039e-05, "loss": 0.9117, "step": 1018 }, { "epoch": 1.7599309153713298, "grad_norm": 1.9761703089192695, "learning_rate": 6.551182426953589e-05, "loss": 0.9064, "step": 1019 }, { "epoch": 1.761658031088083, "grad_norm": 1.7239937540148547, "learning_rate": 6.547465144680243e-05, "loss": 0.9138, "step": 1020 }, { "epoch": 1.7633851468048358, "grad_norm": 2.229569987746711, "learning_rate": 6.54374415737042e-05, "loss": 0.9004, "step": 1021 }, { "epoch": 1.765112262521589, "grad_norm": 2.048492624304516, "learning_rate": 6.540019470435927e-05, "loss": 0.9169, "step": 1022 }, { "epoch": 1.766839378238342, "grad_norm": 1.773591776164927, "learning_rate": 6.536291089293958e-05, "loss": 0.9172, "step": 1023 }, { "epoch": 1.768566493955095, "grad_norm": 1.5705988617585533, "learning_rate": 6.532559019367073e-05, "loss": 0.9052, "step": 1024 }, { "epoch": 1.770293609671848, "grad_norm": 2.19506917431025, "learning_rate": 6.528823266083195e-05, "loss": 0.9337, "step": 1025 }, { "epoch": 1.7720207253886011, "grad_norm": 1.9096286981973467, "learning_rate": 6.525083834875615e-05, "loss": 0.9099, "step": 1026 }, { "epoch": 1.773747841105354, "grad_norm": 1.8395086871133013, "learning_rate": 6.521340731182962e-05, "loss": 0.9351, "step": 1027 }, { "epoch": 1.775474956822107, "grad_norm": 1.6089156274421865, "learning_rate": 6.517593960449212e-05, "loss": 0.9153, "step": 1028 }, { "epoch": 1.7772020725388602, "grad_norm": 2.1411873821209757, "learning_rate": 6.513843528123673e-05, "loss": 0.9017, "step": 1029 }, { "epoch": 1.778929188255613, "grad_norm": 1.8109599266206073, "learning_rate": 6.510089439660978e-05, "loss": 0.8978, "step": 1030 }, { "epoch": 1.7806563039723662, "grad_norm": 1.916383525732133, "learning_rate": 6.506331700521079e-05, "loss": 0.9025, "step": 1031 }, { "epoch": 1.7823834196891193, "grad_norm": 1.7245006084504457, "learning_rate": 6.502570316169236e-05, "loss": 0.9088, "step": 1032 }, { "epoch": 1.7841105354058722, "grad_norm": 2.007663705948644, "learning_rate": 6.49880529207601e-05, "loss": 0.9061, "step": 1033 }, { "epoch": 1.7858376511226253, "grad_norm": 1.672968557976074, "learning_rate": 6.495036633717257e-05, "loss": 0.8948, "step": 1034 }, { "epoch": 1.7875647668393784, "grad_norm": 1.9524776028909738, "learning_rate": 6.49126434657412e-05, "loss": 0.9067, "step": 1035 }, { "epoch": 1.7892918825561313, "grad_norm": 1.7878463217076677, "learning_rate": 6.487488436133015e-05, "loss": 0.9127, "step": 1036 }, { "epoch": 1.7910189982728841, "grad_norm": 1.939180802574076, "learning_rate": 6.483708907885635e-05, "loss": 0.9107, "step": 1037 }, { "epoch": 1.7927461139896375, "grad_norm": 1.5444419336008592, "learning_rate": 6.479925767328928e-05, "loss": 0.9152, "step": 1038 }, { "epoch": 1.7944732297063903, "grad_norm": 1.8973342177783676, "learning_rate": 6.4761390199651e-05, "loss": 0.908, "step": 1039 }, { "epoch": 1.7962003454231432, "grad_norm": 1.6323930061658558, "learning_rate": 6.472348671301598e-05, "loss": 0.9137, "step": 1040 }, { "epoch": 1.7979274611398963, "grad_norm": 2.0490774469276785, "learning_rate": 6.468554726851113e-05, "loss": 0.9002, "step": 1041 }, { "epoch": 1.7996545768566494, "grad_norm": 1.7713420163049203, "learning_rate": 6.464757192131561e-05, "loss": 0.9058, "step": 1042 }, { "epoch": 1.8013816925734023, "grad_norm": 1.7280775267024129, "learning_rate": 6.460956072666081e-05, "loss": 0.8927, "step": 1043 }, { "epoch": 1.8031088082901554, "grad_norm": 1.4843208776074366, "learning_rate": 6.457151373983027e-05, "loss": 0.9073, "step": 1044 }, { "epoch": 1.8048359240069085, "grad_norm": 2.008487762658215, "learning_rate": 6.453343101615956e-05, "loss": 0.9136, "step": 1045 }, { "epoch": 1.8065630397236614, "grad_norm": 1.5085180106148939, "learning_rate": 6.449531261103626e-05, "loss": 0.9054, "step": 1046 }, { "epoch": 1.8082901554404145, "grad_norm": 1.9423670713699317, "learning_rate": 6.445715857989982e-05, "loss": 0.8911, "step": 1047 }, { "epoch": 1.8100172711571676, "grad_norm": 1.7121570906035162, "learning_rate": 6.441896897824148e-05, "loss": 0.8956, "step": 1048 }, { "epoch": 1.8117443868739205, "grad_norm": 1.8866148898022443, "learning_rate": 6.438074386160429e-05, "loss": 0.9315, "step": 1049 }, { "epoch": 1.8134715025906736, "grad_norm": 1.4747472109140833, "learning_rate": 6.434248328558289e-05, "loss": 0.9081, "step": 1050 }, { "epoch": 1.8151986183074267, "grad_norm": 1.7649129746600276, "learning_rate": 6.430418730582349e-05, "loss": 0.91, "step": 1051 }, { "epoch": 1.8169257340241796, "grad_norm": 1.447045790119867, "learning_rate": 6.426585597802385e-05, "loss": 0.9011, "step": 1052 }, { "epoch": 1.8186528497409327, "grad_norm": 2.0523134374242193, "learning_rate": 6.422748935793305e-05, "loss": 0.9175, "step": 1053 }, { "epoch": 1.8203799654576858, "grad_norm": 1.5682420574439448, "learning_rate": 6.41890875013516e-05, "loss": 0.9183, "step": 1054 }, { "epoch": 1.8221070811744386, "grad_norm": 1.852758088946045, "learning_rate": 6.415065046413118e-05, "loss": 0.9184, "step": 1055 }, { "epoch": 1.8238341968911918, "grad_norm": 1.57222522326425, "learning_rate": 6.411217830217466e-05, "loss": 0.915, "step": 1056 }, { "epoch": 1.8255613126079449, "grad_norm": 1.8543579457966604, "learning_rate": 6.407367107143603e-05, "loss": 0.9154, "step": 1057 }, { "epoch": 1.8272884283246977, "grad_norm": 1.5104517447375645, "learning_rate": 6.403512882792022e-05, "loss": 0.9258, "step": 1058 }, { "epoch": 1.8290155440414506, "grad_norm": 1.7172451420597168, "learning_rate": 6.399655162768314e-05, "loss": 0.909, "step": 1059 }, { "epoch": 1.830742659758204, "grad_norm": 1.4272559323026242, "learning_rate": 6.39579395268315e-05, "loss": 0.9193, "step": 1060 }, { "epoch": 1.8324697754749568, "grad_norm": 1.8918974884824669, "learning_rate": 6.39192925815228e-05, "loss": 0.9088, "step": 1061 }, { "epoch": 1.8341968911917097, "grad_norm": 1.525468336363271, "learning_rate": 6.38806108479652e-05, "loss": 0.9167, "step": 1062 }, { "epoch": 1.835924006908463, "grad_norm": 1.8186560822784792, "learning_rate": 6.384189438241748e-05, "loss": 0.9239, "step": 1063 }, { "epoch": 1.837651122625216, "grad_norm": 1.5929724237161644, "learning_rate": 6.380314324118889e-05, "loss": 0.9135, "step": 1064 }, { "epoch": 1.8393782383419688, "grad_norm": 1.7178210116915047, "learning_rate": 6.376435748063916e-05, "loss": 0.9208, "step": 1065 }, { "epoch": 1.8411053540587219, "grad_norm": 1.342825809447464, "learning_rate": 6.372553715717832e-05, "loss": 0.9015, "step": 1066 }, { "epoch": 1.842832469775475, "grad_norm": 1.7851215071283069, "learning_rate": 6.368668232726672e-05, "loss": 0.8941, "step": 1067 }, { "epoch": 1.8445595854922279, "grad_norm": 5.298181868350034, "learning_rate": 6.364779304741487e-05, "loss": 0.9224, "step": 1068 }, { "epoch": 1.846286701208981, "grad_norm": 211.75683122788405, "learning_rate": 6.360886937418338e-05, "loss": 8.1666, "step": 1069 }, { "epoch": 1.848013816925734, "grad_norm": 122.37900044818441, "learning_rate": 6.356991136418289e-05, "loss": 8.6457, "step": 1070 }, { "epoch": 1.849740932642487, "grad_norm": 59.39103776292803, "learning_rate": 6.353091907407396e-05, "loss": 7.947, "step": 1071 }, { "epoch": 1.85146804835924, "grad_norm": 20.736639888515533, "learning_rate": 6.349189256056707e-05, "loss": 7.4058, "step": 1072 }, { "epoch": 1.8531951640759932, "grad_norm": 15.729079486788306, "learning_rate": 6.345283188042242e-05, "loss": 6.9735, "step": 1073 }, { "epoch": 1.854922279792746, "grad_norm": 46.90018084120614, "learning_rate": 6.341373709044987e-05, "loss": 6.9557, "step": 1074 }, { "epoch": 1.8566493955094991, "grad_norm": 28.469670289420982, "learning_rate": 6.337460824750898e-05, "loss": 7.1966, "step": 1075 }, { "epoch": 1.8583765112262522, "grad_norm": 61.07657238199919, "learning_rate": 6.333544540850874e-05, "loss": 7.4539, "step": 1076 }, { "epoch": 1.8601036269430051, "grad_norm": 27.816406016488198, "learning_rate": 6.329624863040767e-05, "loss": 6.9617, "step": 1077 }, { "epoch": 1.8618307426597582, "grad_norm": 199.3657102066223, "learning_rate": 6.325701797021362e-05, "loss": 9.5767, "step": 1078 }, { "epoch": 1.8635578583765113, "grad_norm": 16.978362426779267, "learning_rate": 6.321775348498366e-05, "loss": 6.813, "step": 1079 }, { "epoch": 1.8652849740932642, "grad_norm": 10.327555284373343, "learning_rate": 6.317845523182416e-05, "loss": 6.4705, "step": 1080 }, { "epoch": 1.8670120898100173, "grad_norm": 31.66265661858456, "learning_rate": 6.313912326789052e-05, "loss": 6.6674, "step": 1081 }, { "epoch": 1.8687392055267704, "grad_norm": 12.45719658188136, "learning_rate": 6.30997576503872e-05, "loss": 6.4178, "step": 1082 }, { "epoch": 1.8704663212435233, "grad_norm": 13.501055216905542, "learning_rate": 6.306035843656761e-05, "loss": 6.5215, "step": 1083 }, { "epoch": 1.8721934369602762, "grad_norm": 9.02304058400973, "learning_rate": 6.302092568373401e-05, "loss": 6.4257, "step": 1084 }, { "epoch": 1.8739205526770295, "grad_norm": 34.60134522108773, "learning_rate": 6.298145944923744e-05, "loss": 6.5632, "step": 1085 }, { "epoch": 1.8756476683937824, "grad_norm": 38.20815229284678, "learning_rate": 6.294195979047766e-05, "loss": 6.7271, "step": 1086 }, { "epoch": 1.8773747841105353, "grad_norm": 10.885442619873794, "learning_rate": 6.290242676490301e-05, "loss": 6.3826, "step": 1087 }, { "epoch": 1.8791018998272886, "grad_norm": 11.817345739052305, "learning_rate": 6.286286043001037e-05, "loss": 6.4145, "step": 1088 }, { "epoch": 1.8808290155440415, "grad_norm": 13.046924929017933, "learning_rate": 6.282326084334507e-05, "loss": 6.4593, "step": 1089 }, { "epoch": 1.8825561312607944, "grad_norm": 6.338331190019349, "learning_rate": 6.27836280625008e-05, "loss": 6.3069, "step": 1090 }, { "epoch": 1.8842832469775475, "grad_norm": 16.35950157368765, "learning_rate": 6.274396214511951e-05, "loss": 6.3972, "step": 1091 }, { "epoch": 1.8860103626943006, "grad_norm": 13.390106240562476, "learning_rate": 6.270426314889138e-05, "loss": 6.3632, "step": 1092 }, { "epoch": 1.8877374784110534, "grad_norm": 6.4627195733410625, "learning_rate": 6.266453113155468e-05, "loss": 6.2984, "step": 1093 }, { "epoch": 1.8894645941278065, "grad_norm": 9.843826777935146, "learning_rate": 6.262476615089568e-05, "loss": 6.322, "step": 1094 }, { "epoch": 1.8911917098445596, "grad_norm": 9.81060384268441, "learning_rate": 6.258496826474865e-05, "loss": 6.3375, "step": 1095 }, { "epoch": 1.8929188255613125, "grad_norm": 13.171462801655053, "learning_rate": 6.254513753099567e-05, "loss": 6.316, "step": 1096 }, { "epoch": 1.8946459412780656, "grad_norm": 12.880294251429234, "learning_rate": 6.25052740075666e-05, "loss": 6.2558, "step": 1097 }, { "epoch": 1.8963730569948187, "grad_norm": 6.294026660110246, "learning_rate": 6.2465377752439e-05, "loss": 6.2474, "step": 1098 }, { "epoch": 1.8981001727115716, "grad_norm": 3.4733932556895515, "learning_rate": 6.242544882363804e-05, "loss": 6.2011, "step": 1099 }, { "epoch": 1.8998272884283247, "grad_norm": 8.908567381816141, "learning_rate": 6.238548727923642e-05, "loss": 6.1774, "step": 1100 }, { "epoch": 1.9015544041450778, "grad_norm": 15.26841991198491, "learning_rate": 6.234549317735423e-05, "loss": 6.4009, "step": 1101 }, { "epoch": 1.9032815198618307, "grad_norm": 5.669629098017729, "learning_rate": 6.230546657615897e-05, "loss": 6.2288, "step": 1102 }, { "epoch": 1.9050086355785838, "grad_norm": 7.94707643546723, "learning_rate": 6.226540753386535e-05, "loss": 6.0978, "step": 1103 }, { "epoch": 1.906735751295337, "grad_norm": 9.19948790213138, "learning_rate": 6.22253161087353e-05, "loss": 5.9008, "step": 1104 }, { "epoch": 1.9084628670120898, "grad_norm": 32.276211090348376, "learning_rate": 6.218519235907786e-05, "loss": 6.234, "step": 1105 }, { "epoch": 1.9101899827288429, "grad_norm": 5.612304208398534, "learning_rate": 6.214503634324904e-05, "loss": 5.8257, "step": 1106 }, { "epoch": 1.911917098445596, "grad_norm": 10.813399041229719, "learning_rate": 6.210484811965179e-05, "loss": 5.7711, "step": 1107 }, { "epoch": 1.9136442141623489, "grad_norm": 9.32179671210114, "learning_rate": 6.206462774673595e-05, "loss": 5.628, "step": 1108 }, { "epoch": 1.9153713298791017, "grad_norm": 9.061468226785466, "learning_rate": 6.202437528299804e-05, "loss": 5.5411, "step": 1109 }, { "epoch": 1.917098445595855, "grad_norm": 7.639338921239152, "learning_rate": 6.198409078698131e-05, "loss": 5.2594, "step": 1110 }, { "epoch": 1.918825561312608, "grad_norm": 7.0458559460050365, "learning_rate": 6.194377431727558e-05, "loss": 4.9591, "step": 1111 }, { "epoch": 1.9205526770293608, "grad_norm": 29.96096071918638, "learning_rate": 6.190342593251718e-05, "loss": 5.561, "step": 1112 }, { "epoch": 1.922279792746114, "grad_norm": 30.409007983688728, "learning_rate": 6.186304569138885e-05, "loss": 3.6032, "step": 1113 }, { "epoch": 1.924006908462867, "grad_norm": 203.70497772366468, "learning_rate": 6.182263365261967e-05, "loss": 5.7653, "step": 1114 }, { "epoch": 1.92573402417962, "grad_norm": 45.28986470635639, "learning_rate": 6.178218987498492e-05, "loss": 5.6128, "step": 1115 }, { "epoch": 1.927461139896373, "grad_norm": 11.353734859628759, "learning_rate": 6.174171441730612e-05, "loss": 2.2957, "step": 1116 }, { "epoch": 1.9291882556131261, "grad_norm": 8.936055049774392, "learning_rate": 6.170120733845082e-05, "loss": 1.6211, "step": 1117 }, { "epoch": 1.930915371329879, "grad_norm": 92.52181571867068, "learning_rate": 6.166066869733255e-05, "loss": 3.3184, "step": 1118 }, { "epoch": 1.932642487046632, "grad_norm": 4.611220536401277, "learning_rate": 6.162009855291078e-05, "loss": 1.3867, "step": 1119 }, { "epoch": 1.9343696027633852, "grad_norm": 5.362093903386963, "learning_rate": 6.157949696419076e-05, "loss": 1.3871, "step": 1120 }, { "epoch": 1.936096718480138, "grad_norm": 2.570479357229482, "learning_rate": 6.153886399022351e-05, "loss": 1.1035, "step": 1121 }, { "epoch": 1.9378238341968912, "grad_norm": 2.1817855288868944, "learning_rate": 6.149819969010568e-05, "loss": 1.0599, "step": 1122 }, { "epoch": 1.9395509499136443, "grad_norm": 1.7107597087662632, "learning_rate": 6.145750412297944e-05, "loss": 1.0172, "step": 1123 }, { "epoch": 1.9412780656303972, "grad_norm": 2.242358355897742, "learning_rate": 6.141677734803251e-05, "loss": 1.023, "step": 1124 }, { "epoch": 1.9430051813471503, "grad_norm": 0.9779438178679295, "learning_rate": 6.137601942449796e-05, "loss": 0.9851, "step": 1125 }, { "epoch": 1.9447322970639034, "grad_norm": 3.1174481677817916, "learning_rate": 6.133523041165416e-05, "loss": 1.0107, "step": 1126 }, { "epoch": 1.9464594127806563, "grad_norm": 2.300435259228025, "learning_rate": 6.12944103688247e-05, "loss": 1.0004, "step": 1127 }, { "epoch": 1.9481865284974094, "grad_norm": 1.8742026527706472, "learning_rate": 6.125355935537828e-05, "loss": 0.9691, "step": 1128 }, { "epoch": 1.9499136442141625, "grad_norm": 1.492615443577009, "learning_rate": 6.121267743072871e-05, "loss": 0.9895, "step": 1129 }, { "epoch": 1.9516407599309153, "grad_norm": 1.985813881361239, "learning_rate": 6.117176465433467e-05, "loss": 0.9597, "step": 1130 }, { "epoch": 1.9533678756476682, "grad_norm": 1.275813672191863, "learning_rate": 6.113082108569976e-05, "loss": 0.9612, "step": 1131 }, { "epoch": 1.9550949913644216, "grad_norm": 2.245540103791426, "learning_rate": 6.108984678437238e-05, "loss": 0.9888, "step": 1132 }, { "epoch": 1.9568221070811744, "grad_norm": 1.773545618907051, "learning_rate": 6.10488418099456e-05, "loss": 0.9653, "step": 1133 }, { "epoch": 1.9585492227979273, "grad_norm": 2.0120977939838705, "learning_rate": 6.100780622205709e-05, "loss": 0.9825, "step": 1134 }, { "epoch": 1.9602763385146806, "grad_norm": 1.7077847878395676, "learning_rate": 6.096674008038907e-05, "loss": 0.9479, "step": 1135 }, { "epoch": 1.9620034542314335, "grad_norm": 2.0793159484326917, "learning_rate": 6.0925643444668176e-05, "loss": 0.9588, "step": 1136 }, { "epoch": 1.9637305699481864, "grad_norm": 1.491435281609856, "learning_rate": 6.088451637466542e-05, "loss": 0.94, "step": 1137 }, { "epoch": 1.9654576856649395, "grad_norm": 1.9934160928342486, "learning_rate": 6.0843358930196064e-05, "loss": 0.9578, "step": 1138 }, { "epoch": 1.9671848013816926, "grad_norm": 1.9013132898456753, "learning_rate": 6.080217117111954e-05, "loss": 0.9374, "step": 1139 }, { "epoch": 1.9689119170984455, "grad_norm": 1.5339640762914935, "learning_rate": 6.0760953157339366e-05, "loss": 0.9568, "step": 1140 }, { "epoch": 1.9706390328151986, "grad_norm": 1.2311082896102425, "learning_rate": 6.07197049488031e-05, "loss": 0.9223, "step": 1141 }, { "epoch": 1.9723661485319517, "grad_norm": 1.642175722302105, "learning_rate": 6.067842660550216e-05, "loss": 0.9516, "step": 1142 }, { "epoch": 1.9740932642487046, "grad_norm": 1.18842688990186, "learning_rate": 6.063711818747183e-05, "loss": 0.9315, "step": 1143 }, { "epoch": 1.9758203799654577, "grad_norm": 1.6701394535903382, "learning_rate": 6.0595779754791137e-05, "loss": 0.9537, "step": 1144 }, { "epoch": 1.9775474956822108, "grad_norm": 1.2870993406570725, "learning_rate": 6.055441136758273e-05, "loss": 0.9435, "step": 1145 }, { "epoch": 1.9792746113989637, "grad_norm": 1.5540055028898736, "learning_rate": 6.051301308601285e-05, "loss": 0.9264, "step": 1146 }, { "epoch": 1.9810017271157168, "grad_norm": 1.2939791615121499, "learning_rate": 6.047158497029122e-05, "loss": 0.9328, "step": 1147 }, { "epoch": 1.9827288428324699, "grad_norm": 1.4021069035167129, "learning_rate": 6.0430127080670926e-05, "loss": 0.9432, "step": 1148 }, { "epoch": 1.9844559585492227, "grad_norm": 1.137038108932786, "learning_rate": 6.038863947744839e-05, "loss": 0.9227, "step": 1149 }, { "epoch": 1.9861830742659758, "grad_norm": 1.5629160017262265, "learning_rate": 6.034712222096321e-05, "loss": 0.9419, "step": 1150 }, { "epoch": 1.987910189982729, "grad_norm": 1.2606917724185387, "learning_rate": 6.030557537159816e-05, "loss": 0.9065, "step": 1151 }, { "epoch": 1.9896373056994818, "grad_norm": 1.38339406479116, "learning_rate": 6.026399898977902e-05, "loss": 0.9221, "step": 1152 }, { "epoch": 1.991364421416235, "grad_norm": 1.162910664543591, "learning_rate": 6.022239313597453e-05, "loss": 0.9202, "step": 1153 }, { "epoch": 1.993091537132988, "grad_norm": 1.2111555088685724, "learning_rate": 6.01807578706963e-05, "loss": 0.8998, "step": 1154 }, { "epoch": 1.994818652849741, "grad_norm": 0.956763662748825, "learning_rate": 6.01390932544987e-05, "loss": 0.9333, "step": 1155 }, { "epoch": 1.9965457685664938, "grad_norm": 1.4891512122736972, "learning_rate": 6.009739934797882e-05, "loss": 0.9148, "step": 1156 }, { "epoch": 1.9982728842832471, "grad_norm": 1.1273063945089474, "learning_rate": 6.005567621177632e-05, "loss": 0.9204, "step": 1157 }, { "epoch": 2.0, "grad_norm": 1.3522353048589386, "learning_rate": 6.001392390657336e-05, "loss": 0.9029, "step": 1158 }, { "epoch": 2.001727115716753, "grad_norm": 1.0049570462230286, "learning_rate": 5.997214249309458e-05, "loss": 0.8952, "step": 1159 }, { "epoch": 2.003454231433506, "grad_norm": 1.1825960978626104, "learning_rate": 5.9930332032106904e-05, "loss": 0.903, "step": 1160 }, { "epoch": 2.005181347150259, "grad_norm": 1.217788252599348, "learning_rate": 5.988849258441951e-05, "loss": 0.9134, "step": 1161 }, { "epoch": 2.006908462867012, "grad_norm": 1.331449077365896, "learning_rate": 5.9846624210883756e-05, "loss": 0.8878, "step": 1162 }, { "epoch": 2.0086355785837653, "grad_norm": 0.9794050709951746, "learning_rate": 5.980472697239305e-05, "loss": 0.9001, "step": 1163 }, { "epoch": 2.010362694300518, "grad_norm": 1.4866641120833683, "learning_rate": 5.976280092988279e-05, "loss": 0.9072, "step": 1164 }, { "epoch": 2.012089810017271, "grad_norm": 1.1703331369904586, "learning_rate": 5.9720846144330265e-05, "loss": 0.9002, "step": 1165 }, { "epoch": 2.0138169257340244, "grad_norm": 1.2693151805503857, "learning_rate": 5.967886267675456e-05, "loss": 0.9157, "step": 1166 }, { "epoch": 2.0155440414507773, "grad_norm": 0.9162067916438668, "learning_rate": 5.9636850588216495e-05, "loss": 0.8934, "step": 1167 }, { "epoch": 2.01727115716753, "grad_norm": 1.0992257615202221, "learning_rate": 5.9594809939818493e-05, "loss": 0.898, "step": 1168 }, { "epoch": 2.0189982728842835, "grad_norm": 0.9547730478879806, "learning_rate": 5.955274079270453e-05, "loss": 0.8953, "step": 1169 }, { "epoch": 2.0207253886010363, "grad_norm": 1.3550421522884735, "learning_rate": 5.9510643208060033e-05, "loss": 0.9118, "step": 1170 }, { "epoch": 2.0224525043177892, "grad_norm": 0.8812594349994677, "learning_rate": 5.946851724711177e-05, "loss": 0.8985, "step": 1171 }, { "epoch": 2.024179620034542, "grad_norm": 0.8829568305237805, "learning_rate": 5.942636297112779e-05, "loss": 0.881, "step": 1172 }, { "epoch": 2.0259067357512954, "grad_norm": 0.8113423735282355, "learning_rate": 5.938418044141733e-05, "loss": 0.9091, "step": 1173 }, { "epoch": 2.0276338514680483, "grad_norm": 0.767188035818685, "learning_rate": 5.934196971933071e-05, "loss": 0.9231, "step": 1174 }, { "epoch": 2.029360967184801, "grad_norm": 0.805035492939026, "learning_rate": 5.929973086625928e-05, "loss": 0.904, "step": 1175 }, { "epoch": 2.0310880829015545, "grad_norm": 0.9149229559834411, "learning_rate": 5.925746394363524e-05, "loss": 0.9115, "step": 1176 }, { "epoch": 2.0328151986183074, "grad_norm": 0.9443889454170518, "learning_rate": 5.921516901293169e-05, "loss": 0.9201, "step": 1177 }, { "epoch": 2.0345423143350603, "grad_norm": 1.1135408388295867, "learning_rate": 5.9172846135662406e-05, "loss": 0.8771, "step": 1178 }, { "epoch": 2.0362694300518136, "grad_norm": 1.1865339746414074, "learning_rate": 5.913049537338184e-05, "loss": 0.9026, "step": 1179 }, { "epoch": 2.0379965457685665, "grad_norm": 0.8432723007002008, "learning_rate": 5.9088116787685006e-05, "loss": 0.8927, "step": 1180 }, { "epoch": 2.0397236614853194, "grad_norm": 0.7563451950682866, "learning_rate": 5.904571044020735e-05, "loss": 0.8717, "step": 1181 }, { "epoch": 2.0414507772020727, "grad_norm": 0.6309379620378125, "learning_rate": 5.9003276392624734e-05, "loss": 0.8898, "step": 1182 }, { "epoch": 2.0431778929188256, "grad_norm": 0.7598185327490725, "learning_rate": 5.896081470665328e-05, "loss": 0.9007, "step": 1183 }, { "epoch": 2.0449050086355784, "grad_norm": 0.6751831522068205, "learning_rate": 5.8918325444049335e-05, "loss": 0.8754, "step": 1184 }, { "epoch": 2.0466321243523318, "grad_norm": 0.8322686453737627, "learning_rate": 5.887580866660931e-05, "loss": 0.9137, "step": 1185 }, { "epoch": 2.0483592400690847, "grad_norm": 0.6539854603178477, "learning_rate": 5.883326443616967e-05, "loss": 0.8964, "step": 1186 }, { "epoch": 2.0500863557858375, "grad_norm": 0.5950525780240083, "learning_rate": 5.87906928146068e-05, "loss": 0.8964, "step": 1187 }, { "epoch": 2.051813471502591, "grad_norm": 0.622584751772281, "learning_rate": 5.874809386383691e-05, "loss": 0.8916, "step": 1188 }, { "epoch": 2.0535405872193437, "grad_norm": 0.782819372477996, "learning_rate": 5.870546764581598e-05, "loss": 0.8991, "step": 1189 }, { "epoch": 2.0552677029360966, "grad_norm": 0.6607775254354489, "learning_rate": 5.8662814222539626e-05, "loss": 0.91, "step": 1190 }, { "epoch": 2.05699481865285, "grad_norm": 1.0171168776212367, "learning_rate": 5.862013365604304e-05, "loss": 0.8939, "step": 1191 }, { "epoch": 2.058721934369603, "grad_norm": 1.5040760920173983, "learning_rate": 5.8577426008400904e-05, "loss": 0.8884, "step": 1192 }, { "epoch": 2.0604490500863557, "grad_norm": 0.49014512179203623, "learning_rate": 5.8534691341727246e-05, "loss": 0.8909, "step": 1193 }, { "epoch": 2.062176165803109, "grad_norm": 0.7641555084106332, "learning_rate": 5.849192971817544e-05, "loss": 0.8939, "step": 1194 }, { "epoch": 2.063903281519862, "grad_norm": 1.6667701163661883, "learning_rate": 5.844914119993805e-05, "loss": 0.9197, "step": 1195 }, { "epoch": 2.065630397236615, "grad_norm": 0.6803426705923122, "learning_rate": 5.8406325849246724e-05, "loss": 0.8941, "step": 1196 }, { "epoch": 2.0673575129533677, "grad_norm": 0.9114166798796041, "learning_rate": 5.836348372837219e-05, "loss": 0.8691, "step": 1197 }, { "epoch": 2.069084628670121, "grad_norm": 1.9633505540373426, "learning_rate": 5.8320614899624054e-05, "loss": 0.9011, "step": 1198 }, { "epoch": 2.070811744386874, "grad_norm": 0.8689194792097363, "learning_rate": 5.827771942535082e-05, "loss": 0.8979, "step": 1199 }, { "epoch": 2.0725388601036268, "grad_norm": 3.06794420010953, "learning_rate": 5.823479736793971e-05, "loss": 0.8923, "step": 1200 }, { "epoch": 2.07426597582038, "grad_norm": 2.624015954185402, "learning_rate": 5.819184878981661e-05, "loss": 0.9063, "step": 1201 }, { "epoch": 2.075993091537133, "grad_norm": 1.6312158506190466, "learning_rate": 5.814887375344599e-05, "loss": 0.9176, "step": 1202 }, { "epoch": 2.077720207253886, "grad_norm": 1.5027986840242251, "learning_rate": 5.8105872321330804e-05, "loss": 0.8791, "step": 1203 }, { "epoch": 2.079447322970639, "grad_norm": 1.5454670281912886, "learning_rate": 5.806284455601238e-05, "loss": 0.8852, "step": 1204 }, { "epoch": 2.081174438687392, "grad_norm": 1.27227961945723, "learning_rate": 5.801979052007035e-05, "loss": 0.8838, "step": 1205 }, { "epoch": 2.082901554404145, "grad_norm": 1.1782549709601444, "learning_rate": 5.7976710276122574e-05, "loss": 0.9047, "step": 1206 }, { "epoch": 2.0846286701208983, "grad_norm": 1.838995009752971, "learning_rate": 5.793360388682498e-05, "loss": 0.8919, "step": 1207 }, { "epoch": 2.086355785837651, "grad_norm": 1.167930180809214, "learning_rate": 5.7890471414871606e-05, "loss": 0.8979, "step": 1208 }, { "epoch": 2.088082901554404, "grad_norm": 2.670019578345723, "learning_rate": 5.7847312922994324e-05, "loss": 0.8919, "step": 1209 }, { "epoch": 2.0898100172711573, "grad_norm": 2.4535938745351875, "learning_rate": 5.780412847396292e-05, "loss": 0.9052, "step": 1210 }, { "epoch": 2.09153713298791, "grad_norm": 1.6520145332439882, "learning_rate": 5.7760918130584895e-05, "loss": 0.9018, "step": 1211 }, { "epoch": 2.093264248704663, "grad_norm": 1.542785443990216, "learning_rate": 5.771768195570545e-05, "loss": 0.8923, "step": 1212 }, { "epoch": 2.0949913644214164, "grad_norm": 1.7091558303488987, "learning_rate": 5.767442001220732e-05, "loss": 0.8759, "step": 1213 }, { "epoch": 2.0967184801381693, "grad_norm": 1.1192982563145495, "learning_rate": 5.763113236301072e-05, "loss": 0.8926, "step": 1214 }, { "epoch": 2.098445595854922, "grad_norm": 2.424266192030652, "learning_rate": 5.758781907107329e-05, "loss": 0.9136, "step": 1215 }, { "epoch": 2.1001727115716755, "grad_norm": 1.9342912941228736, "learning_rate": 5.75444801993899e-05, "loss": 0.9244, "step": 1216 }, { "epoch": 2.1018998272884284, "grad_norm": 1.832938790758654, "learning_rate": 5.7501115810992676e-05, "loss": 0.8831, "step": 1217 }, { "epoch": 2.1036269430051813, "grad_norm": 1.6219025617526985, "learning_rate": 5.7457725968950843e-05, "loss": 0.873, "step": 1218 }, { "epoch": 2.105354058721934, "grad_norm": 1.8719774748830924, "learning_rate": 5.7414310736370625e-05, "loss": 0.9049, "step": 1219 }, { "epoch": 2.1070811744386875, "grad_norm": 1.560356879693053, "learning_rate": 5.7370870176395216e-05, "loss": 0.9106, "step": 1220 }, { "epoch": 2.1088082901554404, "grad_norm": 2.031127836340723, "learning_rate": 5.732740435220459e-05, "loss": 0.9064, "step": 1221 }, { "epoch": 2.1105354058721932, "grad_norm": 1.8089294302707364, "learning_rate": 5.7283913327015526e-05, "loss": 0.9045, "step": 1222 }, { "epoch": 2.1122625215889466, "grad_norm": 1.5744395519107228, "learning_rate": 5.724039716408139e-05, "loss": 0.9015, "step": 1223 }, { "epoch": 2.1139896373056994, "grad_norm": 1.3776643741434778, "learning_rate": 5.7196855926692186e-05, "loss": 0.8978, "step": 1224 }, { "epoch": 2.1157167530224523, "grad_norm": 1.9391966312407842, "learning_rate": 5.7153289678174304e-05, "loss": 0.8957, "step": 1225 }, { "epoch": 2.1174438687392056, "grad_norm": 1.570472112929359, "learning_rate": 5.710969848189058e-05, "loss": 0.8974, "step": 1226 }, { "epoch": 2.1191709844559585, "grad_norm": 1.9616576169126057, "learning_rate": 5.7066082401240086e-05, "loss": 0.873, "step": 1227 }, { "epoch": 2.1208981001727114, "grad_norm": 1.8499202244254813, "learning_rate": 5.7022441499658105e-05, "loss": 0.8952, "step": 1228 }, { "epoch": 2.1226252158894647, "grad_norm": 1.5173590603641003, "learning_rate": 5.6978775840616024e-05, "loss": 0.8906, "step": 1229 }, { "epoch": 2.1243523316062176, "grad_norm": 1.3340540468628683, "learning_rate": 5.693508548762124e-05, "loss": 0.903, "step": 1230 }, { "epoch": 2.1260794473229705, "grad_norm": 1.6457967510021936, "learning_rate": 5.689137050421704e-05, "loss": 0.8971, "step": 1231 }, { "epoch": 2.127806563039724, "grad_norm": 1.3198556197677382, "learning_rate": 5.684763095398256e-05, "loss": 0.919, "step": 1232 }, { "epoch": 2.1295336787564767, "grad_norm": 1.9647974203665666, "learning_rate": 5.680386690053266e-05, "loss": 0.8822, "step": 1233 }, { "epoch": 2.1312607944732296, "grad_norm": 1.7037604583488621, "learning_rate": 5.6760078407517816e-05, "loss": 0.8975, "step": 1234 }, { "epoch": 2.132987910189983, "grad_norm": 1.5965999555159363, "learning_rate": 5.6716265538624086e-05, "loss": 0.8977, "step": 1235 }, { "epoch": 2.134715025906736, "grad_norm": 1.4272450746636147, "learning_rate": 5.6672428357572966e-05, "loss": 0.89, "step": 1236 }, { "epoch": 2.1364421416234887, "grad_norm": 1.5951116492290847, "learning_rate": 5.662856692812128e-05, "loss": 0.8778, "step": 1237 }, { "epoch": 2.138169257340242, "grad_norm": 1.2595523077420594, "learning_rate": 5.658468131406117e-05, "loss": 0.896, "step": 1238 }, { "epoch": 2.139896373056995, "grad_norm": 1.7821581317393707, "learning_rate": 5.6540771579219906e-05, "loss": 0.9034, "step": 1239 }, { "epoch": 2.1416234887737478, "grad_norm": 1.4430494168837535, "learning_rate": 5.649683778745988e-05, "loss": 0.9079, "step": 1240 }, { "epoch": 2.143350604490501, "grad_norm": 2.2182602350599905, "learning_rate": 5.645288000267845e-05, "loss": 0.9142, "step": 1241 }, { "epoch": 2.145077720207254, "grad_norm": 2.2594056346971056, "learning_rate": 5.640889828880786e-05, "loss": 0.9386, "step": 1242 }, { "epoch": 2.146804835924007, "grad_norm": 0.7689910822729052, "learning_rate": 5.6364892709815186e-05, "loss": 0.9099, "step": 1243 }, { "epoch": 2.14853195164076, "grad_norm": 1.9497774034312745, "learning_rate": 5.6320863329702184e-05, "loss": 0.9207, "step": 1244 }, { "epoch": 2.150259067357513, "grad_norm": 1.4945437945888622, "learning_rate": 5.627681021250524e-05, "loss": 0.9027, "step": 1245 }, { "epoch": 2.151986183074266, "grad_norm": 1.587542836541107, "learning_rate": 5.623273342229529e-05, "loss": 0.898, "step": 1246 }, { "epoch": 2.153713298791019, "grad_norm": 1.597300154407086, "learning_rate": 5.6188633023177637e-05, "loss": 0.8942, "step": 1247 }, { "epoch": 2.155440414507772, "grad_norm": 1.076314249140763, "learning_rate": 5.614450907929195e-05, "loss": 0.8962, "step": 1248 }, { "epoch": 2.157167530224525, "grad_norm": 1.0700183378458576, "learning_rate": 5.610036165481219e-05, "loss": 0.8914, "step": 1249 }, { "epoch": 2.158894645941278, "grad_norm": 1.1086092470778206, "learning_rate": 5.60561908139464e-05, "loss": 0.9109, "step": 1250 }, { "epoch": 2.160621761658031, "grad_norm": 0.9603331195148542, "learning_rate": 5.601199662093671e-05, "loss": 0.9122, "step": 1251 }, { "epoch": 2.162348877374784, "grad_norm": 0.9703017005277597, "learning_rate": 5.59677791400592e-05, "loss": 0.8909, "step": 1252 }, { "epoch": 2.164075993091537, "grad_norm": 0.7941276539095923, "learning_rate": 5.592353843562384e-05, "loss": 0.8974, "step": 1253 }, { "epoch": 2.1658031088082903, "grad_norm": 0.8736802367418115, "learning_rate": 5.587927457197437e-05, "loss": 0.9131, "step": 1254 }, { "epoch": 2.167530224525043, "grad_norm": 0.6889098892716318, "learning_rate": 5.58349876134882e-05, "loss": 0.906, "step": 1255 }, { "epoch": 2.169257340241796, "grad_norm": 0.8284335666876645, "learning_rate": 5.579067762457634e-05, "loss": 0.8844, "step": 1256 }, { "epoch": 2.1709844559585494, "grad_norm": 0.6265772310434512, "learning_rate": 5.5746344669683275e-05, "loss": 0.9038, "step": 1257 }, { "epoch": 2.1727115716753023, "grad_norm": 0.7857876991993661, "learning_rate": 5.5701988813286935e-05, "loss": 0.913, "step": 1258 }, { "epoch": 2.174438687392055, "grad_norm": 0.6661065417853896, "learning_rate": 5.565761011989853e-05, "loss": 0.899, "step": 1259 }, { "epoch": 2.1761658031088085, "grad_norm": 0.5381595950904419, "learning_rate": 5.5613208654062484e-05, "loss": 0.9025, "step": 1260 }, { "epoch": 2.1778929188255614, "grad_norm": 0.7579841378181967, "learning_rate": 5.556878448035634e-05, "loss": 0.8951, "step": 1261 }, { "epoch": 2.1796200345423142, "grad_norm": 0.5058789447974622, "learning_rate": 5.552433766339067e-05, "loss": 0.9038, "step": 1262 }, { "epoch": 2.1813471502590676, "grad_norm": 0.6967608309717245, "learning_rate": 5.5479868267808985e-05, "loss": 0.8945, "step": 1263 }, { "epoch": 2.1830742659758204, "grad_norm": 0.6578513277593679, "learning_rate": 5.5435376358287634e-05, "loss": 0.8845, "step": 1264 }, { "epoch": 2.1848013816925733, "grad_norm": 0.6350616873448576, "learning_rate": 5.539086199953568e-05, "loss": 0.8778, "step": 1265 }, { "epoch": 2.186528497409326, "grad_norm": 0.5814676434498084, "learning_rate": 5.53463252562949e-05, "loss": 0.8971, "step": 1266 }, { "epoch": 2.1882556131260795, "grad_norm": 0.5177643461451024, "learning_rate": 5.530176619333956e-05, "loss": 0.8863, "step": 1267 }, { "epoch": 2.1899827288428324, "grad_norm": 0.5189225069189292, "learning_rate": 5.525718487547642e-05, "loss": 0.887, "step": 1268 }, { "epoch": 2.1917098445595853, "grad_norm": 0.46382837439851465, "learning_rate": 5.521258136754462e-05, "loss": 0.8974, "step": 1269 }, { "epoch": 2.1934369602763386, "grad_norm": 0.5085436872862359, "learning_rate": 5.516795573441554e-05, "loss": 0.891, "step": 1270 }, { "epoch": 2.1951640759930915, "grad_norm": 0.5997629179112026, "learning_rate": 5.5123308040992766e-05, "loss": 0.9192, "step": 1271 }, { "epoch": 2.1968911917098444, "grad_norm": 0.6809111036457908, "learning_rate": 5.507863835221197e-05, "loss": 0.8893, "step": 1272 }, { "epoch": 2.1986183074265977, "grad_norm": 0.815514226183851, "learning_rate": 5.503394673304078e-05, "loss": 0.8892, "step": 1273 }, { "epoch": 2.2003454231433506, "grad_norm": 0.8312864499668938, "learning_rate": 5.498923324847876e-05, "loss": 0.9102, "step": 1274 }, { "epoch": 2.2020725388601035, "grad_norm": 1.036969686895358, "learning_rate": 5.494449796355724e-05, "loss": 0.8767, "step": 1275 }, { "epoch": 2.203799654576857, "grad_norm": 0.9280163869057828, "learning_rate": 5.489974094333928e-05, "loss": 0.8616, "step": 1276 }, { "epoch": 2.2055267702936097, "grad_norm": 0.7083552918409979, "learning_rate": 5.485496225291956e-05, "loss": 0.8729, "step": 1277 }, { "epoch": 2.2072538860103625, "grad_norm": 0.5418075775883058, "learning_rate": 5.481016195742425e-05, "loss": 0.8871, "step": 1278 }, { "epoch": 2.208981001727116, "grad_norm": 0.4440929323378759, "learning_rate": 5.476534012201095e-05, "loss": 0.9029, "step": 1279 }, { "epoch": 2.2107081174438687, "grad_norm": 0.5255303691450817, "learning_rate": 5.472049681186862e-05, "loss": 0.912, "step": 1280 }, { "epoch": 2.2124352331606216, "grad_norm": 0.7004437516674827, "learning_rate": 5.4675632092217395e-05, "loss": 0.9202, "step": 1281 }, { "epoch": 2.214162348877375, "grad_norm": 0.9328341210685445, "learning_rate": 5.463074602830859e-05, "loss": 0.9056, "step": 1282 }, { "epoch": 2.215889464594128, "grad_norm": 1.188894604402772, "learning_rate": 5.458583868542456e-05, "loss": 0.8795, "step": 1283 }, { "epoch": 2.2176165803108807, "grad_norm": 0.7732466300564551, "learning_rate": 5.454091012887859e-05, "loss": 0.9061, "step": 1284 }, { "epoch": 2.219343696027634, "grad_norm": 0.4206897733497837, "learning_rate": 5.449596042401483e-05, "loss": 0.8907, "step": 1285 }, { "epoch": 2.221070811744387, "grad_norm": 0.46347137152525664, "learning_rate": 5.4450989636208196e-05, "loss": 0.8819, "step": 1286 }, { "epoch": 2.22279792746114, "grad_norm": 0.8086859581809959, "learning_rate": 5.440599783086426e-05, "loss": 0.891, "step": 1287 }, { "epoch": 2.224525043177893, "grad_norm": 1.0619829465514334, "learning_rate": 5.4360985073419145e-05, "loss": 0.8862, "step": 1288 }, { "epoch": 2.226252158894646, "grad_norm": 1.1248264539680048, "learning_rate": 5.4315951429339465e-05, "loss": 0.8872, "step": 1289 }, { "epoch": 2.227979274611399, "grad_norm": 0.8152817118782381, "learning_rate": 5.427089696412221e-05, "loss": 0.895, "step": 1290 }, { "epoch": 2.229706390328152, "grad_norm": 0.4996130654000678, "learning_rate": 5.4225821743294656e-05, "loss": 0.9091, "step": 1291 }, { "epoch": 2.231433506044905, "grad_norm": 0.535065472473171, "learning_rate": 5.418072583241425e-05, "loss": 0.8704, "step": 1292 }, { "epoch": 2.233160621761658, "grad_norm": 1.0340962849127904, "learning_rate": 5.4135609297068544e-05, "loss": 0.8919, "step": 1293 }, { "epoch": 2.234887737478411, "grad_norm": 1.3221673352250336, "learning_rate": 5.4090472202875094e-05, "loss": 0.8954, "step": 1294 }, { "epoch": 2.236614853195164, "grad_norm": 0.4633046407815879, "learning_rate": 5.404531461548133e-05, "loss": 0.8831, "step": 1295 }, { "epoch": 2.238341968911917, "grad_norm": 0.6021414044558655, "learning_rate": 5.40001366005645e-05, "loss": 0.8696, "step": 1296 }, { "epoch": 2.24006908462867, "grad_norm": 0.9855957222822156, "learning_rate": 5.3954938223831605e-05, "loss": 0.9019, "step": 1297 }, { "epoch": 2.2417962003454233, "grad_norm": 1.3108997189119003, "learning_rate": 5.3909719551019166e-05, "loss": 0.9135, "step": 1298 }, { "epoch": 2.243523316062176, "grad_norm": 0.5369617446214313, "learning_rate": 5.386448064789331e-05, "loss": 0.8819, "step": 1299 }, { "epoch": 2.245250431778929, "grad_norm": 0.6660594533836443, "learning_rate": 5.3819221580249545e-05, "loss": 0.9012, "step": 1300 }, { "epoch": 2.2469775474956823, "grad_norm": 1.342258138170124, "learning_rate": 5.377394241391272e-05, "loss": 0.8809, "step": 1301 }, { "epoch": 2.2487046632124352, "grad_norm": 0.6998925785343229, "learning_rate": 5.372864321473691e-05, "loss": 0.8871, "step": 1302 }, { "epoch": 2.250431778929188, "grad_norm": 0.8476869224246562, "learning_rate": 5.368332404860532e-05, "loss": 0.8848, "step": 1303 }, { "epoch": 2.2521588946459414, "grad_norm": 1.2713734136292694, "learning_rate": 5.3637984981430195e-05, "loss": 0.8848, "step": 1304 }, { "epoch": 2.2538860103626943, "grad_norm": 0.6779569168500268, "learning_rate": 5.359262607915275e-05, "loss": 0.8921, "step": 1305 }, { "epoch": 2.255613126079447, "grad_norm": 0.9302857224534884, "learning_rate": 5.354724740774302e-05, "loss": 0.916, "step": 1306 }, { "epoch": 2.2573402417962005, "grad_norm": 1.0593241812815748, "learning_rate": 5.3501849033199806e-05, "loss": 0.9133, "step": 1307 }, { "epoch": 2.2590673575129534, "grad_norm": 0.6784835381613379, "learning_rate": 5.3456431021550555e-05, "loss": 0.8994, "step": 1308 }, { "epoch": 2.2607944732297063, "grad_norm": 0.513399645450354, "learning_rate": 5.341099343885127e-05, "loss": 0.8966, "step": 1309 }, { "epoch": 2.2625215889464596, "grad_norm": 0.6615461595452876, "learning_rate": 5.336553635118645e-05, "loss": 0.8919, "step": 1310 }, { "epoch": 2.2642487046632125, "grad_norm": 0.7284747145896842, "learning_rate": 5.332005982466892e-05, "loss": 0.8842, "step": 1311 }, { "epoch": 2.2659758203799654, "grad_norm": 0.5776624299114389, "learning_rate": 5.3274563925439804e-05, "loss": 0.9068, "step": 1312 }, { "epoch": 2.2677029360967182, "grad_norm": 0.43318815585962783, "learning_rate": 5.32290487196684e-05, "loss": 0.8955, "step": 1313 }, { "epoch": 2.2694300518134716, "grad_norm": 0.6473109484756772, "learning_rate": 5.318351427355208e-05, "loss": 0.8926, "step": 1314 }, { "epoch": 2.2711571675302245, "grad_norm": 0.72144683328685, "learning_rate": 5.313796065331619e-05, "loss": 0.9019, "step": 1315 }, { "epoch": 2.2728842832469773, "grad_norm": 0.5947906532532024, "learning_rate": 5.309238792521397e-05, "loss": 0.9024, "step": 1316 }, { "epoch": 2.2746113989637307, "grad_norm": 0.4807554142340778, "learning_rate": 5.304679615552647e-05, "loss": 0.8972, "step": 1317 }, { "epoch": 2.2763385146804835, "grad_norm": 0.5889131261162772, "learning_rate": 5.300118541056239e-05, "loss": 0.8972, "step": 1318 }, { "epoch": 2.2780656303972364, "grad_norm": 0.5548479892843673, "learning_rate": 5.295555575665807e-05, "loss": 0.8928, "step": 1319 }, { "epoch": 2.2797927461139897, "grad_norm": 0.39647487995859904, "learning_rate": 5.290990726017735e-05, "loss": 0.9, "step": 1320 }, { "epoch": 2.2815198618307426, "grad_norm": 0.4270599874506341, "learning_rate": 5.286423998751144e-05, "loss": 0.9043, "step": 1321 }, { "epoch": 2.2832469775474955, "grad_norm": 0.4952094446816467, "learning_rate": 5.281855400507887e-05, "loss": 0.9003, "step": 1322 }, { "epoch": 2.284974093264249, "grad_norm": 0.3895198638883216, "learning_rate": 5.277284937932543e-05, "loss": 0.8983, "step": 1323 }, { "epoch": 2.2867012089810017, "grad_norm": 0.5220372871318583, "learning_rate": 5.272712617672393e-05, "loss": 0.886, "step": 1324 }, { "epoch": 2.2884283246977546, "grad_norm": 0.49313492713628165, "learning_rate": 5.26813844637743e-05, "loss": 0.8733, "step": 1325 }, { "epoch": 2.290155440414508, "grad_norm": 0.6614934706001547, "learning_rate": 5.263562430700331e-05, "loss": 0.8999, "step": 1326 }, { "epoch": 2.291882556131261, "grad_norm": 0.7816319979695506, "learning_rate": 5.2589845772964604e-05, "loss": 0.9171, "step": 1327 }, { "epoch": 2.2936096718480137, "grad_norm": 0.993707202711101, "learning_rate": 5.254404892823855e-05, "loss": 0.9044, "step": 1328 }, { "epoch": 2.295336787564767, "grad_norm": 1.282461008555753, "learning_rate": 5.249823383943212e-05, "loss": 0.894, "step": 1329 }, { "epoch": 2.29706390328152, "grad_norm": 0.6967734645567908, "learning_rate": 5.245240057317884e-05, "loss": 0.8927, "step": 1330 }, { "epoch": 2.2987910189982728, "grad_norm": 0.4602702541376467, "learning_rate": 5.2406549196138666e-05, "loss": 0.8754, "step": 1331 }, { "epoch": 2.300518134715026, "grad_norm": 0.9235597337740481, "learning_rate": 5.23606797749979e-05, "loss": 0.9194, "step": 1332 }, { "epoch": 2.302245250431779, "grad_norm": 1.526151422655988, "learning_rate": 5.23147923764691e-05, "loss": 0.9063, "step": 1333 }, { "epoch": 2.303972366148532, "grad_norm": 0.545234401328189, "learning_rate": 5.2268887067290935e-05, "loss": 0.9182, "step": 1334 }, { "epoch": 2.305699481865285, "grad_norm": 1.67576767906797, "learning_rate": 5.222296391422815e-05, "loss": 0.8965, "step": 1335 }, { "epoch": 2.307426597582038, "grad_norm": 0.6981344743188987, "learning_rate": 5.217702298407144e-05, "loss": 0.8908, "step": 1336 }, { "epoch": 2.309153713298791, "grad_norm": 1.2420420927526734, "learning_rate": 5.213106434363734e-05, "loss": 0.8896, "step": 1337 }, { "epoch": 2.3108808290155443, "grad_norm": 1.4668710165203815, "learning_rate": 5.2085088059768164e-05, "loss": 0.9128, "step": 1338 }, { "epoch": 2.312607944732297, "grad_norm": 0.6560303712788188, "learning_rate": 5.203909419933188e-05, "loss": 0.8772, "step": 1339 }, { "epoch": 2.31433506044905, "grad_norm": 1.4604234309230653, "learning_rate": 5.1993082829222e-05, "loss": 0.8999, "step": 1340 }, { "epoch": 2.3160621761658033, "grad_norm": 0.9235199023596196, "learning_rate": 5.1947054016357545e-05, "loss": 0.9024, "step": 1341 }, { "epoch": 2.3177892918825562, "grad_norm": 1.2127266868781594, "learning_rate": 5.190100782768285e-05, "loss": 0.9042, "step": 1342 }, { "epoch": 2.319516407599309, "grad_norm": 0.7080773517222125, "learning_rate": 5.1854944330167565e-05, "loss": 0.8983, "step": 1343 }, { "epoch": 2.321243523316062, "grad_norm": 0.9751802292413352, "learning_rate": 5.180886359080649e-05, "loss": 0.8747, "step": 1344 }, { "epoch": 2.3229706390328153, "grad_norm": 1.3593221868676448, "learning_rate": 5.1762765676619516e-05, "loss": 0.8826, "step": 1345 }, { "epoch": 2.324697754749568, "grad_norm": 0.6395588355822046, "learning_rate": 5.17166506546515e-05, "loss": 0.9066, "step": 1346 }, { "epoch": 2.326424870466321, "grad_norm": 1.1466726221525498, "learning_rate": 5.167051859197219e-05, "loss": 0.8995, "step": 1347 }, { "epoch": 2.3281519861830744, "grad_norm": 0.969843087109157, "learning_rate": 5.1624369555676126e-05, "loss": 0.8809, "step": 1348 }, { "epoch": 2.3298791018998273, "grad_norm": 1.3668535779473672, "learning_rate": 5.15782036128825e-05, "loss": 0.8946, "step": 1349 }, { "epoch": 2.33160621761658, "grad_norm": 0.5849818469003083, "learning_rate": 5.1532020830735134e-05, "loss": 0.9141, "step": 1350 }, { "epoch": 2.3333333333333335, "grad_norm": 1.0479881837491836, "learning_rate": 5.148582127640233e-05, "loss": 0.8907, "step": 1351 }, { "epoch": 2.3350604490500864, "grad_norm": 1.5356178828364688, "learning_rate": 5.143960501707677e-05, "loss": 0.8998, "step": 1352 }, { "epoch": 2.3367875647668392, "grad_norm": 0.7719514646518337, "learning_rate": 5.1393372119975425e-05, "loss": 0.8796, "step": 1353 }, { "epoch": 2.3385146804835926, "grad_norm": 1.5735142858501334, "learning_rate": 5.134712265233949e-05, "loss": 0.8945, "step": 1354 }, { "epoch": 2.3402417962003454, "grad_norm": 1.1775961640677473, "learning_rate": 5.1300856681434254e-05, "loss": 0.9094, "step": 1355 }, { "epoch": 2.3419689119170983, "grad_norm": 1.1106791760511592, "learning_rate": 5.125457427454901e-05, "loss": 0.8684, "step": 1356 }, { "epoch": 2.3436960276338517, "grad_norm": 1.1899751403051697, "learning_rate": 5.1208275498996915e-05, "loss": 0.8811, "step": 1357 }, { "epoch": 2.3454231433506045, "grad_norm": 1.2499374943309873, "learning_rate": 5.1161960422115e-05, "loss": 0.9063, "step": 1358 }, { "epoch": 2.3471502590673574, "grad_norm": 0.6154904698271002, "learning_rate": 5.1115629111263946e-05, "loss": 0.904, "step": 1359 }, { "epoch": 2.3488773747841103, "grad_norm": 1.215520579379347, "learning_rate": 5.1069281633828084e-05, "loss": 0.9089, "step": 1360 }, { "epoch": 2.3506044905008636, "grad_norm": 0.7429016078567549, "learning_rate": 5.1022918057215244e-05, "loss": 0.908, "step": 1361 }, { "epoch": 2.3523316062176165, "grad_norm": 0.9112195120379906, "learning_rate": 5.097653844885665e-05, "loss": 0.8914, "step": 1362 }, { "epoch": 2.3540587219343694, "grad_norm": 0.6034223588414739, "learning_rate": 5.0930142876206893e-05, "loss": 0.8886, "step": 1363 }, { "epoch": 2.3557858376511227, "grad_norm": 0.6987164148294293, "learning_rate": 5.088373140674373e-05, "loss": 0.8865, "step": 1364 }, { "epoch": 2.3575129533678756, "grad_norm": 0.7083891080264018, "learning_rate": 5.083730410796805e-05, "loss": 0.8932, "step": 1365 }, { "epoch": 2.3592400690846285, "grad_norm": 0.8452589790064973, "learning_rate": 5.079086104740381e-05, "loss": 0.8728, "step": 1366 }, { "epoch": 2.360967184801382, "grad_norm": 0.7566928471387463, "learning_rate": 5.074440229259782e-05, "loss": 0.8972, "step": 1367 }, { "epoch": 2.3626943005181347, "grad_norm": 0.7447568989532469, "learning_rate": 5.069792791111978e-05, "loss": 0.9092, "step": 1368 }, { "epoch": 2.3644214162348876, "grad_norm": 0.694294016212242, "learning_rate": 5.065143797056205e-05, "loss": 0.8723, "step": 1369 }, { "epoch": 2.366148531951641, "grad_norm": 0.4930578099386284, "learning_rate": 5.060493253853969e-05, "loss": 0.8978, "step": 1370 }, { "epoch": 2.3678756476683938, "grad_norm": 0.6762186111834759, "learning_rate": 5.0558411682690257e-05, "loss": 0.9005, "step": 1371 }, { "epoch": 2.3696027633851466, "grad_norm": 0.585620908165173, "learning_rate": 5.0511875470673717e-05, "loss": 0.8912, "step": 1372 }, { "epoch": 2.3713298791019, "grad_norm": 0.5987229872014909, "learning_rate": 5.0465323970172424e-05, "loss": 0.8713, "step": 1373 }, { "epoch": 2.373056994818653, "grad_norm": 0.5449484141502042, "learning_rate": 5.041875724889092e-05, "loss": 0.8965, "step": 1374 }, { "epoch": 2.3747841105354057, "grad_norm": 0.4615434768497105, "learning_rate": 5.037217537455591e-05, "loss": 0.8921, "step": 1375 }, { "epoch": 2.376511226252159, "grad_norm": 0.6135411097174418, "learning_rate": 5.032557841491613e-05, "loss": 0.8933, "step": 1376 }, { "epoch": 2.378238341968912, "grad_norm": 0.5727939455689288, "learning_rate": 5.0278966437742254e-05, "loss": 0.891, "step": 1377 }, { "epoch": 2.379965457685665, "grad_norm": 0.727365967192097, "learning_rate": 5.023233951082679e-05, "loss": 0.8942, "step": 1378 }, { "epoch": 2.381692573402418, "grad_norm": 0.7750169619466123, "learning_rate": 5.0185697701984025e-05, "loss": 0.8821, "step": 1379 }, { "epoch": 2.383419689119171, "grad_norm": 0.47404884967279837, "learning_rate": 5.013904107904983e-05, "loss": 0.9005, "step": 1380 }, { "epoch": 2.385146804835924, "grad_norm": 0.6755383929969406, "learning_rate": 5.009236970988168e-05, "loss": 0.9363, "step": 1381 }, { "epoch": 2.386873920552677, "grad_norm": 0.5824233444453087, "learning_rate": 5.004568366235846e-05, "loss": 0.899, "step": 1382 }, { "epoch": 2.38860103626943, "grad_norm": 0.386160106355466, "learning_rate": 4.999898300438042e-05, "loss": 0.8969, "step": 1383 }, { "epoch": 2.390328151986183, "grad_norm": 0.6896742022406954, "learning_rate": 4.9952267803869066e-05, "loss": 0.8854, "step": 1384 }, { "epoch": 2.3920552677029363, "grad_norm": 0.7230217844411135, "learning_rate": 4.9905538128767015e-05, "loss": 0.8669, "step": 1385 }, { "epoch": 2.393782383419689, "grad_norm": 0.8606682123351437, "learning_rate": 4.9858794047038004e-05, "loss": 0.8771, "step": 1386 }, { "epoch": 2.395509499136442, "grad_norm": 0.6788783097321082, "learning_rate": 4.9812035626666646e-05, "loss": 0.8775, "step": 1387 }, { "epoch": 2.3972366148531954, "grad_norm": 0.38651994631652253, "learning_rate": 4.976526293565846e-05, "loss": 0.9108, "step": 1388 }, { "epoch": 2.3989637305699483, "grad_norm": 0.47226066111033305, "learning_rate": 4.971847604203972e-05, "loss": 0.8963, "step": 1389 }, { "epoch": 2.400690846286701, "grad_norm": 0.46272667693173614, "learning_rate": 4.9671675013857316e-05, "loss": 0.8844, "step": 1390 }, { "epoch": 2.4024179620034545, "grad_norm": 0.43735876488590136, "learning_rate": 4.962485991917874e-05, "loss": 0.88, "step": 1391 }, { "epoch": 2.4041450777202074, "grad_norm": 0.576418543623559, "learning_rate": 4.957803082609193e-05, "loss": 0.8849, "step": 1392 }, { "epoch": 2.4058721934369602, "grad_norm": 0.7696674374022533, "learning_rate": 4.953118780270516e-05, "loss": 0.9057, "step": 1393 }, { "epoch": 2.407599309153713, "grad_norm": 0.7837280635236451, "learning_rate": 4.948433091714699e-05, "loss": 0.8979, "step": 1394 }, { "epoch": 2.4093264248704664, "grad_norm": 0.7390781077211634, "learning_rate": 4.943746023756613e-05, "loss": 0.8951, "step": 1395 }, { "epoch": 2.4110535405872193, "grad_norm": 0.783800964704434, "learning_rate": 4.939057583213136e-05, "loss": 0.9049, "step": 1396 }, { "epoch": 2.412780656303972, "grad_norm": 0.7049341271511873, "learning_rate": 4.93436777690314e-05, "loss": 0.8867, "step": 1397 }, { "epoch": 2.4145077720207255, "grad_norm": 0.6254644259392039, "learning_rate": 4.9296766116474876e-05, "loss": 0.9231, "step": 1398 }, { "epoch": 2.4162348877374784, "grad_norm": 0.4906895238303636, "learning_rate": 4.924984094269015e-05, "loss": 0.8723, "step": 1399 }, { "epoch": 2.4179620034542313, "grad_norm": 0.37343626814938263, "learning_rate": 4.920290231592523e-05, "loss": 0.8784, "step": 1400 }, { "epoch": 2.4196891191709846, "grad_norm": 0.9892917422714251, "learning_rate": 4.915595030444774e-05, "loss": 0.9173, "step": 1401 }, { "epoch": 2.4214162348877375, "grad_norm": 0.45100691818206373, "learning_rate": 4.910898497654473e-05, "loss": 0.9072, "step": 1402 }, { "epoch": 2.4231433506044904, "grad_norm": 0.6555573820601529, "learning_rate": 4.9062006400522635e-05, "loss": 0.8982, "step": 1403 }, { "epoch": 2.4248704663212437, "grad_norm": 0.6717214563753442, "learning_rate": 4.9015014644707163e-05, "loss": 0.8875, "step": 1404 }, { "epoch": 2.4265975820379966, "grad_norm": 0.723414428435621, "learning_rate": 4.8968009777443154e-05, "loss": 0.9184, "step": 1405 }, { "epoch": 2.4283246977547495, "grad_norm": 0.7775899701060291, "learning_rate": 4.8920991867094574e-05, "loss": 0.8971, "step": 1406 }, { "epoch": 2.4300518134715023, "grad_norm": 0.6972087935853424, "learning_rate": 4.8873960982044324e-05, "loss": 0.8744, "step": 1407 }, { "epoch": 2.4317789291882557, "grad_norm": 0.753086718126588, "learning_rate": 4.882691719069417e-05, "loss": 0.8969, "step": 1408 }, { "epoch": 2.4335060449050085, "grad_norm": 0.6571516133891058, "learning_rate": 4.877986056146468e-05, "loss": 0.897, "step": 1409 }, { "epoch": 2.4352331606217614, "grad_norm": 0.6280540018083655, "learning_rate": 4.8732791162795054e-05, "loss": 0.9036, "step": 1410 }, { "epoch": 2.4369602763385148, "grad_norm": 0.5758305819120269, "learning_rate": 4.868570906314309e-05, "loss": 0.8943, "step": 1411 }, { "epoch": 2.4386873920552676, "grad_norm": 0.6092444412372904, "learning_rate": 4.8638614330985074e-05, "loss": 0.884, "step": 1412 }, { "epoch": 2.4404145077720205, "grad_norm": 0.48297175690821814, "learning_rate": 4.859150703481561e-05, "loss": 0.8737, "step": 1413 }, { "epoch": 2.442141623488774, "grad_norm": 0.4161185463520007, "learning_rate": 4.854438724314763e-05, "loss": 0.9092, "step": 1414 }, { "epoch": 2.4438687392055267, "grad_norm": 0.3843800155254997, "learning_rate": 4.849725502451221e-05, "loss": 0.8879, "step": 1415 }, { "epoch": 2.4455958549222796, "grad_norm": 0.46528565210069134, "learning_rate": 4.8450110447458496e-05, "loss": 0.9015, "step": 1416 }, { "epoch": 2.447322970639033, "grad_norm": 0.49676250129133387, "learning_rate": 4.8402953580553646e-05, "loss": 0.8981, "step": 1417 }, { "epoch": 2.449050086355786, "grad_norm": 0.8379207870617195, "learning_rate": 4.8355784492382636e-05, "loss": 0.9114, "step": 1418 }, { "epoch": 2.4507772020725387, "grad_norm": 1.148285213736391, "learning_rate": 4.8308603251548275e-05, "loss": 0.9018, "step": 1419 }, { "epoch": 2.452504317789292, "grad_norm": 0.6405218890200031, "learning_rate": 4.826140992667101e-05, "loss": 0.8993, "step": 1420 }, { "epoch": 2.454231433506045, "grad_norm": 0.7985190710187062, "learning_rate": 4.8214204586388855e-05, "loss": 0.8895, "step": 1421 }, { "epoch": 2.4559585492227978, "grad_norm": 0.5698478034877499, "learning_rate": 4.816698729935733e-05, "loss": 0.8705, "step": 1422 }, { "epoch": 2.457685664939551, "grad_norm": 0.4869790324085779, "learning_rate": 4.8119758134249306e-05, "loss": 0.9219, "step": 1423 }, { "epoch": 2.459412780656304, "grad_norm": 0.8034893654180771, "learning_rate": 4.807251715975495e-05, "loss": 0.8862, "step": 1424 }, { "epoch": 2.461139896373057, "grad_norm": 0.6778435832960842, "learning_rate": 4.8025264444581604e-05, "loss": 0.9002, "step": 1425 }, { "epoch": 2.46286701208981, "grad_norm": 0.7662697210752171, "learning_rate": 4.797800005745366e-05, "loss": 0.9023, "step": 1426 }, { "epoch": 2.464594127806563, "grad_norm": 1.04740425565489, "learning_rate": 4.7930724067112505e-05, "loss": 0.9424, "step": 1427 }, { "epoch": 2.466321243523316, "grad_norm": 0.8374573621533933, "learning_rate": 4.788343654231638e-05, "loss": 0.9104, "step": 1428 }, { "epoch": 2.4680483592400693, "grad_norm": 0.6895621998809534, "learning_rate": 4.783613755184035e-05, "loss": 0.8951, "step": 1429 }, { "epoch": 2.469775474956822, "grad_norm": 0.6186149863997923, "learning_rate": 4.778882716447611e-05, "loss": 0.892, "step": 1430 }, { "epoch": 2.471502590673575, "grad_norm": 0.3941316762682224, "learning_rate": 4.7741505449031956e-05, "loss": 0.892, "step": 1431 }, { "epoch": 2.4732297063903284, "grad_norm": 0.599959260495035, "learning_rate": 4.769417247433264e-05, "loss": 0.8966, "step": 1432 }, { "epoch": 2.4749568221070812, "grad_norm": 0.959159984097863, "learning_rate": 4.7646828309219294e-05, "loss": 0.8871, "step": 1433 }, { "epoch": 2.476683937823834, "grad_norm": 1.4806124360167745, "learning_rate": 4.7599473022549335e-05, "loss": 0.9224, "step": 1434 }, { "epoch": 2.4784110535405874, "grad_norm": 0.3301155556142342, "learning_rate": 4.755210668319637e-05, "loss": 0.8624, "step": 1435 }, { "epoch": 2.4801381692573403, "grad_norm": 1.2264697510018834, "learning_rate": 4.750472936005005e-05, "loss": 0.907, "step": 1436 }, { "epoch": 2.481865284974093, "grad_norm": 1.21960824457381, "learning_rate": 4.7457341122016e-05, "loss": 0.9046, "step": 1437 }, { "epoch": 2.4835924006908465, "grad_norm": 0.5841870297759015, "learning_rate": 4.7409942038015736e-05, "loss": 0.8735, "step": 1438 }, { "epoch": 2.4853195164075994, "grad_norm": 0.7022556016833509, "learning_rate": 4.736253217698656e-05, "loss": 0.8842, "step": 1439 }, { "epoch": 2.4870466321243523, "grad_norm": 0.7571151539676488, "learning_rate": 4.731511160788143e-05, "loss": 0.8901, "step": 1440 }, { "epoch": 2.488773747841105, "grad_norm": 0.9057559260108684, "learning_rate": 4.726768039966885e-05, "loss": 0.9003, "step": 1441 }, { "epoch": 2.4905008635578585, "grad_norm": 1.5591040724445204, "learning_rate": 4.7220238621332876e-05, "loss": 0.9215, "step": 1442 }, { "epoch": 2.4922279792746114, "grad_norm": 0.5289179431579273, "learning_rate": 4.717278634187286e-05, "loss": 0.8896, "step": 1443 }, { "epoch": 2.4939550949913643, "grad_norm": 1.5408226737076076, "learning_rate": 4.712532363030345e-05, "loss": 0.8784, "step": 1444 }, { "epoch": 2.4956822107081176, "grad_norm": 0.8919794705052352, "learning_rate": 4.7077850555654496e-05, "loss": 0.8958, "step": 1445 }, { "epoch": 2.4974093264248705, "grad_norm": 1.1022825134239205, "learning_rate": 4.703036718697089e-05, "loss": 0.9033, "step": 1446 }, { "epoch": 2.4991364421416233, "grad_norm": 1.0181672458588424, "learning_rate": 4.698287359331248e-05, "loss": 0.8812, "step": 1447 }, { "epoch": 2.5008635578583767, "grad_norm": 1.377959450334617, "learning_rate": 4.693536984375405e-05, "loss": 0.8783, "step": 1448 }, { "epoch": 2.5025906735751295, "grad_norm": 0.49032551545037933, "learning_rate": 4.688785600738507e-05, "loss": 0.8865, "step": 1449 }, { "epoch": 2.5043177892918824, "grad_norm": 1.3362996301969359, "learning_rate": 4.6840332153309745e-05, "loss": 0.8884, "step": 1450 }, { "epoch": 2.5060449050086353, "grad_norm": 1.1333030700071474, "learning_rate": 4.679279835064684e-05, "loss": 0.8947, "step": 1451 }, { "epoch": 2.5077720207253886, "grad_norm": 0.6890608761579453, "learning_rate": 4.674525466852954e-05, "loss": 0.8667, "step": 1452 }, { "epoch": 2.5094991364421415, "grad_norm": 1.007807106525557, "learning_rate": 4.6697701176105456e-05, "loss": 0.9417, "step": 1453 }, { "epoch": 2.5112262521588944, "grad_norm": 0.6309139565593291, "learning_rate": 4.6650137942536444e-05, "loss": 0.8968, "step": 1454 }, { "epoch": 2.5129533678756477, "grad_norm": 1.0100990390823252, "learning_rate": 4.660256503699856e-05, "loss": 0.8886, "step": 1455 }, { "epoch": 2.5146804835924006, "grad_norm": 1.044875583225104, "learning_rate": 4.655498252868184e-05, "loss": 0.8931, "step": 1456 }, { "epoch": 2.5164075993091535, "grad_norm": 0.877087002603305, "learning_rate": 4.6507390486790384e-05, "loss": 0.9199, "step": 1457 }, { "epoch": 2.518134715025907, "grad_norm": 0.6804472624111076, "learning_rate": 4.645978898054211e-05, "loss": 0.8848, "step": 1458 }, { "epoch": 2.5198618307426597, "grad_norm": 0.5081325620763198, "learning_rate": 4.64121780791687e-05, "loss": 0.8989, "step": 1459 }, { "epoch": 2.5215889464594126, "grad_norm": 0.8212679771960479, "learning_rate": 4.636455785191552e-05, "loss": 0.8672, "step": 1460 }, { "epoch": 2.523316062176166, "grad_norm": 0.8214144170981003, "learning_rate": 4.631692836804147e-05, "loss": 0.8903, "step": 1461 }, { "epoch": 2.5250431778929188, "grad_norm": 0.7596908513239587, "learning_rate": 4.6269289696818936e-05, "loss": 0.8819, "step": 1462 }, { "epoch": 2.5267702936096716, "grad_norm": 0.8769234280412251, "learning_rate": 4.622164190753368e-05, "loss": 0.8719, "step": 1463 }, { "epoch": 2.528497409326425, "grad_norm": 0.8790414668517641, "learning_rate": 4.6173985069484675e-05, "loss": 0.9174, "step": 1464 }, { "epoch": 2.530224525043178, "grad_norm": 0.8636998301623003, "learning_rate": 4.612631925198412e-05, "loss": 0.8944, "step": 1465 }, { "epoch": 2.5319516407599307, "grad_norm": 0.43567656013018957, "learning_rate": 4.6078644524357206e-05, "loss": 0.8895, "step": 1466 }, { "epoch": 2.533678756476684, "grad_norm": 0.4791130858914562, "learning_rate": 4.603096095594213e-05, "loss": 0.8936, "step": 1467 }, { "epoch": 2.535405872193437, "grad_norm": 0.4586031935753213, "learning_rate": 4.5983268616089946e-05, "loss": 0.9142, "step": 1468 }, { "epoch": 2.53713298791019, "grad_norm": 0.35670538592724343, "learning_rate": 4.593556757416443e-05, "loss": 0.886, "step": 1469 }, { "epoch": 2.538860103626943, "grad_norm": 0.5089243928979167, "learning_rate": 4.588785789954205e-05, "loss": 0.8912, "step": 1470 }, { "epoch": 2.540587219343696, "grad_norm": 0.5535338564637918, "learning_rate": 4.584013966161182e-05, "loss": 0.8845, "step": 1471 }, { "epoch": 2.542314335060449, "grad_norm": 0.558598175571938, "learning_rate": 4.5792412929775196e-05, "loss": 0.8762, "step": 1472 }, { "epoch": 2.5440414507772022, "grad_norm": 0.5201131302391749, "learning_rate": 4.5744677773446e-05, "loss": 0.8798, "step": 1473 }, { "epoch": 2.545768566493955, "grad_norm": 0.5022986477292436, "learning_rate": 4.5696934262050295e-05, "loss": 0.8965, "step": 1474 }, { "epoch": 2.547495682210708, "grad_norm": 0.38117889465070237, "learning_rate": 4.5649182465026306e-05, "loss": 0.9153, "step": 1475 }, { "epoch": 2.5492227979274613, "grad_norm": 0.7889234260527443, "learning_rate": 4.560142245182433e-05, "loss": 0.9154, "step": 1476 }, { "epoch": 2.550949913644214, "grad_norm": 0.732089657366807, "learning_rate": 4.555365429190655e-05, "loss": 0.8899, "step": 1477 }, { "epoch": 2.552677029360967, "grad_norm": 0.39292796543844943, "learning_rate": 4.550587805474707e-05, "loss": 0.8744, "step": 1478 }, { "epoch": 2.5544041450777204, "grad_norm": 0.499680750563524, "learning_rate": 4.5458093809831686e-05, "loss": 0.8668, "step": 1479 }, { "epoch": 2.5561312607944733, "grad_norm": 0.8153213026083254, "learning_rate": 4.541030162665787e-05, "loss": 0.8907, "step": 1480 }, { "epoch": 2.557858376511226, "grad_norm": 0.6364634771780231, "learning_rate": 4.536250157473465e-05, "loss": 0.8914, "step": 1481 }, { "epoch": 2.5595854922279795, "grad_norm": 0.4507199081967431, "learning_rate": 4.5314693723582445e-05, "loss": 0.9214, "step": 1482 }, { "epoch": 2.5613126079447324, "grad_norm": 0.5869008385713516, "learning_rate": 4.526687814273309e-05, "loss": 0.9181, "step": 1483 }, { "epoch": 2.5630397236614852, "grad_norm": 0.6489575889258198, "learning_rate": 4.521905490172961e-05, "loss": 0.8875, "step": 1484 }, { "epoch": 2.5647668393782386, "grad_norm": 0.8399389527319724, "learning_rate": 4.5171224070126174e-05, "loss": 0.9074, "step": 1485 }, { "epoch": 2.5664939550949915, "grad_norm": 0.6824406994306461, "learning_rate": 4.512338571748803e-05, "loss": 0.9082, "step": 1486 }, { "epoch": 2.5682210708117443, "grad_norm": 0.5316166189302975, "learning_rate": 4.5075539913391317e-05, "loss": 0.8961, "step": 1487 }, { "epoch": 2.5699481865284977, "grad_norm": 0.5434659087647928, "learning_rate": 4.5027686727423036e-05, "loss": 0.9002, "step": 1488 }, { "epoch": 2.5716753022452505, "grad_norm": 0.6734514994197223, "learning_rate": 4.497982622918092e-05, "loss": 0.9167, "step": 1489 }, { "epoch": 2.5734024179620034, "grad_norm": 0.8503914467104274, "learning_rate": 4.493195848827334e-05, "loss": 0.8977, "step": 1490 }, { "epoch": 2.5751295336787567, "grad_norm": 0.9996129435272021, "learning_rate": 4.4884083574319204e-05, "loss": 0.9047, "step": 1491 }, { "epoch": 2.5768566493955096, "grad_norm": 0.807607899090321, "learning_rate": 4.483620155694783e-05, "loss": 0.9014, "step": 1492 }, { "epoch": 2.5785837651122625, "grad_norm": 0.6787084764350136, "learning_rate": 4.47883125057989e-05, "loss": 0.8721, "step": 1493 }, { "epoch": 2.5803108808290154, "grad_norm": 0.6205632884270383, "learning_rate": 4.4740416490522295e-05, "loss": 0.9177, "step": 1494 }, { "epoch": 2.5820379965457687, "grad_norm": 0.6531075021405742, "learning_rate": 4.469251358077805e-05, "loss": 0.8928, "step": 1495 }, { "epoch": 2.5837651122625216, "grad_norm": 0.5125444087399698, "learning_rate": 4.464460384623622e-05, "loss": 0.8827, "step": 1496 }, { "epoch": 2.5854922279792745, "grad_norm": 0.3496110292424639, "learning_rate": 4.459668735657677e-05, "loss": 0.8837, "step": 1497 }, { "epoch": 2.587219343696028, "grad_norm": 0.4268273300682081, "learning_rate": 4.454876418148951e-05, "loss": 0.8929, "step": 1498 }, { "epoch": 2.5889464594127807, "grad_norm": 0.520444509432503, "learning_rate": 4.450083439067396e-05, "loss": 0.9084, "step": 1499 }, { "epoch": 2.5906735751295336, "grad_norm": 0.47770606338104693, "learning_rate": 4.4452898053839275e-05, "loss": 0.9036, "step": 1500 }, { "epoch": 2.5924006908462864, "grad_norm": 0.5411428380809802, "learning_rate": 4.440495524070413e-05, "loss": 0.8955, "step": 1501 }, { "epoch": 2.5941278065630398, "grad_norm": 0.5635431033333954, "learning_rate": 4.4357006020996583e-05, "loss": 0.8992, "step": 1502 }, { "epoch": 2.5958549222797926, "grad_norm": 0.6759423254275801, "learning_rate": 4.430905046445406e-05, "loss": 0.8891, "step": 1503 }, { "epoch": 2.5975820379965455, "grad_norm": 0.41148124367916317, "learning_rate": 4.426108864082319e-05, "loss": 0.8952, "step": 1504 }, { "epoch": 2.599309153713299, "grad_norm": 0.3811850569855404, "learning_rate": 4.421312061985969e-05, "loss": 0.9166, "step": 1505 }, { "epoch": 2.6010362694300517, "grad_norm": 0.4342165676551669, "learning_rate": 4.416514647132831e-05, "loss": 0.8848, "step": 1506 }, { "epoch": 2.6027633851468046, "grad_norm": 0.3912085592377241, "learning_rate": 4.411716626500273e-05, "loss": 0.8934, "step": 1507 }, { "epoch": 2.604490500863558, "grad_norm": 0.322632149954307, "learning_rate": 4.4069180070665404e-05, "loss": 0.8844, "step": 1508 }, { "epoch": 2.606217616580311, "grad_norm": 0.46684262832970735, "learning_rate": 4.4021187958107524e-05, "loss": 0.8966, "step": 1509 }, { "epoch": 2.6079447322970637, "grad_norm": 0.5445269862847049, "learning_rate": 4.397318999712887e-05, "loss": 0.8899, "step": 1510 }, { "epoch": 2.609671848013817, "grad_norm": 0.46999289791760057, "learning_rate": 4.3925186257537744e-05, "loss": 0.8845, "step": 1511 }, { "epoch": 2.61139896373057, "grad_norm": 0.5349444067081975, "learning_rate": 4.3877176809150844e-05, "loss": 0.8848, "step": 1512 }, { "epoch": 2.613126079447323, "grad_norm": 0.6142858173686077, "learning_rate": 4.382916172179315e-05, "loss": 0.9223, "step": 1513 }, { "epoch": 2.614853195164076, "grad_norm": 0.5564432791637571, "learning_rate": 4.3781141065297904e-05, "loss": 0.89, "step": 1514 }, { "epoch": 2.616580310880829, "grad_norm": 0.5044869817690132, "learning_rate": 4.373311490950637e-05, "loss": 0.9016, "step": 1515 }, { "epoch": 2.618307426597582, "grad_norm": 0.39083330595030374, "learning_rate": 4.368508332426788e-05, "loss": 0.8692, "step": 1516 }, { "epoch": 2.620034542314335, "grad_norm": 0.3318590097438312, "learning_rate": 4.363704637943961e-05, "loss": 0.8796, "step": 1517 }, { "epoch": 2.621761658031088, "grad_norm": 0.38179492927621234, "learning_rate": 4.3589004144886554e-05, "loss": 0.8944, "step": 1518 }, { "epoch": 2.623488773747841, "grad_norm": 0.6880264466862672, "learning_rate": 4.3540956690481426e-05, "loss": 0.8771, "step": 1519 }, { "epoch": 2.6252158894645943, "grad_norm": 0.9441234013695523, "learning_rate": 4.349290408610445e-05, "loss": 0.8779, "step": 1520 }, { "epoch": 2.626943005181347, "grad_norm": 1.0325533171453132, "learning_rate": 4.344484640164345e-05, "loss": 0.8772, "step": 1521 }, { "epoch": 2.6286701208981, "grad_norm": 1.0146322940905792, "learning_rate": 4.3396783706993543e-05, "loss": 0.8775, "step": 1522 }, { "epoch": 2.6303972366148534, "grad_norm": 0.8613693253516661, "learning_rate": 4.3348716072057185e-05, "loss": 0.8955, "step": 1523 }, { "epoch": 2.6321243523316062, "grad_norm": 0.5639806586808582, "learning_rate": 4.330064356674401e-05, "loss": 0.8873, "step": 1524 }, { "epoch": 2.633851468048359, "grad_norm": 0.39190279202467904, "learning_rate": 4.325256626097073e-05, "loss": 0.8944, "step": 1525 }, { "epoch": 2.6355785837651124, "grad_norm": 0.5321752238522922, "learning_rate": 4.3204484224661014e-05, "loss": 0.8881, "step": 1526 }, { "epoch": 2.6373056994818653, "grad_norm": 0.6752697215513123, "learning_rate": 4.315639752774547e-05, "loss": 0.8814, "step": 1527 }, { "epoch": 2.639032815198618, "grad_norm": 0.6635435490850171, "learning_rate": 4.3108306240161425e-05, "loss": 0.8989, "step": 1528 }, { "epoch": 2.6407599309153715, "grad_norm": 0.7076192696040523, "learning_rate": 4.306021043185292e-05, "loss": 0.8857, "step": 1529 }, { "epoch": 2.6424870466321244, "grad_norm": 0.616342874604553, "learning_rate": 4.301211017277055e-05, "loss": 0.897, "step": 1530 }, { "epoch": 2.6442141623488773, "grad_norm": 0.5712096262241598, "learning_rate": 4.29640055328714e-05, "loss": 0.8941, "step": 1531 }, { "epoch": 2.6459412780656306, "grad_norm": 0.4819126395562676, "learning_rate": 4.2915896582118917e-05, "loss": 0.8863, "step": 1532 }, { "epoch": 2.6476683937823835, "grad_norm": 0.3272746917321077, "learning_rate": 4.2867783390482815e-05, "loss": 0.9105, "step": 1533 }, { "epoch": 2.6493955094991364, "grad_norm": 0.3146508356656128, "learning_rate": 4.281966602793898e-05, "loss": 0.8859, "step": 1534 }, { "epoch": 2.6511226252158897, "grad_norm": 0.4009523396711022, "learning_rate": 4.277154456446937e-05, "loss": 0.8965, "step": 1535 }, { "epoch": 2.6528497409326426, "grad_norm": 0.4131455346805882, "learning_rate": 4.272341907006189e-05, "loss": 0.8954, "step": 1536 }, { "epoch": 2.6545768566493955, "grad_norm": 0.4503450739505316, "learning_rate": 4.267528961471033e-05, "loss": 0.8723, "step": 1537 }, { "epoch": 2.656303972366149, "grad_norm": 0.52812650624855, "learning_rate": 4.262715626841421e-05, "loss": 0.9022, "step": 1538 }, { "epoch": 2.6580310880829017, "grad_norm": 0.47025129942385197, "learning_rate": 4.257901910117876e-05, "loss": 0.9078, "step": 1539 }, { "epoch": 2.6597582037996546, "grad_norm": 0.37737865307541163, "learning_rate": 4.253087818301471e-05, "loss": 0.878, "step": 1540 }, { "epoch": 2.6614853195164074, "grad_norm": 0.3206187530417747, "learning_rate": 4.248273358393829e-05, "loss": 0.9149, "step": 1541 }, { "epoch": 2.6632124352331608, "grad_norm": 0.30470566318686354, "learning_rate": 4.243458537397105e-05, "loss": 0.8703, "step": 1542 }, { "epoch": 2.6649395509499136, "grad_norm": 0.3553844344539907, "learning_rate": 4.238643362313981e-05, "loss": 0.8794, "step": 1543 }, { "epoch": 2.6666666666666665, "grad_norm": 0.30031710636886544, "learning_rate": 4.233827840147655e-05, "loss": 0.8846, "step": 1544 }, { "epoch": 2.66839378238342, "grad_norm": 0.3021565492558505, "learning_rate": 4.2290119779018264e-05, "loss": 0.9035, "step": 1545 }, { "epoch": 2.6701208981001727, "grad_norm": 0.3819182984473307, "learning_rate": 4.224195782580692e-05, "loss": 0.8989, "step": 1546 }, { "epoch": 2.6718480138169256, "grad_norm": 0.31283443254656473, "learning_rate": 4.219379261188934e-05, "loss": 0.9131, "step": 1547 }, { "epoch": 2.6735751295336785, "grad_norm": 0.3675675678467023, "learning_rate": 4.214562420731704e-05, "loss": 0.8911, "step": 1548 }, { "epoch": 2.675302245250432, "grad_norm": 0.3857614248846757, "learning_rate": 4.2097452682146224e-05, "loss": 0.8894, "step": 1549 }, { "epoch": 2.6770293609671847, "grad_norm": 0.3434186166196959, "learning_rate": 4.204927810643762e-05, "loss": 0.8837, "step": 1550 }, { "epoch": 2.6787564766839376, "grad_norm": 0.36511849227058935, "learning_rate": 4.200110055025638e-05, "loss": 0.8768, "step": 1551 }, { "epoch": 2.680483592400691, "grad_norm": 0.4936502119739069, "learning_rate": 4.1952920083672026e-05, "loss": 0.8968, "step": 1552 }, { "epoch": 2.6822107081174438, "grad_norm": 0.37977490256867424, "learning_rate": 4.190473677675826e-05, "loss": 0.8921, "step": 1553 }, { "epoch": 2.6839378238341967, "grad_norm": 0.41465673822308935, "learning_rate": 4.185655069959295e-05, "loss": 0.8783, "step": 1554 }, { "epoch": 2.68566493955095, "grad_norm": 0.4171517561535128, "learning_rate": 4.1808361922258e-05, "loss": 0.8887, "step": 1555 }, { "epoch": 2.687392055267703, "grad_norm": 0.4124719236969017, "learning_rate": 4.176017051483922e-05, "loss": 0.8797, "step": 1556 }, { "epoch": 2.6891191709844557, "grad_norm": 0.38653675112973795, "learning_rate": 4.171197654742626e-05, "loss": 0.8828, "step": 1557 }, { "epoch": 2.690846286701209, "grad_norm": 0.3646968283060852, "learning_rate": 4.1663780090112465e-05, "loss": 0.8775, "step": 1558 }, { "epoch": 2.692573402417962, "grad_norm": 0.392878927466157, "learning_rate": 4.161558121299484e-05, "loss": 0.9068, "step": 1559 }, { "epoch": 2.694300518134715, "grad_norm": 0.35515883329376685, "learning_rate": 4.15673799861739e-05, "loss": 0.8919, "step": 1560 }, { "epoch": 2.696027633851468, "grad_norm": 0.4308614592946804, "learning_rate": 4.151917647975354e-05, "loss": 0.8696, "step": 1561 }, { "epoch": 2.697754749568221, "grad_norm": 0.3477076487874876, "learning_rate": 4.1470970763841025e-05, "loss": 0.8715, "step": 1562 }, { "epoch": 2.699481865284974, "grad_norm": 0.4102718042825442, "learning_rate": 4.14227629085468e-05, "loss": 0.8845, "step": 1563 }, { "epoch": 2.7012089810017272, "grad_norm": 0.3466999129188969, "learning_rate": 4.137455298398443e-05, "loss": 0.8745, "step": 1564 }, { "epoch": 2.70293609671848, "grad_norm": 0.33388887903961717, "learning_rate": 4.132634106027049e-05, "loss": 0.8796, "step": 1565 }, { "epoch": 2.704663212435233, "grad_norm": 0.3811500936396073, "learning_rate": 4.127812720752446e-05, "loss": 0.8838, "step": 1566 }, { "epoch": 2.7063903281519863, "grad_norm": 0.5108464268169567, "learning_rate": 4.122991149586863e-05, "loss": 0.9124, "step": 1567 }, { "epoch": 2.708117443868739, "grad_norm": 0.2982035695360644, "learning_rate": 4.118169399542799e-05, "loss": 0.8765, "step": 1568 }, { "epoch": 2.709844559585492, "grad_norm": 0.29560276375664696, "learning_rate": 4.113347477633013e-05, "loss": 0.8753, "step": 1569 }, { "epoch": 2.7115716753022454, "grad_norm": 0.41973237195112123, "learning_rate": 4.1085253908705154e-05, "loss": 0.8909, "step": 1570 }, { "epoch": 2.7132987910189983, "grad_norm": 0.5072669609982255, "learning_rate": 4.103703146268553e-05, "loss": 0.8932, "step": 1571 }, { "epoch": 2.715025906735751, "grad_norm": 0.6228666575056468, "learning_rate": 4.0988807508406064e-05, "loss": 0.8902, "step": 1572 }, { "epoch": 2.7167530224525045, "grad_norm": 0.7379213285676341, "learning_rate": 4.0940582116003733e-05, "loss": 0.8838, "step": 1573 }, { "epoch": 2.7184801381692574, "grad_norm": 0.7709456306315209, "learning_rate": 4.089235535561759e-05, "loss": 0.9093, "step": 1574 }, { "epoch": 2.7202072538860103, "grad_norm": 0.5667074820355589, "learning_rate": 4.084412729738872e-05, "loss": 0.888, "step": 1575 }, { "epoch": 2.7219343696027636, "grad_norm": 0.4564102397197639, "learning_rate": 4.0795898011460045e-05, "loss": 0.8876, "step": 1576 }, { "epoch": 2.7236614853195165, "grad_norm": 0.37375667265447476, "learning_rate": 4.0747667567976316e-05, "loss": 0.8797, "step": 1577 }, { "epoch": 2.7253886010362693, "grad_norm": 0.5085593344961906, "learning_rate": 4.069943603708395e-05, "loss": 0.8988, "step": 1578 }, { "epoch": 2.7271157167530227, "grad_norm": 0.6308486140937926, "learning_rate": 4.0651203488930934e-05, "loss": 0.8896, "step": 1579 }, { "epoch": 2.7288428324697755, "grad_norm": 0.7057768062652842, "learning_rate": 4.0602969993666764e-05, "loss": 0.8914, "step": 1580 }, { "epoch": 2.7305699481865284, "grad_norm": 0.8054509900197676, "learning_rate": 4.055473562144228e-05, "loss": 0.8764, "step": 1581 }, { "epoch": 2.7322970639032818, "grad_norm": 0.798973453301131, "learning_rate": 4.050650044240961e-05, "loss": 0.8855, "step": 1582 }, { "epoch": 2.7340241796200346, "grad_norm": 0.6513602884747159, "learning_rate": 4.045826452672208e-05, "loss": 0.8941, "step": 1583 }, { "epoch": 2.7357512953367875, "grad_norm": 0.4822701645528896, "learning_rate": 4.041002794453404e-05, "loss": 0.9059, "step": 1584 }, { "epoch": 2.737478411053541, "grad_norm": 0.4128615302784742, "learning_rate": 4.036179076600085e-05, "loss": 0.8726, "step": 1585 }, { "epoch": 2.7392055267702937, "grad_norm": 0.4833458267824031, "learning_rate": 4.0313553061278725e-05, "loss": 0.8948, "step": 1586 }, { "epoch": 2.7409326424870466, "grad_norm": 0.4324571081197527, "learning_rate": 4.026531490052462e-05, "loss": 0.8799, "step": 1587 }, { "epoch": 2.7426597582038, "grad_norm": 0.586590776446889, "learning_rate": 4.0217076353896226e-05, "loss": 0.9074, "step": 1588 }, { "epoch": 2.744386873920553, "grad_norm": 0.737574840438354, "learning_rate": 4.0168837491551687e-05, "loss": 0.8786, "step": 1589 }, { "epoch": 2.7461139896373057, "grad_norm": 0.8666774710723562, "learning_rate": 4.0120598383649694e-05, "loss": 0.891, "step": 1590 }, { "epoch": 2.7478411053540586, "grad_norm": 0.9361466897665471, "learning_rate": 4.007235910034927e-05, "loss": 0.8793, "step": 1591 }, { "epoch": 2.749568221070812, "grad_norm": 0.8091430298210771, "learning_rate": 4.002411971180967e-05, "loss": 0.8878, "step": 1592 }, { "epoch": 2.7512953367875648, "grad_norm": 0.5882493461660068, "learning_rate": 3.997588028819034e-05, "loss": 0.884, "step": 1593 }, { "epoch": 2.7530224525043177, "grad_norm": 0.4782564111496843, "learning_rate": 3.992764089965075e-05, "loss": 0.8741, "step": 1594 }, { "epoch": 2.754749568221071, "grad_norm": 0.4451884062368831, "learning_rate": 3.987940161635031e-05, "loss": 0.8859, "step": 1595 }, { "epoch": 2.756476683937824, "grad_norm": 0.551828026174058, "learning_rate": 3.983116250844833e-05, "loss": 0.9255, "step": 1596 }, { "epoch": 2.7582037996545767, "grad_norm": 0.2986669553395544, "learning_rate": 3.97829236461038e-05, "loss": 0.882, "step": 1597 }, { "epoch": 2.7599309153713296, "grad_norm": 0.4772981547219276, "learning_rate": 3.9734685099475376e-05, "loss": 0.9022, "step": 1598 }, { "epoch": 2.761658031088083, "grad_norm": 0.5643249878472132, "learning_rate": 3.968644693872129e-05, "loss": 0.8822, "step": 1599 }, { "epoch": 2.763385146804836, "grad_norm": 0.46690165365121145, "learning_rate": 3.963820923399917e-05, "loss": 0.8617, "step": 1600 }, { "epoch": 2.7651122625215887, "grad_norm": 0.974976979874295, "learning_rate": 3.958997205546597e-05, "loss": 0.8956, "step": 1601 }, { "epoch": 2.766839378238342, "grad_norm": 0.46436091592904083, "learning_rate": 3.9541735473277935e-05, "loss": 0.8856, "step": 1602 }, { "epoch": 2.768566493955095, "grad_norm": 0.3660566885012577, "learning_rate": 3.9493499557590406e-05, "loss": 0.9071, "step": 1603 }, { "epoch": 2.770293609671848, "grad_norm": 0.44657795580331455, "learning_rate": 3.944526437855774e-05, "loss": 0.879, "step": 1604 }, { "epoch": 2.772020725388601, "grad_norm": 0.4076685083206412, "learning_rate": 3.939703000633325e-05, "loss": 0.8846, "step": 1605 }, { "epoch": 2.773747841105354, "grad_norm": 0.39203722961503323, "learning_rate": 3.934879651106908e-05, "loss": 0.8934, "step": 1606 }, { "epoch": 2.775474956822107, "grad_norm": 0.3695867983159649, "learning_rate": 3.930056396291607e-05, "loss": 0.8951, "step": 1607 }, { "epoch": 2.77720207253886, "grad_norm": 0.4147486425386516, "learning_rate": 3.925233243202369e-05, "loss": 0.855, "step": 1608 }, { "epoch": 2.778929188255613, "grad_norm": 0.600360733198907, "learning_rate": 3.920410198853997e-05, "loss": 0.8814, "step": 1609 }, { "epoch": 2.780656303972366, "grad_norm": 0.7114351977148544, "learning_rate": 3.91558727026113e-05, "loss": 0.8934, "step": 1610 }, { "epoch": 2.7823834196891193, "grad_norm": 0.6525190452953762, "learning_rate": 3.9107644644382414e-05, "loss": 0.9017, "step": 1611 }, { "epoch": 2.784110535405872, "grad_norm": 0.6845846226522763, "learning_rate": 3.905941788399628e-05, "loss": 0.8843, "step": 1612 }, { "epoch": 2.785837651122625, "grad_norm": 0.5406247841324029, "learning_rate": 3.901119249159395e-05, "loss": 0.9131, "step": 1613 }, { "epoch": 2.7875647668393784, "grad_norm": 0.5055543105414455, "learning_rate": 3.896296853731447e-05, "loss": 0.8864, "step": 1614 }, { "epoch": 2.7892918825561313, "grad_norm": 0.49343413820546267, "learning_rate": 3.891474609129485e-05, "loss": 0.889, "step": 1615 }, { "epoch": 2.791018998272884, "grad_norm": 0.3641937787504106, "learning_rate": 3.886652522366988e-05, "loss": 0.9013, "step": 1616 }, { "epoch": 2.7927461139896375, "grad_norm": 0.36742421665477903, "learning_rate": 3.8818306004572017e-05, "loss": 0.8918, "step": 1617 }, { "epoch": 2.7944732297063903, "grad_norm": 0.4645611725328096, "learning_rate": 3.8770088504131375e-05, "loss": 0.8738, "step": 1618 }, { "epoch": 2.796200345423143, "grad_norm": 0.3950915578720256, "learning_rate": 3.872187279247555e-05, "loss": 0.8796, "step": 1619 }, { "epoch": 2.7979274611398965, "grad_norm": 0.3157614997647221, "learning_rate": 3.867365893972952e-05, "loss": 0.8832, "step": 1620 }, { "epoch": 2.7996545768566494, "grad_norm": 0.3553826053330369, "learning_rate": 3.8625447016015584e-05, "loss": 0.8851, "step": 1621 }, { "epoch": 2.8013816925734023, "grad_norm": 0.2878715980399106, "learning_rate": 3.857723709145321e-05, "loss": 0.8927, "step": 1622 }, { "epoch": 2.8031088082901556, "grad_norm": 0.3729885396929934, "learning_rate": 3.852902923615899e-05, "loss": 0.8899, "step": 1623 }, { "epoch": 2.8048359240069085, "grad_norm": 0.4073193939664775, "learning_rate": 3.848082352024647e-05, "loss": 0.8853, "step": 1624 }, { "epoch": 2.8065630397236614, "grad_norm": 0.32799979097442383, "learning_rate": 3.8432620013826115e-05, "loss": 0.8883, "step": 1625 }, { "epoch": 2.8082901554404147, "grad_norm": 0.32992454643459335, "learning_rate": 3.838441878700517e-05, "loss": 0.8959, "step": 1626 }, { "epoch": 2.8100172711571676, "grad_norm": 0.33722339548361807, "learning_rate": 3.8336219909887555e-05, "loss": 0.8818, "step": 1627 }, { "epoch": 2.8117443868739205, "grad_norm": 0.31906150556329876, "learning_rate": 3.8288023452573756e-05, "loss": 0.8788, "step": 1628 }, { "epoch": 2.813471502590674, "grad_norm": 0.2807154215364889, "learning_rate": 3.823982948516079e-05, "loss": 0.8909, "step": 1629 }, { "epoch": 2.8151986183074267, "grad_norm": 0.36109655884689285, "learning_rate": 3.819163807774202e-05, "loss": 0.8949, "step": 1630 }, { "epoch": 2.8169257340241796, "grad_norm": 0.3061851125227955, "learning_rate": 3.814344930040706e-05, "loss": 0.8833, "step": 1631 }, { "epoch": 2.818652849740933, "grad_norm": 0.3612727113461691, "learning_rate": 3.8095263223241754e-05, "loss": 0.8838, "step": 1632 }, { "epoch": 2.8203799654576858, "grad_norm": 0.4397557495154955, "learning_rate": 3.8047079916328e-05, "loss": 0.8887, "step": 1633 }, { "epoch": 2.8221070811744386, "grad_norm": 0.3600712365983518, "learning_rate": 3.799889944974362e-05, "loss": 0.8593, "step": 1634 }, { "epoch": 2.823834196891192, "grad_norm": 0.35022641885950706, "learning_rate": 3.795072189356239e-05, "loss": 0.878, "step": 1635 }, { "epoch": 2.825561312607945, "grad_norm": 0.431465174337035, "learning_rate": 3.7902547317853796e-05, "loss": 0.9079, "step": 1636 }, { "epoch": 2.8272884283246977, "grad_norm": 0.2988859616345342, "learning_rate": 3.785437579268297e-05, "loss": 0.8834, "step": 1637 }, { "epoch": 2.8290155440414506, "grad_norm": 0.5807328388378498, "learning_rate": 3.7806207388110676e-05, "loss": 0.8991, "step": 1638 }, { "epoch": 2.830742659758204, "grad_norm": 0.35804687522544826, "learning_rate": 3.7758042174193086e-05, "loss": 0.8848, "step": 1639 }, { "epoch": 2.832469775474957, "grad_norm": 0.34715774617329415, "learning_rate": 3.770988022098175e-05, "loss": 0.8808, "step": 1640 }, { "epoch": 2.8341968911917097, "grad_norm": 0.3458819778617539, "learning_rate": 3.766172159852346e-05, "loss": 0.9075, "step": 1641 }, { "epoch": 2.835924006908463, "grad_norm": 0.36025253398984614, "learning_rate": 3.7613566376860193e-05, "loss": 0.877, "step": 1642 }, { "epoch": 2.837651122625216, "grad_norm": 0.2608927073220777, "learning_rate": 3.756541462602897e-05, "loss": 0.8829, "step": 1643 }, { "epoch": 2.839378238341969, "grad_norm": 0.26141789500109675, "learning_rate": 3.751726641606171e-05, "loss": 0.8778, "step": 1644 }, { "epoch": 2.8411053540587217, "grad_norm": 0.32403668469486546, "learning_rate": 3.7469121816985295e-05, "loss": 0.8683, "step": 1645 }, { "epoch": 2.842832469775475, "grad_norm": 0.26966221539255586, "learning_rate": 3.742098089882126e-05, "loss": 0.8855, "step": 1646 }, { "epoch": 2.844559585492228, "grad_norm": 0.3406952124280666, "learning_rate": 3.7372843731585785e-05, "loss": 0.892, "step": 1647 }, { "epoch": 2.8462867012089808, "grad_norm": 0.38510263074762685, "learning_rate": 3.7324710385289686e-05, "loss": 0.8691, "step": 1648 }, { "epoch": 2.848013816925734, "grad_norm": 0.2992625781446102, "learning_rate": 3.727658092993813e-05, "loss": 0.8983, "step": 1649 }, { "epoch": 2.849740932642487, "grad_norm": 0.3328117407101398, "learning_rate": 3.7228455435530655e-05, "loss": 0.8903, "step": 1650 }, { "epoch": 2.85146804835924, "grad_norm": 0.30491719289441305, "learning_rate": 3.7180333972061025e-05, "loss": 0.8822, "step": 1651 }, { "epoch": 2.853195164075993, "grad_norm": 0.2808550973925694, "learning_rate": 3.71322166095172e-05, "loss": 0.8931, "step": 1652 }, { "epoch": 2.854922279792746, "grad_norm": 0.35619451265023566, "learning_rate": 3.70841034178811e-05, "loss": 0.8799, "step": 1653 }, { "epoch": 2.856649395509499, "grad_norm": 0.33736996610653036, "learning_rate": 3.70359944671286e-05, "loss": 0.8772, "step": 1654 }, { "epoch": 2.8583765112262522, "grad_norm": 0.26579345465910575, "learning_rate": 3.6987889827229454e-05, "loss": 0.889, "step": 1655 }, { "epoch": 2.860103626943005, "grad_norm": 0.3285347951266129, "learning_rate": 3.69397895681471e-05, "loss": 0.9018, "step": 1656 }, { "epoch": 2.861830742659758, "grad_norm": 0.28589299167739635, "learning_rate": 3.689169375983858e-05, "loss": 0.9006, "step": 1657 }, { "epoch": 2.8635578583765113, "grad_norm": 0.29682369591264707, "learning_rate": 3.684360247225454e-05, "loss": 0.8824, "step": 1658 }, { "epoch": 2.865284974093264, "grad_norm": 0.38261062965772064, "learning_rate": 3.6795515775339e-05, "loss": 0.8909, "step": 1659 }, { "epoch": 2.867012089810017, "grad_norm": 0.3268312214099066, "learning_rate": 3.674743373902928e-05, "loss": 0.8832, "step": 1660 }, { "epoch": 2.8687392055267704, "grad_norm": 0.27485271352829893, "learning_rate": 3.669935643325599e-05, "loss": 0.8838, "step": 1661 }, { "epoch": 2.8704663212435233, "grad_norm": 0.2934931343165851, "learning_rate": 3.665128392794282e-05, "loss": 0.8815, "step": 1662 }, { "epoch": 2.872193436960276, "grad_norm": 0.27046292919366005, "learning_rate": 3.6603216293006477e-05, "loss": 0.897, "step": 1663 }, { "epoch": 2.8739205526770295, "grad_norm": 0.28405981910741934, "learning_rate": 3.655515359835656e-05, "loss": 0.8651, "step": 1664 }, { "epoch": 2.8756476683937824, "grad_norm": 0.25670268244454525, "learning_rate": 3.6507095913895554e-05, "loss": 0.8934, "step": 1665 }, { "epoch": 2.8773747841105353, "grad_norm": 0.2795229989612953, "learning_rate": 3.64590433095186e-05, "loss": 0.887, "step": 1666 }, { "epoch": 2.8791018998272886, "grad_norm": 0.2849222613987033, "learning_rate": 3.6410995855113446e-05, "loss": 0.8812, "step": 1667 }, { "epoch": 2.8808290155440415, "grad_norm": 0.2941173635016795, "learning_rate": 3.6362953620560395e-05, "loss": 0.8799, "step": 1668 }, { "epoch": 2.8825561312607944, "grad_norm": 0.3355343873548098, "learning_rate": 3.631491667573213e-05, "loss": 0.8978, "step": 1669 }, { "epoch": 2.8842832469775477, "grad_norm": 0.3875026512882094, "learning_rate": 3.626688509049363e-05, "loss": 0.9103, "step": 1670 }, { "epoch": 2.8860103626943006, "grad_norm": 0.44022996518243035, "learning_rate": 3.621885893470211e-05, "loss": 0.8917, "step": 1671 }, { "epoch": 2.8877374784110534, "grad_norm": 0.5180837987060816, "learning_rate": 3.6170838278206853e-05, "loss": 0.9065, "step": 1672 }, { "epoch": 2.8894645941278068, "grad_norm": 0.4642674013989749, "learning_rate": 3.6122823190849176e-05, "loss": 0.8837, "step": 1673 }, { "epoch": 2.8911917098445596, "grad_norm": 0.3556304096836004, "learning_rate": 3.607481374246226e-05, "loss": 0.8802, "step": 1674 }, { "epoch": 2.8929188255613125, "grad_norm": 0.321277927046943, "learning_rate": 3.602681000287114e-05, "loss": 0.8938, "step": 1675 }, { "epoch": 2.894645941278066, "grad_norm": 0.27900481969526325, "learning_rate": 3.597881204189249e-05, "loss": 0.8975, "step": 1676 }, { "epoch": 2.8963730569948187, "grad_norm": 0.4152111348457489, "learning_rate": 3.59308199293346e-05, "loss": 0.902, "step": 1677 }, { "epoch": 2.8981001727115716, "grad_norm": 0.4766946059188518, "learning_rate": 3.588283373499727e-05, "loss": 0.9047, "step": 1678 }, { "epoch": 2.899827288428325, "grad_norm": 0.4414897487931988, "learning_rate": 3.58348535286717e-05, "loss": 0.8824, "step": 1679 }, { "epoch": 2.901554404145078, "grad_norm": 0.3506185736264103, "learning_rate": 3.578687938014033e-05, "loss": 0.8916, "step": 1680 }, { "epoch": 2.9032815198618307, "grad_norm": 0.3626575403930621, "learning_rate": 3.573891135917682e-05, "loss": 0.9099, "step": 1681 }, { "epoch": 2.905008635578584, "grad_norm": 0.35779032352893675, "learning_rate": 3.569094953554595e-05, "loss": 0.8824, "step": 1682 }, { "epoch": 2.906735751295337, "grad_norm": 0.27960763751166556, "learning_rate": 3.564299397900343e-05, "loss": 0.8845, "step": 1683 }, { "epoch": 2.90846286701209, "grad_norm": 0.38124854289345267, "learning_rate": 3.5595044759295886e-05, "loss": 0.8697, "step": 1684 }, { "epoch": 2.910189982728843, "grad_norm": 0.424629902531167, "learning_rate": 3.554710194616073e-05, "loss": 0.9002, "step": 1685 }, { "epoch": 2.911917098445596, "grad_norm": 0.40301313306944336, "learning_rate": 3.549916560932605e-05, "loss": 0.8781, "step": 1686 }, { "epoch": 2.913644214162349, "grad_norm": 0.34982901319162446, "learning_rate": 3.5451235818510495e-05, "loss": 0.8885, "step": 1687 }, { "epoch": 2.9153713298791017, "grad_norm": 0.29586410606120334, "learning_rate": 3.540331264342324e-05, "loss": 0.8916, "step": 1688 }, { "epoch": 2.917098445595855, "grad_norm": 0.2853132336539256, "learning_rate": 3.5355396153763794e-05, "loss": 0.8784, "step": 1689 }, { "epoch": 2.918825561312608, "grad_norm": 0.27314152840965167, "learning_rate": 3.530748641922195e-05, "loss": 0.8874, "step": 1690 }, { "epoch": 2.920552677029361, "grad_norm": 0.2861823985251561, "learning_rate": 3.525958350947771e-05, "loss": 0.8764, "step": 1691 }, { "epoch": 2.9222797927461137, "grad_norm": 0.2808395125138123, "learning_rate": 3.5211687494201114e-05, "loss": 0.8771, "step": 1692 }, { "epoch": 2.924006908462867, "grad_norm": 0.27576728469070755, "learning_rate": 3.5163798443052175e-05, "loss": 0.8936, "step": 1693 }, { "epoch": 2.92573402417962, "grad_norm": 0.31194441071465745, "learning_rate": 3.511591642568081e-05, "loss": 0.8819, "step": 1694 }, { "epoch": 2.927461139896373, "grad_norm": 0.4624720327726707, "learning_rate": 3.506804151172668e-05, "loss": 0.8896, "step": 1695 }, { "epoch": 2.929188255613126, "grad_norm": 0.3220841511498294, "learning_rate": 3.50201737708191e-05, "loss": 0.8902, "step": 1696 }, { "epoch": 2.930915371329879, "grad_norm": 0.305288593380298, "learning_rate": 3.497231327257698e-05, "loss": 0.8811, "step": 1697 }, { "epoch": 2.932642487046632, "grad_norm": 0.3055169889058269, "learning_rate": 3.49244600866087e-05, "loss": 0.8976, "step": 1698 }, { "epoch": 2.934369602763385, "grad_norm": 0.32810201207515177, "learning_rate": 3.487661428251199e-05, "loss": 0.8881, "step": 1699 }, { "epoch": 2.936096718480138, "grad_norm": 0.26137150553561905, "learning_rate": 3.4828775929873826e-05, "loss": 0.882, "step": 1700 }, { "epoch": 2.937823834196891, "grad_norm": 0.3360075357785436, "learning_rate": 3.4780945098270405e-05, "loss": 0.8822, "step": 1701 }, { "epoch": 2.9395509499136443, "grad_norm": 0.4339526487672443, "learning_rate": 3.473312185726693e-05, "loss": 0.8794, "step": 1702 }, { "epoch": 2.941278065630397, "grad_norm": 0.2827408979284592, "learning_rate": 3.4685306276417555e-05, "loss": 0.8864, "step": 1703 }, { "epoch": 2.94300518134715, "grad_norm": 0.36278038846657035, "learning_rate": 3.463749842526537e-05, "loss": 0.8872, "step": 1704 }, { "epoch": 2.9447322970639034, "grad_norm": 0.4164887670364328, "learning_rate": 3.4589698373342145e-05, "loss": 0.867, "step": 1705 }, { "epoch": 2.9464594127806563, "grad_norm": 0.35016191307326827, "learning_rate": 3.454190619016832e-05, "loss": 0.8812, "step": 1706 }, { "epoch": 2.948186528497409, "grad_norm": 0.4021548766523425, "learning_rate": 3.449412194525294e-05, "loss": 0.9034, "step": 1707 }, { "epoch": 2.9499136442141625, "grad_norm": 0.5174841580935156, "learning_rate": 3.444634570809346e-05, "loss": 0.9118, "step": 1708 }, { "epoch": 2.9516407599309153, "grad_norm": 0.5711903962708643, "learning_rate": 3.439857754817569e-05, "loss": 0.9062, "step": 1709 }, { "epoch": 2.9533678756476682, "grad_norm": 0.6072067870629012, "learning_rate": 3.435081753497369e-05, "loss": 0.894, "step": 1710 }, { "epoch": 2.9550949913644216, "grad_norm": 0.618453537024723, "learning_rate": 3.430306573794972e-05, "loss": 0.8812, "step": 1711 }, { "epoch": 2.9568221070811744, "grad_norm": 0.46136975202389696, "learning_rate": 3.425532222655402e-05, "loss": 0.9065, "step": 1712 }, { "epoch": 2.9585492227979273, "grad_norm": 0.39607905715874686, "learning_rate": 3.420758707022481e-05, "loss": 0.876, "step": 1713 }, { "epoch": 2.9602763385146806, "grad_norm": 0.2710128786045903, "learning_rate": 3.415986033838819e-05, "loss": 0.889, "step": 1714 }, { "epoch": 2.9620034542314335, "grad_norm": 0.2838742338621505, "learning_rate": 3.411214210045796e-05, "loss": 0.8827, "step": 1715 }, { "epoch": 2.9637305699481864, "grad_norm": 0.360448485406102, "learning_rate": 3.406443242583557e-05, "loss": 0.8893, "step": 1716 }, { "epoch": 2.9654576856649397, "grad_norm": 0.33616384104136476, "learning_rate": 3.401673138391007e-05, "loss": 0.8833, "step": 1717 }, { "epoch": 2.9671848013816926, "grad_norm": 0.33835111273905544, "learning_rate": 3.3969039044057876e-05, "loss": 0.8699, "step": 1718 }, { "epoch": 2.9689119170984455, "grad_norm": 0.323266301939846, "learning_rate": 3.392135547564282e-05, "loss": 0.8746, "step": 1719 }, { "epoch": 2.970639032815199, "grad_norm": 0.34757268585017403, "learning_rate": 3.387368074801589e-05, "loss": 0.8919, "step": 1720 }, { "epoch": 2.9723661485319517, "grad_norm": 0.2794820712924377, "learning_rate": 3.382601493051533e-05, "loss": 0.8889, "step": 1721 }, { "epoch": 2.9740932642487046, "grad_norm": 0.32198587875691986, "learning_rate": 3.3778358092466345e-05, "loss": 0.8979, "step": 1722 }, { "epoch": 2.975820379965458, "grad_norm": 0.28085599530420774, "learning_rate": 3.373071030318107e-05, "loss": 0.8842, "step": 1723 }, { "epoch": 2.9775474956822108, "grad_norm": 0.33565809795312207, "learning_rate": 3.3683071631958546e-05, "loss": 0.9122, "step": 1724 }, { "epoch": 2.9792746113989637, "grad_norm": 0.2790538582528287, "learning_rate": 3.363544214808451e-05, "loss": 0.8716, "step": 1725 }, { "epoch": 2.981001727115717, "grad_norm": 0.2922639555459221, "learning_rate": 3.3587821920831304e-05, "loss": 0.8881, "step": 1726 }, { "epoch": 2.98272884283247, "grad_norm": 0.30193686108266293, "learning_rate": 3.3540211019457896e-05, "loss": 0.8708, "step": 1727 }, { "epoch": 2.9844559585492227, "grad_norm": 0.26636134504754616, "learning_rate": 3.349260951320963e-05, "loss": 0.9043, "step": 1728 }, { "epoch": 2.986183074265976, "grad_norm": 0.3167870504516813, "learning_rate": 3.344501747131816e-05, "loss": 0.8929, "step": 1729 }, { "epoch": 2.987910189982729, "grad_norm": 0.32545277654921095, "learning_rate": 3.3397434963001454e-05, "loss": 0.876, "step": 1730 }, { "epoch": 2.989637305699482, "grad_norm": 0.3249669202995788, "learning_rate": 3.334986205746356e-05, "loss": 0.9002, "step": 1731 }, { "epoch": 2.991364421416235, "grad_norm": 0.33943389478237074, "learning_rate": 3.330229882389455e-05, "loss": 0.8925, "step": 1732 }, { "epoch": 2.993091537132988, "grad_norm": 0.2826130459417804, "learning_rate": 3.3254745331470476e-05, "loss": 0.8851, "step": 1733 }, { "epoch": 2.994818652849741, "grad_norm": 0.23780137324935582, "learning_rate": 3.3207201649353176e-05, "loss": 0.8902, "step": 1734 }, { "epoch": 2.996545768566494, "grad_norm": 0.31182737607496996, "learning_rate": 3.315966784669026e-05, "loss": 0.8802, "step": 1735 }, { "epoch": 2.998272884283247, "grad_norm": 0.2791255643361188, "learning_rate": 3.311214399261494e-05, "loss": 0.8734, "step": 1736 }, { "epoch": 3.0, "grad_norm": 0.33673902501150343, "learning_rate": 3.3064630156245965e-05, "loss": 0.8816, "step": 1737 }, { "epoch": 3.001727115716753, "grad_norm": 0.5629070135273784, "learning_rate": 3.3017126406687525e-05, "loss": 0.861, "step": 1738 }, { "epoch": 3.003454231433506, "grad_norm": 0.25225608242634634, "learning_rate": 3.2969632813029126e-05, "loss": 0.8729, "step": 1739 }, { "epoch": 3.005181347150259, "grad_norm": 0.4046718432207937, "learning_rate": 3.292214944434551e-05, "loss": 0.8934, "step": 1740 }, { "epoch": 3.006908462867012, "grad_norm": 0.39396360235788225, "learning_rate": 3.287467636969656e-05, "loss": 0.883, "step": 1741 }, { "epoch": 3.0086355785837653, "grad_norm": 0.3835503782697619, "learning_rate": 3.282721365812716e-05, "loss": 0.858, "step": 1742 }, { "epoch": 3.010362694300518, "grad_norm": 0.36331184641690695, "learning_rate": 3.277976137866714e-05, "loss": 0.863, "step": 1743 }, { "epoch": 3.012089810017271, "grad_norm": 0.26980093531023164, "learning_rate": 3.2732319600331156e-05, "loss": 0.8493, "step": 1744 }, { "epoch": 3.0138169257340244, "grad_norm": 0.316101766473071, "learning_rate": 3.26848883921186e-05, "loss": 0.8574, "step": 1745 }, { "epoch": 3.0155440414507773, "grad_norm": 0.3143876913220166, "learning_rate": 3.263746782301345e-05, "loss": 0.8673, "step": 1746 }, { "epoch": 3.01727115716753, "grad_norm": 0.31737314327921146, "learning_rate": 3.259005796198427e-05, "loss": 0.8771, "step": 1747 }, { "epoch": 3.0189982728842835, "grad_norm": 0.2869960242611334, "learning_rate": 3.254265887798402e-05, "loss": 0.8498, "step": 1748 }, { "epoch": 3.0207253886010363, "grad_norm": 0.35407062667011563, "learning_rate": 3.249527063994996e-05, "loss": 0.8951, "step": 1749 }, { "epoch": 3.0224525043177892, "grad_norm": 0.26794156444377637, "learning_rate": 3.2447893316803637e-05, "loss": 0.8565, "step": 1750 }, { "epoch": 3.024179620034542, "grad_norm": 0.33949869778082176, "learning_rate": 3.240052697745067e-05, "loss": 0.8532, "step": 1751 }, { "epoch": 3.0259067357512954, "grad_norm": 0.28459669548107563, "learning_rate": 3.2353171690780726e-05, "loss": 0.8666, "step": 1752 }, { "epoch": 3.0276338514680483, "grad_norm": 0.34802451072425655, "learning_rate": 3.230582752566737e-05, "loss": 0.8819, "step": 1753 }, { "epoch": 3.029360967184801, "grad_norm": 0.37352920159310116, "learning_rate": 3.225849455096806e-05, "loss": 0.8602, "step": 1754 }, { "epoch": 3.0310880829015545, "grad_norm": 0.2676742327038962, "learning_rate": 3.221117283552391e-05, "loss": 0.8524, "step": 1755 }, { "epoch": 3.0328151986183074, "grad_norm": 0.32083498526370385, "learning_rate": 3.216386244815965e-05, "loss": 0.8662, "step": 1756 }, { "epoch": 3.0345423143350603, "grad_norm": 0.29414587447394247, "learning_rate": 3.211656345768363e-05, "loss": 0.8539, "step": 1757 }, { "epoch": 3.0362694300518136, "grad_norm": 0.29468529963547274, "learning_rate": 3.206927593288752e-05, "loss": 0.8606, "step": 1758 }, { "epoch": 3.0379965457685665, "grad_norm": 0.2720918469371244, "learning_rate": 3.2021999942546344e-05, "loss": 0.8636, "step": 1759 }, { "epoch": 3.0397236614853194, "grad_norm": 0.2918861452206845, "learning_rate": 3.19747355554184e-05, "loss": 0.8716, "step": 1760 }, { "epoch": 3.0414507772020727, "grad_norm": 0.34831617567591433, "learning_rate": 3.1927482840245055e-05, "loss": 0.8596, "step": 1761 }, { "epoch": 3.0431778929188256, "grad_norm": 0.283638019113907, "learning_rate": 3.188024186575069e-05, "loss": 0.8391, "step": 1762 }, { "epoch": 3.0449050086355784, "grad_norm": 0.23782412564091926, "learning_rate": 3.183301270064268e-05, "loss": 0.8625, "step": 1763 }, { "epoch": 3.0466321243523318, "grad_norm": 0.24017842675992632, "learning_rate": 3.1785795413611165e-05, "loss": 0.8624, "step": 1764 }, { "epoch": 3.0483592400690847, "grad_norm": 0.24809874621551603, "learning_rate": 3.173859007332901e-05, "loss": 0.8561, "step": 1765 }, { "epoch": 3.0500863557858375, "grad_norm": 0.2551679232563637, "learning_rate": 3.169139674845173e-05, "loss": 0.8734, "step": 1766 }, { "epoch": 3.051813471502591, "grad_norm": 0.22336437447246332, "learning_rate": 3.164421550761737e-05, "loss": 0.8634, "step": 1767 }, { "epoch": 3.0535405872193437, "grad_norm": 0.3081256535076099, "learning_rate": 3.1597046419446374e-05, "loss": 0.8671, "step": 1768 }, { "epoch": 3.0552677029360966, "grad_norm": 0.28324953843969947, "learning_rate": 3.1549889552541503e-05, "loss": 0.8746, "step": 1769 }, { "epoch": 3.05699481865285, "grad_norm": 0.22081895231723558, "learning_rate": 3.1502744975487804e-05, "loss": 0.8497, "step": 1770 }, { "epoch": 3.058721934369603, "grad_norm": 0.3247835461455866, "learning_rate": 3.145561275685239e-05, "loss": 0.8645, "step": 1771 }, { "epoch": 3.0604490500863557, "grad_norm": 0.2753908323159257, "learning_rate": 3.1408492965184395e-05, "loss": 0.8685, "step": 1772 }, { "epoch": 3.062176165803109, "grad_norm": 0.24970549861187807, "learning_rate": 3.136138566901494e-05, "loss": 0.8603, "step": 1773 }, { "epoch": 3.063903281519862, "grad_norm": 0.28281860601152103, "learning_rate": 3.1314290936856914e-05, "loss": 0.856, "step": 1774 }, { "epoch": 3.065630397236615, "grad_norm": 0.31286631429854933, "learning_rate": 3.1267208837204966e-05, "loss": 0.8568, "step": 1775 }, { "epoch": 3.0673575129533677, "grad_norm": 0.2515666170762475, "learning_rate": 3.1220139438535326e-05, "loss": 0.8682, "step": 1776 }, { "epoch": 3.069084628670121, "grad_norm": 0.26223174104026253, "learning_rate": 3.117308280930584e-05, "loss": 0.8757, "step": 1777 }, { "epoch": 3.070811744386874, "grad_norm": 0.25666581060510035, "learning_rate": 3.1126039017955696e-05, "loss": 0.8514, "step": 1778 }, { "epoch": 3.0725388601036268, "grad_norm": 0.27799265061605044, "learning_rate": 3.107900813290543e-05, "loss": 0.8897, "step": 1779 }, { "epoch": 3.07426597582038, "grad_norm": 0.2804182576351185, "learning_rate": 3.103199022255685e-05, "loss": 0.8669, "step": 1780 }, { "epoch": 3.075993091537133, "grad_norm": 0.2780613009370811, "learning_rate": 3.0984985355292863e-05, "loss": 0.8792, "step": 1781 }, { "epoch": 3.077720207253886, "grad_norm": 0.25784840738893505, "learning_rate": 3.093799359947737e-05, "loss": 0.8463, "step": 1782 }, { "epoch": 3.079447322970639, "grad_norm": 0.2778477740518355, "learning_rate": 3.089101502345528e-05, "loss": 0.8823, "step": 1783 }, { "epoch": 3.081174438687392, "grad_norm": 0.24350400238432618, "learning_rate": 3.084404969555228e-05, "loss": 0.8726, "step": 1784 }, { "epoch": 3.082901554404145, "grad_norm": 0.22893288990158434, "learning_rate": 3.079709768407477e-05, "loss": 0.8633, "step": 1785 }, { "epoch": 3.0846286701208983, "grad_norm": 0.21834847785107883, "learning_rate": 3.075015905730987e-05, "loss": 0.8529, "step": 1786 }, { "epoch": 3.086355785837651, "grad_norm": 0.2536597321783113, "learning_rate": 3.070323388352514e-05, "loss": 0.8603, "step": 1787 }, { "epoch": 3.088082901554404, "grad_norm": 0.2697477274992826, "learning_rate": 3.0656322230968605e-05, "loss": 0.8626, "step": 1788 }, { "epoch": 3.0898100172711573, "grad_norm": 0.2305593835178526, "learning_rate": 3.060942416786865e-05, "loss": 0.8762, "step": 1789 }, { "epoch": 3.09153713298791, "grad_norm": 0.2316119528965227, "learning_rate": 3.0562539762433876e-05, "loss": 0.8482, "step": 1790 }, { "epoch": 3.093264248704663, "grad_norm": 0.27676552392608866, "learning_rate": 3.0515669082853024e-05, "loss": 0.8711, "step": 1791 }, { "epoch": 3.0949913644214164, "grad_norm": 0.3307281909065412, "learning_rate": 3.046881219729485e-05, "loss": 0.8624, "step": 1792 }, { "epoch": 3.0967184801381693, "grad_norm": 0.25888771935184435, "learning_rate": 3.0421969173908078e-05, "loss": 0.896, "step": 1793 }, { "epoch": 3.098445595854922, "grad_norm": 0.23287650151059727, "learning_rate": 3.037514008082127e-05, "loss": 0.8571, "step": 1794 }, { "epoch": 3.1001727115716755, "grad_norm": 0.2458987401804847, "learning_rate": 3.032832498614269e-05, "loss": 0.8752, "step": 1795 }, { "epoch": 3.1018998272884284, "grad_norm": 0.2904593442400198, "learning_rate": 3.0281523957960295e-05, "loss": 0.8545, "step": 1796 }, { "epoch": 3.1036269430051813, "grad_norm": 0.3842023366147268, "learning_rate": 3.023473706434155e-05, "loss": 0.8529, "step": 1797 }, { "epoch": 3.105354058721934, "grad_norm": 0.31594377763694365, "learning_rate": 3.0187964373333378e-05, "loss": 0.8684, "step": 1798 }, { "epoch": 3.1070811744386875, "grad_norm": 0.39167258765085383, "learning_rate": 3.0141205952962013e-05, "loss": 0.8492, "step": 1799 }, { "epoch": 3.1088082901554404, "grad_norm": 0.23248871868424573, "learning_rate": 3.0094461871232995e-05, "loss": 0.8618, "step": 1800 }, { "epoch": 3.1105354058721932, "grad_norm": 0.348681256596166, "learning_rate": 3.0047732196130958e-05, "loss": 0.8708, "step": 1801 }, { "epoch": 3.1122625215889466, "grad_norm": 0.27418206400956713, "learning_rate": 3.0001016995619582e-05, "loss": 0.8444, "step": 1802 }, { "epoch": 3.1139896373056994, "grad_norm": 0.3015191041150421, "learning_rate": 2.9954316337641552e-05, "loss": 0.8777, "step": 1803 }, { "epoch": 3.1157167530224523, "grad_norm": 0.24974851545221022, "learning_rate": 2.990763029011834e-05, "loss": 0.856, "step": 1804 }, { "epoch": 3.1174438687392056, "grad_norm": 0.3131255902680605, "learning_rate": 2.9860958920950175e-05, "loss": 0.8683, "step": 1805 }, { "epoch": 3.1191709844559585, "grad_norm": 0.35006882342995227, "learning_rate": 2.981430229801599e-05, "loss": 0.8427, "step": 1806 }, { "epoch": 3.1208981001727114, "grad_norm": 0.29433872063575345, "learning_rate": 2.9767660489173216e-05, "loss": 0.858, "step": 1807 }, { "epoch": 3.1226252158894647, "grad_norm": 0.22057670485606704, "learning_rate": 2.9721033562257752e-05, "loss": 0.8564, "step": 1808 }, { "epoch": 3.1243523316062176, "grad_norm": 0.26139494390100254, "learning_rate": 2.9674421585083874e-05, "loss": 0.8667, "step": 1809 }, { "epoch": 3.1260794473229705, "grad_norm": 0.26207002709303445, "learning_rate": 2.96278246254441e-05, "loss": 0.8599, "step": 1810 }, { "epoch": 3.127806563039724, "grad_norm": 0.29831920221776437, "learning_rate": 2.958124275110909e-05, "loss": 0.8554, "step": 1811 }, { "epoch": 3.1295336787564767, "grad_norm": 0.2603815735940368, "learning_rate": 2.953467602982758e-05, "loss": 0.8484, "step": 1812 }, { "epoch": 3.1312607944732296, "grad_norm": 0.27046585904277515, "learning_rate": 2.9488124529326287e-05, "loss": 0.8617, "step": 1813 }, { "epoch": 3.132987910189983, "grad_norm": 0.22142640305474245, "learning_rate": 2.9441588317309767e-05, "loss": 0.8743, "step": 1814 }, { "epoch": 3.134715025906736, "grad_norm": 0.3230540580811928, "learning_rate": 2.9395067461460313e-05, "loss": 0.8595, "step": 1815 }, { "epoch": 3.1364421416234887, "grad_norm": 0.25020507864718095, "learning_rate": 2.934856202943796e-05, "loss": 0.8696, "step": 1816 }, { "epoch": 3.138169257340242, "grad_norm": 0.265811282126277, "learning_rate": 2.9302072088880247e-05, "loss": 0.8621, "step": 1817 }, { "epoch": 3.139896373056995, "grad_norm": 0.2647374537536464, "learning_rate": 2.9255597707402185e-05, "loss": 0.8636, "step": 1818 }, { "epoch": 3.1416234887737478, "grad_norm": 0.3087567738046283, "learning_rate": 2.9209138952596202e-05, "loss": 0.8618, "step": 1819 }, { "epoch": 3.143350604490501, "grad_norm": 0.2696015980736788, "learning_rate": 2.916269589203196e-05, "loss": 0.8674, "step": 1820 }, { "epoch": 3.145077720207254, "grad_norm": 0.3774663284156344, "learning_rate": 2.9116268593256296e-05, "loss": 0.8821, "step": 1821 }, { "epoch": 3.146804835924007, "grad_norm": 0.23832791561780556, "learning_rate": 2.906985712379312e-05, "loss": 0.8476, "step": 1822 }, { "epoch": 3.14853195164076, "grad_norm": 0.32082516418503065, "learning_rate": 2.9023461551143354e-05, "loss": 0.8727, "step": 1823 }, { "epoch": 3.150259067357513, "grad_norm": 0.24180707471568177, "learning_rate": 2.8977081942784776e-05, "loss": 0.8722, "step": 1824 }, { "epoch": 3.151986183074266, "grad_norm": 0.3214627850417722, "learning_rate": 2.8930718366171916e-05, "loss": 0.8555, "step": 1825 }, { "epoch": 3.153713298791019, "grad_norm": 0.3265267926774761, "learning_rate": 2.888437088873606e-05, "loss": 0.8948, "step": 1826 }, { "epoch": 3.155440414507772, "grad_norm": 0.27166906002732255, "learning_rate": 2.883803957788502e-05, "loss": 0.856, "step": 1827 }, { "epoch": 3.157167530224525, "grad_norm": 0.24894345426092648, "learning_rate": 2.8791724501003088e-05, "loss": 0.8724, "step": 1828 }, { "epoch": 3.158894645941278, "grad_norm": 0.21198748426891387, "learning_rate": 2.8745425725451003e-05, "loss": 0.8807, "step": 1829 }, { "epoch": 3.160621761658031, "grad_norm": 0.2879791562176034, "learning_rate": 2.869914331856575e-05, "loss": 0.8519, "step": 1830 }, { "epoch": 3.162348877374784, "grad_norm": 0.2540392829873994, "learning_rate": 2.8652877347660504e-05, "loss": 0.88, "step": 1831 }, { "epoch": 3.164075993091537, "grad_norm": 0.2667529082650539, "learning_rate": 2.8606627880024578e-05, "loss": 0.8641, "step": 1832 }, { "epoch": 3.1658031088082903, "grad_norm": 0.2180900782380449, "learning_rate": 2.8560394982923248e-05, "loss": 0.8688, "step": 1833 }, { "epoch": 3.167530224525043, "grad_norm": 0.24367879610364857, "learning_rate": 2.8514178723597684e-05, "loss": 0.8691, "step": 1834 }, { "epoch": 3.169257340241796, "grad_norm": 0.2979272495056228, "learning_rate": 2.8467979169264866e-05, "loss": 0.8585, "step": 1835 }, { "epoch": 3.1709844559585494, "grad_norm": 0.3152073823946475, "learning_rate": 2.842179638711751e-05, "loss": 0.8585, "step": 1836 }, { "epoch": 3.1727115716753023, "grad_norm": 0.21002082210736592, "learning_rate": 2.8375630444323898e-05, "loss": 0.8598, "step": 1837 }, { "epoch": 3.174438687392055, "grad_norm": 0.3629409815772088, "learning_rate": 2.8329481408027814e-05, "loss": 0.8758, "step": 1838 }, { "epoch": 3.1761658031088085, "grad_norm": 0.39306398483019755, "learning_rate": 2.8283349345348514e-05, "loss": 0.8601, "step": 1839 }, { "epoch": 3.1778929188255614, "grad_norm": 0.4178716548906831, "learning_rate": 2.8237234323380504e-05, "loss": 0.8716, "step": 1840 }, { "epoch": 3.1796200345423142, "grad_norm": 0.30951634210078954, "learning_rate": 2.8191136409193513e-05, "loss": 0.8521, "step": 1841 }, { "epoch": 3.1813471502590676, "grad_norm": 0.2580829327849351, "learning_rate": 2.814505566983245e-05, "loss": 0.8545, "step": 1842 }, { "epoch": 3.1830742659758204, "grad_norm": 0.33122880316897113, "learning_rate": 2.8098992172317165e-05, "loss": 0.8876, "step": 1843 }, { "epoch": 3.1848013816925733, "grad_norm": 0.3184998720885938, "learning_rate": 2.8052945983642472e-05, "loss": 0.8529, "step": 1844 }, { "epoch": 3.186528497409326, "grad_norm": 0.33585242475202076, "learning_rate": 2.8006917170778005e-05, "loss": 0.8727, "step": 1845 }, { "epoch": 3.1882556131260795, "grad_norm": 0.2748959675504545, "learning_rate": 2.796090580066813e-05, "loss": 0.864, "step": 1846 }, { "epoch": 3.1899827288428324, "grad_norm": 0.31686082898633544, "learning_rate": 2.791491194023185e-05, "loss": 0.8574, "step": 1847 }, { "epoch": 3.1917098445595853, "grad_norm": 0.4124597459409555, "learning_rate": 2.7868935656362676e-05, "loss": 0.8672, "step": 1848 }, { "epoch": 3.1934369602763386, "grad_norm": 0.5616293379570743, "learning_rate": 2.7822977015928575e-05, "loss": 0.8543, "step": 1849 }, { "epoch": 3.1951640759930915, "grad_norm": 0.40359510889678757, "learning_rate": 2.7777036085771868e-05, "loss": 0.875, "step": 1850 }, { "epoch": 3.1968911917098444, "grad_norm": 0.4730955248481481, "learning_rate": 2.7731112932709082e-05, "loss": 0.876, "step": 1851 }, { "epoch": 3.1986183074265977, "grad_norm": 0.6850602088386744, "learning_rate": 2.7685207623530913e-05, "loss": 0.886, "step": 1852 }, { "epoch": 3.2003454231433506, "grad_norm": 0.5656351916123276, "learning_rate": 2.7639320225002108e-05, "loss": 0.863, "step": 1853 }, { "epoch": 3.2020725388601035, "grad_norm": 0.47832626505281967, "learning_rate": 2.759345080386134e-05, "loss": 0.8675, "step": 1854 }, { "epoch": 3.203799654576857, "grad_norm": 0.39736671178440225, "learning_rate": 2.7547599426821167e-05, "loss": 0.8791, "step": 1855 }, { "epoch": 3.2055267702936097, "grad_norm": 0.5936948540471869, "learning_rate": 2.7501766160567894e-05, "loss": 0.8537, "step": 1856 }, { "epoch": 3.2072538860103625, "grad_norm": 0.7584363870727038, "learning_rate": 2.7455951071761467e-05, "loss": 0.8716, "step": 1857 }, { "epoch": 3.208981001727116, "grad_norm": 0.6253606983400484, "learning_rate": 2.7410154227035396e-05, "loss": 0.854, "step": 1858 }, { "epoch": 3.2107081174438687, "grad_norm": 0.30548735370952634, "learning_rate": 2.73643756929967e-05, "loss": 0.8669, "step": 1859 }, { "epoch": 3.2124352331606216, "grad_norm": 0.5717096882101409, "learning_rate": 2.7318615536225724e-05, "loss": 0.8837, "step": 1860 }, { "epoch": 3.214162348877375, "grad_norm": 0.4043988924393432, "learning_rate": 2.7272873823276072e-05, "loss": 0.8806, "step": 1861 }, { "epoch": 3.215889464594128, "grad_norm": 0.3608162916409269, "learning_rate": 2.722715062067459e-05, "loss": 0.8498, "step": 1862 }, { "epoch": 3.2176165803108807, "grad_norm": 0.33944463741018416, "learning_rate": 2.7181445994921136e-05, "loss": 0.8628, "step": 1863 }, { "epoch": 3.219343696027634, "grad_norm": 0.5746563182630707, "learning_rate": 2.7135760012488565e-05, "loss": 0.8619, "step": 1864 }, { "epoch": 3.221070811744387, "grad_norm": 0.6765660443820559, "learning_rate": 2.709009273982266e-05, "loss": 0.8627, "step": 1865 }, { "epoch": 3.22279792746114, "grad_norm": 0.4707155574849479, "learning_rate": 2.7044444243341936e-05, "loss": 0.8539, "step": 1866 }, { "epoch": 3.224525043177893, "grad_norm": 0.33215993616693507, "learning_rate": 2.699881458943763e-05, "loss": 0.8551, "step": 1867 }, { "epoch": 3.226252158894646, "grad_norm": 0.5007749813733136, "learning_rate": 2.695320384447355e-05, "loss": 0.8678, "step": 1868 }, { "epoch": 3.227979274611399, "grad_norm": 0.6285162954292136, "learning_rate": 2.6907612074786046e-05, "loss": 0.8872, "step": 1869 }, { "epoch": 3.229706390328152, "grad_norm": 0.6647223535497819, "learning_rate": 2.6862039346683837e-05, "loss": 0.9111, "step": 1870 }, { "epoch": 3.231433506044905, "grad_norm": 0.3396975103511857, "learning_rate": 2.6816485726447933e-05, "loss": 0.8965, "step": 1871 }, { "epoch": 3.233160621761658, "grad_norm": 0.4284167381412009, "learning_rate": 2.6770951280331606e-05, "loss": 0.8739, "step": 1872 }, { "epoch": 3.234887737478411, "grad_norm": 0.30440279535430836, "learning_rate": 2.672543607456021e-05, "loss": 0.8621, "step": 1873 }, { "epoch": 3.236614853195164, "grad_norm": 0.3034384420811119, "learning_rate": 2.6679940175331088e-05, "loss": 0.8687, "step": 1874 }, { "epoch": 3.238341968911917, "grad_norm": 0.41024281403358914, "learning_rate": 2.6634463648813566e-05, "loss": 0.8817, "step": 1875 }, { "epoch": 3.24006908462867, "grad_norm": 0.35021337360417343, "learning_rate": 2.6589006561148742e-05, "loss": 0.8875, "step": 1876 }, { "epoch": 3.2417962003454233, "grad_norm": 0.22097412985041923, "learning_rate": 2.654356897844947e-05, "loss": 0.8884, "step": 1877 }, { "epoch": 3.243523316062176, "grad_norm": 0.33088321927880504, "learning_rate": 2.6498150966800204e-05, "loss": 0.8574, "step": 1878 }, { "epoch": 3.245250431778929, "grad_norm": 0.309531819870468, "learning_rate": 2.6452752592256987e-05, "loss": 0.8559, "step": 1879 }, { "epoch": 3.2469775474956823, "grad_norm": 0.22785593435805276, "learning_rate": 2.6407373920847264e-05, "loss": 0.8679, "step": 1880 }, { "epoch": 3.2487046632124352, "grad_norm": 0.26423678137104484, "learning_rate": 2.636201501856981e-05, "loss": 0.8655, "step": 1881 }, { "epoch": 3.250431778929188, "grad_norm": 0.3812029820641495, "learning_rate": 2.6316675951394698e-05, "loss": 0.8731, "step": 1882 }, { "epoch": 3.2521588946459414, "grad_norm": 0.34065184879313853, "learning_rate": 2.627135678526311e-05, "loss": 0.8707, "step": 1883 }, { "epoch": 3.2538860103626943, "grad_norm": 0.21066383137464653, "learning_rate": 2.6226057586087282e-05, "loss": 0.879, "step": 1884 }, { "epoch": 3.255613126079447, "grad_norm": 0.2994416124210764, "learning_rate": 2.618077841975046e-05, "loss": 0.8525, "step": 1885 }, { "epoch": 3.2573402417962005, "grad_norm": 0.35590036153182025, "learning_rate": 2.6135519352106703e-05, "loss": 0.8601, "step": 1886 }, { "epoch": 3.2590673575129534, "grad_norm": 0.24986015712555784, "learning_rate": 2.6090280448980837e-05, "loss": 0.8777, "step": 1887 }, { "epoch": 3.2607944732297063, "grad_norm": 0.22488304500448683, "learning_rate": 2.604506177616841e-05, "loss": 0.8722, "step": 1888 }, { "epoch": 3.2625215889464596, "grad_norm": 0.2924332802658415, "learning_rate": 2.5999863399435503e-05, "loss": 0.8716, "step": 1889 }, { "epoch": 3.2642487046632125, "grad_norm": 0.27963387785827026, "learning_rate": 2.595468538451869e-05, "loss": 0.8538, "step": 1890 }, { "epoch": 3.2659758203799654, "grad_norm": 0.2172624510295093, "learning_rate": 2.590952779712492e-05, "loss": 0.8656, "step": 1891 }, { "epoch": 3.2677029360967182, "grad_norm": 0.3012076533723449, "learning_rate": 2.586439070293147e-05, "loss": 0.8691, "step": 1892 }, { "epoch": 3.2694300518134716, "grad_norm": 0.27377604966197927, "learning_rate": 2.581927416758577e-05, "loss": 0.8784, "step": 1893 }, { "epoch": 3.2711571675302245, "grad_norm": 0.2950961240966675, "learning_rate": 2.5774178256705354e-05, "loss": 0.8753, "step": 1894 }, { "epoch": 3.2728842832469773, "grad_norm": 0.23072753808554514, "learning_rate": 2.5729103035877804e-05, "loss": 0.8565, "step": 1895 }, { "epoch": 3.2746113989637307, "grad_norm": 0.2708894315686548, "learning_rate": 2.5684048570660555e-05, "loss": 0.8672, "step": 1896 }, { "epoch": 3.2763385146804835, "grad_norm": 0.24309374278024776, "learning_rate": 2.5639014926580862e-05, "loss": 0.8485, "step": 1897 }, { "epoch": 3.2780656303972364, "grad_norm": 0.2499947282203489, "learning_rate": 2.559400216913575e-05, "loss": 0.8642, "step": 1898 }, { "epoch": 3.2797927461139897, "grad_norm": 0.2651906812025179, "learning_rate": 2.5549010363791807e-05, "loss": 0.8607, "step": 1899 }, { "epoch": 3.2815198618307426, "grad_norm": 0.2502703297015237, "learning_rate": 2.5504039575985174e-05, "loss": 0.8686, "step": 1900 }, { "epoch": 3.2832469775474955, "grad_norm": 0.2192677765874483, "learning_rate": 2.5459089871121417e-05, "loss": 0.8764, "step": 1901 }, { "epoch": 3.284974093264249, "grad_norm": 0.2725026356296654, "learning_rate": 2.5414161314575444e-05, "loss": 0.8529, "step": 1902 }, { "epoch": 3.2867012089810017, "grad_norm": 0.22135106802145496, "learning_rate": 2.5369253971691422e-05, "loss": 0.8803, "step": 1903 }, { "epoch": 3.2884283246977546, "grad_norm": 0.27932487474518114, "learning_rate": 2.532436790778262e-05, "loss": 0.8686, "step": 1904 }, { "epoch": 3.290155440414508, "grad_norm": 0.24106258545490192, "learning_rate": 2.5279503188131394e-05, "loss": 0.8614, "step": 1905 }, { "epoch": 3.291882556131261, "grad_norm": 0.28882263427833826, "learning_rate": 2.523465987798906e-05, "loss": 0.8777, "step": 1906 }, { "epoch": 3.2936096718480137, "grad_norm": 0.23416372308816583, "learning_rate": 2.5189838042575763e-05, "loss": 0.8643, "step": 1907 }, { "epoch": 3.295336787564767, "grad_norm": 0.25251673830362525, "learning_rate": 2.514503774708045e-05, "loss": 0.8701, "step": 1908 }, { "epoch": 3.29706390328152, "grad_norm": 0.2634215924656164, "learning_rate": 2.5100259056660733e-05, "loss": 0.8636, "step": 1909 }, { "epoch": 3.2987910189982728, "grad_norm": 0.28087047333220166, "learning_rate": 2.5055502036442776e-05, "loss": 0.8535, "step": 1910 }, { "epoch": 3.300518134715026, "grad_norm": 0.2521379641698137, "learning_rate": 2.501076675152126e-05, "loss": 0.8657, "step": 1911 }, { "epoch": 3.302245250431779, "grad_norm": 0.2932937905129332, "learning_rate": 2.4966053266959235e-05, "loss": 0.8723, "step": 1912 }, { "epoch": 3.303972366148532, "grad_norm": 0.320314086494674, "learning_rate": 2.492136164778805e-05, "loss": 0.8756, "step": 1913 }, { "epoch": 3.305699481865285, "grad_norm": 0.3260790435304739, "learning_rate": 2.4876691959007227e-05, "loss": 0.8736, "step": 1914 }, { "epoch": 3.307426597582038, "grad_norm": 0.33718061920939196, "learning_rate": 2.4832044265584462e-05, "loss": 0.863, "step": 1915 }, { "epoch": 3.309153713298791, "grad_norm": 0.32887009768595593, "learning_rate": 2.4787418632455397e-05, "loss": 0.8718, "step": 1916 }, { "epoch": 3.3108808290155443, "grad_norm": 0.44412796190366516, "learning_rate": 2.4742815124523585e-05, "loss": 0.8669, "step": 1917 }, { "epoch": 3.312607944732297, "grad_norm": 0.22395334690228655, "learning_rate": 2.4698233806660454e-05, "loss": 0.8801, "step": 1918 }, { "epoch": 3.31433506044905, "grad_norm": 0.3687199474198829, "learning_rate": 2.465367474370512e-05, "loss": 0.8783, "step": 1919 }, { "epoch": 3.3160621761658033, "grad_norm": 0.3450599781867232, "learning_rate": 2.460913800046432e-05, "loss": 0.8578, "step": 1920 }, { "epoch": 3.3177892918825562, "grad_norm": 0.31267106488351215, "learning_rate": 2.4564623641712383e-05, "loss": 0.8776, "step": 1921 }, { "epoch": 3.319516407599309, "grad_norm": 0.24244448713951727, "learning_rate": 2.4520131732191032e-05, "loss": 0.8599, "step": 1922 }, { "epoch": 3.321243523316062, "grad_norm": 0.30655514935371647, "learning_rate": 2.4475662336609348e-05, "loss": 0.8792, "step": 1923 }, { "epoch": 3.3229706390328153, "grad_norm": 0.35255889147749875, "learning_rate": 2.443121551964367e-05, "loss": 0.8853, "step": 1924 }, { "epoch": 3.324697754749568, "grad_norm": 0.2656342729732186, "learning_rate": 2.438679134593753e-05, "loss": 0.8592, "step": 1925 }, { "epoch": 3.326424870466321, "grad_norm": 0.33466604252685617, "learning_rate": 2.4342389880101482e-05, "loss": 0.8855, "step": 1926 }, { "epoch": 3.3281519861830744, "grad_norm": 0.2677974312781846, "learning_rate": 2.4298011186713065e-05, "loss": 0.87, "step": 1927 }, { "epoch": 3.3298791018998273, "grad_norm": 0.33057836620947856, "learning_rate": 2.4253655330316735e-05, "loss": 0.8721, "step": 1928 }, { "epoch": 3.33160621761658, "grad_norm": 0.20214413329761294, "learning_rate": 2.4209322375423687e-05, "loss": 0.8549, "step": 1929 }, { "epoch": 3.3333333333333335, "grad_norm": 0.29626147891072746, "learning_rate": 2.416501238651181e-05, "loss": 0.8597, "step": 1930 }, { "epoch": 3.3350604490500864, "grad_norm": 0.2846213549878557, "learning_rate": 2.4120725428025643e-05, "loss": 0.861, "step": 1931 }, { "epoch": 3.3367875647668392, "grad_norm": 0.25623867036629044, "learning_rate": 2.4076461564376175e-05, "loss": 0.878, "step": 1932 }, { "epoch": 3.3385146804835926, "grad_norm": 0.24034626082786045, "learning_rate": 2.40322208599408e-05, "loss": 0.8585, "step": 1933 }, { "epoch": 3.3402417962003454, "grad_norm": 0.27060624196938765, "learning_rate": 2.3988003379063303e-05, "loss": 0.9034, "step": 1934 }, { "epoch": 3.3419689119170983, "grad_norm": 0.2406741749841719, "learning_rate": 2.3943809186053613e-05, "loss": 0.8561, "step": 1935 }, { "epoch": 3.3436960276338517, "grad_norm": 0.2601150278039695, "learning_rate": 2.3899638345187825e-05, "loss": 0.8672, "step": 1936 }, { "epoch": 3.3454231433506045, "grad_norm": 0.29867105031783264, "learning_rate": 2.3855490920708047e-05, "loss": 0.8657, "step": 1937 }, { "epoch": 3.3471502590673574, "grad_norm": 0.2772600907429787, "learning_rate": 2.381136697682238e-05, "loss": 0.8611, "step": 1938 }, { "epoch": 3.3488773747841103, "grad_norm": 0.25181208092238083, "learning_rate": 2.376726657770473e-05, "loss": 0.8735, "step": 1939 }, { "epoch": 3.3506044905008636, "grad_norm": 0.26018500726893506, "learning_rate": 2.3723189787494754e-05, "loss": 0.8709, "step": 1940 }, { "epoch": 3.3523316062176165, "grad_norm": 0.3317845554461469, "learning_rate": 2.3679136670297826e-05, "loss": 0.8592, "step": 1941 }, { "epoch": 3.3540587219343694, "grad_norm": 0.23686267195265504, "learning_rate": 2.363510729018483e-05, "loss": 0.8462, "step": 1942 }, { "epoch": 3.3557858376511227, "grad_norm": 0.25396106003915475, "learning_rate": 2.3591101711192144e-05, "loss": 0.8832, "step": 1943 }, { "epoch": 3.3575129533678756, "grad_norm": 0.2739362847295367, "learning_rate": 2.3547119997321558e-05, "loss": 0.8591, "step": 1944 }, { "epoch": 3.3592400690846285, "grad_norm": 0.2745826341345275, "learning_rate": 2.3503162212540128e-05, "loss": 0.876, "step": 1945 }, { "epoch": 3.360967184801382, "grad_norm": 0.18764263058068453, "learning_rate": 2.345922842078011e-05, "loss": 0.8696, "step": 1946 }, { "epoch": 3.3626943005181347, "grad_norm": 0.28288559588818724, "learning_rate": 2.341531868593884e-05, "loss": 0.8745, "step": 1947 }, { "epoch": 3.3644214162348876, "grad_norm": 0.19171805254804752, "learning_rate": 2.337143307187873e-05, "loss": 0.8711, "step": 1948 }, { "epoch": 3.366148531951641, "grad_norm": 0.285096757450517, "learning_rate": 2.3327571642427044e-05, "loss": 0.8684, "step": 1949 }, { "epoch": 3.3678756476683938, "grad_norm": 0.31873025074128425, "learning_rate": 2.32837344613759e-05, "loss": 0.8637, "step": 1950 }, { "epoch": 3.3696027633851466, "grad_norm": 0.2615952823921696, "learning_rate": 2.323992159248218e-05, "loss": 0.8766, "step": 1951 }, { "epoch": 3.3713298791019, "grad_norm": 0.18430101588364708, "learning_rate": 2.3196133099467347e-05, "loss": 0.8717, "step": 1952 }, { "epoch": 3.373056994818653, "grad_norm": 0.2225925403215445, "learning_rate": 2.3152369046017434e-05, "loss": 0.8415, "step": 1953 }, { "epoch": 3.3747841105354057, "grad_norm": 0.25698982591153025, "learning_rate": 2.3108629495782963e-05, "loss": 0.8583, "step": 1954 }, { "epoch": 3.376511226252159, "grad_norm": 0.2197304110023439, "learning_rate": 2.306491451237877e-05, "loss": 0.8567, "step": 1955 }, { "epoch": 3.378238341968912, "grad_norm": 0.22560733910975692, "learning_rate": 2.3021224159383972e-05, "loss": 0.8791, "step": 1956 }, { "epoch": 3.379965457685665, "grad_norm": 0.23740532925871743, "learning_rate": 2.2977558500341898e-05, "loss": 0.8743, "step": 1957 }, { "epoch": 3.381692573402418, "grad_norm": 0.23787397006862135, "learning_rate": 2.2933917598759927e-05, "loss": 0.8703, "step": 1958 }, { "epoch": 3.383419689119171, "grad_norm": 0.21932539190679204, "learning_rate": 2.289030151810944e-05, "loss": 0.8534, "step": 1959 }, { "epoch": 3.385146804835924, "grad_norm": 0.272584520984078, "learning_rate": 2.28467103218257e-05, "loss": 0.8811, "step": 1960 }, { "epoch": 3.386873920552677, "grad_norm": 0.2894811829274617, "learning_rate": 2.2803144073307827e-05, "loss": 0.8525, "step": 1961 }, { "epoch": 3.38860103626943, "grad_norm": 0.21243640463883168, "learning_rate": 2.2759602835918618e-05, "loss": 0.8639, "step": 1962 }, { "epoch": 3.390328151986183, "grad_norm": 0.25210592527246134, "learning_rate": 2.2716086672984484e-05, "loss": 0.8781, "step": 1963 }, { "epoch": 3.3920552677029363, "grad_norm": 0.23272420820384124, "learning_rate": 2.2672595647795416e-05, "loss": 0.8629, "step": 1964 }, { "epoch": 3.393782383419689, "grad_norm": 0.20031109951236595, "learning_rate": 2.2629129823604805e-05, "loss": 0.8389, "step": 1965 }, { "epoch": 3.395509499136442, "grad_norm": 0.28065439125551767, "learning_rate": 2.2585689263629372e-05, "loss": 0.8649, "step": 1966 }, { "epoch": 3.3972366148531954, "grad_norm": 0.2298114967131551, "learning_rate": 2.254227403104917e-05, "loss": 0.8611, "step": 1967 }, { "epoch": 3.3989637305699483, "grad_norm": 0.2778550574803295, "learning_rate": 2.2498884189007337e-05, "loss": 0.8657, "step": 1968 }, { "epoch": 3.400690846286701, "grad_norm": 0.30216363114363853, "learning_rate": 2.2455519800610118e-05, "loss": 0.8713, "step": 1969 }, { "epoch": 3.4024179620034545, "grad_norm": 0.21971385966294366, "learning_rate": 2.2412180928926722e-05, "loss": 0.8736, "step": 1970 }, { "epoch": 3.4041450777202074, "grad_norm": 0.283251453613926, "learning_rate": 2.2368867636989283e-05, "loss": 0.8709, "step": 1971 }, { "epoch": 3.4058721934369602, "grad_norm": 0.25164111840730374, "learning_rate": 2.23255799877927e-05, "loss": 0.8596, "step": 1972 }, { "epoch": 3.407599309153713, "grad_norm": 0.23587403982640548, "learning_rate": 2.228231804429456e-05, "loss": 0.8715, "step": 1973 }, { "epoch": 3.4093264248704664, "grad_norm": 0.3485182690798112, "learning_rate": 2.2239081869415112e-05, "loss": 0.8699, "step": 1974 }, { "epoch": 3.4110535405872193, "grad_norm": 0.3784403464412606, "learning_rate": 2.21958715260371e-05, "loss": 0.8614, "step": 1975 }, { "epoch": 3.412780656303972, "grad_norm": 0.24807726679325745, "learning_rate": 2.2152687077005686e-05, "loss": 0.8754, "step": 1976 }, { "epoch": 3.4145077720207255, "grad_norm": 0.3091453976806629, "learning_rate": 2.2109528585128414e-05, "loss": 0.84, "step": 1977 }, { "epoch": 3.4162348877374784, "grad_norm": 0.2969218387148511, "learning_rate": 2.2066396113175025e-05, "loss": 0.8656, "step": 1978 }, { "epoch": 3.4179620034542313, "grad_norm": 2.4527514450205925, "learning_rate": 2.2023289723877436e-05, "loss": 0.8791, "step": 1979 }, { "epoch": 3.4196891191709846, "grad_norm": 0.38403061744180766, "learning_rate": 2.198020947992966e-05, "loss": 0.8875, "step": 1980 }, { "epoch": 3.4214162348877375, "grad_norm": 0.49556474471857564, "learning_rate": 2.1937155443987638e-05, "loss": 0.8739, "step": 1981 }, { "epoch": 3.4231433506044904, "grad_norm": 0.31193254836082335, "learning_rate": 2.1894127678669213e-05, "loss": 0.8893, "step": 1982 }, { "epoch": 3.4248704663212437, "grad_norm": 0.38715106028082963, "learning_rate": 2.1851126246554014e-05, "loss": 0.8755, "step": 1983 }, { "epoch": 3.4265975820379966, "grad_norm": 0.4469519564930913, "learning_rate": 2.1808151210183404e-05, "loss": 0.8759, "step": 1984 }, { "epoch": 3.4283246977547495, "grad_norm": 0.2590732424882825, "learning_rate": 2.176520263206031e-05, "loss": 0.8822, "step": 1985 }, { "epoch": 3.4300518134715023, "grad_norm": 0.39582335115185857, "learning_rate": 2.1722280574649183e-05, "loss": 0.8647, "step": 1986 }, { "epoch": 3.4317789291882557, "grad_norm": 0.33790799525159976, "learning_rate": 2.167938510037595e-05, "loss": 0.8715, "step": 1987 }, { "epoch": 3.4335060449050085, "grad_norm": 0.27657664523354686, "learning_rate": 2.163651627162783e-05, "loss": 0.865, "step": 1988 }, { "epoch": 3.4352331606217614, "grad_norm": 0.3008503743833561, "learning_rate": 2.1593674150753276e-05, "loss": 0.8639, "step": 1989 }, { "epoch": 3.4369602763385148, "grad_norm": 0.3496488873331267, "learning_rate": 2.1550858800061967e-05, "loss": 0.8699, "step": 1990 }, { "epoch": 3.4386873920552676, "grad_norm": 0.23608808248972493, "learning_rate": 2.1508070281824568e-05, "loss": 0.8794, "step": 1991 }, { "epoch": 3.4404145077720205, "grad_norm": 0.3581903245573637, "learning_rate": 2.146530865827277e-05, "loss": 0.8547, "step": 1992 }, { "epoch": 3.442141623488774, "grad_norm": 0.2734329406916248, "learning_rate": 2.1422573991599113e-05, "loss": 0.8706, "step": 1993 }, { "epoch": 3.4438687392055267, "grad_norm": 0.2804340227963005, "learning_rate": 2.1379866343956968e-05, "loss": 0.8547, "step": 1994 }, { "epoch": 3.4455958549222796, "grad_norm": 0.2589126145713421, "learning_rate": 2.1337185777460387e-05, "loss": 0.8628, "step": 1995 }, { "epoch": 3.447322970639033, "grad_norm": 0.24158238910075436, "learning_rate": 2.1294532354184027e-05, "loss": 0.8657, "step": 1996 }, { "epoch": 3.449050086355786, "grad_norm": 0.2502634262691051, "learning_rate": 2.12519061361631e-05, "loss": 0.8635, "step": 1997 }, { "epoch": 3.4507772020725387, "grad_norm": 0.20825605863394597, "learning_rate": 2.120930718539322e-05, "loss": 0.8666, "step": 1998 }, { "epoch": 3.452504317789292, "grad_norm": 0.27868661376585835, "learning_rate": 2.1166735563830335e-05, "loss": 0.8641, "step": 1999 }, { "epoch": 3.454231433506045, "grad_norm": 0.2178164264104592, "learning_rate": 2.1124191333390703e-05, "loss": 0.8674, "step": 2000 }, { "epoch": 3.4559585492227978, "grad_norm": 0.27148450712584776, "learning_rate": 2.1081674555950685e-05, "loss": 0.8632, "step": 2001 }, { "epoch": 3.457685664939551, "grad_norm": 0.2761246066147608, "learning_rate": 2.1039185293346716e-05, "loss": 0.8893, "step": 2002 }, { "epoch": 3.459412780656304, "grad_norm": 0.22861278015580522, "learning_rate": 2.0996723607375273e-05, "loss": 0.8793, "step": 2003 }, { "epoch": 3.461139896373057, "grad_norm": 0.20061633374217908, "learning_rate": 2.0954289559792664e-05, "loss": 0.861, "step": 2004 }, { "epoch": 3.46286701208981, "grad_norm": 0.22660452755323335, "learning_rate": 2.0911883212315018e-05, "loss": 0.858, "step": 2005 }, { "epoch": 3.464594127806563, "grad_norm": 0.21067139815035832, "learning_rate": 2.0869504626618164e-05, "loss": 0.8527, "step": 2006 }, { "epoch": 3.466321243523316, "grad_norm": 0.211961886883435, "learning_rate": 2.082715386433761e-05, "loss": 0.8562, "step": 2007 }, { "epoch": 3.4680483592400693, "grad_norm": 0.22669711727547226, "learning_rate": 2.0784830987068334e-05, "loss": 0.8609, "step": 2008 }, { "epoch": 3.469775474956822, "grad_norm": 0.26366325958473197, "learning_rate": 2.0742536056364762e-05, "loss": 0.8778, "step": 2009 }, { "epoch": 3.471502590673575, "grad_norm": 0.17869579696282414, "learning_rate": 2.0700269133740737e-05, "loss": 0.8546, "step": 2010 }, { "epoch": 3.4732297063903284, "grad_norm": 0.23438811245200378, "learning_rate": 2.0658030280669297e-05, "loss": 0.8751, "step": 2011 }, { "epoch": 3.4749568221070812, "grad_norm": 0.18875791309930007, "learning_rate": 2.0615819558582672e-05, "loss": 0.8636, "step": 2012 }, { "epoch": 3.476683937823834, "grad_norm": 0.1880725436436509, "learning_rate": 2.057363702887222e-05, "loss": 0.8519, "step": 2013 }, { "epoch": 3.4784110535405874, "grad_norm": 0.18350898392128792, "learning_rate": 2.053148275288825e-05, "loss": 0.8527, "step": 2014 }, { "epoch": 3.4801381692573403, "grad_norm": 0.18461193058249972, "learning_rate": 2.048935679193999e-05, "loss": 0.8617, "step": 2015 }, { "epoch": 3.481865284974093, "grad_norm": 0.18384777699774166, "learning_rate": 2.0447259207295475e-05, "loss": 0.86, "step": 2016 }, { "epoch": 3.4835924006908465, "grad_norm": 0.18311597963896695, "learning_rate": 2.040519006018152e-05, "loss": 0.8562, "step": 2017 }, { "epoch": 3.4853195164075994, "grad_norm": 0.1814222592700424, "learning_rate": 2.0363149411783528e-05, "loss": 0.861, "step": 2018 }, { "epoch": 3.4870466321243523, "grad_norm": 0.17586582644987744, "learning_rate": 2.032113732324545e-05, "loss": 0.8621, "step": 2019 }, { "epoch": 3.488773747841105, "grad_norm": 0.23515235820227973, "learning_rate": 2.027915385566975e-05, "loss": 0.8554, "step": 2020 }, { "epoch": 3.4905008635578585, "grad_norm": 0.18766356714189983, "learning_rate": 2.0237199070117232e-05, "loss": 0.8606, "step": 2021 }, { "epoch": 3.4922279792746114, "grad_norm": 0.20992254248575487, "learning_rate": 2.0195273027606958e-05, "loss": 0.8645, "step": 2022 }, { "epoch": 3.4939550949913643, "grad_norm": 0.22090725743662368, "learning_rate": 2.015337578911626e-05, "loss": 0.8591, "step": 2023 }, { "epoch": 3.4956822107081176, "grad_norm": 0.23820432871239328, "learning_rate": 2.0111507415580508e-05, "loss": 0.8577, "step": 2024 }, { "epoch": 3.4974093264248705, "grad_norm": 0.1871011783598217, "learning_rate": 2.0069667967893122e-05, "loss": 0.872, "step": 2025 }, { "epoch": 3.4991364421416233, "grad_norm": 0.2577636115900881, "learning_rate": 2.002785750690543e-05, "loss": 0.8812, "step": 2026 }, { "epoch": 3.5008635578583767, "grad_norm": 0.2404365854018223, "learning_rate": 1.9986076093426656e-05, "loss": 0.8605, "step": 2027 }, { "epoch": 3.5025906735751295, "grad_norm": 0.22207936020586685, "learning_rate": 1.994432378822371e-05, "loss": 0.845, "step": 2028 }, { "epoch": 3.5043177892918824, "grad_norm": 0.20370499433669575, "learning_rate": 1.9902600652021197e-05, "loss": 0.8448, "step": 2029 }, { "epoch": 3.5060449050086353, "grad_norm": 0.19018754029123058, "learning_rate": 1.9860906745501315e-05, "loss": 0.8688, "step": 2030 }, { "epoch": 3.5077720207253886, "grad_norm": 0.1831221626484794, "learning_rate": 1.9819242129303723e-05, "loss": 0.8583, "step": 2031 }, { "epoch": 3.5094991364421415, "grad_norm": 0.18680430777538662, "learning_rate": 1.9777606864025483e-05, "loss": 0.8855, "step": 2032 }, { "epoch": 3.5112262521588944, "grad_norm": 0.20331960906121002, "learning_rate": 1.9736001010221e-05, "loss": 0.8605, "step": 2033 }, { "epoch": 3.5129533678756477, "grad_norm": 0.22110326297848512, "learning_rate": 1.9694424628401863e-05, "loss": 0.8722, "step": 2034 }, { "epoch": 3.5146804835924006, "grad_norm": 0.17759381819728517, "learning_rate": 1.9652877779036798e-05, "loss": 0.88, "step": 2035 }, { "epoch": 3.5164075993091535, "grad_norm": 0.21776244129677427, "learning_rate": 1.961136052255163e-05, "loss": 0.8633, "step": 2036 }, { "epoch": 3.518134715025907, "grad_norm": 0.45742130576091844, "learning_rate": 1.9569872919329094e-05, "loss": 0.875, "step": 2037 }, { "epoch": 3.5198618307426597, "grad_norm": 0.2197492477952463, "learning_rate": 1.952841502970879e-05, "loss": 0.858, "step": 2038 }, { "epoch": 3.5215889464594126, "grad_norm": 0.17416424528761845, "learning_rate": 1.948698691398716e-05, "loss": 0.8745, "step": 2039 }, { "epoch": 3.523316062176166, "grad_norm": 0.48434894798551914, "learning_rate": 1.9445588632417288e-05, "loss": 0.8974, "step": 2040 }, { "epoch": 3.5250431778929188, "grad_norm": 0.20084990253883717, "learning_rate": 1.940422024520887e-05, "loss": 0.864, "step": 2041 }, { "epoch": 3.5267702936096716, "grad_norm": 0.2457920363427438, "learning_rate": 1.936288181252818e-05, "loss": 0.8682, "step": 2042 }, { "epoch": 3.528497409326425, "grad_norm": 0.2115807462297176, "learning_rate": 1.9321573394497858e-05, "loss": 0.8495, "step": 2043 }, { "epoch": 3.530224525043178, "grad_norm": 0.18126178728468081, "learning_rate": 1.928029505119691e-05, "loss": 0.8633, "step": 2044 }, { "epoch": 3.5319516407599307, "grad_norm": 0.2113531961104165, "learning_rate": 1.9239046842660644e-05, "loss": 0.8591, "step": 2045 }, { "epoch": 3.533678756476684, "grad_norm": 0.20958925107436474, "learning_rate": 1.9197828828880483e-05, "loss": 0.8653, "step": 2046 }, { "epoch": 3.535405872193437, "grad_norm": 0.20787033458713527, "learning_rate": 1.9156641069803946e-05, "loss": 0.8738, "step": 2047 }, { "epoch": 3.53713298791019, "grad_norm": 0.22611080575479028, "learning_rate": 1.9115483625334586e-05, "loss": 0.8486, "step": 2048 }, { "epoch": 3.538860103626943, "grad_norm": 0.20838352284074768, "learning_rate": 1.9074356555331834e-05, "loss": 0.8824, "step": 2049 }, { "epoch": 3.540587219343696, "grad_norm": 0.21561272434418277, "learning_rate": 1.9033259919610936e-05, "loss": 0.8606, "step": 2050 }, { "epoch": 3.542314335060449, "grad_norm": 0.2103432399334273, "learning_rate": 1.899219377794292e-05, "loss": 0.86, "step": 2051 }, { "epoch": 3.5440414507772022, "grad_norm": 0.23000873533468452, "learning_rate": 1.895115819005441e-05, "loss": 0.871, "step": 2052 }, { "epoch": 3.545768566493955, "grad_norm": 0.21746844262108705, "learning_rate": 1.8910153215627614e-05, "loss": 0.8702, "step": 2053 }, { "epoch": 3.547495682210708, "grad_norm": 0.22040264243200788, "learning_rate": 1.8869178914300237e-05, "loss": 0.8653, "step": 2054 }, { "epoch": 3.5492227979274613, "grad_norm": 0.20068113115626932, "learning_rate": 1.8828235345665333e-05, "loss": 0.8582, "step": 2055 }, { "epoch": 3.550949913644214, "grad_norm": 0.19764799510141556, "learning_rate": 1.8787322569271297e-05, "loss": 0.8573, "step": 2056 }, { "epoch": 3.552677029360967, "grad_norm": 0.20008779480485384, "learning_rate": 1.8746440644621722e-05, "loss": 0.8904, "step": 2057 }, { "epoch": 3.5544041450777204, "grad_norm": 0.24590739073754475, "learning_rate": 1.8705589631175304e-05, "loss": 0.8606, "step": 2058 }, { "epoch": 3.5561312607944733, "grad_norm": 0.1707740953124441, "learning_rate": 1.8664769588345843e-05, "loss": 0.8708, "step": 2059 }, { "epoch": 3.557858376511226, "grad_norm": 0.2527791665312126, "learning_rate": 1.862398057550204e-05, "loss": 0.8625, "step": 2060 }, { "epoch": 3.5595854922279795, "grad_norm": 0.19551259410941227, "learning_rate": 1.8583222651967496e-05, "loss": 0.8845, "step": 2061 }, { "epoch": 3.5613126079447324, "grad_norm": 0.23258439918086268, "learning_rate": 1.854249587702056e-05, "loss": 0.8854, "step": 2062 }, { "epoch": 3.5630397236614852, "grad_norm": 0.18176645893376336, "learning_rate": 1.850180030989434e-05, "loss": 0.8611, "step": 2063 }, { "epoch": 3.5647668393782386, "grad_norm": 0.22241530187518663, "learning_rate": 1.84611360097765e-05, "loss": 0.867, "step": 2064 }, { "epoch": 3.5664939550949915, "grad_norm": 0.19310617281142098, "learning_rate": 1.8420503035809235e-05, "loss": 0.851, "step": 2065 }, { "epoch": 3.5682210708117443, "grad_norm": 0.18865080839277648, "learning_rate": 1.8379901447089227e-05, "loss": 0.8613, "step": 2066 }, { "epoch": 3.5699481865284977, "grad_norm": 0.22852068548478588, "learning_rate": 1.833933130266746e-05, "loss": 0.8649, "step": 2067 }, { "epoch": 3.5716753022452505, "grad_norm": 0.17805678994938967, "learning_rate": 1.8298792661549188e-05, "loss": 0.8608, "step": 2068 }, { "epoch": 3.5734024179620034, "grad_norm": 0.2733083170642377, "learning_rate": 1.8258285582693887e-05, "loss": 0.8604, "step": 2069 }, { "epoch": 3.5751295336787567, "grad_norm": 0.20355230142391953, "learning_rate": 1.8217810125015095e-05, "loss": 0.8536, "step": 2070 }, { "epoch": 3.5768566493955096, "grad_norm": 0.23808486260875827, "learning_rate": 1.817736634738036e-05, "loss": 0.8608, "step": 2071 }, { "epoch": 3.5785837651122625, "grad_norm": 0.19198097167337846, "learning_rate": 1.8136954308611155e-05, "loss": 0.8688, "step": 2072 }, { "epoch": 3.5803108808290154, "grad_norm": 0.3009126047971047, "learning_rate": 1.8096574067482825e-05, "loss": 0.8581, "step": 2073 }, { "epoch": 3.5820379965457687, "grad_norm": 0.20544936020759744, "learning_rate": 1.8056225682724426e-05, "loss": 0.873, "step": 2074 }, { "epoch": 3.5837651122625216, "grad_norm": 0.26407144798715315, "learning_rate": 1.8015909213018692e-05, "loss": 0.8825, "step": 2075 }, { "epoch": 3.5854922279792745, "grad_norm": 0.2274067179468201, "learning_rate": 1.797562471700197e-05, "loss": 0.8728, "step": 2076 }, { "epoch": 3.587219343696028, "grad_norm": 0.24132647876321997, "learning_rate": 1.793537225326407e-05, "loss": 0.8609, "step": 2077 }, { "epoch": 3.5889464594127807, "grad_norm": 0.20925238113650702, "learning_rate": 1.7895151880348204e-05, "loss": 0.8683, "step": 2078 }, { "epoch": 3.5906735751295336, "grad_norm": 0.20583779626162044, "learning_rate": 1.7854963656750966e-05, "loss": 0.8543, "step": 2079 }, { "epoch": 3.5924006908462864, "grad_norm": 0.2084884216308528, "learning_rate": 1.7814807640922147e-05, "loss": 0.8635, "step": 2080 }, { "epoch": 3.5941278065630398, "grad_norm": 0.1902700029774157, "learning_rate": 1.7774683891264687e-05, "loss": 0.8739, "step": 2081 }, { "epoch": 3.5958549222797926, "grad_norm": 0.22209329988068008, "learning_rate": 1.773459246613465e-05, "loss": 0.8659, "step": 2082 }, { "epoch": 3.5975820379965455, "grad_norm": 0.22643753929134894, "learning_rate": 1.7694533423841038e-05, "loss": 0.8605, "step": 2083 }, { "epoch": 3.599309153713299, "grad_norm": 0.21273965564836317, "learning_rate": 1.7654506822645773e-05, "loss": 0.8685, "step": 2084 }, { "epoch": 3.6010362694300517, "grad_norm": 0.2134618155559346, "learning_rate": 1.7614512720763582e-05, "loss": 0.8602, "step": 2085 }, { "epoch": 3.6027633851468046, "grad_norm": 0.19338165069528016, "learning_rate": 1.757455117636196e-05, "loss": 0.8704, "step": 2086 }, { "epoch": 3.604490500863558, "grad_norm": 0.2008775987477039, "learning_rate": 1.7534622247561008e-05, "loss": 0.8674, "step": 2087 }, { "epoch": 3.606217616580311, "grad_norm": 0.22094576526435825, "learning_rate": 1.749472599243341e-05, "loss": 0.8602, "step": 2088 }, { "epoch": 3.6079447322970637, "grad_norm": 0.20807804852438522, "learning_rate": 1.7454862469004346e-05, "loss": 0.8295, "step": 2089 }, { "epoch": 3.609671848013817, "grad_norm": 0.21043500051805428, "learning_rate": 1.7415031735251364e-05, "loss": 0.8734, "step": 2090 }, { "epoch": 3.61139896373057, "grad_norm": 0.18134457113637337, "learning_rate": 1.7375233849104315e-05, "loss": 0.8325, "step": 2091 }, { "epoch": 3.613126079447323, "grad_norm": 0.2215574016793117, "learning_rate": 1.733546886844533e-05, "loss": 0.8587, "step": 2092 }, { "epoch": 3.614853195164076, "grad_norm": 0.228108813470185, "learning_rate": 1.7295736851108625e-05, "loss": 0.8537, "step": 2093 }, { "epoch": 3.616580310880829, "grad_norm": 0.1959246731982709, "learning_rate": 1.7256037854880504e-05, "loss": 0.8681, "step": 2094 }, { "epoch": 3.618307426597582, "grad_norm": 0.24707365314810495, "learning_rate": 1.721637193749921e-05, "loss": 0.8703, "step": 2095 }, { "epoch": 3.620034542314335, "grad_norm": 0.21855411559847512, "learning_rate": 1.7176739156654936e-05, "loss": 0.8692, "step": 2096 }, { "epoch": 3.621761658031088, "grad_norm": 0.21155281001757553, "learning_rate": 1.713713956998964e-05, "loss": 0.8564, "step": 2097 }, { "epoch": 3.623488773747841, "grad_norm": 0.2387526456584492, "learning_rate": 1.709757323509699e-05, "loss": 0.8615, "step": 2098 }, { "epoch": 3.6252158894645943, "grad_norm": 0.21450311401874625, "learning_rate": 1.7058040209522345e-05, "loss": 0.871, "step": 2099 }, { "epoch": 3.626943005181347, "grad_norm": 0.22350057858090594, "learning_rate": 1.701854055076256e-05, "loss": 0.847, "step": 2100 }, { "epoch": 3.6286701208981, "grad_norm": 0.2568539852329748, "learning_rate": 1.6979074316265992e-05, "loss": 0.8705, "step": 2101 }, { "epoch": 3.6303972366148534, "grad_norm": 0.18747264895661458, "learning_rate": 1.6939641563432396e-05, "loss": 0.8529, "step": 2102 }, { "epoch": 3.6321243523316062, "grad_norm": 0.2394917645160303, "learning_rate": 1.690024234961281e-05, "loss": 0.8869, "step": 2103 }, { "epoch": 3.633851468048359, "grad_norm": 0.22258268112083415, "learning_rate": 1.6860876732109486e-05, "loss": 0.8776, "step": 2104 }, { "epoch": 3.6355785837651124, "grad_norm": 0.18081466273793861, "learning_rate": 1.682154476817585e-05, "loss": 0.8533, "step": 2105 }, { "epoch": 3.6373056994818653, "grad_norm": 0.20717571231311696, "learning_rate": 1.6782246515016346e-05, "loss": 0.8825, "step": 2106 }, { "epoch": 3.639032815198618, "grad_norm": 0.1942088726512596, "learning_rate": 1.6742982029786404e-05, "loss": 0.8831, "step": 2107 }, { "epoch": 3.6407599309153715, "grad_norm": 0.19064826582555505, "learning_rate": 1.670375136959233e-05, "loss": 0.8653, "step": 2108 }, { "epoch": 3.6424870466321244, "grad_norm": 0.2192270718013021, "learning_rate": 1.6664554591491263e-05, "loss": 0.8836, "step": 2109 }, { "epoch": 3.6442141623488773, "grad_norm": 0.17656884720403562, "learning_rate": 1.662539175249104e-05, "loss": 0.8668, "step": 2110 }, { "epoch": 3.6459412780656306, "grad_norm": 0.2794675155116181, "learning_rate": 1.6586262909550132e-05, "loss": 0.877, "step": 2111 }, { "epoch": 3.6476683937823835, "grad_norm": 0.19328283209477198, "learning_rate": 1.6547168119577595e-05, "loss": 0.8688, "step": 2112 }, { "epoch": 3.6493955094991364, "grad_norm": 0.19030774573993423, "learning_rate": 1.6508107439432932e-05, "loss": 0.8511, "step": 2113 }, { "epoch": 3.6511226252158897, "grad_norm": 0.6720109872280315, "learning_rate": 1.646908092592603e-05, "loss": 0.8775, "step": 2114 }, { "epoch": 3.6528497409326426, "grad_norm": 0.1791091857975662, "learning_rate": 1.6430088635817124e-05, "loss": 0.8673, "step": 2115 }, { "epoch": 3.6545768566493955, "grad_norm": 0.2973057474461744, "learning_rate": 1.639113062581664e-05, "loss": 0.866, "step": 2116 }, { "epoch": 3.656303972366149, "grad_norm": 0.25487304141100176, "learning_rate": 1.6352206952585154e-05, "loss": 0.8609, "step": 2117 }, { "epoch": 3.6580310880829017, "grad_norm": 0.25352695011530557, "learning_rate": 1.6313317672733287e-05, "loss": 0.8557, "step": 2118 }, { "epoch": 3.6597582037996546, "grad_norm": 0.2266759529152252, "learning_rate": 1.6274462842821693e-05, "loss": 0.8431, "step": 2119 }, { "epoch": 3.6614853195164074, "grad_norm": 0.18675807153003188, "learning_rate": 1.6235642519360863e-05, "loss": 0.8618, "step": 2120 }, { "epoch": 3.6632124352331608, "grad_norm": 0.25633466617815454, "learning_rate": 1.6196856758811112e-05, "loss": 0.8687, "step": 2121 }, { "epoch": 3.6649395509499136, "grad_norm": 0.19816021800925218, "learning_rate": 1.615810561758253e-05, "loss": 0.8838, "step": 2122 }, { "epoch": 3.6666666666666665, "grad_norm": 0.2203977774000965, "learning_rate": 1.6119389152034804e-05, "loss": 0.8628, "step": 2123 }, { "epoch": 3.66839378238342, "grad_norm": 0.1959853403486548, "learning_rate": 1.6080707418477203e-05, "loss": 0.8703, "step": 2124 }, { "epoch": 3.6701208981001727, "grad_norm": 0.19449390633680602, "learning_rate": 1.604206047316851e-05, "loss": 0.8831, "step": 2125 }, { "epoch": 3.6718480138169256, "grad_norm": 0.1778790874219279, "learning_rate": 1.6003448372316884e-05, "loss": 0.8662, "step": 2126 }, { "epoch": 3.6735751295336785, "grad_norm": 0.1803104594376959, "learning_rate": 1.596487117207979e-05, "loss": 0.853, "step": 2127 }, { "epoch": 3.675302245250432, "grad_norm": 0.18989724716221423, "learning_rate": 1.592632892856399e-05, "loss": 0.8377, "step": 2128 }, { "epoch": 3.6770293609671847, "grad_norm": 0.18055564127789994, "learning_rate": 1.5887821697825354e-05, "loss": 0.8551, "step": 2129 }, { "epoch": 3.6787564766839376, "grad_norm": 0.930843390073152, "learning_rate": 1.5849349535868843e-05, "loss": 0.8868, "step": 2130 }, { "epoch": 3.680483592400691, "grad_norm": 0.22828009494236284, "learning_rate": 1.5810912498648412e-05, "loss": 0.9054, "step": 2131 }, { "epoch": 3.6822107081174438, "grad_norm": 0.19411392751242643, "learning_rate": 1.5772510642066962e-05, "loss": 0.8719, "step": 2132 }, { "epoch": 3.6839378238341967, "grad_norm": 0.19898814216763774, "learning_rate": 1.5734144021976178e-05, "loss": 0.8724, "step": 2133 }, { "epoch": 3.68566493955095, "grad_norm": 0.20953956174865168, "learning_rate": 1.5695812694176513e-05, "loss": 0.8618, "step": 2134 }, { "epoch": 3.687392055267703, "grad_norm": 4.661521760152715, "learning_rate": 1.5657516714417126e-05, "loss": 0.8669, "step": 2135 }, { "epoch": 3.6891191709844557, "grad_norm": 0.2542506012267877, "learning_rate": 1.5619256138395725e-05, "loss": 0.8522, "step": 2136 }, { "epoch": 3.690846286701209, "grad_norm": 0.8708398208421275, "learning_rate": 1.5581031021758525e-05, "loss": 0.8836, "step": 2137 }, { "epoch": 3.692573402417962, "grad_norm": 1.4486453844833642, "learning_rate": 1.5542841420100202e-05, "loss": 0.8731, "step": 2138 }, { "epoch": 3.694300518134715, "grad_norm": 0.9348953340227925, "learning_rate": 1.550468738896376e-05, "loss": 0.8611, "step": 2139 }, { "epoch": 3.696027633851468, "grad_norm": 0.46493485801968265, "learning_rate": 1.5466568983840455e-05, "loss": 0.872, "step": 2140 }, { "epoch": 3.697754749568221, "grad_norm": 0.7126950483142618, "learning_rate": 1.5428486260169742e-05, "loss": 0.8519, "step": 2141 }, { "epoch": 3.699481865284974, "grad_norm": 0.23252217134070283, "learning_rate": 1.5390439273339203e-05, "loss": 0.8684, "step": 2142 }, { "epoch": 3.7012089810017272, "grad_norm": 0.5223096769457872, "learning_rate": 1.5352428078684413e-05, "loss": 0.8647, "step": 2143 }, { "epoch": 3.70293609671848, "grad_norm": 0.5249537317693631, "learning_rate": 1.5314452731488884e-05, "loss": 0.8813, "step": 2144 }, { "epoch": 3.704663212435233, "grad_norm": 0.2229807814077658, "learning_rate": 1.5276513286984033e-05, "loss": 0.8362, "step": 2145 }, { "epoch": 3.7063903281519863, "grad_norm": 0.2767322638087843, "learning_rate": 1.5238609800349028e-05, "loss": 0.8591, "step": 2146 }, { "epoch": 3.708117443868739, "grad_norm": 0.21375984982167281, "learning_rate": 1.5200742326710725e-05, "loss": 0.8516, "step": 2147 }, { "epoch": 3.709844559585492, "grad_norm": 0.19325874468284962, "learning_rate": 1.5162910921143659e-05, "loss": 0.8735, "step": 2148 }, { "epoch": 3.7115716753022454, "grad_norm": 0.21651546655722076, "learning_rate": 1.512511563866986e-05, "loss": 0.8723, "step": 2149 }, { "epoch": 3.7132987910189983, "grad_norm": 0.33452500354434833, "learning_rate": 1.5087356534258817e-05, "loss": 0.8916, "step": 2150 }, { "epoch": 3.715025906735751, "grad_norm": 0.1961013283101255, "learning_rate": 1.5049633662827448e-05, "loss": 0.8684, "step": 2151 }, { "epoch": 3.7167530224525045, "grad_norm": 0.2109399034033331, "learning_rate": 1.5011947079239928e-05, "loss": 0.8664, "step": 2152 }, { "epoch": 3.7184801381692574, "grad_norm": 0.17446517308711032, "learning_rate": 1.4974296838307658e-05, "loss": 0.8717, "step": 2153 }, { "epoch": 3.7202072538860103, "grad_norm": 0.24329624129650676, "learning_rate": 1.4936682994789227e-05, "loss": 0.8542, "step": 2154 }, { "epoch": 3.7219343696027636, "grad_norm": 0.15157128061248393, "learning_rate": 1.4899105603390233e-05, "loss": 0.8706, "step": 2155 }, { "epoch": 3.7236614853195165, "grad_norm": 0.24788571706228918, "learning_rate": 1.4861564718763277e-05, "loss": 0.8598, "step": 2156 }, { "epoch": 3.7253886010362693, "grad_norm": 0.17558584901300586, "learning_rate": 1.482406039550789e-05, "loss": 0.8587, "step": 2157 }, { "epoch": 3.7271157167530227, "grad_norm": 0.181780328035432, "learning_rate": 1.4786592688170393e-05, "loss": 0.8702, "step": 2158 }, { "epoch": 3.7288428324697755, "grad_norm": 0.17733761665665188, "learning_rate": 1.4749161651243857e-05, "loss": 0.8659, "step": 2159 }, { "epoch": 3.7305699481865284, "grad_norm": 0.17954476983492826, "learning_rate": 1.471176733916805e-05, "loss": 0.8663, "step": 2160 }, { "epoch": 3.7322970639032818, "grad_norm": 0.19601239280088043, "learning_rate": 1.4674409806329295e-05, "loss": 0.8663, "step": 2161 }, { "epoch": 3.7340241796200346, "grad_norm": 0.16855078148157004, "learning_rate": 1.4637089107060422e-05, "loss": 0.8546, "step": 2162 }, { "epoch": 3.7357512953367875, "grad_norm": 0.19894004580350616, "learning_rate": 1.4599805295640725e-05, "loss": 0.8486, "step": 2163 }, { "epoch": 3.737478411053541, "grad_norm": 0.15887597584295957, "learning_rate": 1.456255842629582e-05, "loss": 0.8646, "step": 2164 }, { "epoch": 3.7392055267702937, "grad_norm": 0.18363945135786802, "learning_rate": 1.4525348553197578e-05, "loss": 0.8819, "step": 2165 }, { "epoch": 3.7409326424870466, "grad_norm": 0.17211218769317796, "learning_rate": 1.4488175730464118e-05, "loss": 0.862, "step": 2166 }, { "epoch": 3.7426597582038, "grad_norm": 0.1701656534268132, "learning_rate": 1.4451040012159609e-05, "loss": 0.8653, "step": 2167 }, { "epoch": 3.744386873920553, "grad_norm": 0.17074804499172364, "learning_rate": 1.4413941452294316e-05, "loss": 0.842, "step": 2168 }, { "epoch": 3.7461139896373057, "grad_norm": 0.151051115842816, "learning_rate": 1.4376880104824413e-05, "loss": 0.8752, "step": 2169 }, { "epoch": 3.7478411053540586, "grad_norm": 0.1861738106382227, "learning_rate": 1.433985602365196e-05, "loss": 0.8556, "step": 2170 }, { "epoch": 3.749568221070812, "grad_norm": 0.20275888576530332, "learning_rate": 1.4302869262624852e-05, "loss": 0.8767, "step": 2171 }, { "epoch": 3.7512953367875648, "grad_norm": 0.17830900632684923, "learning_rate": 1.4265919875536667e-05, "loss": 0.8617, "step": 2172 }, { "epoch": 3.7530224525043177, "grad_norm": 0.22696110612794554, "learning_rate": 1.4229007916126642e-05, "loss": 0.8802, "step": 2173 }, { "epoch": 3.754749568221071, "grad_norm": 0.1720835844068611, "learning_rate": 1.4192133438079561e-05, "loss": 0.8572, "step": 2174 }, { "epoch": 3.756476683937824, "grad_norm": 0.2062699895997437, "learning_rate": 1.4155296495025735e-05, "loss": 0.8898, "step": 2175 }, { "epoch": 3.7582037996545767, "grad_norm": 0.19635820231688159, "learning_rate": 1.411849714054085e-05, "loss": 0.881, "step": 2176 }, { "epoch": 3.7599309153713296, "grad_norm": 0.23694504642992414, "learning_rate": 1.4081735428145922e-05, "loss": 0.8695, "step": 2177 }, { "epoch": 3.761658031088083, "grad_norm": 0.20879839796763464, "learning_rate": 1.4045011411307253e-05, "loss": 0.8592, "step": 2178 }, { "epoch": 3.763385146804836, "grad_norm": 0.22474675931854995, "learning_rate": 1.4008325143436286e-05, "loss": 0.884, "step": 2179 }, { "epoch": 3.7651122625215887, "grad_norm": 0.22244108178410926, "learning_rate": 1.3971676677889563e-05, "loss": 0.8438, "step": 2180 }, { "epoch": 3.766839378238342, "grad_norm": 0.1824871915572591, "learning_rate": 1.3935066067968683e-05, "loss": 0.8615, "step": 2181 }, { "epoch": 3.768566493955095, "grad_norm": 0.5609977022746316, "learning_rate": 1.3898493366920151e-05, "loss": 0.912, "step": 2182 }, { "epoch": 3.770293609671848, "grad_norm": 0.2100294929077649, "learning_rate": 1.3861958627935335e-05, "loss": 0.8499, "step": 2183 }, { "epoch": 3.772020725388601, "grad_norm": 0.24393025824251474, "learning_rate": 1.3825461904150434e-05, "loss": 0.8698, "step": 2184 }, { "epoch": 3.773747841105354, "grad_norm": 0.20815543141735562, "learning_rate": 1.3789003248646311e-05, "loss": 0.8767, "step": 2185 }, { "epoch": 3.775474956822107, "grad_norm": 0.1810534527737032, "learning_rate": 1.3752582714448481e-05, "loss": 0.866, "step": 2186 }, { "epoch": 3.77720207253886, "grad_norm": 0.2061436317349607, "learning_rate": 1.3716200354527001e-05, "loss": 0.8535, "step": 2187 }, { "epoch": 3.778929188255613, "grad_norm": 0.17004553381583082, "learning_rate": 1.3679856221796449e-05, "loss": 0.8723, "step": 2188 }, { "epoch": 3.780656303972366, "grad_norm": 0.19886983551476684, "learning_rate": 1.364355036911576e-05, "loss": 0.8675, "step": 2189 }, { "epoch": 3.7823834196891193, "grad_norm": 2.11756649167769, "learning_rate": 1.3607282849288201e-05, "loss": 0.8806, "step": 2190 }, { "epoch": 3.784110535405872, "grad_norm": 0.19893858129860276, "learning_rate": 1.3571053715061324e-05, "loss": 0.8437, "step": 2191 }, { "epoch": 3.785837651122625, "grad_norm": 0.17829753252718708, "learning_rate": 1.3534863019126814e-05, "loss": 0.8757, "step": 2192 }, { "epoch": 3.7875647668393784, "grad_norm": 0.17295457955469207, "learning_rate": 1.3498710814120451e-05, "loss": 0.8668, "step": 2193 }, { "epoch": 3.7892918825561313, "grad_norm": 0.1571992253441763, "learning_rate": 1.3462597152622071e-05, "loss": 0.8565, "step": 2194 }, { "epoch": 3.791018998272884, "grad_norm": 0.1828848223774849, "learning_rate": 1.3426522087155416e-05, "loss": 0.8598, "step": 2195 }, { "epoch": 3.7927461139896375, "grad_norm": 0.1605277745618357, "learning_rate": 1.3390485670188111e-05, "loss": 0.8735, "step": 2196 }, { "epoch": 3.7944732297063903, "grad_norm": 0.18372508191926973, "learning_rate": 1.3354487954131546e-05, "loss": 0.8911, "step": 2197 }, { "epoch": 3.796200345423143, "grad_norm": 0.17463862171314484, "learning_rate": 1.3318528991340873e-05, "loss": 0.8582, "step": 2198 }, { "epoch": 3.7979274611398965, "grad_norm": 0.1673593565466499, "learning_rate": 1.3282608834114838e-05, "loss": 0.8824, "step": 2199 }, { "epoch": 3.7996545768566494, "grad_norm": 0.15244714047207414, "learning_rate": 1.3246727534695749e-05, "loss": 0.872, "step": 2200 }, { "epoch": 3.8013816925734023, "grad_norm": 0.19448533146753672, "learning_rate": 1.3210885145269444e-05, "loss": 0.8711, "step": 2201 }, { "epoch": 3.8031088082901556, "grad_norm": 0.17560904841337913, "learning_rate": 1.317508171796512e-05, "loss": 0.8776, "step": 2202 }, { "epoch": 3.8048359240069085, "grad_norm": 0.21553081994673318, "learning_rate": 1.313931730485531e-05, "loss": 0.8908, "step": 2203 }, { "epoch": 3.8065630397236614, "grad_norm": 0.1952392755772464, "learning_rate": 1.3103591957955857e-05, "loss": 0.8603, "step": 2204 }, { "epoch": 3.8082901554404147, "grad_norm": 0.19773975009416592, "learning_rate": 1.3067905729225729e-05, "loss": 0.8687, "step": 2205 }, { "epoch": 3.8100172711571676, "grad_norm": 0.22403215484096164, "learning_rate": 1.3032258670567015e-05, "loss": 0.8783, "step": 2206 }, { "epoch": 3.8117443868739205, "grad_norm": 0.17191139300718516, "learning_rate": 1.299665083382486e-05, "loss": 0.8637, "step": 2207 }, { "epoch": 3.813471502590674, "grad_norm": 0.21155326568897778, "learning_rate": 1.2961082270787335e-05, "loss": 0.8701, "step": 2208 }, { "epoch": 3.8151986183074267, "grad_norm": 0.2531554557225839, "learning_rate": 1.2925553033185408e-05, "loss": 0.8558, "step": 2209 }, { "epoch": 3.8169257340241796, "grad_norm": 0.1619384336040461, "learning_rate": 1.2890063172692831e-05, "loss": 0.8436, "step": 2210 }, { "epoch": 3.818652849740933, "grad_norm": 0.22470977800773353, "learning_rate": 1.285461274092612e-05, "loss": 0.8636, "step": 2211 }, { "epoch": 3.8203799654576858, "grad_norm": 0.14996043412461332, "learning_rate": 1.2819201789444424e-05, "loss": 0.8783, "step": 2212 }, { "epoch": 3.8221070811744386, "grad_norm": 0.18769853050259494, "learning_rate": 1.2783830369749456e-05, "loss": 0.8536, "step": 2213 }, { "epoch": 3.823834196891192, "grad_norm": 0.1771414792576515, "learning_rate": 1.274849853328548e-05, "loss": 0.852, "step": 2214 }, { "epoch": 3.825561312607945, "grad_norm": 0.1460259412853419, "learning_rate": 1.271320633143915e-05, "loss": 0.8671, "step": 2215 }, { "epoch": 3.8272884283246977, "grad_norm": 0.17636893528074982, "learning_rate": 1.2677953815539477e-05, "loss": 0.8731, "step": 2216 }, { "epoch": 3.8290155440414506, "grad_norm": 0.15967244504953723, "learning_rate": 1.264274103685779e-05, "loss": 0.8355, "step": 2217 }, { "epoch": 3.830742659758204, "grad_norm": 0.18068758882627647, "learning_rate": 1.2607568046607583e-05, "loss": 0.8761, "step": 2218 }, { "epoch": 3.832469775474957, "grad_norm": 0.1827953915050586, "learning_rate": 1.2572434895944494e-05, "loss": 0.8563, "step": 2219 }, { "epoch": 3.8341968911917097, "grad_norm": 0.18190944162337405, "learning_rate": 1.2537341635966218e-05, "loss": 0.8704, "step": 2220 }, { "epoch": 3.835924006908463, "grad_norm": 0.17192604208579312, "learning_rate": 1.2502288317712453e-05, "loss": 0.8702, "step": 2221 }, { "epoch": 3.837651122625216, "grad_norm": 0.18873363415000843, "learning_rate": 1.2467274992164776e-05, "loss": 0.8657, "step": 2222 }, { "epoch": 3.839378238341969, "grad_norm": 0.18688244932623194, "learning_rate": 1.2432301710246598e-05, "loss": 0.8559, "step": 2223 }, { "epoch": 3.8411053540587217, "grad_norm": 0.1938690507098947, "learning_rate": 1.2397368522823138e-05, "loss": 0.8741, "step": 2224 }, { "epoch": 3.842832469775475, "grad_norm": 0.1999957337687537, "learning_rate": 1.236247548070125e-05, "loss": 0.8754, "step": 2225 }, { "epoch": 3.844559585492228, "grad_norm": 0.15127013843975548, "learning_rate": 1.23276226346294e-05, "loss": 0.8764, "step": 2226 }, { "epoch": 3.8462867012089808, "grad_norm": 0.44213591042497025, "learning_rate": 1.229281003529764e-05, "loss": 0.8854, "step": 2227 }, { "epoch": 3.848013816925734, "grad_norm": 0.16659613613447083, "learning_rate": 1.2258037733337448e-05, "loss": 0.865, "step": 2228 }, { "epoch": 3.849740932642487, "grad_norm": 0.18798675180225818, "learning_rate": 1.2223305779321683e-05, "loss": 0.8615, "step": 2229 }, { "epoch": 3.85146804835924, "grad_norm": 0.21907974471707986, "learning_rate": 1.2188614223764574e-05, "loss": 0.8625, "step": 2230 }, { "epoch": 3.853195164075993, "grad_norm": 0.16797614033793554, "learning_rate": 1.215396311712155e-05, "loss": 0.8638, "step": 2231 }, { "epoch": 3.854922279792746, "grad_norm": 0.23061801767288093, "learning_rate": 1.2119352509789217e-05, "loss": 0.859, "step": 2232 }, { "epoch": 3.856649395509499, "grad_norm": 0.16135341329603728, "learning_rate": 1.208478245210528e-05, "loss": 0.874, "step": 2233 }, { "epoch": 3.8583765112262522, "grad_norm": 0.23376172941765336, "learning_rate": 1.20502529943485e-05, "loss": 0.8561, "step": 2234 }, { "epoch": 3.860103626943005, "grad_norm": 0.18552433375331767, "learning_rate": 1.2015764186738541e-05, "loss": 0.8801, "step": 2235 }, { "epoch": 3.861830742659758, "grad_norm": 0.1922258270944501, "learning_rate": 1.1981316079435965e-05, "loss": 0.8611, "step": 2236 }, { "epoch": 3.8635578583765113, "grad_norm": 0.19137096892301594, "learning_rate": 1.1946908722542161e-05, "loss": 0.8707, "step": 2237 }, { "epoch": 3.865284974093264, "grad_norm": 0.16998678789266333, "learning_rate": 1.1912542166099224e-05, "loss": 0.8718, "step": 2238 }, { "epoch": 3.867012089810017, "grad_norm": 0.17669448789359032, "learning_rate": 1.1878216460089899e-05, "loss": 0.8559, "step": 2239 }, { "epoch": 3.8687392055267704, "grad_norm": 0.20511798699510633, "learning_rate": 1.1843931654437565e-05, "loss": 0.8685, "step": 2240 }, { "epoch": 3.8704663212435233, "grad_norm": 0.18222075194568313, "learning_rate": 1.1809687799006073e-05, "loss": 0.8542, "step": 2241 }, { "epoch": 3.872193436960276, "grad_norm": 0.18554049222945881, "learning_rate": 1.1775484943599728e-05, "loss": 0.8625, "step": 2242 }, { "epoch": 3.8739205526770295, "grad_norm": 0.17397778529711816, "learning_rate": 1.174132313796319e-05, "loss": 0.8487, "step": 2243 }, { "epoch": 3.8756476683937824, "grad_norm": 0.1771088203890626, "learning_rate": 1.170720243178146e-05, "loss": 0.858, "step": 2244 }, { "epoch": 3.8773747841105353, "grad_norm": 0.1562430145597745, "learning_rate": 1.1673122874679722e-05, "loss": 0.8633, "step": 2245 }, { "epoch": 3.8791018998272886, "grad_norm": 0.17317448552535256, "learning_rate": 1.1639084516223318e-05, "loss": 0.863, "step": 2246 }, { "epoch": 3.8808290155440415, "grad_norm": 0.17206917074748262, "learning_rate": 1.1605087405917695e-05, "loss": 0.8494, "step": 2247 }, { "epoch": 3.8825561312607944, "grad_norm": 0.18788482808041965, "learning_rate": 1.1571131593208284e-05, "loss": 0.8604, "step": 2248 }, { "epoch": 3.8842832469775477, "grad_norm": 0.18389150929686163, "learning_rate": 1.1537217127480456e-05, "loss": 0.8685, "step": 2249 }, { "epoch": 3.8860103626943006, "grad_norm": 0.17381812479279277, "learning_rate": 1.1503344058059468e-05, "loss": 0.858, "step": 2250 }, { "epoch": 3.8877374784110534, "grad_norm": 0.16380148482192283, "learning_rate": 1.1469512434210341e-05, "loss": 0.8482, "step": 2251 }, { "epoch": 3.8894645941278068, "grad_norm": 0.1413695348730968, "learning_rate": 1.1435722305137827e-05, "loss": 0.8563, "step": 2252 }, { "epoch": 3.8911917098445596, "grad_norm": 0.16494653867961506, "learning_rate": 1.1401973719986348e-05, "loss": 0.8481, "step": 2253 }, { "epoch": 3.8929188255613125, "grad_norm": 0.18870049988223583, "learning_rate": 1.1368266727839883e-05, "loss": 0.8784, "step": 2254 }, { "epoch": 3.894645941278066, "grad_norm": 0.1564818589698664, "learning_rate": 1.133460137772192e-05, "loss": 0.8465, "step": 2255 }, { "epoch": 3.8963730569948187, "grad_norm": 0.18559201624291732, "learning_rate": 1.1300977718595369e-05, "loss": 0.8782, "step": 2256 }, { "epoch": 3.8981001727115716, "grad_norm": 0.16041845307217378, "learning_rate": 1.1267395799362553e-05, "loss": 0.8637, "step": 2257 }, { "epoch": 3.899827288428325, "grad_norm": 0.22738398178490812, "learning_rate": 1.1233855668865043e-05, "loss": 0.8802, "step": 2258 }, { "epoch": 3.901554404145078, "grad_norm": 0.20522061865662433, "learning_rate": 1.1200357375883635e-05, "loss": 0.8741, "step": 2259 }, { "epoch": 3.9032815198618307, "grad_norm": 0.2035598876488008, "learning_rate": 1.116690096913831e-05, "loss": 0.866, "step": 2260 }, { "epoch": 3.905008635578584, "grad_norm": 0.2608884514630692, "learning_rate": 1.1133486497288101e-05, "loss": 0.878, "step": 2261 }, { "epoch": 3.906735751295337, "grad_norm": 0.18806416020522052, "learning_rate": 1.1100114008931038e-05, "loss": 0.8599, "step": 2262 }, { "epoch": 3.90846286701209, "grad_norm": 0.2572456795373681, "learning_rate": 1.1066783552604136e-05, "loss": 0.8733, "step": 2263 }, { "epoch": 3.910189982728843, "grad_norm": 0.20381497626295508, "learning_rate": 1.1033495176783244e-05, "loss": 0.8771, "step": 2264 }, { "epoch": 3.911917098445596, "grad_norm": 0.18519952606413184, "learning_rate": 1.1000248929883001e-05, "loss": 0.8762, "step": 2265 }, { "epoch": 3.913644214162349, "grad_norm": 0.18095864367036835, "learning_rate": 1.0967044860256814e-05, "loss": 0.8566, "step": 2266 }, { "epoch": 3.9153713298791017, "grad_norm": 0.171783404182663, "learning_rate": 1.0933883016196716e-05, "loss": 0.8666, "step": 2267 }, { "epoch": 3.917098445595855, "grad_norm": 0.21647310823000274, "learning_rate": 1.0900763445933315e-05, "loss": 0.8835, "step": 2268 }, { "epoch": 3.918825561312608, "grad_norm": 0.1631096945377616, "learning_rate": 1.0867686197635785e-05, "loss": 0.86, "step": 2269 }, { "epoch": 3.920552677029361, "grad_norm": 0.2078892907275828, "learning_rate": 1.08346513194117e-05, "loss": 0.866, "step": 2270 }, { "epoch": 3.9222797927461137, "grad_norm": 0.16931670257694736, "learning_rate": 1.0801658859307023e-05, "loss": 0.8814, "step": 2271 }, { "epoch": 3.924006908462867, "grad_norm": 0.1663068178092915, "learning_rate": 1.0768708865306046e-05, "loss": 0.869, "step": 2272 }, { "epoch": 3.92573402417962, "grad_norm": 0.18906123582166393, "learning_rate": 1.0735801385331275e-05, "loss": 0.852, "step": 2273 }, { "epoch": 3.927461139896373, "grad_norm": 0.15115296379263066, "learning_rate": 1.0702936467243373e-05, "loss": 0.8463, "step": 2274 }, { "epoch": 3.929188255613126, "grad_norm": 0.16283762684697572, "learning_rate": 1.0670114158841142e-05, "loss": 0.8523, "step": 2275 }, { "epoch": 3.930915371329879, "grad_norm": 0.16896742059379244, "learning_rate": 1.0637334507861384e-05, "loss": 0.8894, "step": 2276 }, { "epoch": 3.932642487046632, "grad_norm": 0.14607850128293406, "learning_rate": 1.0604597561978842e-05, "loss": 0.8868, "step": 2277 }, { "epoch": 3.934369602763385, "grad_norm": 0.18771840551553307, "learning_rate": 1.0571903368806198e-05, "loss": 0.8481, "step": 2278 }, { "epoch": 3.936096718480138, "grad_norm": 0.1520325775998876, "learning_rate": 1.0539251975893908e-05, "loss": 0.8867, "step": 2279 }, { "epoch": 3.937823834196891, "grad_norm": 0.14947705121937901, "learning_rate": 1.0506643430730214e-05, "loss": 0.8651, "step": 2280 }, { "epoch": 3.9395509499136443, "grad_norm": 0.16659521247727618, "learning_rate": 1.0474077780741024e-05, "loss": 0.8781, "step": 2281 }, { "epoch": 3.941278065630397, "grad_norm": 0.15035942928296464, "learning_rate": 1.0441555073289833e-05, "loss": 0.8648, "step": 2282 }, { "epoch": 3.94300518134715, "grad_norm": 0.17253102105355916, "learning_rate": 1.0409075355677745e-05, "loss": 0.8523, "step": 2283 }, { "epoch": 3.9447322970639034, "grad_norm": 0.15344662090792272, "learning_rate": 1.0376638675143283e-05, "loss": 0.8678, "step": 2284 }, { "epoch": 3.9464594127806563, "grad_norm": 0.16908664671179346, "learning_rate": 1.0344245078862386e-05, "loss": 0.8705, "step": 2285 }, { "epoch": 3.948186528497409, "grad_norm": 0.13352715955749314, "learning_rate": 1.0311894613948361e-05, "loss": 0.8482, "step": 2286 }, { "epoch": 3.9499136442141625, "grad_norm": 1.0784515303027762, "learning_rate": 1.0279587327451757e-05, "loss": 0.895, "step": 2287 }, { "epoch": 3.9516407599309153, "grad_norm": 0.16298830883312815, "learning_rate": 1.0247323266360333e-05, "loss": 0.861, "step": 2288 }, { "epoch": 3.9533678756476682, "grad_norm": 0.1494873852201232, "learning_rate": 1.021510247759896e-05, "loss": 0.8828, "step": 2289 }, { "epoch": 3.9550949913644216, "grad_norm": 0.3961017478655708, "learning_rate": 1.018292500802962e-05, "loss": 0.864, "step": 2290 }, { "epoch": 3.9568221070811744, "grad_norm": 0.14350170902162837, "learning_rate": 1.0150790904451258e-05, "loss": 0.8614, "step": 2291 }, { "epoch": 3.9585492227979273, "grad_norm": 0.18406595663843536, "learning_rate": 1.0118700213599734e-05, "loss": 0.8707, "step": 2292 }, { "epoch": 3.9602763385146806, "grad_norm": 0.16526937944364348, "learning_rate": 1.008665298214782e-05, "loss": 0.8592, "step": 2293 }, { "epoch": 3.9620034542314335, "grad_norm": 0.1878431462077027, "learning_rate": 1.0054649256705033e-05, "loss": 0.8791, "step": 2294 }, { "epoch": 3.9637305699481864, "grad_norm": 0.18212211588691726, "learning_rate": 1.0022689083817623e-05, "loss": 0.8639, "step": 2295 }, { "epoch": 3.9654576856649397, "grad_norm": 0.17454310267687417, "learning_rate": 9.99077250996853e-06, "loss": 0.8412, "step": 2296 }, { "epoch": 3.9671848013816926, "grad_norm": 0.19724980951424523, "learning_rate": 9.958899581577236e-06, "loss": 0.8554, "step": 2297 }, { "epoch": 3.9689119170984455, "grad_norm": 0.16877056453443268, "learning_rate": 9.927070344999779e-06, "loss": 0.8596, "step": 2298 }, { "epoch": 3.970639032815199, "grad_norm": 0.18165344096566755, "learning_rate": 9.895284846528623e-06, "loss": 0.8587, "step": 2299 }, { "epoch": 3.9723661485319517, "grad_norm": 0.1876472843647405, "learning_rate": 9.863543132392665e-06, "loss": 0.8504, "step": 2300 }, { "epoch": 3.9740932642487046, "grad_norm": 0.1421874566943582, "learning_rate": 9.83184524875708e-06, "loss": 0.8637, "step": 2301 }, { "epoch": 3.975820379965458, "grad_norm": 0.19394588489732492, "learning_rate": 9.800191241723293e-06, "loss": 0.8483, "step": 2302 }, { "epoch": 3.9775474956822108, "grad_norm": 0.19086982203078703, "learning_rate": 9.768581157328958e-06, "loss": 0.8768, "step": 2303 }, { "epoch": 3.9792746113989637, "grad_norm": 0.15392281887240292, "learning_rate": 9.737015041547807e-06, "loss": 0.8635, "step": 2304 }, { "epoch": 3.981001727115717, "grad_norm": 0.2029681661024322, "learning_rate": 9.705492940289632e-06, "loss": 0.8807, "step": 2305 }, { "epoch": 3.98272884283247, "grad_norm": 0.1967861074486307, "learning_rate": 9.67401489940024e-06, "loss": 0.87, "step": 2306 }, { "epoch": 3.9844559585492227, "grad_norm": 0.160296461848377, "learning_rate": 9.642580964661312e-06, "loss": 0.8607, "step": 2307 }, { "epoch": 3.986183074265976, "grad_norm": 0.1854972120520951, "learning_rate": 9.611191181790397e-06, "loss": 0.8584, "step": 2308 }, { "epoch": 3.987910189982729, "grad_norm": 0.18202126844285782, "learning_rate": 9.579845596440856e-06, "loss": 0.8628, "step": 2309 }, { "epoch": 3.989637305699482, "grad_norm": 0.2000754684779986, "learning_rate": 9.548544254201739e-06, "loss": 0.8562, "step": 2310 }, { "epoch": 3.991364421416235, "grad_norm": 0.1493945265819573, "learning_rate": 9.517287200597752e-06, "loss": 0.8554, "step": 2311 }, { "epoch": 3.993091537132988, "grad_norm": 0.16199409773432472, "learning_rate": 9.48607448108918e-06, "loss": 0.8673, "step": 2312 }, { "epoch": 3.994818652849741, "grad_norm": 0.13676839403421848, "learning_rate": 9.454906141071873e-06, "loss": 0.8677, "step": 2313 }, { "epoch": 3.996545768566494, "grad_norm": 0.14873219074574623, "learning_rate": 9.42378222587708e-06, "loss": 0.87, "step": 2314 }, { "epoch": 3.998272884283247, "grad_norm": 0.15227855366863421, "learning_rate": 9.392702780771463e-06, "loss": 0.8496, "step": 2315 }, { "epoch": 4.0, "grad_norm": 0.14977168763748608, "learning_rate": 9.361667850957019e-06, "loss": 0.8543, "step": 2316 }, { "epoch": 4.001727115716753, "grad_norm": 0.17311323596800685, "learning_rate": 9.330677481570984e-06, "loss": 0.8557, "step": 2317 }, { "epoch": 4.003454231433506, "grad_norm": 0.1596200526691956, "learning_rate": 9.299731717685771e-06, "loss": 0.84, "step": 2318 }, { "epoch": 4.005181347150259, "grad_norm": 0.15731338813105097, "learning_rate": 9.268830604308965e-06, "loss": 0.835, "step": 2319 }, { "epoch": 4.006908462867012, "grad_norm": 0.1980192228181531, "learning_rate": 9.237974186383174e-06, "loss": 0.8516, "step": 2320 }, { "epoch": 4.008635578583765, "grad_norm": 0.17791280212725463, "learning_rate": 9.207162508786006e-06, "loss": 0.8386, "step": 2321 }, { "epoch": 4.010362694300518, "grad_norm": 0.1970259455344314, "learning_rate": 9.176395616329996e-06, "loss": 0.853, "step": 2322 }, { "epoch": 4.0120898100172715, "grad_norm": 0.1876556689787625, "learning_rate": 9.145673553762569e-06, "loss": 0.8525, "step": 2323 }, { "epoch": 4.013816925734024, "grad_norm": 0.17631638732922264, "learning_rate": 9.114996365765924e-06, "loss": 0.8507, "step": 2324 }, { "epoch": 4.015544041450777, "grad_norm": 0.15725623922437995, "learning_rate": 9.084364096956987e-06, "loss": 0.8547, "step": 2325 }, { "epoch": 4.017271157167531, "grad_norm": 0.16412693341179768, "learning_rate": 9.053776791887391e-06, "loss": 0.8527, "step": 2326 }, { "epoch": 4.018998272884283, "grad_norm": 0.15563053132258498, "learning_rate": 9.023234495043342e-06, "loss": 0.8634, "step": 2327 }, { "epoch": 4.020725388601036, "grad_norm": 0.13698592929371314, "learning_rate": 8.992737250845583e-06, "loss": 0.8368, "step": 2328 }, { "epoch": 4.02245250431779, "grad_norm": 0.17413070486483956, "learning_rate": 8.96228510364936e-06, "loss": 0.8487, "step": 2329 }, { "epoch": 4.024179620034542, "grad_norm": 0.13901561952684774, "learning_rate": 8.931878097744314e-06, "loss": 0.8334, "step": 2330 }, { "epoch": 4.025906735751295, "grad_norm": 0.1524946233145558, "learning_rate": 8.90151627735441e-06, "loss": 0.844, "step": 2331 }, { "epoch": 4.027633851468049, "grad_norm": 0.14859322819484097, "learning_rate": 8.871199686637944e-06, "loss": 0.8524, "step": 2332 }, { "epoch": 4.029360967184801, "grad_norm": 0.13575325038366462, "learning_rate": 8.840928369687391e-06, "loss": 0.8347, "step": 2333 }, { "epoch": 4.0310880829015545, "grad_norm": 0.15438872843721282, "learning_rate": 8.810702370529393e-06, "loss": 0.8566, "step": 2334 }, { "epoch": 4.032815198618308, "grad_norm": 0.15028853873253356, "learning_rate": 8.780521733124669e-06, "loss": 0.8372, "step": 2335 }, { "epoch": 4.03454231433506, "grad_norm": 0.15362568233300877, "learning_rate": 8.750386501368004e-06, "loss": 0.8603, "step": 2336 }, { "epoch": 4.036269430051814, "grad_norm": 0.15711917411432935, "learning_rate": 8.720296719088095e-06, "loss": 0.8306, "step": 2337 }, { "epoch": 4.037996545768567, "grad_norm": 0.14638380724786268, "learning_rate": 8.690252430047556e-06, "loss": 0.8311, "step": 2338 }, { "epoch": 4.039723661485319, "grad_norm": 0.1523901141653662, "learning_rate": 8.660253677942854e-06, "loss": 0.8529, "step": 2339 }, { "epoch": 4.041450777202073, "grad_norm": 0.14642715743764317, "learning_rate": 8.630300506404205e-06, "loss": 0.8617, "step": 2340 }, { "epoch": 4.043177892918825, "grad_norm": 0.18150602684115597, "learning_rate": 8.600392958995529e-06, "loss": 0.8363, "step": 2341 }, { "epoch": 4.0449050086355784, "grad_norm": 0.14420609328124023, "learning_rate": 8.570531079214416e-06, "loss": 0.8396, "step": 2342 }, { "epoch": 4.046632124352332, "grad_norm": 0.1593515595358198, "learning_rate": 8.540714910492016e-06, "loss": 0.8644, "step": 2343 }, { "epoch": 4.048359240069084, "grad_norm": 0.14363874952119354, "learning_rate": 8.510944496193004e-06, "loss": 0.8575, "step": 2344 }, { "epoch": 4.0500863557858375, "grad_norm": 0.14068344878420908, "learning_rate": 8.481219879615485e-06, "loss": 0.8616, "step": 2345 }, { "epoch": 4.051813471502591, "grad_norm": 0.1678499914232787, "learning_rate": 8.45154110399101e-06, "loss": 0.8534, "step": 2346 }, { "epoch": 4.053540587219343, "grad_norm": 0.15776681261639572, "learning_rate": 8.421908212484417e-06, "loss": 0.8668, "step": 2347 }, { "epoch": 4.055267702936097, "grad_norm": 0.1363764512302756, "learning_rate": 8.392321248193803e-06, "loss": 0.8493, "step": 2348 }, { "epoch": 4.05699481865285, "grad_norm": 0.14754557804268603, "learning_rate": 8.36278025415051e-06, "loss": 0.8303, "step": 2349 }, { "epoch": 4.058721934369602, "grad_norm": 0.165284686660264, "learning_rate": 8.33328527331899e-06, "loss": 0.8411, "step": 2350 }, { "epoch": 4.060449050086356, "grad_norm": 0.13969345991062154, "learning_rate": 8.303836348596764e-06, "loss": 0.8564, "step": 2351 }, { "epoch": 4.062176165803109, "grad_norm": 0.16254235288967364, "learning_rate": 8.274433522814407e-06, "loss": 0.8506, "step": 2352 }, { "epoch": 4.0639032815198615, "grad_norm": 0.13512907244053501, "learning_rate": 8.245076838735415e-06, "loss": 0.8486, "step": 2353 }, { "epoch": 4.065630397236615, "grad_norm": 0.13998671365893245, "learning_rate": 8.215766339056176e-06, "loss": 0.8555, "step": 2354 }, { "epoch": 4.067357512953368, "grad_norm": 0.14571487317155765, "learning_rate": 8.186502066405939e-06, "loss": 0.8495, "step": 2355 }, { "epoch": 4.0690846286701206, "grad_norm": 0.13451557657727056, "learning_rate": 8.157284063346687e-06, "loss": 0.8591, "step": 2356 }, { "epoch": 4.070811744386874, "grad_norm": 0.16505648008584112, "learning_rate": 8.128112372373121e-06, "loss": 0.8607, "step": 2357 }, { "epoch": 4.072538860103627, "grad_norm": 0.12410280475969321, "learning_rate": 8.098987035912573e-06, "loss": 0.8401, "step": 2358 }, { "epoch": 4.07426597582038, "grad_norm": 0.44207023468348616, "learning_rate": 8.069908096324987e-06, "loss": 0.8671, "step": 2359 }, { "epoch": 4.075993091537133, "grad_norm": 0.1293662049598758, "learning_rate": 8.040875595902804e-06, "loss": 0.8528, "step": 2360 }, { "epoch": 4.077720207253886, "grad_norm": 0.14945834475423844, "learning_rate": 8.011889576870913e-06, "loss": 0.8406, "step": 2361 }, { "epoch": 4.079447322970639, "grad_norm": 0.13740584212662277, "learning_rate": 7.98295008138664e-06, "loss": 0.838, "step": 2362 }, { "epoch": 4.081174438687392, "grad_norm": 0.14170745623706282, "learning_rate": 7.954057151539617e-06, "loss": 0.8579, "step": 2363 }, { "epoch": 4.082901554404145, "grad_norm": 0.13757335540221588, "learning_rate": 7.925210829351737e-06, "loss": 0.8361, "step": 2364 }, { "epoch": 4.084628670120898, "grad_norm": 0.1697674878578403, "learning_rate": 7.896411156777155e-06, "loss": 0.848, "step": 2365 }, { "epoch": 4.086355785837651, "grad_norm": 0.1471107390185156, "learning_rate": 7.867658175702137e-06, "loss": 0.8353, "step": 2366 }, { "epoch": 4.0880829015544045, "grad_norm": 0.1342706396873131, "learning_rate": 7.838951927945055e-06, "loss": 0.847, "step": 2367 }, { "epoch": 4.089810017271157, "grad_norm": 0.15082664563703513, "learning_rate": 7.810292455256303e-06, "loss": 0.8581, "step": 2368 }, { "epoch": 4.09153713298791, "grad_norm": 0.15248071497888746, "learning_rate": 7.781679799318271e-06, "loss": 0.8721, "step": 2369 }, { "epoch": 4.0932642487046635, "grad_norm": 0.15356022364283586, "learning_rate": 7.753114001745232e-06, "loss": 0.8387, "step": 2370 }, { "epoch": 4.094991364421416, "grad_norm": 0.16362622815331762, "learning_rate": 7.724595104083312e-06, "loss": 0.8553, "step": 2371 }, { "epoch": 4.096718480138169, "grad_norm": 0.15581264283876048, "learning_rate": 7.696123147810448e-06, "loss": 0.8404, "step": 2372 }, { "epoch": 4.098445595854923, "grad_norm": 0.1619946325352462, "learning_rate": 7.667698174336276e-06, "loss": 0.8537, "step": 2373 }, { "epoch": 4.100172711571675, "grad_norm": 0.34145492459183713, "learning_rate": 7.639320225002106e-06, "loss": 0.8659, "step": 2374 }, { "epoch": 4.101899827288428, "grad_norm": 0.15098270284239282, "learning_rate": 7.610989341080883e-06, "loss": 0.8404, "step": 2375 }, { "epoch": 4.103626943005182, "grad_norm": 0.12848984204943426, "learning_rate": 7.5827055637770666e-06, "loss": 0.8534, "step": 2376 }, { "epoch": 4.105354058721934, "grad_norm": 0.19492359848977414, "learning_rate": 7.554468934226613e-06, "loss": 0.8608, "step": 2377 }, { "epoch": 4.1070811744386875, "grad_norm": 0.13816900631596152, "learning_rate": 7.526279493496927e-06, "loss": 0.8602, "step": 2378 }, { "epoch": 4.108808290155441, "grad_norm": 0.12174310074734611, "learning_rate": 7.498137282586757e-06, "loss": 0.8293, "step": 2379 }, { "epoch": 4.110535405872193, "grad_norm": 0.12152274093795147, "learning_rate": 7.470042342426164e-06, "loss": 0.8613, "step": 2380 }, { "epoch": 4.112262521588947, "grad_norm": 0.13826165412161925, "learning_rate": 7.441994713876477e-06, "loss": 0.8621, "step": 2381 }, { "epoch": 4.1139896373057, "grad_norm": 0.12036917021273633, "learning_rate": 7.413994437730196e-06, "loss": 0.8573, "step": 2382 }, { "epoch": 4.115716753022452, "grad_norm": 0.1424588597797964, "learning_rate": 7.386041554710943e-06, "loss": 0.846, "step": 2383 }, { "epoch": 4.117443868739206, "grad_norm": 0.12546393805758316, "learning_rate": 7.358136105473455e-06, "loss": 0.8474, "step": 2384 }, { "epoch": 4.119170984455959, "grad_norm": 0.14006193947036238, "learning_rate": 7.330278130603434e-06, "loss": 0.8434, "step": 2385 }, { "epoch": 4.120898100172711, "grad_norm": 0.1270315726379182, "learning_rate": 7.3024676706175436e-06, "loss": 0.8552, "step": 2386 }, { "epoch": 4.122625215889465, "grad_norm": 0.1272424706366422, "learning_rate": 7.274704765963378e-06, "loss": 0.8611, "step": 2387 }, { "epoch": 4.124352331606218, "grad_norm": 0.13066118093909884, "learning_rate": 7.246989457019325e-06, "loss": 0.8656, "step": 2388 }, { "epoch": 4.1260794473229705, "grad_norm": 0.12267391755790695, "learning_rate": 7.219321784094551e-06, "loss": 0.8411, "step": 2389 }, { "epoch": 4.127806563039724, "grad_norm": 0.14105985729936463, "learning_rate": 7.19170178742898e-06, "loss": 0.8319, "step": 2390 }, { "epoch": 4.129533678756476, "grad_norm": 0.13510046287214433, "learning_rate": 7.164129507193149e-06, "loss": 0.8631, "step": 2391 }, { "epoch": 4.13126079447323, "grad_norm": 0.1432296117030573, "learning_rate": 7.136604983488235e-06, "loss": 0.8456, "step": 2392 }, { "epoch": 4.132987910189983, "grad_norm": 0.12944352716652122, "learning_rate": 7.109128256345932e-06, "loss": 0.8611, "step": 2393 }, { "epoch": 4.134715025906735, "grad_norm": 0.11507429403898589, "learning_rate": 7.081699365728414e-06, "loss": 0.8513, "step": 2394 }, { "epoch": 4.136442141623489, "grad_norm": 0.13329446909350437, "learning_rate": 7.054318351528313e-06, "loss": 0.8623, "step": 2395 }, { "epoch": 4.138169257340242, "grad_norm": 0.14491526071727348, "learning_rate": 7.026985253568597e-06, "loss": 0.8717, "step": 2396 }, { "epoch": 4.139896373056994, "grad_norm": 0.129477016853371, "learning_rate": 6.999700111602554e-06, "loss": 0.8542, "step": 2397 }, { "epoch": 4.141623488773748, "grad_norm": 0.1321416580160395, "learning_rate": 6.972462965313749e-06, "loss": 0.875, "step": 2398 }, { "epoch": 4.143350604490501, "grad_norm": 0.12426643949544422, "learning_rate": 6.945273854315906e-06, "loss": 0.8398, "step": 2399 }, { "epoch": 4.1450777202072535, "grad_norm": 0.12770331798667986, "learning_rate": 6.918132818152896e-06, "loss": 0.8552, "step": 2400 }, { "epoch": 4.146804835924007, "grad_norm": 0.3102799727218449, "learning_rate": 6.8910398962986906e-06, "loss": 0.8376, "step": 2401 }, { "epoch": 4.14853195164076, "grad_norm": 0.1275551043833013, "learning_rate": 6.863995128157266e-06, "loss": 0.8352, "step": 2402 }, { "epoch": 4.150259067357513, "grad_norm": 0.12419388810412675, "learning_rate": 6.836998553062559e-06, "loss": 0.8377, "step": 2403 }, { "epoch": 4.151986183074266, "grad_norm": 0.12981164808899512, "learning_rate": 6.81005021027842e-06, "loss": 0.8497, "step": 2404 }, { "epoch": 4.153713298791019, "grad_norm": 0.15268240753591888, "learning_rate": 6.783150138998569e-06, "loss": 0.8798, "step": 2405 }, { "epoch": 4.155440414507772, "grad_norm": 0.14598819442618388, "learning_rate": 6.7562983783464905e-06, "loss": 0.8354, "step": 2406 }, { "epoch": 4.157167530224525, "grad_norm": 0.1319573822423032, "learning_rate": 6.729494967375414e-06, "loss": 0.84, "step": 2407 }, { "epoch": 4.158894645941278, "grad_norm": 0.15171022772039316, "learning_rate": 6.702739945068266e-06, "loss": 0.8515, "step": 2408 }, { "epoch": 4.160621761658031, "grad_norm": 0.13783223831899657, "learning_rate": 6.676033350337583e-06, "loss": 0.8511, "step": 2409 }, { "epoch": 4.162348877374784, "grad_norm": 0.17998576811550035, "learning_rate": 6.649375222025454e-06, "loss": 0.8738, "step": 2410 }, { "epoch": 4.164075993091537, "grad_norm": 0.13569857303543661, "learning_rate": 6.622765598903513e-06, "loss": 0.8653, "step": 2411 }, { "epoch": 4.16580310880829, "grad_norm": 0.13578834195273407, "learning_rate": 6.596204519672822e-06, "loss": 0.854, "step": 2412 }, { "epoch": 4.167530224525043, "grad_norm": 0.1740422754431349, "learning_rate": 6.569692022963847e-06, "loss": 0.8597, "step": 2413 }, { "epoch": 4.1692573402417965, "grad_norm": 0.14976391562495792, "learning_rate": 6.543228147336389e-06, "loss": 0.8294, "step": 2414 }, { "epoch": 4.170984455958549, "grad_norm": 0.11967408365659375, "learning_rate": 6.516812931279557e-06, "loss": 0.8471, "step": 2415 }, { "epoch": 4.172711571675302, "grad_norm": 0.12417129405342334, "learning_rate": 6.490446413211669e-06, "loss": 0.8353, "step": 2416 }, { "epoch": 4.174438687392056, "grad_norm": 0.133272235735888, "learning_rate": 6.464128631480204e-06, "loss": 0.8393, "step": 2417 }, { "epoch": 4.176165803108808, "grad_norm": 0.14210208675967473, "learning_rate": 6.437859624361804e-06, "loss": 0.8506, "step": 2418 }, { "epoch": 4.177892918825561, "grad_norm": 0.133533417930662, "learning_rate": 6.411639430062133e-06, "loss": 0.8402, "step": 2419 }, { "epoch": 4.179620034542315, "grad_norm": 0.1427393116584703, "learning_rate": 6.38546808671586e-06, "loss": 0.8515, "step": 2420 }, { "epoch": 4.181347150259067, "grad_norm": 0.13350620494481638, "learning_rate": 6.359345632386648e-06, "loss": 0.8555, "step": 2421 }, { "epoch": 4.18307426597582, "grad_norm": 0.1278258217536834, "learning_rate": 6.3332721050670095e-06, "loss": 0.8514, "step": 2422 }, { "epoch": 4.184801381692574, "grad_norm": 0.1380514365906585, "learning_rate": 6.307247542678321e-06, "loss": 0.8436, "step": 2423 }, { "epoch": 4.186528497409326, "grad_norm": 0.13480888813369007, "learning_rate": 6.281271983070727e-06, "loss": 0.8619, "step": 2424 }, { "epoch": 4.1882556131260795, "grad_norm": 0.1320745015616146, "learning_rate": 6.255345464023128e-06, "loss": 0.8473, "step": 2425 }, { "epoch": 4.189982728842833, "grad_norm": 0.14270841102225287, "learning_rate": 6.2294680232430815e-06, "loss": 0.8603, "step": 2426 }, { "epoch": 4.191709844559585, "grad_norm": 0.14014596488745296, "learning_rate": 6.203639698366757e-06, "loss": 0.849, "step": 2427 }, { "epoch": 4.193436960276339, "grad_norm": 0.1559974516737239, "learning_rate": 6.177860526958923e-06, "loss": 0.8423, "step": 2428 }, { "epoch": 4.195164075993092, "grad_norm": 0.1696556547876031, "learning_rate": 6.152130546512829e-06, "loss": 0.8443, "step": 2429 }, { "epoch": 4.196891191709844, "grad_norm": 0.17362265664219306, "learning_rate": 6.12644979445018e-06, "loss": 0.8641, "step": 2430 }, { "epoch": 4.198618307426598, "grad_norm": 0.13991742552746966, "learning_rate": 6.100818308121117e-06, "loss": 0.8637, "step": 2431 }, { "epoch": 4.200345423143351, "grad_norm": 0.14863602905111487, "learning_rate": 6.075236124804087e-06, "loss": 0.8539, "step": 2432 }, { "epoch": 4.2020725388601035, "grad_norm": 0.1207618824289248, "learning_rate": 6.049703281705852e-06, "loss": 0.866, "step": 2433 }, { "epoch": 4.203799654576857, "grad_norm": 0.14870418564180746, "learning_rate": 6.024219815961418e-06, "loss": 0.8605, "step": 2434 }, { "epoch": 4.205526770293609, "grad_norm": 0.15363606827702686, "learning_rate": 5.998785764633966e-06, "loss": 0.8405, "step": 2435 }, { "epoch": 4.2072538860103625, "grad_norm": 0.166362337376057, "learning_rate": 5.973401164714809e-06, "loss": 0.8441, "step": 2436 }, { "epoch": 4.208981001727116, "grad_norm": 0.14991074102812696, "learning_rate": 5.9480660531233384e-06, "loss": 0.8525, "step": 2437 }, { "epoch": 4.210708117443868, "grad_norm": 0.19122438203890774, "learning_rate": 5.922780466706983e-06, "loss": 0.8592, "step": 2438 }, { "epoch": 4.212435233160622, "grad_norm": 0.1426874704542232, "learning_rate": 5.897544442241123e-06, "loss": 0.8504, "step": 2439 }, { "epoch": 4.214162348877375, "grad_norm": 0.21184052699958691, "learning_rate": 5.872358016429056e-06, "loss": 0.8416, "step": 2440 }, { "epoch": 4.215889464594127, "grad_norm": 0.1551194123237828, "learning_rate": 5.847221225901969e-06, "loss": 0.8296, "step": 2441 }, { "epoch": 4.217616580310881, "grad_norm": 0.16587799797819566, "learning_rate": 5.822134107218835e-06, "loss": 0.8406, "step": 2442 }, { "epoch": 4.219343696027634, "grad_norm": 0.129249868544617, "learning_rate": 5.79709669686638e-06, "loss": 0.8472, "step": 2443 }, { "epoch": 4.2210708117443865, "grad_norm": 0.14732102674062997, "learning_rate": 5.772109031259057e-06, "loss": 0.8452, "step": 2444 }, { "epoch": 4.22279792746114, "grad_norm": 0.13586612239505236, "learning_rate": 5.747171146738959e-06, "loss": 0.8383, "step": 2445 }, { "epoch": 4.224525043177893, "grad_norm": 0.13066850395419966, "learning_rate": 5.722283079575768e-06, "loss": 0.8421, "step": 2446 }, { "epoch": 4.226252158894646, "grad_norm": 0.20849621967232945, "learning_rate": 5.697444865966715e-06, "loss": 0.8518, "step": 2447 }, { "epoch": 4.227979274611399, "grad_norm": 0.13698251034333955, "learning_rate": 5.672656542036548e-06, "loss": 0.8682, "step": 2448 }, { "epoch": 4.229706390328152, "grad_norm": 0.15109680938138154, "learning_rate": 5.647918143837423e-06, "loss": 0.8099, "step": 2449 }, { "epoch": 4.231433506044905, "grad_norm": 0.16240988448838828, "learning_rate": 5.6232297073488896e-06, "loss": 0.8326, "step": 2450 }, { "epoch": 4.233160621761658, "grad_norm": 0.140293114998167, "learning_rate": 5.59859126847786e-06, "loss": 0.8545, "step": 2451 }, { "epoch": 4.234887737478411, "grad_norm": 0.13718301604591343, "learning_rate": 5.5740028630585006e-06, "loss": 0.8401, "step": 2452 }, { "epoch": 4.236614853195164, "grad_norm": 0.1388995443453662, "learning_rate": 5.549464526852211e-06, "loss": 0.8378, "step": 2453 }, { "epoch": 4.238341968911917, "grad_norm": 0.1735543610402784, "learning_rate": 5.524976295547593e-06, "loss": 0.84, "step": 2454 }, { "epoch": 4.24006908462867, "grad_norm": 0.11997685518857036, "learning_rate": 5.500538204760353e-06, "loss": 0.8446, "step": 2455 }, { "epoch": 4.241796200345423, "grad_norm": 0.12539015884777183, "learning_rate": 5.476150290033273e-06, "loss": 0.8642, "step": 2456 }, { "epoch": 4.243523316062176, "grad_norm": 0.15654778363992594, "learning_rate": 5.4518125868361805e-06, "loss": 0.8463, "step": 2457 }, { "epoch": 4.2452504317789295, "grad_norm": 0.13881577185229496, "learning_rate": 5.427525130565858e-06, "loss": 0.8424, "step": 2458 }, { "epoch": 4.246977547495682, "grad_norm": 0.1629583004484595, "learning_rate": 5.4032879565460105e-06, "loss": 0.8477, "step": 2459 }, { "epoch": 4.248704663212435, "grad_norm": 0.1489571830457967, "learning_rate": 5.3791011000272e-06, "loss": 0.8414, "step": 2460 }, { "epoch": 4.2504317789291886, "grad_norm": 0.12071930598842981, "learning_rate": 5.3549645961868464e-06, "loss": 0.8439, "step": 2461 }, { "epoch": 4.252158894645941, "grad_norm": 0.1322731287660645, "learning_rate": 5.330878480129098e-06, "loss": 0.8488, "step": 2462 }, { "epoch": 4.253886010362694, "grad_norm": 0.14928910527103653, "learning_rate": 5.3068427868848205e-06, "loss": 0.856, "step": 2463 }, { "epoch": 4.255613126079448, "grad_norm": 0.13892867374882498, "learning_rate": 5.28285755141158e-06, "loss": 0.8335, "step": 2464 }, { "epoch": 4.2573402417962, "grad_norm": 0.12214904516446211, "learning_rate": 5.2589228085935205e-06, "loss": 0.8352, "step": 2465 }, { "epoch": 4.259067357512953, "grad_norm": 0.19117663316711087, "learning_rate": 5.235038593241348e-06, "loss": 0.8413, "step": 2466 }, { "epoch": 4.260794473229707, "grad_norm": 0.11797904359603217, "learning_rate": 5.211204940092325e-06, "loss": 0.8339, "step": 2467 }, { "epoch": 4.262521588946459, "grad_norm": 0.16852109321641873, "learning_rate": 5.187421883810126e-06, "loss": 0.8651, "step": 2468 }, { "epoch": 4.2642487046632125, "grad_norm": 0.12935747281575, "learning_rate": 5.163689458984862e-06, "loss": 0.8472, "step": 2469 }, { "epoch": 4.265975820379966, "grad_norm": 0.11717680247779699, "learning_rate": 5.140007700132992e-06, "loss": 0.8248, "step": 2470 }, { "epoch": 4.267702936096718, "grad_norm": 0.12179228042038769, "learning_rate": 5.116376641697316e-06, "loss": 0.8666, "step": 2471 }, { "epoch": 4.269430051813472, "grad_norm": 0.11421893081934549, "learning_rate": 5.092796318046857e-06, "loss": 0.8382, "step": 2472 }, { "epoch": 4.271157167530225, "grad_norm": 0.1319229314956009, "learning_rate": 5.0692667634768725e-06, "loss": 0.8515, "step": 2473 }, { "epoch": 4.272884283246977, "grad_norm": 0.12052464085795843, "learning_rate": 5.045788012208781e-06, "loss": 0.8527, "step": 2474 }, { "epoch": 4.274611398963731, "grad_norm": 0.14867526774888654, "learning_rate": 5.022360098390105e-06, "loss": 0.8607, "step": 2475 }, { "epoch": 4.276338514680484, "grad_norm": 0.11390306489331456, "learning_rate": 4.998983056094422e-06, "loss": 0.8612, "step": 2476 }, { "epoch": 4.278065630397236, "grad_norm": 0.14139956682943913, "learning_rate": 4.975656919321345e-06, "loss": 0.8603, "step": 2477 }, { "epoch": 4.27979274611399, "grad_norm": 0.1281909862832109, "learning_rate": 4.952381721996431e-06, "loss": 0.8725, "step": 2478 }, { "epoch": 4.281519861830743, "grad_norm": 0.1394673833320002, "learning_rate": 4.929157497971151e-06, "loss": 0.8403, "step": 2479 }, { "epoch": 4.2832469775474955, "grad_norm": 0.12382214865357678, "learning_rate": 4.905984281022856e-06, "loss": 0.8458, "step": 2480 }, { "epoch": 4.284974093264249, "grad_norm": 0.1410552697727774, "learning_rate": 4.882862104854695e-06, "loss": 0.8716, "step": 2481 }, { "epoch": 4.286701208981002, "grad_norm": 0.1344881396294326, "learning_rate": 4.859791003095593e-06, "loss": 0.8581, "step": 2482 }, { "epoch": 4.288428324697755, "grad_norm": 0.11941540245809681, "learning_rate": 4.836771009300174e-06, "loss": 0.8514, "step": 2483 }, { "epoch": 4.290155440414508, "grad_norm": 0.1371639934431839, "learning_rate": 4.813802156948768e-06, "loss": 0.8713, "step": 2484 }, { "epoch": 4.291882556131261, "grad_norm": 0.11502583871007038, "learning_rate": 4.790884479447293e-06, "loss": 0.8502, "step": 2485 }, { "epoch": 4.293609671848014, "grad_norm": 0.11439778448714527, "learning_rate": 4.768018010127238e-06, "loss": 0.8528, "step": 2486 }, { "epoch": 4.295336787564767, "grad_norm": 0.13515652454499683, "learning_rate": 4.745202782245644e-06, "loss": 0.8425, "step": 2487 }, { "epoch": 4.29706390328152, "grad_norm": 0.13384561523831504, "learning_rate": 4.722438828984994e-06, "loss": 0.8422, "step": 2488 }, { "epoch": 4.298791018998273, "grad_norm": 0.13079630670536466, "learning_rate": 4.699726183453206e-06, "loss": 0.8489, "step": 2489 }, { "epoch": 4.300518134715026, "grad_norm": 0.1582289962458131, "learning_rate": 4.677064878683592e-06, "loss": 0.85, "step": 2490 }, { "epoch": 4.3022452504317785, "grad_norm": 0.10545708434352331, "learning_rate": 4.654454947634781e-06, "loss": 0.8548, "step": 2491 }, { "epoch": 4.303972366148532, "grad_norm": 0.10846043919901358, "learning_rate": 4.631896423190667e-06, "loss": 0.8526, "step": 2492 }, { "epoch": 4.305699481865285, "grad_norm": 0.1824917770672271, "learning_rate": 4.609389338160419e-06, "loss": 0.8605, "step": 2493 }, { "epoch": 4.307426597582038, "grad_norm": 0.11828153207964108, "learning_rate": 4.58693372527836e-06, "loss": 0.8664, "step": 2494 }, { "epoch": 4.309153713298791, "grad_norm": 0.11823555281888108, "learning_rate": 4.564529617203955e-06, "loss": 0.8489, "step": 2495 }, { "epoch": 4.310880829015544, "grad_norm": 0.28581745430843203, "learning_rate": 4.5421770465217785e-06, "loss": 0.8603, "step": 2496 }, { "epoch": 4.312607944732297, "grad_norm": 0.16769998041723627, "learning_rate": 4.519876045741432e-06, "loss": 0.8491, "step": 2497 }, { "epoch": 4.31433506044905, "grad_norm": 0.11436047054343214, "learning_rate": 4.4976266472975065e-06, "loss": 0.8229, "step": 2498 }, { "epoch": 4.316062176165803, "grad_norm": 0.11858376116211893, "learning_rate": 4.4754288835495755e-06, "loss": 0.8594, "step": 2499 }, { "epoch": 4.317789291882556, "grad_norm": 0.14651363400979753, "learning_rate": 4.453282786782085e-06, "loss": 0.8498, "step": 2500 }, { "epoch": 4.319516407599309, "grad_norm": 0.13146446967605444, "learning_rate": 4.431188389204342e-06, "loss": 0.8239, "step": 2501 }, { "epoch": 4.321243523316062, "grad_norm": 0.11734170217555799, "learning_rate": 4.409145722950472e-06, "loss": 0.831, "step": 2502 }, { "epoch": 4.322970639032815, "grad_norm": 0.11641123821010219, "learning_rate": 4.387154820079347e-06, "loss": 0.8674, "step": 2503 }, { "epoch": 4.324697754749568, "grad_norm": 0.1133151135403448, "learning_rate": 4.365215712574582e-06, "loss": 0.8474, "step": 2504 }, { "epoch": 4.3264248704663215, "grad_norm": 0.12021545754923249, "learning_rate": 4.343328432344427e-06, "loss": 0.8396, "step": 2505 }, { "epoch": 4.328151986183074, "grad_norm": 0.13403448232979784, "learning_rate": 4.3214930112217644e-06, "loss": 0.8602, "step": 2506 }, { "epoch": 4.329879101899827, "grad_norm": 0.11660521098663544, "learning_rate": 4.299709480964076e-06, "loss": 0.851, "step": 2507 }, { "epoch": 4.331606217616581, "grad_norm": 0.10254573848115506, "learning_rate": 4.2779778732533425e-06, "loss": 0.8377, "step": 2508 }, { "epoch": 4.333333333333333, "grad_norm": 0.12313734935516082, "learning_rate": 4.256298219696042e-06, "loss": 0.8528, "step": 2509 }, { "epoch": 4.335060449050086, "grad_norm": 0.11471123601172734, "learning_rate": 4.234670551823099e-06, "loss": 0.8355, "step": 2510 }, { "epoch": 4.33678756476684, "grad_norm": 0.11142181859877048, "learning_rate": 4.21309490108981e-06, "loss": 0.8577, "step": 2511 }, { "epoch": 4.338514680483592, "grad_norm": 0.12020881098186523, "learning_rate": 4.1915712988758275e-06, "loss": 0.8443, "step": 2512 }, { "epoch": 4.3402417962003454, "grad_norm": 0.1138737544678231, "learning_rate": 4.170099776485117e-06, "loss": 0.8528, "step": 2513 }, { "epoch": 4.341968911917099, "grad_norm": 0.11422648145988384, "learning_rate": 4.148680365145881e-06, "loss": 0.8453, "step": 2514 }, { "epoch": 4.343696027633851, "grad_norm": 0.12222352889108526, "learning_rate": 4.127313096010532e-06, "loss": 0.8565, "step": 2515 }, { "epoch": 4.3454231433506045, "grad_norm": 0.12357239419714587, "learning_rate": 4.105998000155657e-06, "loss": 0.8435, "step": 2516 }, { "epoch": 4.347150259067358, "grad_norm": 0.11795376461411378, "learning_rate": 4.084735108581961e-06, "loss": 0.8575, "step": 2517 }, { "epoch": 4.34887737478411, "grad_norm": 0.11472081774759492, "learning_rate": 4.063524452214225e-06, "loss": 0.8528, "step": 2518 }, { "epoch": 4.350604490500864, "grad_norm": 0.1213205999459636, "learning_rate": 4.042366061901235e-06, "loss": 0.8337, "step": 2519 }, { "epoch": 4.352331606217617, "grad_norm": 0.12285202805544866, "learning_rate": 4.021259968415798e-06, "loss": 0.8541, "step": 2520 }, { "epoch": 4.354058721934369, "grad_norm": 0.11630729223728932, "learning_rate": 4.0002062024546395e-06, "loss": 0.8263, "step": 2521 }, { "epoch": 4.355785837651123, "grad_norm": 0.1145394898409773, "learning_rate": 3.979204794638372e-06, "loss": 0.8766, "step": 2522 }, { "epoch": 4.357512953367876, "grad_norm": 0.11261740186031016, "learning_rate": 3.958255775511486e-06, "loss": 0.8405, "step": 2523 }, { "epoch": 4.3592400690846285, "grad_norm": 0.1165394248393951, "learning_rate": 3.937359175542251e-06, "loss": 0.8644, "step": 2524 }, { "epoch": 4.360967184801382, "grad_norm": 0.12538222047392175, "learning_rate": 3.9165150251227e-06, "loss": 0.8586, "step": 2525 }, { "epoch": 4.362694300518135, "grad_norm": 0.12475989864257615, "learning_rate": 3.895723354568612e-06, "loss": 0.8401, "step": 2526 }, { "epoch": 4.3644214162348876, "grad_norm": 0.2638074585022236, "learning_rate": 3.8749841941194065e-06, "loss": 0.8696, "step": 2527 }, { "epoch": 4.366148531951641, "grad_norm": 0.11949071535645889, "learning_rate": 3.854297573938142e-06, "loss": 0.8647, "step": 2528 }, { "epoch": 4.367875647668393, "grad_norm": 0.12338441072670334, "learning_rate": 3.8336635241114616e-06, "loss": 0.8615, "step": 2529 }, { "epoch": 4.369602763385147, "grad_norm": 0.14755744698732007, "learning_rate": 3.81308207464957e-06, "loss": 0.8608, "step": 2530 }, { "epoch": 4.3713298791019, "grad_norm": 0.10702034601456659, "learning_rate": 3.79255325548614e-06, "loss": 0.826, "step": 2531 }, { "epoch": 4.373056994818652, "grad_norm": 0.1298397956308217, "learning_rate": 3.7720770964783015e-06, "loss": 0.8517, "step": 2532 }, { "epoch": 4.374784110535406, "grad_norm": 0.11642699385781097, "learning_rate": 3.751653627406633e-06, "loss": 0.8561, "step": 2533 }, { "epoch": 4.376511226252159, "grad_norm": 0.1198010000393195, "learning_rate": 3.731282877975031e-06, "loss": 0.8485, "step": 2534 }, { "epoch": 4.3782383419689115, "grad_norm": 0.12583487626073014, "learning_rate": 3.710964877810743e-06, "loss": 0.8457, "step": 2535 }, { "epoch": 4.379965457685665, "grad_norm": 0.1192197056738803, "learning_rate": 3.690699656464305e-06, "loss": 0.8584, "step": 2536 }, { "epoch": 4.381692573402418, "grad_norm": 0.10927081529423255, "learning_rate": 3.6704872434094687e-06, "loss": 0.8603, "step": 2537 }, { "epoch": 4.383419689119171, "grad_norm": 0.14241797798970676, "learning_rate": 3.6503276680431945e-06, "loss": 0.8575, "step": 2538 }, { "epoch": 4.385146804835924, "grad_norm": 0.12131394190228854, "learning_rate": 3.630220959685593e-06, "loss": 0.8675, "step": 2539 }, { "epoch": 4.386873920552677, "grad_norm": 0.10065558807724283, "learning_rate": 3.6101671475798903e-06, "loss": 0.8405, "step": 2540 }, { "epoch": 4.38860103626943, "grad_norm": 0.1171438648537587, "learning_rate": 3.5901662608923784e-06, "loss": 0.8489, "step": 2541 }, { "epoch": 4.390328151986183, "grad_norm": 0.1238807516436274, "learning_rate": 3.5702183287123603e-06, "loss": 0.8336, "step": 2542 }, { "epoch": 4.392055267702936, "grad_norm": 0.11357691405190204, "learning_rate": 3.550323380052141e-06, "loss": 0.8466, "step": 2543 }, { "epoch": 4.393782383419689, "grad_norm": 0.11231825773863795, "learning_rate": 3.530481443846965e-06, "loss": 0.8457, "step": 2544 }, { "epoch": 4.395509499136442, "grad_norm": 0.11160129123218582, "learning_rate": 3.510692548954948e-06, "loss": 0.8397, "step": 2545 }, { "epoch": 4.397236614853195, "grad_norm": 0.14228402917458619, "learning_rate": 3.4909567241571084e-06, "loss": 0.8692, "step": 2546 }, { "epoch": 4.398963730569948, "grad_norm": 0.12259015371557208, "learning_rate": 3.4712739981572407e-06, "loss": 0.8706, "step": 2547 }, { "epoch": 4.400690846286701, "grad_norm": 0.11096918317173607, "learning_rate": 3.451644399581926e-06, "loss": 0.8573, "step": 2548 }, { "epoch": 4.4024179620034545, "grad_norm": 0.1095835098025916, "learning_rate": 3.4320679569804825e-06, "loss": 0.8502, "step": 2549 }, { "epoch": 4.404145077720207, "grad_norm": 0.13122853473830384, "learning_rate": 3.412544698824909e-06, "loss": 0.8392, "step": 2550 }, { "epoch": 4.40587219343696, "grad_norm": 0.12059514764649233, "learning_rate": 3.393074653509856e-06, "loss": 0.8468, "step": 2551 }, { "epoch": 4.407599309153714, "grad_norm": 0.10379807637538974, "learning_rate": 3.3736578493525786e-06, "loss": 0.8537, "step": 2552 }, { "epoch": 4.409326424870466, "grad_norm": 0.12081040652173156, "learning_rate": 3.3542943145929096e-06, "loss": 0.8535, "step": 2553 }, { "epoch": 4.411053540587219, "grad_norm": 0.11384988718297306, "learning_rate": 3.334984077393193e-06, "loss": 0.87, "step": 2554 }, { "epoch": 4.412780656303973, "grad_norm": 0.10205235940290262, "learning_rate": 3.315727165838256e-06, "loss": 0.8436, "step": 2555 }, { "epoch": 4.414507772020725, "grad_norm": 0.11132261387751057, "learning_rate": 3.296523607935389e-06, "loss": 0.8628, "step": 2556 }, { "epoch": 4.416234887737478, "grad_norm": 0.12988464277597797, "learning_rate": 3.2773734316142637e-06, "loss": 0.8638, "step": 2557 }, { "epoch": 4.417962003454232, "grad_norm": 0.11030858270899636, "learning_rate": 3.258276664726907e-06, "loss": 0.8575, "step": 2558 }, { "epoch": 4.419689119170984, "grad_norm": 0.11214096626642689, "learning_rate": 3.2392333350477068e-06, "loss": 0.8543, "step": 2559 }, { "epoch": 4.4214162348877375, "grad_norm": 0.10401253375766685, "learning_rate": 3.2202434702732855e-06, "loss": 0.8392, "step": 2560 }, { "epoch": 4.423143350604491, "grad_norm": 0.12299536609368306, "learning_rate": 3.2013070980225326e-06, "loss": 0.8647, "step": 2561 }, { "epoch": 4.424870466321243, "grad_norm": 0.11185744452753903, "learning_rate": 3.1824242458365238e-06, "loss": 0.855, "step": 2562 }, { "epoch": 4.426597582037997, "grad_norm": 0.09905430857542875, "learning_rate": 3.1635949411785184e-06, "loss": 0.8575, "step": 2563 }, { "epoch": 4.42832469775475, "grad_norm": 0.10690787718330151, "learning_rate": 3.144819211433867e-06, "loss": 0.8505, "step": 2564 }, { "epoch": 4.430051813471502, "grad_norm": 0.15429712260803874, "learning_rate": 3.126097083910016e-06, "loss": 0.8386, "step": 2565 }, { "epoch": 4.431778929188256, "grad_norm": 0.13784889890583538, "learning_rate": 3.1074285858364627e-06, "loss": 0.8422, "step": 2566 }, { "epoch": 4.433506044905009, "grad_norm": 0.13200775435349393, "learning_rate": 3.0888137443646806e-06, "loss": 0.8591, "step": 2567 }, { "epoch": 4.435233160621761, "grad_norm": 0.14598877204218583, "learning_rate": 3.0702525865681188e-06, "loss": 0.8661, "step": 2568 }, { "epoch": 4.436960276338515, "grad_norm": 0.1365859953890488, "learning_rate": 3.051745139442157e-06, "loss": 0.8436, "step": 2569 }, { "epoch": 4.438687392055268, "grad_norm": 0.1349481035333031, "learning_rate": 3.0332914299040505e-06, "loss": 0.8448, "step": 2570 }, { "epoch": 4.4404145077720205, "grad_norm": 0.13435129173551721, "learning_rate": 3.0148914847928857e-06, "loss": 0.8575, "step": 2571 }, { "epoch": 4.442141623488774, "grad_norm": 0.11956517347599087, "learning_rate": 2.996545330869567e-06, "loss": 0.8577, "step": 2572 }, { "epoch": 4.443868739205527, "grad_norm": 0.13782474934783395, "learning_rate": 2.9782529948167726e-06, "loss": 0.8702, "step": 2573 }, { "epoch": 4.44559585492228, "grad_norm": 0.15293563794379067, "learning_rate": 2.960014503238897e-06, "loss": 0.8621, "step": 2574 }, { "epoch": 4.447322970639033, "grad_norm": 0.14848258063483138, "learning_rate": 2.9418298826620152e-06, "loss": 0.8691, "step": 2575 }, { "epoch": 4.449050086355786, "grad_norm": 0.14537747119397912, "learning_rate": 2.923699159533877e-06, "loss": 0.8443, "step": 2576 }, { "epoch": 4.450777202072539, "grad_norm": 0.1275717183946161, "learning_rate": 2.905622360223821e-06, "loss": 0.861, "step": 2577 }, { "epoch": 4.452504317789292, "grad_norm": 0.18379340666638194, "learning_rate": 2.887599511022758e-06, "loss": 0.8495, "step": 2578 }, { "epoch": 4.454231433506045, "grad_norm": 0.1679376248494305, "learning_rate": 2.869630638143157e-06, "loss": 0.8513, "step": 2579 }, { "epoch": 4.455958549222798, "grad_norm": 0.2291980304519739, "learning_rate": 2.8517157677189657e-06, "loss": 0.8424, "step": 2580 }, { "epoch": 4.457685664939551, "grad_norm": 0.15365884773447203, "learning_rate": 2.8338549258055812e-06, "loss": 0.8559, "step": 2581 }, { "epoch": 4.459412780656304, "grad_norm": 0.17435060954691117, "learning_rate": 2.8160481383798567e-06, "loss": 0.8515, "step": 2582 }, { "epoch": 4.461139896373057, "grad_norm": 0.15978199511852206, "learning_rate": 2.798295431339986e-06, "loss": 0.8595, "step": 2583 }, { "epoch": 4.46286701208981, "grad_norm": 0.157054893700398, "learning_rate": 2.780596830505542e-06, "loss": 0.8372, "step": 2584 }, { "epoch": 4.4645941278065635, "grad_norm": 0.14860927236536572, "learning_rate": 2.7629523616173747e-06, "loss": 0.8445, "step": 2585 }, { "epoch": 4.466321243523316, "grad_norm": 0.15591659224275034, "learning_rate": 2.745362050337641e-06, "loss": 0.8477, "step": 2586 }, { "epoch": 4.468048359240069, "grad_norm": 0.1304997845710612, "learning_rate": 2.727825922249703e-06, "loss": 0.8529, "step": 2587 }, { "epoch": 4.469775474956822, "grad_norm": 0.14283976917517993, "learning_rate": 2.7103440028581184e-06, "loss": 0.8422, "step": 2588 }, { "epoch": 4.471502590673575, "grad_norm": 0.12593720954596482, "learning_rate": 2.6929163175886298e-06, "loss": 0.8661, "step": 2589 }, { "epoch": 4.473229706390328, "grad_norm": 0.12063209064171485, "learning_rate": 2.6755428917880764e-06, "loss": 0.8363, "step": 2590 }, { "epoch": 4.474956822107081, "grad_norm": 0.10914373109374105, "learning_rate": 2.6582237507243802e-06, "loss": 0.8628, "step": 2591 }, { "epoch": 4.476683937823834, "grad_norm": 0.18169846223864453, "learning_rate": 2.640958919586538e-06, "loss": 0.8328, "step": 2592 }, { "epoch": 4.478411053540587, "grad_norm": 0.1072675236331911, "learning_rate": 2.6237484234845354e-06, "loss": 0.8507, "step": 2593 }, { "epoch": 4.48013816925734, "grad_norm": 0.10391739094923619, "learning_rate": 2.606592287449341e-06, "loss": 0.8688, "step": 2594 }, { "epoch": 4.481865284974093, "grad_norm": 0.1260142658204185, "learning_rate": 2.589490536432853e-06, "loss": 0.8526, "step": 2595 }, { "epoch": 4.4835924006908465, "grad_norm": 0.11029851307249616, "learning_rate": 2.5724431953078853e-06, "loss": 0.8601, "step": 2596 }, { "epoch": 4.485319516407599, "grad_norm": 0.10680939104062992, "learning_rate": 2.555450288868113e-06, "loss": 0.8576, "step": 2597 }, { "epoch": 4.487046632124352, "grad_norm": 0.20841489946304853, "learning_rate": 2.538511841828033e-06, "loss": 0.8494, "step": 2598 }, { "epoch": 4.488773747841106, "grad_norm": 0.10558540951294565, "learning_rate": 2.5216278788229563e-06, "loss": 0.863, "step": 2599 }, { "epoch": 4.490500863557858, "grad_norm": 0.11399233228075124, "learning_rate": 2.504798424408934e-06, "loss": 0.8495, "step": 2600 }, { "epoch": 4.492227979274611, "grad_norm": 0.12481215796315617, "learning_rate": 2.4880235030627376e-06, "loss": 0.832, "step": 2601 }, { "epoch": 4.493955094991365, "grad_norm": 0.1230386201144327, "learning_rate": 2.471303139181842e-06, "loss": 0.8577, "step": 2602 }, { "epoch": 4.495682210708117, "grad_norm": 0.10460978256129207, "learning_rate": 2.4546373570843684e-06, "loss": 0.8496, "step": 2603 }, { "epoch": 4.4974093264248705, "grad_norm": 0.10943374128617726, "learning_rate": 2.438026181009039e-06, "loss": 0.8591, "step": 2604 }, { "epoch": 4.499136442141624, "grad_norm": 0.18125928713568626, "learning_rate": 2.4214696351151813e-06, "loss": 0.8544, "step": 2605 }, { "epoch": 4.500863557858376, "grad_norm": 0.11078729928216753, "learning_rate": 2.404967743482649e-06, "loss": 0.8481, "step": 2606 }, { "epoch": 4.5025906735751295, "grad_norm": 0.09952718011356464, "learning_rate": 2.3885205301118132e-06, "loss": 0.8563, "step": 2607 }, { "epoch": 4.504317789291883, "grad_norm": 0.11020778982828988, "learning_rate": 2.3721280189235297e-06, "loss": 0.8464, "step": 2608 }, { "epoch": 4.506044905008635, "grad_norm": 0.2364452843544742, "learning_rate": 2.3557902337590787e-06, "loss": 0.8625, "step": 2609 }, { "epoch": 4.507772020725389, "grad_norm": 0.11775602250803478, "learning_rate": 2.3395071983801554e-06, "loss": 0.8399, "step": 2610 }, { "epoch": 4.509499136442142, "grad_norm": 0.10689839094954191, "learning_rate": 2.3232789364688337e-06, "loss": 0.8376, "step": 2611 }, { "epoch": 4.511226252158894, "grad_norm": 0.20294071625861496, "learning_rate": 2.3071054716275133e-06, "loss": 0.8459, "step": 2612 }, { "epoch": 4.512953367875648, "grad_norm": 0.11323526266390863, "learning_rate": 2.290986827378898e-06, "loss": 0.8377, "step": 2613 }, { "epoch": 4.514680483592401, "grad_norm": 0.1051093313641547, "learning_rate": 2.274923027165974e-06, "loss": 0.8427, "step": 2614 }, { "epoch": 4.5164075993091535, "grad_norm": 0.1199683072596203, "learning_rate": 2.2589140943519407e-06, "loss": 0.8411, "step": 2615 }, { "epoch": 4.518134715025907, "grad_norm": 0.10479008394956517, "learning_rate": 2.242960052220218e-06, "loss": 0.8543, "step": 2616 }, { "epoch": 4.51986183074266, "grad_norm": 0.11215616165340754, "learning_rate": 2.227060923974387e-06, "loss": 0.8566, "step": 2617 }, { "epoch": 4.521588946459413, "grad_norm": 0.10929711120287718, "learning_rate": 2.2112167327381463e-06, "loss": 0.846, "step": 2618 }, { "epoch": 4.523316062176166, "grad_norm": 0.11087283013436425, "learning_rate": 2.1954275015553206e-06, "loss": 0.8519, "step": 2619 }, { "epoch": 4.525043177892919, "grad_norm": 0.1075304928834679, "learning_rate": 2.179693253389785e-06, "loss": 0.8699, "step": 2620 }, { "epoch": 4.526770293609672, "grad_norm": 0.11527359484603042, "learning_rate": 2.1640140111254393e-06, "loss": 0.8543, "step": 2621 }, { "epoch": 4.528497409326425, "grad_norm": 0.1096295167202731, "learning_rate": 2.1483897975662104e-06, "loss": 0.8297, "step": 2622 }, { "epoch": 4.530224525043177, "grad_norm": 0.11636209818964653, "learning_rate": 2.1328206354359616e-06, "loss": 0.8498, "step": 2623 }, { "epoch": 4.531951640759931, "grad_norm": 0.10355193389436852, "learning_rate": 2.1173065473784994e-06, "loss": 0.864, "step": 2624 }, { "epoch": 4.533678756476684, "grad_norm": 0.10513645509893031, "learning_rate": 2.1018475559575434e-06, "loss": 0.8563, "step": 2625 }, { "epoch": 4.5354058721934365, "grad_norm": 0.10266736227774696, "learning_rate": 2.0864436836566646e-06, "loss": 0.8595, "step": 2626 }, { "epoch": 4.53713298791019, "grad_norm": 0.10026184977902118, "learning_rate": 2.0710949528792666e-06, "loss": 0.8546, "step": 2627 }, { "epoch": 4.538860103626943, "grad_norm": 0.11139437460769407, "learning_rate": 2.055801385948577e-06, "loss": 0.8604, "step": 2628 }, { "epoch": 4.540587219343696, "grad_norm": 0.09988589138401587, "learning_rate": 2.040563005107572e-06, "loss": 0.8358, "step": 2629 }, { "epoch": 4.542314335060449, "grad_norm": 0.09779601914693559, "learning_rate": 2.0253798325189722e-06, "loss": 0.8464, "step": 2630 }, { "epoch": 4.544041450777202, "grad_norm": 0.09962221970243772, "learning_rate": 2.0102518902651935e-06, "loss": 0.8499, "step": 2631 }, { "epoch": 4.545768566493955, "grad_norm": 0.11089981408991725, "learning_rate": 1.9951792003483473e-06, "loss": 0.8333, "step": 2632 }, { "epoch": 4.547495682210708, "grad_norm": 0.1002736151571401, "learning_rate": 1.9801617846901688e-06, "loss": 0.84, "step": 2633 }, { "epoch": 4.549222797927461, "grad_norm": 0.10589801212505356, "learning_rate": 1.9651996651320006e-06, "loss": 0.8495, "step": 2634 }, { "epoch": 4.550949913644214, "grad_norm": 0.10420887571940908, "learning_rate": 1.950292863434777e-06, "loss": 0.8505, "step": 2635 }, { "epoch": 4.552677029360967, "grad_norm": 0.1091224860018779, "learning_rate": 1.9354414012789613e-06, "loss": 0.858, "step": 2636 }, { "epoch": 4.55440414507772, "grad_norm": 0.09857366940256375, "learning_rate": 1.9206453002645363e-06, "loss": 0.8622, "step": 2637 }, { "epoch": 4.556131260794473, "grad_norm": 0.105718372033913, "learning_rate": 1.9059045819109823e-06, "loss": 0.8565, "step": 2638 }, { "epoch": 4.557858376511226, "grad_norm": 0.12074916421918677, "learning_rate": 1.8912192676572117e-06, "loss": 0.8517, "step": 2639 }, { "epoch": 4.5595854922279795, "grad_norm": 0.10433047103618372, "learning_rate": 1.8765893788615574e-06, "loss": 0.863, "step": 2640 }, { "epoch": 4.561312607944732, "grad_norm": 0.10527403572891926, "learning_rate": 1.862014936801755e-06, "loss": 0.8637, "step": 2641 }, { "epoch": 4.563039723661485, "grad_norm": 0.11445647785705275, "learning_rate": 1.8474959626748878e-06, "loss": 0.8518, "step": 2642 }, { "epoch": 4.564766839378239, "grad_norm": 0.11338813464315939, "learning_rate": 1.8330324775973762e-06, "loss": 0.8572, "step": 2643 }, { "epoch": 4.566493955094991, "grad_norm": 0.10293675152435117, "learning_rate": 1.8186245026049177e-06, "loss": 0.8529, "step": 2644 }, { "epoch": 4.568221070811744, "grad_norm": 0.09587838713739152, "learning_rate": 1.8042720586525054e-06, "loss": 0.8237, "step": 2645 }, { "epoch": 4.569948186528498, "grad_norm": 0.09625746589660492, "learning_rate": 1.7899751666143438e-06, "loss": 0.8594, "step": 2646 }, { "epoch": 4.57167530224525, "grad_norm": 0.10478606145911483, "learning_rate": 1.7757338472838448e-06, "loss": 0.8472, "step": 2647 }, { "epoch": 4.573402417962003, "grad_norm": 0.09695916142830067, "learning_rate": 1.7615481213736086e-06, "loss": 0.8481, "step": 2648 }, { "epoch": 4.575129533678757, "grad_norm": 0.10096054699793071, "learning_rate": 1.747418009515376e-06, "loss": 0.8312, "step": 2649 }, { "epoch": 4.576856649395509, "grad_norm": 0.10275904924724491, "learning_rate": 1.7333435322599834e-06, "loss": 0.8414, "step": 2650 }, { "epoch": 4.5785837651122625, "grad_norm": 0.1129612907870064, "learning_rate": 1.7193247100773858e-06, "loss": 0.8601, "step": 2651 }, { "epoch": 4.580310880829016, "grad_norm": 0.11096868369383865, "learning_rate": 1.7053615633565712e-06, "loss": 0.86, "step": 2652 }, { "epoch": 4.582037996545768, "grad_norm": 0.09580057650105976, "learning_rate": 1.6914541124055528e-06, "loss": 0.8566, "step": 2653 }, { "epoch": 4.583765112262522, "grad_norm": 0.09261148347038847, "learning_rate": 1.6776023774513416e-06, "loss": 0.8423, "step": 2654 }, { "epoch": 4.585492227979275, "grad_norm": 0.10168609183717849, "learning_rate": 1.663806378639925e-06, "loss": 0.8779, "step": 2655 }, { "epoch": 4.587219343696027, "grad_norm": 0.09762733311283578, "learning_rate": 1.6500661360362213e-06, "loss": 0.853, "step": 2656 }, { "epoch": 4.588946459412781, "grad_norm": 0.10167190836631788, "learning_rate": 1.6363816696240498e-06, "loss": 0.8391, "step": 2657 }, { "epoch": 4.590673575129534, "grad_norm": 0.0987324692692998, "learning_rate": 1.6227529993061254e-06, "loss": 0.8398, "step": 2658 }, { "epoch": 4.592400690846286, "grad_norm": 0.10242651638078878, "learning_rate": 1.6091801449039967e-06, "loss": 0.8558, "step": 2659 }, { "epoch": 4.59412780656304, "grad_norm": 0.09911494799671759, "learning_rate": 1.5956631261580336e-06, "loss": 0.8495, "step": 2660 }, { "epoch": 4.595854922279793, "grad_norm": 0.13648545549563232, "learning_rate": 1.582201962727421e-06, "loss": 0.8664, "step": 2661 }, { "epoch": 4.5975820379965455, "grad_norm": 0.09799095908601797, "learning_rate": 1.5687966741900807e-06, "loss": 0.8464, "step": 2662 }, { "epoch": 4.599309153713299, "grad_norm": 0.10377658264190456, "learning_rate": 1.5554472800426834e-06, "loss": 0.8428, "step": 2663 }, { "epoch": 4.601036269430052, "grad_norm": 0.0978035258119922, "learning_rate": 1.542153799700601e-06, "loss": 0.8426, "step": 2664 }, { "epoch": 4.602763385146805, "grad_norm": 0.09779324713014156, "learning_rate": 1.5289162524978963e-06, "loss": 0.8395, "step": 2665 }, { "epoch": 4.604490500863558, "grad_norm": 0.09379018090808328, "learning_rate": 1.5157346576872666e-06, "loss": 0.8421, "step": 2666 }, { "epoch": 4.606217616580311, "grad_norm": 0.20090618232139545, "learning_rate": 1.502609034440039e-06, "loss": 0.8684, "step": 2667 }, { "epoch": 4.607944732297064, "grad_norm": 0.09554573444184702, "learning_rate": 1.4895394018461474e-06, "loss": 0.8526, "step": 2668 }, { "epoch": 4.609671848013817, "grad_norm": 0.09200627457040406, "learning_rate": 1.4765257789140708e-06, "loss": 0.8634, "step": 2669 }, { "epoch": 4.61139896373057, "grad_norm": 0.1002446400546579, "learning_rate": 1.4635681845708382e-06, "loss": 0.8579, "step": 2670 }, { "epoch": 4.613126079447323, "grad_norm": 0.10488702534323212, "learning_rate": 1.450666637661997e-06, "loss": 0.856, "step": 2671 }, { "epoch": 4.614853195164076, "grad_norm": 0.09885992950981361, "learning_rate": 1.4378211569515777e-06, "loss": 0.8304, "step": 2672 }, { "epoch": 4.616580310880829, "grad_norm": 0.09830487502190065, "learning_rate": 1.4250317611220443e-06, "loss": 0.8594, "step": 2673 }, { "epoch": 4.618307426597582, "grad_norm": 0.10491621217103253, "learning_rate": 1.4122984687743312e-06, "loss": 0.8684, "step": 2674 }, { "epoch": 4.620034542314335, "grad_norm": 0.0994962783690584, "learning_rate": 1.3996212984277447e-06, "loss": 0.8487, "step": 2675 }, { "epoch": 4.6217616580310885, "grad_norm": 0.09867400870176475, "learning_rate": 1.3870002685199801e-06, "loss": 0.8423, "step": 2676 }, { "epoch": 4.623488773747841, "grad_norm": 0.1030436057240703, "learning_rate": 1.3744353974070702e-06, "loss": 0.8523, "step": 2677 }, { "epoch": 4.625215889464594, "grad_norm": 0.10201575033741761, "learning_rate": 1.361926703363392e-06, "loss": 0.8278, "step": 2678 }, { "epoch": 4.626943005181348, "grad_norm": 0.09500770424102252, "learning_rate": 1.3494742045815978e-06, "loss": 0.8376, "step": 2679 }, { "epoch": 4.6286701208981, "grad_norm": 0.09867259441716493, "learning_rate": 1.3370779191726136e-06, "loss": 0.8377, "step": 2680 }, { "epoch": 4.630397236614853, "grad_norm": 0.10066169917301117, "learning_rate": 1.324737865165626e-06, "loss": 0.849, "step": 2681 }, { "epoch": 4.632124352331607, "grad_norm": 0.09821878968799513, "learning_rate": 1.3124540605080173e-06, "loss": 0.8554, "step": 2682 }, { "epoch": 4.633851468048359, "grad_norm": 0.10423298787894593, "learning_rate": 1.3002265230653622e-06, "loss": 0.8455, "step": 2683 }, { "epoch": 4.6355785837651124, "grad_norm": 0.10325107541460841, "learning_rate": 1.2880552706214134e-06, "loss": 0.8531, "step": 2684 }, { "epoch": 4.637305699481866, "grad_norm": 0.11036732676056801, "learning_rate": 1.2759403208780552e-06, "loss": 0.8619, "step": 2685 }, { "epoch": 4.639032815198618, "grad_norm": 0.10647438669267417, "learning_rate": 1.2638816914552864e-06, "loss": 0.839, "step": 2686 }, { "epoch": 4.6407599309153715, "grad_norm": 0.0988601162764577, "learning_rate": 1.2518793998911894e-06, "loss": 0.871, "step": 2687 }, { "epoch": 4.642487046632124, "grad_norm": 0.09154452460966409, "learning_rate": 1.2399334636419113e-06, "loss": 0.8442, "step": 2688 }, { "epoch": 4.644214162348877, "grad_norm": 0.09154989727858184, "learning_rate": 1.2280439000816435e-06, "loss": 0.8417, "step": 2689 }, { "epoch": 4.645941278065631, "grad_norm": 0.10085872730456583, "learning_rate": 1.2162107265025714e-06, "loss": 0.8627, "step": 2690 }, { "epoch": 4.647668393782383, "grad_norm": 0.09820875152616564, "learning_rate": 1.204433960114888e-06, "loss": 0.8284, "step": 2691 }, { "epoch": 4.649395509499136, "grad_norm": 0.10453984984759185, "learning_rate": 1.1927136180467324e-06, "loss": 0.8576, "step": 2692 }, { "epoch": 4.65112262521589, "grad_norm": 0.11009667455417635, "learning_rate": 1.1810497173441804e-06, "loss": 0.8479, "step": 2693 }, { "epoch": 4.652849740932642, "grad_norm": 0.09450702204963533, "learning_rate": 1.1694422749712309e-06, "loss": 0.8409, "step": 2694 }, { "epoch": 4.6545768566493955, "grad_norm": 0.10193053806953786, "learning_rate": 1.1578913078097577e-06, "loss": 0.8402, "step": 2695 }, { "epoch": 4.656303972366149, "grad_norm": 0.09437675911510215, "learning_rate": 1.1463968326595088e-06, "loss": 0.8562, "step": 2696 }, { "epoch": 4.658031088082901, "grad_norm": 0.10683759882988889, "learning_rate": 1.1349588662380495e-06, "loss": 0.8464, "step": 2697 }, { "epoch": 4.6597582037996546, "grad_norm": 0.09878607097566004, "learning_rate": 1.123577425180784e-06, "loss": 0.8487, "step": 2698 }, { "epoch": 4.661485319516408, "grad_norm": 0.09256343126232901, "learning_rate": 1.1122525260408978e-06, "loss": 0.8396, "step": 2699 }, { "epoch": 4.66321243523316, "grad_norm": 0.1036338569502869, "learning_rate": 1.1009841852893221e-06, "loss": 0.8404, "step": 2700 }, { "epoch": 4.664939550949914, "grad_norm": 0.2745817400094403, "learning_rate": 1.0897724193147608e-06, "loss": 0.8668, "step": 2701 }, { "epoch": 4.666666666666667, "grad_norm": 0.09260180153028272, "learning_rate": 1.0786172444236143e-06, "loss": 0.8531, "step": 2702 }, { "epoch": 4.668393782383419, "grad_norm": 0.09489659055737745, "learning_rate": 1.0675186768399803e-06, "loss": 0.8505, "step": 2703 }, { "epoch": 4.670120898100173, "grad_norm": 0.22079923096638945, "learning_rate": 1.0564767327056357e-06, "loss": 0.8591, "step": 2704 }, { "epoch": 4.671848013816926, "grad_norm": 0.0974896435400368, "learning_rate": 1.0454914280799966e-06, "loss": 0.8691, "step": 2705 }, { "epoch": 4.6735751295336785, "grad_norm": 0.10063144254449459, "learning_rate": 1.034562778940096e-06, "loss": 0.8471, "step": 2706 }, { "epoch": 4.675302245250432, "grad_norm": 0.09557030331747161, "learning_rate": 1.0236908011805834e-06, "loss": 0.8406, "step": 2707 }, { "epoch": 4.677029360967185, "grad_norm": 0.0960717200075178, "learning_rate": 1.0128755106136734e-06, "loss": 0.8385, "step": 2708 }, { "epoch": 4.678756476683938, "grad_norm": 0.09095013078535409, "learning_rate": 1.0021169229691386e-06, "loss": 0.8471, "step": 2709 }, { "epoch": 4.680483592400691, "grad_norm": 0.0977162531843797, "learning_rate": 9.914150538942803e-07, "loss": 0.8393, "step": 2710 }, { "epoch": 4.682210708117444, "grad_norm": 0.09448678443961032, "learning_rate": 9.807699189539145e-07, "loss": 0.8632, "step": 2711 }, { "epoch": 4.683937823834197, "grad_norm": 0.09451020281539635, "learning_rate": 9.701815336303366e-07, "loss": 0.8471, "step": 2712 }, { "epoch": 4.68566493955095, "grad_norm": 0.09189539727378304, "learning_rate": 9.59649913323304e-07, "loss": 0.8631, "step": 2713 }, { "epoch": 4.687392055267703, "grad_norm": 0.09850064789248306, "learning_rate": 9.491750733500216e-07, "loss": 0.8584, "step": 2714 }, { "epoch": 4.689119170984456, "grad_norm": 0.09698028312256785, "learning_rate": 9.38757028945112e-07, "loss": 0.8679, "step": 2715 }, { "epoch": 4.690846286701209, "grad_norm": 0.09188804869056239, "learning_rate": 9.283957952605882e-07, "loss": 0.8525, "step": 2716 }, { "epoch": 4.692573402417962, "grad_norm": 0.09947626187495687, "learning_rate": 9.180913873658403e-07, "loss": 0.8634, "step": 2717 }, { "epoch": 4.694300518134715, "grad_norm": 0.10226751289633622, "learning_rate": 9.078438202476181e-07, "loss": 0.8451, "step": 2718 }, { "epoch": 4.696027633851468, "grad_norm": 0.10268772297890706, "learning_rate": 8.976531088099904e-07, "loss": 0.8414, "step": 2719 }, { "epoch": 4.697754749568221, "grad_norm": 0.09793159365021188, "learning_rate": 8.875192678743416e-07, "loss": 0.8485, "step": 2720 }, { "epoch": 4.699481865284974, "grad_norm": 0.09715227061683834, "learning_rate": 8.774423121793441e-07, "loss": 0.8478, "step": 2721 }, { "epoch": 4.701208981001727, "grad_norm": 0.0997694334955947, "learning_rate": 8.67422256380932e-07, "loss": 0.8504, "step": 2722 }, { "epoch": 4.70293609671848, "grad_norm": 0.10422175499458215, "learning_rate": 8.574591150522926e-07, "loss": 0.8416, "step": 2723 }, { "epoch": 4.704663212435233, "grad_norm": 0.09753003901889372, "learning_rate": 8.475529026838259e-07, "loss": 0.8796, "step": 2724 }, { "epoch": 4.706390328151986, "grad_norm": 0.09590354078051178, "learning_rate": 8.377036336831401e-07, "loss": 0.8525, "step": 2725 }, { "epoch": 4.708117443868739, "grad_norm": 0.1035125888538228, "learning_rate": 8.279113223750257e-07, "loss": 0.8649, "step": 2726 }, { "epoch": 4.709844559585492, "grad_norm": 0.09693771394920324, "learning_rate": 8.18175983001428e-07, "loss": 0.8652, "step": 2727 }, { "epoch": 4.711571675302245, "grad_norm": 0.08845334382827706, "learning_rate": 8.084976297214475e-07, "loss": 0.8551, "step": 2728 }, { "epoch": 4.713298791018998, "grad_norm": 0.09408777377765781, "learning_rate": 7.988762766112823e-07, "loss": 0.8605, "step": 2729 }, { "epoch": 4.715025906735751, "grad_norm": 0.0939373303159284, "learning_rate": 7.893119376642456e-07, "loss": 0.8591, "step": 2730 }, { "epoch": 4.7167530224525045, "grad_norm": 0.10581479133788914, "learning_rate": 7.798046267907211e-07, "loss": 0.8704, "step": 2731 }, { "epoch": 4.718480138169257, "grad_norm": 0.0927936684586258, "learning_rate": 7.703543578181594e-07, "loss": 0.8673, "step": 2732 }, { "epoch": 4.72020725388601, "grad_norm": 0.09374178744427873, "learning_rate": 7.609611444910325e-07, "loss": 0.833, "step": 2733 }, { "epoch": 4.721934369602764, "grad_norm": 0.08780951244245118, "learning_rate": 7.516250004708525e-07, "loss": 0.8519, "step": 2734 }, { "epoch": 4.723661485319516, "grad_norm": 0.09000933880079445, "learning_rate": 7.423459393361132e-07, "loss": 0.8524, "step": 2735 }, { "epoch": 4.725388601036269, "grad_norm": 0.09362298859661679, "learning_rate": 7.331239745822949e-07, "loss": 0.8418, "step": 2736 }, { "epoch": 4.727115716753023, "grad_norm": 0.09455088839364428, "learning_rate": 7.239591196218332e-07, "loss": 0.8414, "step": 2737 }, { "epoch": 4.728842832469775, "grad_norm": 0.08879069866958764, "learning_rate": 7.148513877841057e-07, "loss": 0.8421, "step": 2738 }, { "epoch": 4.730569948186528, "grad_norm": 0.10313157175293555, "learning_rate": 7.058007923154053e-07, "loss": 0.847, "step": 2739 }, { "epoch": 4.732297063903282, "grad_norm": 0.08842119639001808, "learning_rate": 6.96807346378936e-07, "loss": 0.8496, "step": 2740 }, { "epoch": 4.734024179620034, "grad_norm": 0.10046855924390541, "learning_rate": 6.87871063054777e-07, "loss": 0.8503, "step": 2741 }, { "epoch": 4.7357512953367875, "grad_norm": 0.09350826388983248, "learning_rate": 6.789919553398606e-07, "loss": 0.8515, "step": 2742 }, { "epoch": 4.737478411053541, "grad_norm": 0.0958381508876051, "learning_rate": 6.701700361479769e-07, "loss": 0.8395, "step": 2743 }, { "epoch": 4.739205526770293, "grad_norm": 0.09389986163973855, "learning_rate": 6.61405318309738e-07, "loss": 0.8596, "step": 2744 }, { "epoch": 4.740932642487047, "grad_norm": 0.08790121381975073, "learning_rate": 6.526978145725604e-07, "loss": 0.8767, "step": 2745 }, { "epoch": 4.7426597582038, "grad_norm": 0.09391564131625872, "learning_rate": 6.440475376006428e-07, "loss": 0.8336, "step": 2746 }, { "epoch": 4.744386873920552, "grad_norm": 0.09698707241797223, "learning_rate": 6.354544999749701e-07, "loss": 0.8453, "step": 2747 }, { "epoch": 4.746113989637306, "grad_norm": 0.09859657254475768, "learning_rate": 6.269187141932565e-07, "loss": 0.8444, "step": 2748 }, { "epoch": 4.747841105354059, "grad_norm": 0.09209951164174894, "learning_rate": 6.184401926699579e-07, "loss": 0.8437, "step": 2749 }, { "epoch": 4.7495682210708114, "grad_norm": 0.10284811676639138, "learning_rate": 6.100189477362595e-07, "loss": 0.8625, "step": 2750 }, { "epoch": 4.751295336787565, "grad_norm": 0.08944095953319249, "learning_rate": 6.016549916400216e-07, "loss": 0.8351, "step": 2751 }, { "epoch": 4.753022452504318, "grad_norm": 0.09438298378947962, "learning_rate": 5.933483365457893e-07, "loss": 0.842, "step": 2752 }, { "epoch": 4.7547495682210705, "grad_norm": 0.09114858676600175, "learning_rate": 5.850989945347785e-07, "loss": 0.8244, "step": 2753 }, { "epoch": 4.756476683937824, "grad_norm": 0.09367864607063814, "learning_rate": 5.769069776048409e-07, "loss": 0.8546, "step": 2754 }, { "epoch": 4.758203799654577, "grad_norm": 0.09577092174974595, "learning_rate": 5.687722976704546e-07, "loss": 0.8372, "step": 2755 }, { "epoch": 4.75993091537133, "grad_norm": 0.09308048133813021, "learning_rate": 5.606949665627027e-07, "loss": 0.8275, "step": 2756 }, { "epoch": 4.761658031088083, "grad_norm": 0.08809798975089624, "learning_rate": 5.52674996029281e-07, "loss": 0.8516, "step": 2757 }, { "epoch": 4.763385146804836, "grad_norm": 0.09262400902700962, "learning_rate": 5.447123977344326e-07, "loss": 0.8476, "step": 2758 }, { "epoch": 4.765112262521589, "grad_norm": 0.09104709593386791, "learning_rate": 5.368071832589694e-07, "loss": 0.8635, "step": 2759 }, { "epoch": 4.766839378238342, "grad_norm": 0.0900639951066174, "learning_rate": 5.289593641002544e-07, "loss": 0.8693, "step": 2760 }, { "epoch": 4.768566493955095, "grad_norm": 0.09811334717032297, "learning_rate": 5.211689516721574e-07, "loss": 0.8627, "step": 2761 }, { "epoch": 4.770293609671848, "grad_norm": 0.0962059745199304, "learning_rate": 5.13435957305064e-07, "loss": 0.834, "step": 2762 }, { "epoch": 4.772020725388601, "grad_norm": 0.09461756319225181, "learning_rate": 5.057603922458575e-07, "loss": 0.8576, "step": 2763 }, { "epoch": 4.773747841105354, "grad_norm": 0.0927794599833669, "learning_rate": 4.98142267657884e-07, "loss": 0.8422, "step": 2764 }, { "epoch": 4.775474956822107, "grad_norm": 0.0875821376153362, "learning_rate": 4.905815946209558e-07, "loss": 0.8606, "step": 2765 }, { "epoch": 4.77720207253886, "grad_norm": 0.09283989056768052, "learning_rate": 4.830783841313214e-07, "loss": 0.8622, "step": 2766 }, { "epoch": 4.7789291882556135, "grad_norm": 0.09688974658976082, "learning_rate": 4.756326471016648e-07, "loss": 0.843, "step": 2767 }, { "epoch": 4.780656303972366, "grad_norm": 0.08902593971733165, "learning_rate": 4.6824439436107037e-07, "loss": 0.8395, "step": 2768 }, { "epoch": 4.782383419689119, "grad_norm": 0.0891373944039902, "learning_rate": 4.6091363665502707e-07, "loss": 0.8297, "step": 2769 }, { "epoch": 4.784110535405873, "grad_norm": 0.09825015547398312, "learning_rate": 4.5364038464539294e-07, "loss": 0.8474, "step": 2770 }, { "epoch": 4.785837651122625, "grad_norm": 0.09442926232333387, "learning_rate": 4.4642464891039516e-07, "loss": 0.8363, "step": 2771 }, { "epoch": 4.787564766839378, "grad_norm": 0.09258866776190736, "learning_rate": 4.3926643994461224e-07, "loss": 0.8428, "step": 2772 }, { "epoch": 4.789291882556132, "grad_norm": 0.08651464282555502, "learning_rate": 4.3216576815895197e-07, "loss": 0.8413, "step": 2773 }, { "epoch": 4.791018998272884, "grad_norm": 0.09957463361931047, "learning_rate": 4.2512264388063775e-07, "loss": 0.8576, "step": 2774 }, { "epoch": 4.7927461139896375, "grad_norm": 0.09377969010793659, "learning_rate": 4.181370773532001e-07, "loss": 0.827, "step": 2775 }, { "epoch": 4.794473229706391, "grad_norm": 0.08811860418106035, "learning_rate": 4.112090787364542e-07, "loss": 0.8387, "step": 2776 }, { "epoch": 4.796200345423143, "grad_norm": 0.08910017094927838, "learning_rate": 4.043386581064912e-07, "loss": 0.8472, "step": 2777 }, { "epoch": 4.7979274611398965, "grad_norm": 0.09451292809003459, "learning_rate": 3.975258254556602e-07, "loss": 0.8633, "step": 2778 }, { "epoch": 4.79965457685665, "grad_norm": 0.09194489865966443, "learning_rate": 3.907705906925463e-07, "loss": 0.8451, "step": 2779 }, { "epoch": 4.801381692573402, "grad_norm": 0.08526777425050146, "learning_rate": 3.8407296364198377e-07, "loss": 0.8493, "step": 2780 }, { "epoch": 4.803108808290156, "grad_norm": 0.09596823744323062, "learning_rate": 3.774329540450028e-07, "loss": 0.8455, "step": 2781 }, { "epoch": 4.804835924006909, "grad_norm": 0.14255715463015814, "learning_rate": 3.70850571558834e-07, "loss": 0.8403, "step": 2782 }, { "epoch": 4.806563039723661, "grad_norm": 0.08461333536333171, "learning_rate": 3.643258257569171e-07, "loss": 0.8324, "step": 2783 }, { "epoch": 4.808290155440415, "grad_norm": 0.088114228611236, "learning_rate": 3.5785872612884355e-07, "loss": 0.8566, "step": 2784 }, { "epoch": 4.810017271157167, "grad_norm": 0.10474412163568952, "learning_rate": 3.5144928208036943e-07, "loss": 0.8504, "step": 2785 }, { "epoch": 4.8117443868739205, "grad_norm": 0.10034207290501165, "learning_rate": 3.450975029334025e-07, "loss": 0.8601, "step": 2786 }, { "epoch": 4.813471502590674, "grad_norm": 0.0951134619048099, "learning_rate": 3.388033979259753e-07, "loss": 0.835, "step": 2787 }, { "epoch": 4.815198618307426, "grad_norm": 0.0923995362452693, "learning_rate": 3.325669762122452e-07, "loss": 0.8456, "step": 2788 }, { "epoch": 4.81692573402418, "grad_norm": 0.09160155434332358, "learning_rate": 3.263882468624635e-07, "loss": 0.8487, "step": 2789 }, { "epoch": 4.818652849740933, "grad_norm": 0.08892791837935488, "learning_rate": 3.2026721886299297e-07, "loss": 0.8528, "step": 2790 }, { "epoch": 4.820379965457685, "grad_norm": 0.0867670151861978, "learning_rate": 3.1420390111625454e-07, "loss": 0.8621, "step": 2791 }, { "epoch": 4.822107081174439, "grad_norm": 0.08555437652495226, "learning_rate": 3.081983024407498e-07, "loss": 0.8503, "step": 2792 }, { "epoch": 4.823834196891192, "grad_norm": 0.09323733469575224, "learning_rate": 3.0225043157103396e-07, "loss": 0.8656, "step": 2793 }, { "epoch": 4.825561312607944, "grad_norm": 0.0962170742031377, "learning_rate": 2.9636029715768954e-07, "loss": 0.8311, "step": 2794 }, { "epoch": 4.827288428324698, "grad_norm": 0.09185152598992025, "learning_rate": 2.9052790776733506e-07, "loss": 0.8488, "step": 2795 }, { "epoch": 4.829015544041451, "grad_norm": 0.08824824404137159, "learning_rate": 2.8475327188261184e-07, "loss": 0.8358, "step": 2796 }, { "epoch": 4.8307426597582035, "grad_norm": 0.08515591337181791, "learning_rate": 2.7903639790215266e-07, "loss": 0.8602, "step": 2797 }, { "epoch": 4.832469775474957, "grad_norm": 0.3038775532524218, "learning_rate": 2.7337729414058657e-07, "loss": 0.8584, "step": 2798 }, { "epoch": 4.83419689119171, "grad_norm": 0.09094546406582815, "learning_rate": 2.6777596882852085e-07, "loss": 0.8483, "step": 2799 }, { "epoch": 4.835924006908463, "grad_norm": 0.09132967402122893, "learning_rate": 2.622324301125323e-07, "loss": 0.8583, "step": 2800 }, { "epoch": 4.837651122625216, "grad_norm": 0.09085418018967611, "learning_rate": 2.5674668605514927e-07, "loss": 0.8531, "step": 2801 }, { "epoch": 4.839378238341969, "grad_norm": 0.09284578886673736, "learning_rate": 2.513187446348431e-07, "loss": 0.8446, "step": 2802 }, { "epoch": 4.841105354058722, "grad_norm": 0.09000384497477217, "learning_rate": 2.459486137460232e-07, "loss": 0.8661, "step": 2803 }, { "epoch": 4.842832469775475, "grad_norm": 0.08757914464699851, "learning_rate": 2.406363011990109e-07, "loss": 0.8485, "step": 2804 }, { "epoch": 4.844559585492228, "grad_norm": 0.08793892668430878, "learning_rate": 2.3538181472003928e-07, "loss": 0.8414, "step": 2805 }, { "epoch": 4.846286701208981, "grad_norm": 0.08639253680222128, "learning_rate": 2.3018516195124407e-07, "loss": 0.8347, "step": 2806 }, { "epoch": 4.848013816925734, "grad_norm": 0.09445777562113172, "learning_rate": 2.2504635045063727e-07, "loss": 0.8358, "step": 2807 }, { "epoch": 4.849740932642487, "grad_norm": 0.10472052781523, "learning_rate": 2.1996538769211594e-07, "loss": 0.8556, "step": 2808 }, { "epoch": 4.85146804835924, "grad_norm": 0.08249427795192067, "learning_rate": 2.1494228106544445e-07, "loss": 0.8386, "step": 2809 }, { "epoch": 4.853195164075993, "grad_norm": 0.09055779386078898, "learning_rate": 2.099770378762278e-07, "loss": 0.8495, "step": 2810 }, { "epoch": 4.8549222797927465, "grad_norm": 0.09388773977607132, "learning_rate": 2.0506966534592497e-07, "loss": 0.8582, "step": 2811 }, { "epoch": 4.856649395509499, "grad_norm": 0.09438508224537305, "learning_rate": 2.0022017061181786e-07, "loss": 0.8546, "step": 2812 }, { "epoch": 4.858376511226252, "grad_norm": 0.08648032182451555, "learning_rate": 1.9542856072702899e-07, "loss": 0.8523, "step": 2813 }, { "epoch": 4.860103626943005, "grad_norm": 0.08996762075209852, "learning_rate": 1.906948426604771e-07, "loss": 0.8522, "step": 2814 }, { "epoch": 4.861830742659758, "grad_norm": 0.0876413175864079, "learning_rate": 1.860190232968817e-07, "loss": 0.8477, "step": 2815 }, { "epoch": 4.863557858376511, "grad_norm": 0.08893274818651377, "learning_rate": 1.814011094367718e-07, "loss": 0.8626, "step": 2816 }, { "epoch": 4.865284974093264, "grad_norm": 0.08724369460034267, "learning_rate": 1.7684110779643716e-07, "loss": 0.8388, "step": 2817 }, { "epoch": 4.867012089810017, "grad_norm": 0.08522554464553989, "learning_rate": 1.7233902500795486e-07, "loss": 0.8324, "step": 2818 }, { "epoch": 4.86873920552677, "grad_norm": 0.0977636090953524, "learning_rate": 1.6789486761915385e-07, "loss": 0.8281, "step": 2819 }, { "epoch": 4.870466321243523, "grad_norm": 0.08981379657781019, "learning_rate": 1.635086420936327e-07, "loss": 0.8445, "step": 2820 }, { "epoch": 4.872193436960276, "grad_norm": 0.09037288891578224, "learning_rate": 1.5918035481071515e-07, "loss": 0.8563, "step": 2821 }, { "epoch": 4.8739205526770295, "grad_norm": 0.09280275706818167, "learning_rate": 1.5491001206546786e-07, "loss": 0.8418, "step": 2822 }, { "epoch": 4.875647668393782, "grad_norm": 0.08941696431411064, "learning_rate": 1.5069762006868716e-07, "loss": 0.8688, "step": 2823 }, { "epoch": 4.877374784110535, "grad_norm": 0.09057848474516217, "learning_rate": 1.465431849468768e-07, "loss": 0.8253, "step": 2824 }, { "epoch": 4.879101899827289, "grad_norm": 0.08780517645206588, "learning_rate": 1.4244671274225242e-07, "loss": 0.8364, "step": 2825 }, { "epoch": 4.880829015544041, "grad_norm": 0.09933620131837967, "learning_rate": 1.3840820941273257e-07, "loss": 0.8504, "step": 2826 }, { "epoch": 4.882556131260794, "grad_norm": 0.08934555535825034, "learning_rate": 1.344276808319167e-07, "loss": 0.835, "step": 2827 }, { "epoch": 4.884283246977548, "grad_norm": 0.088346574760994, "learning_rate": 1.3050513278909382e-07, "loss": 0.8852, "step": 2828 }, { "epoch": 4.8860103626943, "grad_norm": 0.0897018855881674, "learning_rate": 1.2664057098922045e-07, "loss": 0.8437, "step": 2829 }, { "epoch": 4.887737478411053, "grad_norm": 0.08768312947748612, "learning_rate": 1.2283400105292498e-07, "loss": 0.8719, "step": 2830 }, { "epoch": 4.889464594127807, "grad_norm": 0.09048994640140583, "learning_rate": 1.1908542851647665e-07, "loss": 0.8702, "step": 2831 }, { "epoch": 4.891191709844559, "grad_norm": 0.10137528452356696, "learning_rate": 1.1539485883181655e-07, "loss": 0.8504, "step": 2832 }, { "epoch": 4.8929188255613125, "grad_norm": 0.09163711013022365, "learning_rate": 1.117622973665089e-07, "loss": 0.8455, "step": 2833 }, { "epoch": 4.894645941278066, "grad_norm": 0.08849936802763414, "learning_rate": 1.0818774940374976e-07, "loss": 0.8572, "step": 2834 }, { "epoch": 4.896373056994818, "grad_norm": 0.08724823720998617, "learning_rate": 1.0467122014237608e-07, "loss": 0.8539, "step": 2835 }, { "epoch": 4.898100172711572, "grad_norm": 0.09023885703737043, "learning_rate": 1.0121271469682559e-07, "loss": 0.8519, "step": 2836 }, { "epoch": 4.899827288428325, "grad_norm": 0.0936372205088587, "learning_rate": 9.781223809715467e-08, "loss": 0.8618, "step": 2837 }, { "epoch": 4.901554404145077, "grad_norm": 0.09064179461360028, "learning_rate": 9.4469795289025e-08, "loss": 0.8619, "step": 2838 }, { "epoch": 4.903281519861831, "grad_norm": 0.08994435249303727, "learning_rate": 9.11853911336813e-08, "loss": 0.8553, "step": 2839 }, { "epoch": 4.905008635578584, "grad_norm": 0.08707991073572492, "learning_rate": 8.79590304079736e-08, "loss": 0.8338, "step": 2840 }, { "epoch": 4.9067357512953365, "grad_norm": 0.09030051871771638, "learning_rate": 8.479071780431725e-08, "loss": 0.8516, "step": 2841 }, { "epoch": 4.90846286701209, "grad_norm": 0.08440845144094093, "learning_rate": 8.168045793071511e-08, "loss": 0.8429, "step": 2842 }, { "epoch": 4.910189982728843, "grad_norm": 0.487314424330009, "learning_rate": 7.862825531073093e-08, "loss": 0.8456, "step": 2843 }, { "epoch": 4.9119170984455955, "grad_norm": 0.09835886267240837, "learning_rate": 7.563411438348933e-08, "loss": 0.8522, "step": 2844 }, { "epoch": 4.913644214162349, "grad_norm": 0.09772168652138687, "learning_rate": 7.269803950368026e-08, "loss": 0.8667, "step": 2845 }, { "epoch": 4.915371329879102, "grad_norm": 0.0949560211385369, "learning_rate": 6.982003494152789e-08, "loss": 0.8524, "step": 2846 }, { "epoch": 4.917098445595855, "grad_norm": 0.08841219169796942, "learning_rate": 6.700010488280839e-08, "loss": 0.8531, "step": 2847 }, { "epoch": 4.918825561312608, "grad_norm": 0.08797381845371956, "learning_rate": 6.423825342883216e-08, "loss": 0.8445, "step": 2848 }, { "epoch": 4.920552677029361, "grad_norm": 0.08969778914169148, "learning_rate": 6.15344845964394e-08, "loss": 0.857, "step": 2849 }, { "epoch": 4.922279792746114, "grad_norm": 0.0862103843986873, "learning_rate": 5.888880231799121e-08, "loss": 0.8535, "step": 2850 }, { "epoch": 4.924006908462867, "grad_norm": 0.08762595694134329, "learning_rate": 5.630121044137404e-08, "loss": 0.8417, "step": 2851 }, { "epoch": 4.92573402417962, "grad_norm": 0.087084655814445, "learning_rate": 5.377171272998638e-08, "loss": 0.8578, "step": 2852 }, { "epoch": 4.927461139896373, "grad_norm": 0.08869710105083903, "learning_rate": 5.1300312862729854e-08, "loss": 0.8271, "step": 2853 }, { "epoch": 4.929188255613126, "grad_norm": 0.08768653546957852, "learning_rate": 4.888701443400923e-08, "loss": 0.8592, "step": 2854 }, { "epoch": 4.9309153713298794, "grad_norm": 0.08428112834730216, "learning_rate": 4.6531820953736875e-08, "loss": 0.857, "step": 2855 }, { "epoch": 4.932642487046632, "grad_norm": 0.08736444016333945, "learning_rate": 4.4234735847301646e-08, "loss": 0.8542, "step": 2856 }, { "epoch": 4.934369602763385, "grad_norm": 0.08638999702529569, "learning_rate": 4.1995762455591114e-08, "loss": 0.8574, "step": 2857 }, { "epoch": 4.9360967184801385, "grad_norm": 0.09145204563345018, "learning_rate": 3.981490403496935e-08, "loss": 0.8582, "step": 2858 }, { "epoch": 4.937823834196891, "grad_norm": 0.09307205069826392, "learning_rate": 3.769216375728135e-08, "loss": 0.8645, "step": 2859 }, { "epoch": 4.939550949913644, "grad_norm": 0.0923221257172985, "learning_rate": 3.5627544709839755e-08, "loss": 0.8472, "step": 2860 }, { "epoch": 4.941278065630398, "grad_norm": 0.09274000787254869, "learning_rate": 3.3621049895429246e-08, "loss": 0.8599, "step": 2861 }, { "epoch": 4.94300518134715, "grad_norm": 0.08880607344572344, "learning_rate": 3.1672682232302134e-08, "loss": 0.8602, "step": 2862 }, { "epoch": 4.944732297063903, "grad_norm": 0.09253414132896749, "learning_rate": 2.9782444554165013e-08, "loss": 0.8453, "step": 2863 }, { "epoch": 4.946459412780657, "grad_norm": 0.08833141126573281, "learning_rate": 2.795033961017879e-08, "loss": 0.8389, "step": 2864 }, { "epoch": 4.948186528497409, "grad_norm": 0.08970910748213977, "learning_rate": 2.6176370064967537e-08, "loss": 0.8636, "step": 2865 }, { "epoch": 4.9499136442141625, "grad_norm": 0.09230321196826917, "learning_rate": 2.4460538498582987e-08, "loss": 0.8665, "step": 2866 }, { "epoch": 4.951640759930916, "grad_norm": 0.0885082843223843, "learning_rate": 2.280284740654448e-08, "loss": 0.8599, "step": 2867 }, { "epoch": 4.953367875647668, "grad_norm": 0.08543608307552009, "learning_rate": 2.120329919979014e-08, "loss": 0.8334, "step": 2868 }, { "epoch": 4.9550949913644216, "grad_norm": 0.08975108597509476, "learning_rate": 1.9661896204712373e-08, "loss": 0.8501, "step": 2869 }, { "epoch": 4.956822107081175, "grad_norm": 0.09102755059730211, "learning_rate": 1.81786406631268e-08, "loss": 0.8586, "step": 2870 }, { "epoch": 4.958549222797927, "grad_norm": 0.08550066396916416, "learning_rate": 1.6753534732276698e-08, "loss": 0.8398, "step": 2871 }, { "epoch": 4.960276338514681, "grad_norm": 0.09066869693514265, "learning_rate": 1.5386580484846314e-08, "loss": 0.8517, "step": 2872 }, { "epoch": 4.962003454231434, "grad_norm": 0.14159067613570886, "learning_rate": 1.4077779908934219e-08, "loss": 0.8633, "step": 2873 }, { "epoch": 4.963730569948186, "grad_norm": 0.08622578128304675, "learning_rate": 1.282713490805776e-08, "loss": 0.8524, "step": 2874 }, { "epoch": 4.96545768566494, "grad_norm": 0.08533948627646179, "learning_rate": 1.1634647301157487e-08, "loss": 0.8543, "step": 2875 }, { "epoch": 4.967184801381693, "grad_norm": 0.08514049802954736, "learning_rate": 1.0500318822588285e-08, "loss": 0.8298, "step": 2876 }, { "epoch": 4.9689119170984455, "grad_norm": 0.088290409735351, "learning_rate": 9.424151122123803e-09, "loss": 0.8569, "step": 2877 }, { "epoch": 4.970639032815199, "grad_norm": 0.08746768688006866, "learning_rate": 8.4061457649387e-09, "loss": 0.8605, "step": 2878 }, { "epoch": 4.972366148531952, "grad_norm": 0.08917129104840091, "learning_rate": 7.4463042316264e-09, "loss": 0.863, "step": 2879 }, { "epoch": 4.974093264248705, "grad_norm": 0.0853603651239903, "learning_rate": 6.544627918172452e-09, "loss": 0.8521, "step": 2880 }, { "epoch": 4.975820379965458, "grad_norm": 0.09005358665320454, "learning_rate": 5.701118135985617e-09, "loss": 0.8705, "step": 2881 }, { "epoch": 4.97754749568221, "grad_norm": 0.0898410220572187, "learning_rate": 4.915776111862336e-09, "loss": 0.8635, "step": 2882 }, { "epoch": 4.979274611398964, "grad_norm": 0.08768368681695879, "learning_rate": 4.188602988008939e-09, "loss": 0.855, "step": 2883 }, { "epoch": 4.981001727115717, "grad_norm": 0.09183163726263269, "learning_rate": 3.519599822019437e-09, "loss": 0.8459, "step": 2884 }, { "epoch": 4.982728842832469, "grad_norm": 0.09116845023331276, "learning_rate": 2.9087675869021725e-09, "loss": 0.8343, "step": 2885 }, { "epoch": 4.984455958549223, "grad_norm": 0.08661879751718919, "learning_rate": 2.3561071710442862e-09, "loss": 0.8353, "step": 2886 }, { "epoch": 4.986183074265976, "grad_norm": 0.09428529104672133, "learning_rate": 1.8616193782383662e-09, "loss": 0.852, "step": 2887 }, { "epoch": 4.9879101899827285, "grad_norm": 0.08556088551551068, "learning_rate": 1.4253049276735654e-09, "loss": 0.8499, "step": 2888 }, { "epoch": 4.989637305699482, "grad_norm": 0.08735667661533125, "learning_rate": 1.0471644539178373e-09, "loss": 0.8408, "step": 2889 }, { "epoch": 4.991364421416235, "grad_norm": 0.09012234424025771, "learning_rate": 7.27198506944582e-10, "loss": 0.8498, "step": 2890 }, { "epoch": 4.993091537132988, "grad_norm": 0.08482766047891142, "learning_rate": 4.654075521104418e-10, "loss": 0.8591, "step": 2891 }, { "epoch": 4.994818652849741, "grad_norm": 0.08903487800793554, "learning_rate": 2.617919701686233e-10, "loss": 0.8431, "step": 2892 }, { "epoch": 4.996545768566494, "grad_norm": 0.08898016466757822, "learning_rate": 1.1635205725113452e-10, "loss": 0.8564, "step": 2893 }, { "epoch": 4.998272884283247, "grad_norm": 0.08786492951387836, "learning_rate": 2.9088024890988607e-11, "loss": 0.8376, "step": 2894 }, { "epoch": 5.0, "grad_norm": 0.08915190458373352, "learning_rate": 0.0, "loss": 0.8376, "step": 2895 }, { "epoch": 5.0, "step": 2895, "total_flos": 3.975437533368759e+19, "train_loss": 0.9977235792214388, "train_runtime": 110165.0044, "train_samples_per_second": 13.452, "train_steps_per_second": 0.026 } ], "logging_steps": 1.0, "max_steps": 2895, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.975437533368759e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }