{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 27.0, "eval_steps": 500, "global_step": 4347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006230529595015576, "grad_norm": 21.625, "learning_rate": 0.0, "loss": 4.4544, "step": 1 }, { "epoch": 0.012461059190031152, "grad_norm": 21.75, "learning_rate": 1.5527950310559006e-07, "loss": 4.4809, "step": 2 }, { "epoch": 0.018691588785046728, "grad_norm": 20.5, "learning_rate": 3.1055900621118013e-07, "loss": 4.3941, "step": 3 }, { "epoch": 0.024922118380062305, "grad_norm": 20.5, "learning_rate": 4.6583850931677024e-07, "loss": 4.4228, "step": 4 }, { "epoch": 0.03115264797507788, "grad_norm": 20.875, "learning_rate": 6.211180124223603e-07, "loss": 4.3986, "step": 5 }, { "epoch": 0.037383177570093455, "grad_norm": 21.375, "learning_rate": 7.763975155279503e-07, "loss": 4.5342, "step": 6 }, { "epoch": 0.04361370716510903, "grad_norm": 21.375, "learning_rate": 9.316770186335405e-07, "loss": 4.4011, "step": 7 }, { "epoch": 0.04984423676012461, "grad_norm": 21.0, "learning_rate": 1.0869565217391306e-06, "loss": 4.4421, "step": 8 }, { "epoch": 0.056074766355140186, "grad_norm": 20.875, "learning_rate": 1.2422360248447205e-06, "loss": 4.4386, "step": 9 }, { "epoch": 0.06230529595015576, "grad_norm": 21.0, "learning_rate": 1.3975155279503107e-06, "loss": 4.4903, "step": 10 }, { "epoch": 0.06853582554517133, "grad_norm": 20.875, "learning_rate": 1.5527950310559006e-06, "loss": 4.4575, "step": 11 }, { "epoch": 0.07476635514018691, "grad_norm": 21.125, "learning_rate": 1.7080745341614908e-06, "loss": 4.5027, "step": 12 }, { "epoch": 0.08099688473520249, "grad_norm": 21.0, "learning_rate": 1.863354037267081e-06, "loss": 4.4064, "step": 13 }, { "epoch": 0.08722741433021806, "grad_norm": 20.125, "learning_rate": 2.018633540372671e-06, "loss": 4.4408, "step": 14 }, { "epoch": 0.09345794392523364, "grad_norm": 21.375, "learning_rate": 2.173913043478261e-06, "loss": 4.4542, "step": 15 }, { "epoch": 0.09968847352024922, "grad_norm": 21.5, "learning_rate": 2.329192546583851e-06, "loss": 4.4183, "step": 16 }, { "epoch": 0.1059190031152648, "grad_norm": 20.625, "learning_rate": 2.484472049689441e-06, "loss": 4.2817, "step": 17 }, { "epoch": 0.11214953271028037, "grad_norm": 20.375, "learning_rate": 2.639751552795031e-06, "loss": 4.4335, "step": 18 }, { "epoch": 0.11838006230529595, "grad_norm": 20.875, "learning_rate": 2.7950310559006214e-06, "loss": 4.3935, "step": 19 }, { "epoch": 0.12461059190031153, "grad_norm": 20.5, "learning_rate": 2.9503105590062115e-06, "loss": 4.3969, "step": 20 }, { "epoch": 0.1308411214953271, "grad_norm": 20.25, "learning_rate": 3.1055900621118013e-06, "loss": 4.3285, "step": 21 }, { "epoch": 0.13707165109034267, "grad_norm": 20.375, "learning_rate": 3.2608695652173914e-06, "loss": 4.2989, "step": 22 }, { "epoch": 0.14330218068535824, "grad_norm": 20.875, "learning_rate": 3.4161490683229816e-06, "loss": 4.3591, "step": 23 }, { "epoch": 0.14953271028037382, "grad_norm": 20.625, "learning_rate": 3.5714285714285714e-06, "loss": 4.3529, "step": 24 }, { "epoch": 0.1557632398753894, "grad_norm": 20.375, "learning_rate": 3.726708074534162e-06, "loss": 4.3015, "step": 25 }, { "epoch": 0.16199376947040497, "grad_norm": 20.5, "learning_rate": 3.881987577639752e-06, "loss": 4.2567, "step": 26 }, { "epoch": 0.16822429906542055, "grad_norm": 20.625, "learning_rate": 4.037267080745342e-06, "loss": 4.3496, "step": 27 }, { "epoch": 0.17445482866043613, "grad_norm": 20.125, "learning_rate": 4.192546583850932e-06, "loss": 4.29, "step": 28 }, { "epoch": 0.1806853582554517, "grad_norm": 20.75, "learning_rate": 4.347826086956522e-06, "loss": 4.2808, "step": 29 }, { "epoch": 0.18691588785046728, "grad_norm": 19.125, "learning_rate": 4.503105590062112e-06, "loss": 4.1188, "step": 30 }, { "epoch": 0.19314641744548286, "grad_norm": 19.375, "learning_rate": 4.658385093167702e-06, "loss": 4.1089, "step": 31 }, { "epoch": 0.19937694704049844, "grad_norm": 19.125, "learning_rate": 4.813664596273292e-06, "loss": 4.0274, "step": 32 }, { "epoch": 0.205607476635514, "grad_norm": 19.375, "learning_rate": 4.968944099378882e-06, "loss": 4.0535, "step": 33 }, { "epoch": 0.2118380062305296, "grad_norm": 18.25, "learning_rate": 5.124223602484472e-06, "loss": 4.0217, "step": 34 }, { "epoch": 0.21806853582554517, "grad_norm": 19.125, "learning_rate": 5.279503105590062e-06, "loss": 4.124, "step": 35 }, { "epoch": 0.22429906542056074, "grad_norm": 19.25, "learning_rate": 5.4347826086956525e-06, "loss": 4.1087, "step": 36 }, { "epoch": 0.23052959501557632, "grad_norm": 18.5, "learning_rate": 5.590062111801243e-06, "loss": 3.9983, "step": 37 }, { "epoch": 0.2367601246105919, "grad_norm": 17.625, "learning_rate": 5.745341614906832e-06, "loss": 3.8408, "step": 38 }, { "epoch": 0.24299065420560748, "grad_norm": 17.625, "learning_rate": 5.900621118012423e-06, "loss": 3.8556, "step": 39 }, { "epoch": 0.24922118380062305, "grad_norm": 18.0, "learning_rate": 6.055900621118013e-06, "loss": 3.8305, "step": 40 }, { "epoch": 0.2554517133956386, "grad_norm": 17.125, "learning_rate": 6.2111801242236025e-06, "loss": 3.7877, "step": 41 }, { "epoch": 0.2616822429906542, "grad_norm": 17.125, "learning_rate": 6.366459627329193e-06, "loss": 3.8079, "step": 42 }, { "epoch": 0.26791277258566976, "grad_norm": 16.5, "learning_rate": 6.521739130434783e-06, "loss": 3.7283, "step": 43 }, { "epoch": 0.27414330218068533, "grad_norm": 15.5625, "learning_rate": 6.677018633540373e-06, "loss": 3.6548, "step": 44 }, { "epoch": 0.2803738317757009, "grad_norm": 16.5, "learning_rate": 6.832298136645963e-06, "loss": 3.7493, "step": 45 }, { "epoch": 0.2866043613707165, "grad_norm": 15.1875, "learning_rate": 6.9875776397515525e-06, "loss": 3.621, "step": 46 }, { "epoch": 0.29283489096573206, "grad_norm": 15.75, "learning_rate": 7.142857142857143e-06, "loss": 3.6482, "step": 47 }, { "epoch": 0.29906542056074764, "grad_norm": 16.125, "learning_rate": 7.298136645962733e-06, "loss": 3.534, "step": 48 }, { "epoch": 0.3052959501557632, "grad_norm": 15.4375, "learning_rate": 7.453416149068324e-06, "loss": 3.5081, "step": 49 }, { "epoch": 0.3115264797507788, "grad_norm": 14.6875, "learning_rate": 7.608695652173914e-06, "loss": 3.4946, "step": 50 }, { "epoch": 0.3177570093457944, "grad_norm": 13.875, "learning_rate": 7.763975155279503e-06, "loss": 3.328, "step": 51 }, { "epoch": 0.32398753894080995, "grad_norm": 13.9375, "learning_rate": 7.919254658385093e-06, "loss": 3.3846, "step": 52 }, { "epoch": 0.3302180685358255, "grad_norm": 12.875, "learning_rate": 8.074534161490684e-06, "loss": 3.2337, "step": 53 }, { "epoch": 0.3364485981308411, "grad_norm": 12.8125, "learning_rate": 8.229813664596275e-06, "loss": 3.1147, "step": 54 }, { "epoch": 0.3426791277258567, "grad_norm": 12.0625, "learning_rate": 8.385093167701864e-06, "loss": 3.1117, "step": 55 }, { "epoch": 0.34890965732087226, "grad_norm": 11.4375, "learning_rate": 8.540372670807453e-06, "loss": 3.0921, "step": 56 }, { "epoch": 0.35514018691588783, "grad_norm": 11.625, "learning_rate": 8.695652173913044e-06, "loss": 3.038, "step": 57 }, { "epoch": 0.3613707165109034, "grad_norm": 10.5625, "learning_rate": 8.850931677018634e-06, "loss": 2.9558, "step": 58 }, { "epoch": 0.367601246105919, "grad_norm": 8.9375, "learning_rate": 9.006211180124225e-06, "loss": 2.9024, "step": 59 }, { "epoch": 0.37383177570093457, "grad_norm": 9.0, "learning_rate": 9.161490683229814e-06, "loss": 2.8925, "step": 60 }, { "epoch": 0.38006230529595014, "grad_norm": 8.125, "learning_rate": 9.316770186335403e-06, "loss": 2.8254, "step": 61 }, { "epoch": 0.3862928348909657, "grad_norm": 7.65625, "learning_rate": 9.472049689440994e-06, "loss": 2.7155, "step": 62 }, { "epoch": 0.3925233644859813, "grad_norm": 6.75, "learning_rate": 9.627329192546584e-06, "loss": 2.71, "step": 63 }, { "epoch": 0.3987538940809969, "grad_norm": 6.125, "learning_rate": 9.782608695652175e-06, "loss": 2.6395, "step": 64 }, { "epoch": 0.40498442367601245, "grad_norm": 6.59375, "learning_rate": 9.937888198757764e-06, "loss": 2.4913, "step": 65 }, { "epoch": 0.411214953271028, "grad_norm": 5.34375, "learning_rate": 1.0093167701863353e-05, "loss": 2.6048, "step": 66 }, { "epoch": 0.4174454828660436, "grad_norm": 5.1875, "learning_rate": 1.0248447204968944e-05, "loss": 2.6173, "step": 67 }, { "epoch": 0.4236760124610592, "grad_norm": 5.25, "learning_rate": 1.0403726708074535e-05, "loss": 2.599, "step": 68 }, { "epoch": 0.42990654205607476, "grad_norm": 4.25, "learning_rate": 1.0559006211180125e-05, "loss": 2.4109, "step": 69 }, { "epoch": 0.43613707165109034, "grad_norm": 4.0625, "learning_rate": 1.0714285714285714e-05, "loss": 2.4219, "step": 70 }, { "epoch": 0.4423676012461059, "grad_norm": 4.09375, "learning_rate": 1.0869565217391305e-05, "loss": 2.5139, "step": 71 }, { "epoch": 0.4485981308411215, "grad_norm": 3.65625, "learning_rate": 1.1024844720496894e-05, "loss": 2.3694, "step": 72 }, { "epoch": 0.45482866043613707, "grad_norm": 3.6875, "learning_rate": 1.1180124223602485e-05, "loss": 2.4366, "step": 73 }, { "epoch": 0.46105919003115264, "grad_norm": 3.421875, "learning_rate": 1.1335403726708076e-05, "loss": 2.3894, "step": 74 }, { "epoch": 0.4672897196261682, "grad_norm": 3.484375, "learning_rate": 1.1490683229813664e-05, "loss": 2.3738, "step": 75 }, { "epoch": 0.4735202492211838, "grad_norm": 3.21875, "learning_rate": 1.1645962732919255e-05, "loss": 2.3221, "step": 76 }, { "epoch": 0.4797507788161994, "grad_norm": 3.328125, "learning_rate": 1.1801242236024846e-05, "loss": 2.2973, "step": 77 }, { "epoch": 0.48598130841121495, "grad_norm": 3.5, "learning_rate": 1.1956521739130435e-05, "loss": 2.2083, "step": 78 }, { "epoch": 0.49221183800623053, "grad_norm": 3.265625, "learning_rate": 1.2111801242236026e-05, "loss": 2.3984, "step": 79 }, { "epoch": 0.4984423676012461, "grad_norm": 4.0625, "learning_rate": 1.2267080745341616e-05, "loss": 2.4198, "step": 80 }, { "epoch": 0.5046728971962616, "grad_norm": 4.1875, "learning_rate": 1.2422360248447205e-05, "loss": 2.4435, "step": 81 }, { "epoch": 0.5109034267912772, "grad_norm": 3.421875, "learning_rate": 1.2577639751552794e-05, "loss": 2.3175, "step": 82 }, { "epoch": 0.5171339563862928, "grad_norm": 3.65625, "learning_rate": 1.2732919254658385e-05, "loss": 2.415, "step": 83 }, { "epoch": 0.5233644859813084, "grad_norm": 3.53125, "learning_rate": 1.2888198757763975e-05, "loss": 2.4788, "step": 84 }, { "epoch": 0.5295950155763239, "grad_norm": 3.546875, "learning_rate": 1.3043478260869566e-05, "loss": 2.4395, "step": 85 }, { "epoch": 0.5358255451713395, "grad_norm": 3.171875, "learning_rate": 1.3198757763975155e-05, "loss": 2.4031, "step": 86 }, { "epoch": 0.5420560747663551, "grad_norm": 3.078125, "learning_rate": 1.3354037267080746e-05, "loss": 2.3965, "step": 87 }, { "epoch": 0.5482866043613707, "grad_norm": 4.1875, "learning_rate": 1.3509316770186337e-05, "loss": 2.2563, "step": 88 }, { "epoch": 0.5545171339563862, "grad_norm": 3.453125, "learning_rate": 1.3664596273291926e-05, "loss": 2.3873, "step": 89 }, { "epoch": 0.5607476635514018, "grad_norm": 3.484375, "learning_rate": 1.3819875776397517e-05, "loss": 2.4077, "step": 90 }, { "epoch": 0.5669781931464174, "grad_norm": 3.53125, "learning_rate": 1.3975155279503105e-05, "loss": 2.3796, "step": 91 }, { "epoch": 0.573208722741433, "grad_norm": 3.34375, "learning_rate": 1.4130434782608694e-05, "loss": 2.3295, "step": 92 }, { "epoch": 0.5794392523364486, "grad_norm": 2.984375, "learning_rate": 1.4285714285714285e-05, "loss": 2.3338, "step": 93 }, { "epoch": 0.5856697819314641, "grad_norm": 3.15625, "learning_rate": 1.4440993788819876e-05, "loss": 2.3082, "step": 94 }, { "epoch": 0.5919003115264797, "grad_norm": 3.328125, "learning_rate": 1.4596273291925466e-05, "loss": 2.3224, "step": 95 }, { "epoch": 0.5981308411214953, "grad_norm": 3.375, "learning_rate": 1.4751552795031057e-05, "loss": 2.3802, "step": 96 }, { "epoch": 0.6043613707165109, "grad_norm": 3.203125, "learning_rate": 1.4906832298136648e-05, "loss": 2.4522, "step": 97 }, { "epoch": 0.6105919003115264, "grad_norm": 3.453125, "learning_rate": 1.5062111801242237e-05, "loss": 2.3779, "step": 98 }, { "epoch": 0.616822429906542, "grad_norm": 2.84375, "learning_rate": 1.5217391304347828e-05, "loss": 2.3352, "step": 99 }, { "epoch": 0.6230529595015576, "grad_norm": 2.859375, "learning_rate": 1.537267080745342e-05, "loss": 2.4483, "step": 100 }, { "epoch": 0.6292834890965732, "grad_norm": 2.96875, "learning_rate": 1.5527950310559007e-05, "loss": 2.3519, "step": 101 }, { "epoch": 0.6355140186915887, "grad_norm": 4.0625, "learning_rate": 1.5683229813664594e-05, "loss": 2.2655, "step": 102 }, { "epoch": 0.6417445482866043, "grad_norm": 3.109375, "learning_rate": 1.5838509316770185e-05, "loss": 2.3304, "step": 103 }, { "epoch": 0.6479750778816199, "grad_norm": 3.140625, "learning_rate": 1.5993788819875776e-05, "loss": 2.3793, "step": 104 }, { "epoch": 0.6542056074766355, "grad_norm": 3.28125, "learning_rate": 1.6149068322981367e-05, "loss": 2.241, "step": 105 }, { "epoch": 0.660436137071651, "grad_norm": 2.96875, "learning_rate": 1.630434782608696e-05, "loss": 2.3329, "step": 106 }, { "epoch": 0.6666666666666666, "grad_norm": 3.09375, "learning_rate": 1.645962732919255e-05, "loss": 2.3001, "step": 107 }, { "epoch": 0.6728971962616822, "grad_norm": 2.875, "learning_rate": 1.6614906832298137e-05, "loss": 2.3532, "step": 108 }, { "epoch": 0.6791277258566978, "grad_norm": 3.21875, "learning_rate": 1.6770186335403728e-05, "loss": 2.3358, "step": 109 }, { "epoch": 0.6853582554517134, "grad_norm": 3.0625, "learning_rate": 1.6925465838509316e-05, "loss": 2.4115, "step": 110 }, { "epoch": 0.6915887850467289, "grad_norm": 2.90625, "learning_rate": 1.7080745341614907e-05, "loss": 2.2682, "step": 111 }, { "epoch": 0.6978193146417445, "grad_norm": 3.046875, "learning_rate": 1.7236024844720498e-05, "loss": 2.3385, "step": 112 }, { "epoch": 0.7040498442367601, "grad_norm": 3.890625, "learning_rate": 1.739130434782609e-05, "loss": 2.4165, "step": 113 }, { "epoch": 0.7102803738317757, "grad_norm": 3.203125, "learning_rate": 1.7546583850931676e-05, "loss": 2.3243, "step": 114 }, { "epoch": 0.7165109034267912, "grad_norm": 3.015625, "learning_rate": 1.7701863354037267e-05, "loss": 2.375, "step": 115 }, { "epoch": 0.7227414330218068, "grad_norm": 3.0, "learning_rate": 1.785714285714286e-05, "loss": 2.3254, "step": 116 }, { "epoch": 0.7289719626168224, "grad_norm": 3.328125, "learning_rate": 1.801242236024845e-05, "loss": 2.37, "step": 117 }, { "epoch": 0.735202492211838, "grad_norm": 3.46875, "learning_rate": 1.816770186335404e-05, "loss": 2.3857, "step": 118 }, { "epoch": 0.7414330218068536, "grad_norm": 3.34375, "learning_rate": 1.8322981366459628e-05, "loss": 2.3181, "step": 119 }, { "epoch": 0.7476635514018691, "grad_norm": 3.296875, "learning_rate": 1.8478260869565216e-05, "loss": 2.2694, "step": 120 }, { "epoch": 0.7538940809968847, "grad_norm": 3.046875, "learning_rate": 1.8633540372670807e-05, "loss": 2.3064, "step": 121 }, { "epoch": 0.7601246105919003, "grad_norm": 3.46875, "learning_rate": 1.8788819875776398e-05, "loss": 2.3837, "step": 122 }, { "epoch": 0.7663551401869159, "grad_norm": 3.203125, "learning_rate": 1.894409937888199e-05, "loss": 2.3447, "step": 123 }, { "epoch": 0.7725856697819314, "grad_norm": 2.828125, "learning_rate": 1.909937888198758e-05, "loss": 2.2986, "step": 124 }, { "epoch": 0.778816199376947, "grad_norm": 3.390625, "learning_rate": 1.9254658385093167e-05, "loss": 2.3389, "step": 125 }, { "epoch": 0.7850467289719626, "grad_norm": 2.875, "learning_rate": 1.940993788819876e-05, "loss": 2.3204, "step": 126 }, { "epoch": 0.7912772585669782, "grad_norm": 3.3125, "learning_rate": 1.956521739130435e-05, "loss": 2.279, "step": 127 }, { "epoch": 0.7975077881619937, "grad_norm": 2.84375, "learning_rate": 1.972049689440994e-05, "loss": 2.3519, "step": 128 }, { "epoch": 0.8037383177570093, "grad_norm": 3.375, "learning_rate": 1.9875776397515528e-05, "loss": 2.3865, "step": 129 }, { "epoch": 0.8099688473520249, "grad_norm": 3.140625, "learning_rate": 2.003105590062112e-05, "loss": 2.3323, "step": 130 }, { "epoch": 0.8161993769470405, "grad_norm": 3.453125, "learning_rate": 2.0186335403726707e-05, "loss": 2.2402, "step": 131 }, { "epoch": 0.822429906542056, "grad_norm": 3.140625, "learning_rate": 2.0341614906832298e-05, "loss": 2.3353, "step": 132 }, { "epoch": 0.8286604361370716, "grad_norm": 3.078125, "learning_rate": 2.049689440993789e-05, "loss": 2.2439, "step": 133 }, { "epoch": 0.8348909657320872, "grad_norm": 3.078125, "learning_rate": 2.065217391304348e-05, "loss": 2.3234, "step": 134 }, { "epoch": 0.8411214953271028, "grad_norm": 3.640625, "learning_rate": 2.080745341614907e-05, "loss": 2.4313, "step": 135 }, { "epoch": 0.8473520249221184, "grad_norm": 3.5, "learning_rate": 2.096273291925466e-05, "loss": 2.3773, "step": 136 }, { "epoch": 0.8535825545171339, "grad_norm": 2.96875, "learning_rate": 2.111801242236025e-05, "loss": 2.3734, "step": 137 }, { "epoch": 0.8598130841121495, "grad_norm": 3.3125, "learning_rate": 2.127329192546584e-05, "loss": 2.2644, "step": 138 }, { "epoch": 0.8660436137071651, "grad_norm": 2.75, "learning_rate": 2.1428571428571428e-05, "loss": 2.2843, "step": 139 }, { "epoch": 0.8722741433021807, "grad_norm": 2.578125, "learning_rate": 2.158385093167702e-05, "loss": 2.3232, "step": 140 }, { "epoch": 0.8785046728971962, "grad_norm": 3.203125, "learning_rate": 2.173913043478261e-05, "loss": 2.2752, "step": 141 }, { "epoch": 0.8847352024922118, "grad_norm": 2.765625, "learning_rate": 2.1894409937888198e-05, "loss": 2.3752, "step": 142 }, { "epoch": 0.8909657320872274, "grad_norm": 2.9375, "learning_rate": 2.204968944099379e-05, "loss": 2.3247, "step": 143 }, { "epoch": 0.897196261682243, "grad_norm": 2.765625, "learning_rate": 2.220496894409938e-05, "loss": 2.3056, "step": 144 }, { "epoch": 0.9034267912772586, "grad_norm": 2.953125, "learning_rate": 2.236024844720497e-05, "loss": 2.3318, "step": 145 }, { "epoch": 0.9096573208722741, "grad_norm": 3.265625, "learning_rate": 2.2515527950310562e-05, "loss": 2.2385, "step": 146 }, { "epoch": 0.9158878504672897, "grad_norm": 2.859375, "learning_rate": 2.2670807453416153e-05, "loss": 2.2781, "step": 147 }, { "epoch": 0.9221183800623053, "grad_norm": 3.203125, "learning_rate": 2.282608695652174e-05, "loss": 2.267, "step": 148 }, { "epoch": 0.9283489096573209, "grad_norm": 3.109375, "learning_rate": 2.2981366459627328e-05, "loss": 2.2643, "step": 149 }, { "epoch": 0.9345794392523364, "grad_norm": 2.984375, "learning_rate": 2.313664596273292e-05, "loss": 2.3215, "step": 150 }, { "epoch": 0.940809968847352, "grad_norm": 3.03125, "learning_rate": 2.329192546583851e-05, "loss": 2.3077, "step": 151 }, { "epoch": 0.9470404984423676, "grad_norm": 3.171875, "learning_rate": 2.34472049689441e-05, "loss": 2.3486, "step": 152 }, { "epoch": 0.9532710280373832, "grad_norm": 3.078125, "learning_rate": 2.3602484472049692e-05, "loss": 2.2695, "step": 153 }, { "epoch": 0.9595015576323987, "grad_norm": 2.734375, "learning_rate": 2.375776397515528e-05, "loss": 2.383, "step": 154 }, { "epoch": 0.9657320872274143, "grad_norm": 3.46875, "learning_rate": 2.391304347826087e-05, "loss": 2.155, "step": 155 }, { "epoch": 0.9719626168224299, "grad_norm": 3.125, "learning_rate": 2.4068322981366462e-05, "loss": 2.1467, "step": 156 }, { "epoch": 0.9781931464174455, "grad_norm": 3.1875, "learning_rate": 2.4223602484472053e-05, "loss": 2.2839, "step": 157 }, { "epoch": 0.9844236760124611, "grad_norm": 3.109375, "learning_rate": 2.437888198757764e-05, "loss": 2.3656, "step": 158 }, { "epoch": 0.9906542056074766, "grad_norm": 3.0625, "learning_rate": 2.453416149068323e-05, "loss": 2.1615, "step": 159 }, { "epoch": 0.9968847352024922, "grad_norm": 3.046875, "learning_rate": 2.468944099378882e-05, "loss": 2.1974, "step": 160 }, { "epoch": 1.0, "grad_norm": 2.5, "learning_rate": 2.484472049689441e-05, "loss": 1.1735, "step": 161 }, { "epoch": 1.0062305295950156, "grad_norm": 3.859375, "learning_rate": 2.5e-05, "loss": 2.379, "step": 162 }, { "epoch": 1.0124610591900312, "grad_norm": 2.953125, "learning_rate": 2.515527950310559e-05, "loss": 2.2372, "step": 163 }, { "epoch": 1.0186915887850467, "grad_norm": 2.765625, "learning_rate": 2.5310559006211183e-05, "loss": 2.3076, "step": 164 }, { "epoch": 1.0249221183800623, "grad_norm": 3.3125, "learning_rate": 2.546583850931677e-05, "loss": 2.2718, "step": 165 }, { "epoch": 1.0311526479750779, "grad_norm": 2.8125, "learning_rate": 2.5621118012422362e-05, "loss": 2.2971, "step": 166 }, { "epoch": 1.0373831775700935, "grad_norm": 3.515625, "learning_rate": 2.577639751552795e-05, "loss": 2.2479, "step": 167 }, { "epoch": 1.043613707165109, "grad_norm": 3.703125, "learning_rate": 2.5931677018633544e-05, "loss": 2.2062, "step": 168 }, { "epoch": 1.0498442367601246, "grad_norm": 2.953125, "learning_rate": 2.608695652173913e-05, "loss": 2.2874, "step": 169 }, { "epoch": 1.0560747663551402, "grad_norm": 2.953125, "learning_rate": 2.6242236024844723e-05, "loss": 2.2644, "step": 170 }, { "epoch": 1.0623052959501558, "grad_norm": 2.765625, "learning_rate": 2.639751552795031e-05, "loss": 2.2658, "step": 171 }, { "epoch": 1.0685358255451713, "grad_norm": 4.0, "learning_rate": 2.6552795031055898e-05, "loss": 2.1962, "step": 172 }, { "epoch": 1.074766355140187, "grad_norm": 3.09375, "learning_rate": 2.6708074534161492e-05, "loss": 2.1297, "step": 173 }, { "epoch": 1.0809968847352025, "grad_norm": 2.796875, "learning_rate": 2.686335403726708e-05, "loss": 2.2106, "step": 174 }, { "epoch": 1.087227414330218, "grad_norm": 5.21875, "learning_rate": 2.7018633540372674e-05, "loss": 2.2945, "step": 175 }, { "epoch": 1.0934579439252337, "grad_norm": 4.375, "learning_rate": 2.7173913043478262e-05, "loss": 2.2794, "step": 176 }, { "epoch": 1.0996884735202492, "grad_norm": 3.6875, "learning_rate": 2.7329192546583853e-05, "loss": 2.3223, "step": 177 }, { "epoch": 1.1059190031152648, "grad_norm": 3.234375, "learning_rate": 2.748447204968944e-05, "loss": 2.1913, "step": 178 }, { "epoch": 1.1121495327102804, "grad_norm": 3.359375, "learning_rate": 2.7639751552795035e-05, "loss": 2.2562, "step": 179 }, { "epoch": 1.118380062305296, "grad_norm": 3.34375, "learning_rate": 2.7795031055900623e-05, "loss": 2.2384, "step": 180 }, { "epoch": 1.1246105919003115, "grad_norm": 3.40625, "learning_rate": 2.795031055900621e-05, "loss": 2.1845, "step": 181 }, { "epoch": 1.1308411214953271, "grad_norm": 4.1875, "learning_rate": 2.8105590062111805e-05, "loss": 2.1903, "step": 182 }, { "epoch": 1.1370716510903427, "grad_norm": 3.046875, "learning_rate": 2.826086956521739e-05, "loss": 2.2417, "step": 183 }, { "epoch": 1.1433021806853583, "grad_norm": 3.484375, "learning_rate": 2.8416149068322983e-05, "loss": 2.2426, "step": 184 }, { "epoch": 1.1495327102803738, "grad_norm": 3.15625, "learning_rate": 2.857142857142857e-05, "loss": 2.1961, "step": 185 }, { "epoch": 1.1557632398753894, "grad_norm": 3.59375, "learning_rate": 2.8726708074534165e-05, "loss": 2.2203, "step": 186 }, { "epoch": 1.161993769470405, "grad_norm": 4.46875, "learning_rate": 2.8881987577639753e-05, "loss": 2.2275, "step": 187 }, { "epoch": 1.1682242990654206, "grad_norm": 3.453125, "learning_rate": 2.9037267080745344e-05, "loss": 2.201, "step": 188 }, { "epoch": 1.1744548286604362, "grad_norm": 3.328125, "learning_rate": 2.919254658385093e-05, "loss": 2.2425, "step": 189 }, { "epoch": 1.1806853582554517, "grad_norm": 3.53125, "learning_rate": 2.9347826086956526e-05, "loss": 2.1958, "step": 190 }, { "epoch": 1.1869158878504673, "grad_norm": 4.03125, "learning_rate": 2.9503105590062114e-05, "loss": 2.1724, "step": 191 }, { "epoch": 1.1931464174454829, "grad_norm": 3.1875, "learning_rate": 2.96583850931677e-05, "loss": 2.1394, "step": 192 }, { "epoch": 1.1993769470404985, "grad_norm": 2.859375, "learning_rate": 2.9813664596273296e-05, "loss": 2.2307, "step": 193 }, { "epoch": 1.205607476635514, "grad_norm": 2.8125, "learning_rate": 2.9968944099378883e-05, "loss": 2.2231, "step": 194 }, { "epoch": 1.2118380062305296, "grad_norm": 3.515625, "learning_rate": 3.0124223602484474e-05, "loss": 2.1515, "step": 195 }, { "epoch": 1.2180685358255452, "grad_norm": 3.078125, "learning_rate": 3.0279503105590062e-05, "loss": 2.1269, "step": 196 }, { "epoch": 1.2242990654205608, "grad_norm": 3.6875, "learning_rate": 3.0434782608695656e-05, "loss": 2.1719, "step": 197 }, { "epoch": 1.2305295950155763, "grad_norm": 3.046875, "learning_rate": 3.059006211180124e-05, "loss": 2.1608, "step": 198 }, { "epoch": 1.236760124610592, "grad_norm": 3.0625, "learning_rate": 3.074534161490684e-05, "loss": 2.162, "step": 199 }, { "epoch": 1.2429906542056075, "grad_norm": 3.3125, "learning_rate": 3.090062111801242e-05, "loss": 2.2085, "step": 200 }, { "epoch": 1.249221183800623, "grad_norm": 2.890625, "learning_rate": 3.1055900621118014e-05, "loss": 2.1744, "step": 201 }, { "epoch": 1.2554517133956387, "grad_norm": 3.359375, "learning_rate": 3.1211180124223605e-05, "loss": 2.1267, "step": 202 }, { "epoch": 1.2616822429906542, "grad_norm": 3.328125, "learning_rate": 3.136645962732919e-05, "loss": 2.1804, "step": 203 }, { "epoch": 1.2679127725856698, "grad_norm": 3.859375, "learning_rate": 3.152173913043479e-05, "loss": 2.1822, "step": 204 }, { "epoch": 1.2741433021806854, "grad_norm": 3.046875, "learning_rate": 3.167701863354037e-05, "loss": 2.1019, "step": 205 }, { "epoch": 1.280373831775701, "grad_norm": 3.125, "learning_rate": 3.183229813664597e-05, "loss": 2.1302, "step": 206 }, { "epoch": 1.2866043613707165, "grad_norm": 3.03125, "learning_rate": 3.198757763975155e-05, "loss": 2.1363, "step": 207 }, { "epoch": 1.2928348909657321, "grad_norm": 2.84375, "learning_rate": 3.2142857142857144e-05, "loss": 2.0888, "step": 208 }, { "epoch": 1.2990654205607477, "grad_norm": 2.84375, "learning_rate": 3.2298136645962735e-05, "loss": 2.1384, "step": 209 }, { "epoch": 1.3052959501557633, "grad_norm": 3.15625, "learning_rate": 3.245341614906832e-05, "loss": 2.1225, "step": 210 }, { "epoch": 1.3115264797507789, "grad_norm": 3.21875, "learning_rate": 3.260869565217392e-05, "loss": 2.1774, "step": 211 }, { "epoch": 1.3177570093457944, "grad_norm": 3.109375, "learning_rate": 3.27639751552795e-05, "loss": 2.1079, "step": 212 }, { "epoch": 1.32398753894081, "grad_norm": 3.046875, "learning_rate": 3.29192546583851e-05, "loss": 2.1944, "step": 213 }, { "epoch": 1.3302180685358256, "grad_norm": 3.09375, "learning_rate": 3.307453416149068e-05, "loss": 2.2024, "step": 214 }, { "epoch": 1.3364485981308412, "grad_norm": 2.84375, "learning_rate": 3.3229813664596274e-05, "loss": 2.1546, "step": 215 }, { "epoch": 1.3426791277258567, "grad_norm": 3.046875, "learning_rate": 3.3385093167701865e-05, "loss": 2.1006, "step": 216 }, { "epoch": 1.3489096573208723, "grad_norm": 2.859375, "learning_rate": 3.3540372670807456e-05, "loss": 2.1005, "step": 217 }, { "epoch": 1.355140186915888, "grad_norm": 3.4375, "learning_rate": 3.369565217391305e-05, "loss": 2.1224, "step": 218 }, { "epoch": 1.3613707165109035, "grad_norm": 2.984375, "learning_rate": 3.385093167701863e-05, "loss": 2.2004, "step": 219 }, { "epoch": 1.367601246105919, "grad_norm": 2.765625, "learning_rate": 3.400621118012422e-05, "loss": 2.1711, "step": 220 }, { "epoch": 1.3738317757009346, "grad_norm": 3.390625, "learning_rate": 3.4161490683229814e-05, "loss": 2.1242, "step": 221 }, { "epoch": 1.3800623052959502, "grad_norm": 3.546875, "learning_rate": 3.4316770186335405e-05, "loss": 2.1552, "step": 222 }, { "epoch": 1.3862928348909658, "grad_norm": 2.65625, "learning_rate": 3.4472049689440996e-05, "loss": 2.1365, "step": 223 }, { "epoch": 1.3925233644859814, "grad_norm": 3.96875, "learning_rate": 3.462732919254659e-05, "loss": 2.085, "step": 224 }, { "epoch": 1.398753894080997, "grad_norm": 3.203125, "learning_rate": 3.478260869565218e-05, "loss": 2.1696, "step": 225 }, { "epoch": 1.4049844236760125, "grad_norm": 3.375, "learning_rate": 3.493788819875777e-05, "loss": 2.1544, "step": 226 }, { "epoch": 1.411214953271028, "grad_norm": 3.25, "learning_rate": 3.509316770186335e-05, "loss": 2.161, "step": 227 }, { "epoch": 1.4174454828660437, "grad_norm": 3.25, "learning_rate": 3.524844720496895e-05, "loss": 2.2387, "step": 228 }, { "epoch": 1.4236760124610592, "grad_norm": 3.046875, "learning_rate": 3.5403726708074535e-05, "loss": 2.0604, "step": 229 }, { "epoch": 1.4299065420560748, "grad_norm": 3.09375, "learning_rate": 3.5559006211180126e-05, "loss": 2.0908, "step": 230 }, { "epoch": 1.4361370716510904, "grad_norm": 2.71875, "learning_rate": 3.571428571428572e-05, "loss": 2.0388, "step": 231 }, { "epoch": 1.442367601246106, "grad_norm": 3.078125, "learning_rate": 3.58695652173913e-05, "loss": 2.1781, "step": 232 }, { "epoch": 1.4485981308411215, "grad_norm": 3.484375, "learning_rate": 3.60248447204969e-05, "loss": 2.1808, "step": 233 }, { "epoch": 1.4548286604361371, "grad_norm": 3.34375, "learning_rate": 3.618012422360248e-05, "loss": 2.1055, "step": 234 }, { "epoch": 1.4610591900311527, "grad_norm": 3.5625, "learning_rate": 3.633540372670808e-05, "loss": 2.1293, "step": 235 }, { "epoch": 1.4672897196261683, "grad_norm": 2.84375, "learning_rate": 3.6490683229813665e-05, "loss": 2.1053, "step": 236 }, { "epoch": 1.4735202492211839, "grad_norm": 3.65625, "learning_rate": 3.6645962732919256e-05, "loss": 2.1123, "step": 237 }, { "epoch": 1.4797507788161994, "grad_norm": 4.28125, "learning_rate": 3.680124223602485e-05, "loss": 2.0667, "step": 238 }, { "epoch": 1.485981308411215, "grad_norm": 3.984375, "learning_rate": 3.695652173913043e-05, "loss": 2.0825, "step": 239 }, { "epoch": 1.4922118380062306, "grad_norm": 2.78125, "learning_rate": 3.711180124223603e-05, "loss": 2.0364, "step": 240 }, { "epoch": 1.4984423676012462, "grad_norm": 6.5, "learning_rate": 3.7267080745341614e-05, "loss": 2.1739, "step": 241 }, { "epoch": 1.5046728971962615, "grad_norm": 3.90625, "learning_rate": 3.742236024844721e-05, "loss": 2.0776, "step": 242 }, { "epoch": 1.5109034267912773, "grad_norm": 4.71875, "learning_rate": 3.7577639751552796e-05, "loss": 2.0737, "step": 243 }, { "epoch": 1.5171339563862927, "grad_norm": 4.15625, "learning_rate": 3.773291925465839e-05, "loss": 2.1228, "step": 244 }, { "epoch": 1.5233644859813085, "grad_norm": 4.78125, "learning_rate": 3.788819875776398e-05, "loss": 2.0697, "step": 245 }, { "epoch": 1.5295950155763238, "grad_norm": 4.84375, "learning_rate": 3.804347826086957e-05, "loss": 2.0748, "step": 246 }, { "epoch": 1.5358255451713396, "grad_norm": 3.125, "learning_rate": 3.819875776397516e-05, "loss": 2.046, "step": 247 }, { "epoch": 1.542056074766355, "grad_norm": 3.53125, "learning_rate": 3.8354037267080744e-05, "loss": 2.0953, "step": 248 }, { "epoch": 1.5482866043613708, "grad_norm": 3.9375, "learning_rate": 3.8509316770186335e-05, "loss": 2.074, "step": 249 }, { "epoch": 1.5545171339563861, "grad_norm": 2.609375, "learning_rate": 3.8664596273291926e-05, "loss": 2.1043, "step": 250 }, { "epoch": 1.560747663551402, "grad_norm": 3.09375, "learning_rate": 3.881987577639752e-05, "loss": 2.0944, "step": 251 }, { "epoch": 1.5669781931464173, "grad_norm": 3.046875, "learning_rate": 3.897515527950311e-05, "loss": 2.0783, "step": 252 }, { "epoch": 1.573208722741433, "grad_norm": 4.5, "learning_rate": 3.91304347826087e-05, "loss": 2.103, "step": 253 }, { "epoch": 1.5794392523364484, "grad_norm": 3.46875, "learning_rate": 3.928571428571429e-05, "loss": 2.0657, "step": 254 }, { "epoch": 1.5856697819314642, "grad_norm": 2.9375, "learning_rate": 3.944099378881988e-05, "loss": 2.0382, "step": 255 }, { "epoch": 1.5919003115264796, "grad_norm": 3.90625, "learning_rate": 3.9596273291925465e-05, "loss": 2.1199, "step": 256 }, { "epoch": 1.5981308411214954, "grad_norm": 3.5625, "learning_rate": 3.9751552795031056e-05, "loss": 2.0703, "step": 257 }, { "epoch": 1.6043613707165107, "grad_norm": 3.59375, "learning_rate": 3.990683229813665e-05, "loss": 2.0715, "step": 258 }, { "epoch": 1.6105919003115265, "grad_norm": 3.34375, "learning_rate": 4.006211180124224e-05, "loss": 2.0849, "step": 259 }, { "epoch": 1.616822429906542, "grad_norm": 2.765625, "learning_rate": 4.021739130434783e-05, "loss": 2.0223, "step": 260 }, { "epoch": 1.6230529595015577, "grad_norm": 4.9375, "learning_rate": 4.0372670807453414e-05, "loss": 2.0735, "step": 261 }, { "epoch": 1.629283489096573, "grad_norm": 3.796875, "learning_rate": 4.052795031055901e-05, "loss": 2.1129, "step": 262 }, { "epoch": 1.6355140186915889, "grad_norm": 2.890625, "learning_rate": 4.0683229813664596e-05, "loss": 2.0814, "step": 263 }, { "epoch": 1.6417445482866042, "grad_norm": 3.265625, "learning_rate": 4.0838509316770193e-05, "loss": 2.0845, "step": 264 }, { "epoch": 1.64797507788162, "grad_norm": 4.90625, "learning_rate": 4.099378881987578e-05, "loss": 2.071, "step": 265 }, { "epoch": 1.6542056074766354, "grad_norm": 5.09375, "learning_rate": 4.114906832298137e-05, "loss": 2.066, "step": 266 }, { "epoch": 1.6604361370716512, "grad_norm": 3.390625, "learning_rate": 4.130434782608696e-05, "loss": 2.1095, "step": 267 }, { "epoch": 1.6666666666666665, "grad_norm": 3.375, "learning_rate": 4.1459627329192544e-05, "loss": 2.1222, "step": 268 }, { "epoch": 1.6728971962616823, "grad_norm": 3.375, "learning_rate": 4.161490683229814e-05, "loss": 1.9753, "step": 269 }, { "epoch": 1.6791277258566977, "grad_norm": 2.859375, "learning_rate": 4.1770186335403726e-05, "loss": 1.9943, "step": 270 }, { "epoch": 1.6853582554517135, "grad_norm": 3.078125, "learning_rate": 4.192546583850932e-05, "loss": 2.0052, "step": 271 }, { "epoch": 1.6915887850467288, "grad_norm": 2.90625, "learning_rate": 4.208074534161491e-05, "loss": 2.0468, "step": 272 }, { "epoch": 1.6978193146417446, "grad_norm": 3.6875, "learning_rate": 4.22360248447205e-05, "loss": 2.0488, "step": 273 }, { "epoch": 1.70404984423676, "grad_norm": 3.390625, "learning_rate": 4.239130434782609e-05, "loss": 1.9979, "step": 274 }, { "epoch": 1.7102803738317758, "grad_norm": 5.125, "learning_rate": 4.254658385093168e-05, "loss": 2.082, "step": 275 }, { "epoch": 1.7165109034267911, "grad_norm": 4.6875, "learning_rate": 4.270186335403727e-05, "loss": 2.0598, "step": 276 }, { "epoch": 1.722741433021807, "grad_norm": 3.90625, "learning_rate": 4.2857142857142856e-05, "loss": 2.0573, "step": 277 }, { "epoch": 1.7289719626168223, "grad_norm": 3.109375, "learning_rate": 4.301242236024845e-05, "loss": 2.0267, "step": 278 }, { "epoch": 1.735202492211838, "grad_norm": 2.984375, "learning_rate": 4.316770186335404e-05, "loss": 2.1038, "step": 279 }, { "epoch": 1.7414330218068534, "grad_norm": 5.3125, "learning_rate": 4.332298136645963e-05, "loss": 2.0721, "step": 280 }, { "epoch": 1.7476635514018692, "grad_norm": 5.40625, "learning_rate": 4.347826086956522e-05, "loss": 2.0936, "step": 281 }, { "epoch": 1.7538940809968846, "grad_norm": 2.875, "learning_rate": 4.363354037267081e-05, "loss": 2.0286, "step": 282 }, { "epoch": 1.7601246105919004, "grad_norm": 4.125, "learning_rate": 4.3788819875776396e-05, "loss": 1.9836, "step": 283 }, { "epoch": 1.7663551401869158, "grad_norm": 5.96875, "learning_rate": 4.3944099378881993e-05, "loss": 2.0922, "step": 284 }, { "epoch": 1.7725856697819315, "grad_norm": 5.25, "learning_rate": 4.409937888198758e-05, "loss": 2.0774, "step": 285 }, { "epoch": 1.778816199376947, "grad_norm": 3.421875, "learning_rate": 4.425465838509317e-05, "loss": 1.9435, "step": 286 }, { "epoch": 1.7850467289719627, "grad_norm": 8.1875, "learning_rate": 4.440993788819876e-05, "loss": 2.0752, "step": 287 }, { "epoch": 1.791277258566978, "grad_norm": 7.75, "learning_rate": 4.456521739130435e-05, "loss": 2.0397, "step": 288 }, { "epoch": 1.7975077881619939, "grad_norm": 6.46875, "learning_rate": 4.472049689440994e-05, "loss": 1.9849, "step": 289 }, { "epoch": 1.8037383177570092, "grad_norm": 4.71875, "learning_rate": 4.4875776397515526e-05, "loss": 2.0359, "step": 290 }, { "epoch": 1.809968847352025, "grad_norm": 7.3125, "learning_rate": 4.5031055900621124e-05, "loss": 1.9925, "step": 291 }, { "epoch": 1.8161993769470404, "grad_norm": 7.59375, "learning_rate": 4.518633540372671e-05, "loss": 2.0804, "step": 292 }, { "epoch": 1.8224299065420562, "grad_norm": 7.59375, "learning_rate": 4.5341614906832306e-05, "loss": 2.0163, "step": 293 }, { "epoch": 1.8286604361370715, "grad_norm": 6.03125, "learning_rate": 4.549689440993789e-05, "loss": 2.0441, "step": 294 }, { "epoch": 1.8348909657320873, "grad_norm": 3.828125, "learning_rate": 4.565217391304348e-05, "loss": 2.0724, "step": 295 }, { "epoch": 1.8411214953271027, "grad_norm": 9.0, "learning_rate": 4.580745341614907e-05, "loss": 2.1115, "step": 296 }, { "epoch": 1.8473520249221185, "grad_norm": 11.625, "learning_rate": 4.5962732919254656e-05, "loss": 2.144, "step": 297 }, { "epoch": 1.8535825545171338, "grad_norm": 13.0, "learning_rate": 4.6118012422360254e-05, "loss": 2.0399, "step": 298 }, { "epoch": 1.8598130841121496, "grad_norm": 11.1875, "learning_rate": 4.627329192546584e-05, "loss": 2.0735, "step": 299 }, { "epoch": 1.866043613707165, "grad_norm": 3.25, "learning_rate": 4.642857142857143e-05, "loss": 2.0003, "step": 300 }, { "epoch": 1.8722741433021808, "grad_norm": 4.8125, "learning_rate": 4.658385093167702e-05, "loss": 1.9765, "step": 301 }, { "epoch": 1.8785046728971961, "grad_norm": 4.75, "learning_rate": 4.673913043478261e-05, "loss": 1.9863, "step": 302 }, { "epoch": 1.884735202492212, "grad_norm": 4.46875, "learning_rate": 4.68944099378882e-05, "loss": 1.9618, "step": 303 }, { "epoch": 1.8909657320872273, "grad_norm": 3.015625, "learning_rate": 4.7049689440993793e-05, "loss": 2.0481, "step": 304 }, { "epoch": 1.897196261682243, "grad_norm": 3.71875, "learning_rate": 4.7204968944099384e-05, "loss": 2.0122, "step": 305 }, { "epoch": 1.9034267912772584, "grad_norm": 7.375, "learning_rate": 4.736024844720497e-05, "loss": 2.0421, "step": 306 }, { "epoch": 1.9096573208722742, "grad_norm": 7.71875, "learning_rate": 4.751552795031056e-05, "loss": 2.0222, "step": 307 }, { "epoch": 1.9158878504672896, "grad_norm": 7.3125, "learning_rate": 4.767080745341615e-05, "loss": 1.9617, "step": 308 }, { "epoch": 1.9221183800623054, "grad_norm": 4.09375, "learning_rate": 4.782608695652174e-05, "loss": 2.0196, "step": 309 }, { "epoch": 1.9283489096573208, "grad_norm": 4.71875, "learning_rate": 4.798136645962733e-05, "loss": 2.0554, "step": 310 }, { "epoch": 1.9345794392523366, "grad_norm": 7.25, "learning_rate": 4.8136645962732924e-05, "loss": 1.9988, "step": 311 }, { "epoch": 1.940809968847352, "grad_norm": 6.34375, "learning_rate": 4.829192546583851e-05, "loss": 1.9776, "step": 312 }, { "epoch": 1.9470404984423677, "grad_norm": 4.96875, "learning_rate": 4.8447204968944106e-05, "loss": 1.9908, "step": 313 }, { "epoch": 1.953271028037383, "grad_norm": 3.234375, "learning_rate": 4.860248447204969e-05, "loss": 2.0315, "step": 314 }, { "epoch": 1.9595015576323989, "grad_norm": 4.84375, "learning_rate": 4.875776397515528e-05, "loss": 2.0451, "step": 315 }, { "epoch": 1.9657320872274142, "grad_norm": 7.09375, "learning_rate": 4.891304347826087e-05, "loss": 1.9959, "step": 316 }, { "epoch": 1.97196261682243, "grad_norm": 8.0, "learning_rate": 4.906832298136646e-05, "loss": 2.0517, "step": 317 }, { "epoch": 1.9781931464174454, "grad_norm": 3.78125, "learning_rate": 4.9223602484472054e-05, "loss": 1.9954, "step": 318 }, { "epoch": 1.9844236760124612, "grad_norm": 3.578125, "learning_rate": 4.937888198757764e-05, "loss": 1.9955, "step": 319 }, { "epoch": 1.9906542056074765, "grad_norm": 5.65625, "learning_rate": 4.9534161490683236e-05, "loss": 1.9768, "step": 320 }, { "epoch": 1.9968847352024923, "grad_norm": 5.0625, "learning_rate": 4.968944099378882e-05, "loss": 1.9917, "step": 321 }, { "epoch": 2.0, "grad_norm": 3.1875, "learning_rate": 4.984472049689442e-05, "loss": 0.977, "step": 322 }, { "epoch": 2.0062305295950154, "grad_norm": 3.3125, "learning_rate": 5e-05, "loss": 1.9575, "step": 323 }, { "epoch": 2.012461059190031, "grad_norm": 3.796875, "learning_rate": 4.99999998785975e-05, "loss": 2.0026, "step": 324 }, { "epoch": 2.0186915887850465, "grad_norm": 6.5625, "learning_rate": 4.999999951439e-05, "loss": 2.0183, "step": 325 }, { "epoch": 2.0249221183800623, "grad_norm": 4.65625, "learning_rate": 4.999999890737752e-05, "loss": 2.018, "step": 326 }, { "epoch": 2.0311526479750777, "grad_norm": 3.53125, "learning_rate": 4.9999998057560046e-05, "loss": 1.9771, "step": 327 }, { "epoch": 2.0373831775700935, "grad_norm": 5.4375, "learning_rate": 4.999999696493759e-05, "loss": 2.0226, "step": 328 }, { "epoch": 2.043613707165109, "grad_norm": 5.9375, "learning_rate": 4.999999562951018e-05, "loss": 2.0068, "step": 329 }, { "epoch": 2.0498442367601246, "grad_norm": 5.40625, "learning_rate": 4.99999940512778e-05, "loss": 1.9871, "step": 330 }, { "epoch": 2.05607476635514, "grad_norm": 3.640625, "learning_rate": 4.9999992230240476e-05, "loss": 1.9577, "step": 331 }, { "epoch": 2.0623052959501558, "grad_norm": 5.78125, "learning_rate": 4.999999016639825e-05, "loss": 1.9738, "step": 332 }, { "epoch": 2.068535825545171, "grad_norm": 6.15625, "learning_rate": 4.9999987859751106e-05, "loss": 1.9128, "step": 333 }, { "epoch": 2.074766355140187, "grad_norm": 3.625, "learning_rate": 4.999998531029908e-05, "loss": 1.9824, "step": 334 }, { "epoch": 2.0809968847352023, "grad_norm": 3.71875, "learning_rate": 4.999998251804221e-05, "loss": 1.9932, "step": 335 }, { "epoch": 2.087227414330218, "grad_norm": 4.09375, "learning_rate": 4.999997948298051e-05, "loss": 1.9354, "step": 336 }, { "epoch": 2.0934579439252334, "grad_norm": 5.1875, "learning_rate": 4.999997620511401e-05, "loss": 1.9062, "step": 337 }, { "epoch": 2.0996884735202492, "grad_norm": 3.453125, "learning_rate": 4.999997268444275e-05, "loss": 1.9358, "step": 338 }, { "epoch": 2.1059190031152646, "grad_norm": 4.65625, "learning_rate": 4.999996892096675e-05, "loss": 2.0227, "step": 339 }, { "epoch": 2.1121495327102804, "grad_norm": 4.125, "learning_rate": 4.999996491468606e-05, "loss": 1.884, "step": 340 }, { "epoch": 2.1183800623052957, "grad_norm": 3.859375, "learning_rate": 4.999996066560071e-05, "loss": 1.9747, "step": 341 }, { "epoch": 2.1246105919003115, "grad_norm": 4.375, "learning_rate": 4.999995617371074e-05, "loss": 1.9467, "step": 342 }, { "epoch": 2.130841121495327, "grad_norm": 3.53125, "learning_rate": 4.99999514390162e-05, "loss": 1.9679, "step": 343 }, { "epoch": 2.1370716510903427, "grad_norm": 3.71875, "learning_rate": 4.9999946461517144e-05, "loss": 1.924, "step": 344 }, { "epoch": 2.143302180685358, "grad_norm": 5.5625, "learning_rate": 4.99999412412136e-05, "loss": 1.9707, "step": 345 }, { "epoch": 2.149532710280374, "grad_norm": 4.28125, "learning_rate": 4.999993577810563e-05, "loss": 1.9218, "step": 346 }, { "epoch": 2.155763239875389, "grad_norm": 4.71875, "learning_rate": 4.999993007219329e-05, "loss": 1.9323, "step": 347 }, { "epoch": 2.161993769470405, "grad_norm": 4.03125, "learning_rate": 4.9999924123476636e-05, "loss": 1.9096, "step": 348 }, { "epoch": 2.1682242990654204, "grad_norm": 3.390625, "learning_rate": 4.999991793195572e-05, "loss": 1.8822, "step": 349 }, { "epoch": 2.174454828660436, "grad_norm": 3.421875, "learning_rate": 4.9999911497630595e-05, "loss": 1.9357, "step": 350 }, { "epoch": 2.1806853582554515, "grad_norm": 3.359375, "learning_rate": 4.999990482050134e-05, "loss": 1.9405, "step": 351 }, { "epoch": 2.1869158878504673, "grad_norm": 4.65625, "learning_rate": 4.9999897900568005e-05, "loss": 1.9718, "step": 352 }, { "epoch": 2.1931464174454827, "grad_norm": 3.484375, "learning_rate": 4.999989073783067e-05, "loss": 1.8096, "step": 353 }, { "epoch": 2.1993769470404985, "grad_norm": 4.6875, "learning_rate": 4.99998833322894e-05, "loss": 1.81, "step": 354 }, { "epoch": 2.205607476635514, "grad_norm": 3.984375, "learning_rate": 4.999987568394426e-05, "loss": 1.8839, "step": 355 }, { "epoch": 2.2118380062305296, "grad_norm": 4.03125, "learning_rate": 4.999986779279533e-05, "loss": 1.8987, "step": 356 }, { "epoch": 2.218068535825545, "grad_norm": 3.875, "learning_rate": 4.9999859658842694e-05, "loss": 1.9525, "step": 357 }, { "epoch": 2.2242990654205608, "grad_norm": 3.484375, "learning_rate": 4.999985128208642e-05, "loss": 1.8755, "step": 358 }, { "epoch": 2.230529595015576, "grad_norm": 4.03125, "learning_rate": 4.9999842662526596e-05, "loss": 1.8876, "step": 359 }, { "epoch": 2.236760124610592, "grad_norm": 4.375, "learning_rate": 4.99998338001633e-05, "loss": 1.8129, "step": 360 }, { "epoch": 2.2429906542056073, "grad_norm": 5.65625, "learning_rate": 4.999982469499662e-05, "loss": 1.8265, "step": 361 }, { "epoch": 2.249221183800623, "grad_norm": 3.953125, "learning_rate": 4.9999815347026644e-05, "loss": 1.8478, "step": 362 }, { "epoch": 2.2554517133956384, "grad_norm": 4.8125, "learning_rate": 4.999980575625346e-05, "loss": 1.8146, "step": 363 }, { "epoch": 2.2616822429906542, "grad_norm": 5.5, "learning_rate": 4.999979592267717e-05, "loss": 1.8744, "step": 364 }, { "epoch": 2.2679127725856696, "grad_norm": 4.34375, "learning_rate": 4.999978584629787e-05, "loss": 1.7636, "step": 365 }, { "epoch": 2.2741433021806854, "grad_norm": 4.375, "learning_rate": 4.9999775527115644e-05, "loss": 1.7741, "step": 366 }, { "epoch": 2.2803738317757007, "grad_norm": 5.5, "learning_rate": 4.99997649651306e-05, "loss": 1.6826, "step": 367 }, { "epoch": 2.2866043613707165, "grad_norm": 5.15625, "learning_rate": 4.999975416034285e-05, "loss": 1.8612, "step": 368 }, { "epoch": 2.292834890965732, "grad_norm": 4.3125, "learning_rate": 4.9999743112752485e-05, "loss": 1.7158, "step": 369 }, { "epoch": 2.2990654205607477, "grad_norm": 6.25, "learning_rate": 4.9999731822359616e-05, "loss": 1.7513, "step": 370 }, { "epoch": 2.305295950155763, "grad_norm": 4.65625, "learning_rate": 4.999972028916436e-05, "loss": 1.7027, "step": 371 }, { "epoch": 2.311526479750779, "grad_norm": 4.375, "learning_rate": 4.9999708513166824e-05, "loss": 1.7601, "step": 372 }, { "epoch": 2.317757009345794, "grad_norm": 4.8125, "learning_rate": 4.999969649436711e-05, "loss": 1.8029, "step": 373 }, { "epoch": 2.32398753894081, "grad_norm": 5.40625, "learning_rate": 4.9999684232765357e-05, "loss": 1.7567, "step": 374 }, { "epoch": 2.3302180685358254, "grad_norm": 5.4375, "learning_rate": 4.999967172836167e-05, "loss": 1.7159, "step": 375 }, { "epoch": 2.336448598130841, "grad_norm": 4.8125, "learning_rate": 4.999965898115617e-05, "loss": 1.7067, "step": 376 }, { "epoch": 2.3426791277258565, "grad_norm": 4.625, "learning_rate": 4.9999645991148994e-05, "loss": 1.7078, "step": 377 }, { "epoch": 2.3489096573208723, "grad_norm": 5.125, "learning_rate": 4.999963275834025e-05, "loss": 1.7997, "step": 378 }, { "epoch": 2.3551401869158877, "grad_norm": 4.84375, "learning_rate": 4.9999619282730084e-05, "loss": 1.6324, "step": 379 }, { "epoch": 2.3613707165109035, "grad_norm": 4.875, "learning_rate": 4.9999605564318606e-05, "loss": 1.746, "step": 380 }, { "epoch": 2.367601246105919, "grad_norm": 4.625, "learning_rate": 4.999959160310597e-05, "loss": 1.5321, "step": 381 }, { "epoch": 2.3738317757009346, "grad_norm": 5.15625, "learning_rate": 4.9999577399092304e-05, "loss": 1.5984, "step": 382 }, { "epoch": 2.38006230529595, "grad_norm": 5.90625, "learning_rate": 4.999956295227774e-05, "loss": 1.5551, "step": 383 }, { "epoch": 2.3862928348909658, "grad_norm": 5.34375, "learning_rate": 4.999954826266242e-05, "loss": 1.6489, "step": 384 }, { "epoch": 2.392523364485981, "grad_norm": 5.0, "learning_rate": 4.99995333302465e-05, "loss": 1.6522, "step": 385 }, { "epoch": 2.398753894080997, "grad_norm": 5.5, "learning_rate": 4.999951815503011e-05, "loss": 1.7643, "step": 386 }, { "epoch": 2.4049844236760123, "grad_norm": 5.875, "learning_rate": 4.99995027370134e-05, "loss": 1.6512, "step": 387 }, { "epoch": 2.411214953271028, "grad_norm": 5.5, "learning_rate": 4.999948707619653e-05, "loss": 1.6847, "step": 388 }, { "epoch": 2.4174454828660434, "grad_norm": 5.4375, "learning_rate": 4.999947117257964e-05, "loss": 1.5806, "step": 389 }, { "epoch": 2.4236760124610592, "grad_norm": 7.0, "learning_rate": 4.999945502616289e-05, "loss": 1.7021, "step": 390 }, { "epoch": 2.4299065420560746, "grad_norm": 7.90625, "learning_rate": 4.9999438636946427e-05, "loss": 1.6021, "step": 391 }, { "epoch": 2.4361370716510904, "grad_norm": 6.03125, "learning_rate": 4.999942200493043e-05, "loss": 1.5013, "step": 392 }, { "epoch": 2.4423676012461057, "grad_norm": 6.34375, "learning_rate": 4.999940513011504e-05, "loss": 1.6075, "step": 393 }, { "epoch": 2.4485981308411215, "grad_norm": 5.4375, "learning_rate": 4.999938801250044e-05, "loss": 1.4568, "step": 394 }, { "epoch": 2.454828660436137, "grad_norm": 5.15625, "learning_rate": 4.9999370652086775e-05, "loss": 1.5305, "step": 395 }, { "epoch": 2.4610591900311527, "grad_norm": 6.78125, "learning_rate": 4.9999353048874234e-05, "loss": 1.538, "step": 396 }, { "epoch": 2.467289719626168, "grad_norm": 5.96875, "learning_rate": 4.999933520286297e-05, "loss": 1.5722, "step": 397 }, { "epoch": 2.473520249221184, "grad_norm": 5.65625, "learning_rate": 4.999931711405318e-05, "loss": 1.5706, "step": 398 }, { "epoch": 2.479750778816199, "grad_norm": 6.03125, "learning_rate": 4.9999298782445007e-05, "loss": 1.4663, "step": 399 }, { "epoch": 2.485981308411215, "grad_norm": 6.15625, "learning_rate": 4.999928020803866e-05, "loss": 1.4853, "step": 400 }, { "epoch": 2.4922118380062304, "grad_norm": 5.9375, "learning_rate": 4.99992613908343e-05, "loss": 1.4933, "step": 401 }, { "epoch": 2.498442367601246, "grad_norm": 7.375, "learning_rate": 4.999924233083212e-05, "loss": 1.4772, "step": 402 }, { "epoch": 2.5046728971962615, "grad_norm": 5.46875, "learning_rate": 4.9999223028032295e-05, "loss": 1.4427, "step": 403 }, { "epoch": 2.5109034267912773, "grad_norm": 6.53125, "learning_rate": 4.9999203482435024e-05, "loss": 1.402, "step": 404 }, { "epoch": 2.5171339563862927, "grad_norm": 8.125, "learning_rate": 4.999918369404049e-05, "loss": 1.527, "step": 405 }, { "epoch": 2.5233644859813085, "grad_norm": 6.21875, "learning_rate": 4.999916366284889e-05, "loss": 1.5608, "step": 406 }, { "epoch": 2.529595015576324, "grad_norm": 7.3125, "learning_rate": 4.999914338886042e-05, "loss": 1.5399, "step": 407 }, { "epoch": 2.5358255451713396, "grad_norm": 5.59375, "learning_rate": 4.999912287207525e-05, "loss": 1.4036, "step": 408 }, { "epoch": 2.542056074766355, "grad_norm": 5.4375, "learning_rate": 4.9999102112493625e-05, "loss": 1.4973, "step": 409 }, { "epoch": 2.5482866043613708, "grad_norm": 5.875, "learning_rate": 4.999908111011571e-05, "loss": 1.402, "step": 410 }, { "epoch": 2.554517133956386, "grad_norm": 5.84375, "learning_rate": 4.9999059864941726e-05, "loss": 1.4288, "step": 411 }, { "epoch": 2.560747663551402, "grad_norm": 6.90625, "learning_rate": 4.9999038376971875e-05, "loss": 1.5201, "step": 412 }, { "epoch": 2.5669781931464173, "grad_norm": 6.3125, "learning_rate": 4.9999016646206366e-05, "loss": 1.4728, "step": 413 }, { "epoch": 2.573208722741433, "grad_norm": 5.75, "learning_rate": 4.9998994672645415e-05, "loss": 1.3927, "step": 414 }, { "epoch": 2.5794392523364484, "grad_norm": 5.96875, "learning_rate": 4.9998972456289226e-05, "loss": 1.4501, "step": 415 }, { "epoch": 2.5856697819314642, "grad_norm": 7.1875, "learning_rate": 4.9998949997138015e-05, "loss": 1.4107, "step": 416 }, { "epoch": 2.5919003115264796, "grad_norm": 5.875, "learning_rate": 4.999892729519201e-05, "loss": 1.4302, "step": 417 }, { "epoch": 2.5981308411214954, "grad_norm": 7.625, "learning_rate": 4.999890435045142e-05, "loss": 1.2293, "step": 418 }, { "epoch": 2.6043613707165107, "grad_norm": 7.28125, "learning_rate": 4.999888116291649e-05, "loss": 1.4372, "step": 419 }, { "epoch": 2.6105919003115265, "grad_norm": 5.625, "learning_rate": 4.9998857732587414e-05, "loss": 1.2784, "step": 420 }, { "epoch": 2.616822429906542, "grad_norm": 5.71875, "learning_rate": 4.9998834059464436e-05, "loss": 1.3441, "step": 421 }, { "epoch": 2.6230529595015577, "grad_norm": 5.625, "learning_rate": 4.999881014354779e-05, "loss": 1.4206, "step": 422 }, { "epoch": 2.629283489096573, "grad_norm": 6.84375, "learning_rate": 4.99987859848377e-05, "loss": 1.4206, "step": 423 }, { "epoch": 2.635514018691589, "grad_norm": 5.84375, "learning_rate": 4.99987615833344e-05, "loss": 1.2258, "step": 424 }, { "epoch": 2.641744548286604, "grad_norm": 5.46875, "learning_rate": 4.999873693903814e-05, "loss": 1.291, "step": 425 }, { "epoch": 2.64797507788162, "grad_norm": 6.21875, "learning_rate": 4.999871205194914e-05, "loss": 1.4762, "step": 426 }, { "epoch": 2.6542056074766354, "grad_norm": 7.375, "learning_rate": 4.999868692206765e-05, "loss": 1.3724, "step": 427 }, { "epoch": 2.660436137071651, "grad_norm": 6.25, "learning_rate": 4.9998661549393924e-05, "loss": 1.2008, "step": 428 }, { "epoch": 2.6666666666666665, "grad_norm": 6.0, "learning_rate": 4.9998635933928196e-05, "loss": 1.2335, "step": 429 }, { "epoch": 2.6728971962616823, "grad_norm": 6.21875, "learning_rate": 4.999861007567072e-05, "loss": 1.5353, "step": 430 }, { "epoch": 2.6791277258566977, "grad_norm": 6.53125, "learning_rate": 4.9998583974621736e-05, "loss": 1.2602, "step": 431 }, { "epoch": 2.6853582554517135, "grad_norm": 5.90625, "learning_rate": 4.9998557630781514e-05, "loss": 1.3727, "step": 432 }, { "epoch": 2.691588785046729, "grad_norm": 5.15625, "learning_rate": 4.999853104415031e-05, "loss": 1.1685, "step": 433 }, { "epoch": 2.6978193146417446, "grad_norm": 5.78125, "learning_rate": 4.999850421472837e-05, "loss": 1.3324, "step": 434 }, { "epoch": 2.70404984423676, "grad_norm": 5.40625, "learning_rate": 4.999847714251596e-05, "loss": 1.2054, "step": 435 }, { "epoch": 2.710280373831776, "grad_norm": 5.65625, "learning_rate": 4.9998449827513344e-05, "loss": 1.3234, "step": 436 }, { "epoch": 2.716510903426791, "grad_norm": 7.125, "learning_rate": 4.999842226972078e-05, "loss": 1.3857, "step": 437 }, { "epoch": 2.722741433021807, "grad_norm": 5.34375, "learning_rate": 4.999839446913855e-05, "loss": 1.1641, "step": 438 }, { "epoch": 2.7289719626168223, "grad_norm": 6.25, "learning_rate": 4.999836642576692e-05, "loss": 1.3435, "step": 439 }, { "epoch": 2.735202492211838, "grad_norm": 6.3125, "learning_rate": 4.999833813960615e-05, "loss": 1.2676, "step": 440 }, { "epoch": 2.7414330218068534, "grad_norm": 6.5625, "learning_rate": 4.999830961065652e-05, "loss": 1.3203, "step": 441 }, { "epoch": 2.7476635514018692, "grad_norm": 6.28125, "learning_rate": 4.999828083891831e-05, "loss": 1.2945, "step": 442 }, { "epoch": 2.7538940809968846, "grad_norm": 5.71875, "learning_rate": 4.999825182439181e-05, "loss": 1.2872, "step": 443 }, { "epoch": 2.7601246105919004, "grad_norm": 6.78125, "learning_rate": 4.999822256707728e-05, "loss": 1.1761, "step": 444 }, { "epoch": 2.7663551401869158, "grad_norm": 5.28125, "learning_rate": 4.9998193066975016e-05, "loss": 1.174, "step": 445 }, { "epoch": 2.7725856697819315, "grad_norm": 5.9375, "learning_rate": 4.999816332408531e-05, "loss": 1.207, "step": 446 }, { "epoch": 2.778816199376947, "grad_norm": 5.5625, "learning_rate": 4.999813333840845e-05, "loss": 1.1859, "step": 447 }, { "epoch": 2.7850467289719627, "grad_norm": 6.21875, "learning_rate": 4.9998103109944714e-05, "loss": 1.3363, "step": 448 }, { "epoch": 2.791277258566978, "grad_norm": 5.96875, "learning_rate": 4.99980726386944e-05, "loss": 1.125, "step": 449 }, { "epoch": 2.797507788161994, "grad_norm": 5.53125, "learning_rate": 4.9998041924657815e-05, "loss": 1.2218, "step": 450 }, { "epoch": 2.803738317757009, "grad_norm": 6.3125, "learning_rate": 4.999801096783524e-05, "loss": 1.3417, "step": 451 }, { "epoch": 2.809968847352025, "grad_norm": 5.0, "learning_rate": 4.9997979768227e-05, "loss": 1.0431, "step": 452 }, { "epoch": 2.8161993769470404, "grad_norm": 6.1875, "learning_rate": 4.999794832583337e-05, "loss": 1.0923, "step": 453 }, { "epoch": 2.822429906542056, "grad_norm": 5.34375, "learning_rate": 4.999791664065467e-05, "loss": 1.3176, "step": 454 }, { "epoch": 2.8286604361370715, "grad_norm": 5.5, "learning_rate": 4.999788471269122e-05, "loss": 1.2173, "step": 455 }, { "epoch": 2.8348909657320873, "grad_norm": 5.3125, "learning_rate": 4.99978525419433e-05, "loss": 1.1774, "step": 456 }, { "epoch": 2.8411214953271027, "grad_norm": 6.125, "learning_rate": 4.999782012841126e-05, "loss": 1.062, "step": 457 }, { "epoch": 2.8473520249221185, "grad_norm": 5.875, "learning_rate": 4.999778747209537e-05, "loss": 1.0994, "step": 458 }, { "epoch": 2.853582554517134, "grad_norm": 6.125, "learning_rate": 4.999775457299598e-05, "loss": 1.2723, "step": 459 }, { "epoch": 2.8598130841121496, "grad_norm": 5.9375, "learning_rate": 4.999772143111341e-05, "loss": 1.2173, "step": 460 }, { "epoch": 2.866043613707165, "grad_norm": 5.96875, "learning_rate": 4.999768804644796e-05, "loss": 1.1241, "step": 461 }, { "epoch": 2.872274143302181, "grad_norm": 5.34375, "learning_rate": 4.999765441899997e-05, "loss": 1.1008, "step": 462 }, { "epoch": 2.878504672897196, "grad_norm": 5.46875, "learning_rate": 4.999762054876977e-05, "loss": 0.985, "step": 463 }, { "epoch": 2.884735202492212, "grad_norm": 6.0625, "learning_rate": 4.999758643575768e-05, "loss": 1.1343, "step": 464 }, { "epoch": 2.8909657320872273, "grad_norm": 6.375, "learning_rate": 4.999755207996403e-05, "loss": 1.2552, "step": 465 }, { "epoch": 2.897196261682243, "grad_norm": 6.0625, "learning_rate": 4.9997517481389156e-05, "loss": 1.1362, "step": 466 }, { "epoch": 2.9034267912772584, "grad_norm": 5.34375, "learning_rate": 4.99974826400334e-05, "loss": 0.9222, "step": 467 }, { "epoch": 2.9096573208722742, "grad_norm": 5.25, "learning_rate": 4.999744755589709e-05, "loss": 1.3016, "step": 468 }, { "epoch": 2.9158878504672896, "grad_norm": 5.25, "learning_rate": 4.999741222898058e-05, "loss": 1.0441, "step": 469 }, { "epoch": 2.9221183800623054, "grad_norm": 5.8125, "learning_rate": 4.99973766592842e-05, "loss": 1.0088, "step": 470 }, { "epoch": 2.9283489096573208, "grad_norm": 6.0625, "learning_rate": 4.99973408468083e-05, "loss": 1.1416, "step": 471 }, { "epoch": 2.9345794392523366, "grad_norm": 6.6875, "learning_rate": 4.999730479155323e-05, "loss": 0.9618, "step": 472 }, { "epoch": 2.940809968847352, "grad_norm": 6.28125, "learning_rate": 4.9997268493519345e-05, "loss": 1.0906, "step": 473 }, { "epoch": 2.9470404984423677, "grad_norm": 7.15625, "learning_rate": 4.999723195270698e-05, "loss": 1.0354, "step": 474 }, { "epoch": 2.953271028037383, "grad_norm": 6.40625, "learning_rate": 4.9997195169116514e-05, "loss": 1.047, "step": 475 }, { "epoch": 2.959501557632399, "grad_norm": 6.78125, "learning_rate": 4.999715814274828e-05, "loss": 1.0395, "step": 476 }, { "epoch": 2.965732087227414, "grad_norm": 5.25, "learning_rate": 4.9997120873602654e-05, "loss": 1.193, "step": 477 }, { "epoch": 2.97196261682243, "grad_norm": 5.28125, "learning_rate": 4.9997083361679995e-05, "loss": 1.1083, "step": 478 }, { "epoch": 2.9781931464174454, "grad_norm": 6.03125, "learning_rate": 4.999704560698067e-05, "loss": 0.9952, "step": 479 }, { "epoch": 2.984423676012461, "grad_norm": 5.125, "learning_rate": 4.999700760950503e-05, "loss": 0.9787, "step": 480 }, { "epoch": 2.9906542056074765, "grad_norm": 5.71875, "learning_rate": 4.999696936925347e-05, "loss": 1.0184, "step": 481 }, { "epoch": 2.9968847352024923, "grad_norm": 6.9375, "learning_rate": 4.999693088622633e-05, "loss": 1.0158, "step": 482 }, { "epoch": 3.0, "grad_norm": 3.53125, "learning_rate": 4.9996892160424016e-05, "loss": 0.5845, "step": 483 }, { "epoch": 3.0062305295950154, "grad_norm": 7.0, "learning_rate": 4.9996853191846885e-05, "loss": 1.3792, "step": 484 }, { "epoch": 3.012461059190031, "grad_norm": 6.65625, "learning_rate": 4.9996813980495305e-05, "loss": 1.2228, "step": 485 }, { "epoch": 3.0186915887850465, "grad_norm": 5.375, "learning_rate": 4.999677452636968e-05, "loss": 1.0208, "step": 486 }, { "epoch": 3.0249221183800623, "grad_norm": 5.53125, "learning_rate": 4.999673482947039e-05, "loss": 0.9694, "step": 487 }, { "epoch": 3.0311526479750777, "grad_norm": 6.84375, "learning_rate": 4.999669488979781e-05, "loss": 0.9819, "step": 488 }, { "epoch": 3.0373831775700935, "grad_norm": 6.46875, "learning_rate": 4.999665470735233e-05, "loss": 0.9272, "step": 489 }, { "epoch": 3.043613707165109, "grad_norm": 5.5625, "learning_rate": 4.999661428213435e-05, "loss": 0.8949, "step": 490 }, { "epoch": 3.0498442367601246, "grad_norm": 7.25, "learning_rate": 4.999657361414425e-05, "loss": 1.0846, "step": 491 }, { "epoch": 3.05607476635514, "grad_norm": 7.59375, "learning_rate": 4.999653270338242e-05, "loss": 0.9503, "step": 492 }, { "epoch": 3.0623052959501558, "grad_norm": 5.625, "learning_rate": 4.9996491549849286e-05, "loss": 0.9352, "step": 493 }, { "epoch": 3.068535825545171, "grad_norm": 8.25, "learning_rate": 4.999645015354522e-05, "loss": 0.9721, "step": 494 }, { "epoch": 3.074766355140187, "grad_norm": 9.875, "learning_rate": 4.999640851447064e-05, "loss": 0.7818, "step": 495 }, { "epoch": 3.0809968847352023, "grad_norm": 5.46875, "learning_rate": 4.999636663262594e-05, "loss": 0.8147, "step": 496 }, { "epoch": 3.087227414330218, "grad_norm": 6.03125, "learning_rate": 4.999632450801153e-05, "loss": 0.9981, "step": 497 }, { "epoch": 3.0934579439252334, "grad_norm": 7.03125, "learning_rate": 4.999628214062782e-05, "loss": 0.9878, "step": 498 }, { "epoch": 3.0996884735202492, "grad_norm": 6.5, "learning_rate": 4.999623953047522e-05, "loss": 0.9994, "step": 499 }, { "epoch": 3.1059190031152646, "grad_norm": 5.03125, "learning_rate": 4.999619667755414e-05, "loss": 0.7525, "step": 500 }, { "epoch": 3.1121495327102804, "grad_norm": 5.53125, "learning_rate": 4.9996153581865006e-05, "loss": 0.9349, "step": 501 }, { "epoch": 3.1183800623052957, "grad_norm": 7.125, "learning_rate": 4.9996110243408234e-05, "loss": 0.9448, "step": 502 }, { "epoch": 3.1246105919003115, "grad_norm": 6.0, "learning_rate": 4.999606666218424e-05, "loss": 0.7907, "step": 503 }, { "epoch": 3.130841121495327, "grad_norm": 8.8125, "learning_rate": 4.9996022838193454e-05, "loss": 0.886, "step": 504 }, { "epoch": 3.1370716510903427, "grad_norm": 8.5625, "learning_rate": 4.9995978771436294e-05, "loss": 0.9978, "step": 505 }, { "epoch": 3.143302180685358, "grad_norm": 5.8125, "learning_rate": 4.999593446191319e-05, "loss": 0.9223, "step": 506 }, { "epoch": 3.149532710280374, "grad_norm": 4.8125, "learning_rate": 4.999588990962458e-05, "loss": 0.78, "step": 507 }, { "epoch": 3.155763239875389, "grad_norm": 11.0625, "learning_rate": 4.9995845114570886e-05, "loss": 1.0236, "step": 508 }, { "epoch": 3.161993769470405, "grad_norm": 11.5625, "learning_rate": 4.999580007675255e-05, "loss": 1.0602, "step": 509 }, { "epoch": 3.1682242990654204, "grad_norm": 6.78125, "learning_rate": 4.999575479617e-05, "loss": 0.8545, "step": 510 }, { "epoch": 3.174454828660436, "grad_norm": 7.28125, "learning_rate": 4.9995709272823686e-05, "loss": 1.1637, "step": 511 }, { "epoch": 3.1806853582554515, "grad_norm": 8.3125, "learning_rate": 4.9995663506714054e-05, "loss": 0.8286, "step": 512 }, { "epoch": 3.1869158878504673, "grad_norm": 8.25, "learning_rate": 4.999561749784154e-05, "loss": 0.9416, "step": 513 }, { "epoch": 3.1931464174454827, "grad_norm": 5.34375, "learning_rate": 4.9995571246206586e-05, "loss": 0.6763, "step": 514 }, { "epoch": 3.1993769470404985, "grad_norm": 8.3125, "learning_rate": 4.999552475180965e-05, "loss": 0.8274, "step": 515 }, { "epoch": 3.205607476635514, "grad_norm": 12.0625, "learning_rate": 4.999547801465119e-05, "loss": 0.9876, "step": 516 }, { "epoch": 3.2118380062305296, "grad_norm": 8.875, "learning_rate": 4.999543103473164e-05, "loss": 0.8322, "step": 517 }, { "epoch": 3.218068535825545, "grad_norm": 4.65625, "learning_rate": 4.999538381205147e-05, "loss": 0.8505, "step": 518 }, { "epoch": 3.2242990654205608, "grad_norm": 6.34375, "learning_rate": 4.999533634661113e-05, "loss": 0.9754, "step": 519 }, { "epoch": 3.230529595015576, "grad_norm": 7.1875, "learning_rate": 4.99952886384111e-05, "loss": 0.989, "step": 520 }, { "epoch": 3.236760124610592, "grad_norm": 5.3125, "learning_rate": 4.999524068745182e-05, "loss": 0.66, "step": 521 }, { "epoch": 3.2429906542056073, "grad_norm": 5.34375, "learning_rate": 4.999519249373377e-05, "loss": 0.91, "step": 522 }, { "epoch": 3.249221183800623, "grad_norm": 8.25, "learning_rate": 4.9995144057257413e-05, "loss": 0.9608, "step": 523 }, { "epoch": 3.2554517133956384, "grad_norm": 7.0, "learning_rate": 4.999509537802322e-05, "loss": 0.7531, "step": 524 }, { "epoch": 3.2616822429906542, "grad_norm": 4.65625, "learning_rate": 4.999504645603167e-05, "loss": 0.7666, "step": 525 }, { "epoch": 3.2679127725856696, "grad_norm": 6.09375, "learning_rate": 4.999499729128322e-05, "loss": 0.7864, "step": 526 }, { "epoch": 3.2741433021806854, "grad_norm": 6.28125, "learning_rate": 4.9994947883778375e-05, "loss": 0.8694, "step": 527 }, { "epoch": 3.2803738317757007, "grad_norm": 6.96875, "learning_rate": 4.999489823351759e-05, "loss": 1.0523, "step": 528 }, { "epoch": 3.2866043613707165, "grad_norm": 5.875, "learning_rate": 4.999484834050136e-05, "loss": 0.9617, "step": 529 }, { "epoch": 3.292834890965732, "grad_norm": 9.75, "learning_rate": 4.9994798204730166e-05, "loss": 0.9407, "step": 530 }, { "epoch": 3.2990654205607477, "grad_norm": 10.75, "learning_rate": 4.999474782620449e-05, "loss": 1.0201, "step": 531 }, { "epoch": 3.305295950155763, "grad_norm": 7.21875, "learning_rate": 4.999469720492483e-05, "loss": 0.8005, "step": 532 }, { "epoch": 3.311526479750779, "grad_norm": 5.9375, "learning_rate": 4.999464634089168e-05, "loss": 1.0173, "step": 533 }, { "epoch": 3.317757009345794, "grad_norm": 7.3125, "learning_rate": 4.999459523410553e-05, "loss": 1.0147, "step": 534 }, { "epoch": 3.32398753894081, "grad_norm": 9.125, "learning_rate": 4.999454388456687e-05, "loss": 0.9798, "step": 535 }, { "epoch": 3.3302180685358254, "grad_norm": 6.40625, "learning_rate": 4.99944922922762e-05, "loss": 0.7245, "step": 536 }, { "epoch": 3.336448598130841, "grad_norm": 5.96875, "learning_rate": 4.999444045723403e-05, "loss": 1.0686, "step": 537 }, { "epoch": 3.3426791277258565, "grad_norm": 4.625, "learning_rate": 4.9994388379440854e-05, "loss": 0.7834, "step": 538 }, { "epoch": 3.3489096573208723, "grad_norm": 5.5, "learning_rate": 4.9994336058897184e-05, "loss": 0.9634, "step": 539 }, { "epoch": 3.3551401869158877, "grad_norm": 6.0625, "learning_rate": 4.999428349560352e-05, "loss": 0.8822, "step": 540 }, { "epoch": 3.3613707165109035, "grad_norm": 5.46875, "learning_rate": 4.9994230689560387e-05, "loss": 0.9426, "step": 541 }, { "epoch": 3.367601246105919, "grad_norm": 6.375, "learning_rate": 4.999417764076829e-05, "loss": 1.0417, "step": 542 }, { "epoch": 3.3738317757009346, "grad_norm": 5.25, "learning_rate": 4.9994124349227736e-05, "loss": 0.6999, "step": 543 }, { "epoch": 3.38006230529595, "grad_norm": 5.9375, "learning_rate": 4.9994070814939256e-05, "loss": 0.9116, "step": 544 }, { "epoch": 3.3862928348909658, "grad_norm": 6.3125, "learning_rate": 4.9994017037903365e-05, "loss": 1.1158, "step": 545 }, { "epoch": 3.392523364485981, "grad_norm": 4.5625, "learning_rate": 4.999396301812058e-05, "loss": 0.7059, "step": 546 }, { "epoch": 3.398753894080997, "grad_norm": 5.84375, "learning_rate": 4.999390875559143e-05, "loss": 0.747, "step": 547 }, { "epoch": 3.4049844236760123, "grad_norm": 5.53125, "learning_rate": 4.999385425031644e-05, "loss": 0.8145, "step": 548 }, { "epoch": 3.411214953271028, "grad_norm": 6.09375, "learning_rate": 4.999379950229615e-05, "loss": 0.7708, "step": 549 }, { "epoch": 3.4174454828660434, "grad_norm": 6.21875, "learning_rate": 4.999374451153107e-05, "loss": 0.8079, "step": 550 }, { "epoch": 3.4236760124610592, "grad_norm": 5.3125, "learning_rate": 4.999368927802176e-05, "loss": 0.7345, "step": 551 }, { "epoch": 3.4299065420560746, "grad_norm": 6.28125, "learning_rate": 4.999363380176874e-05, "loss": 0.9347, "step": 552 }, { "epoch": 3.4361370716510904, "grad_norm": 5.34375, "learning_rate": 4.999357808277255e-05, "loss": 0.9109, "step": 553 }, { "epoch": 3.4423676012461057, "grad_norm": 5.25, "learning_rate": 4.999352212103373e-05, "loss": 0.781, "step": 554 }, { "epoch": 3.4485981308411215, "grad_norm": 6.34375, "learning_rate": 4.999346591655284e-05, "loss": 1.1715, "step": 555 }, { "epoch": 3.454828660436137, "grad_norm": 5.96875, "learning_rate": 4.99934094693304e-05, "loss": 0.9869, "step": 556 }, { "epoch": 3.4610591900311527, "grad_norm": 4.96875, "learning_rate": 4.999335277936698e-05, "loss": 0.8633, "step": 557 }, { "epoch": 3.467289719626168, "grad_norm": 5.03125, "learning_rate": 4.999329584666311e-05, "loss": 0.726, "step": 558 }, { "epoch": 3.473520249221184, "grad_norm": 6.71875, "learning_rate": 4.999323867121936e-05, "loss": 0.8241, "step": 559 }, { "epoch": 3.479750778816199, "grad_norm": 5.1875, "learning_rate": 4.999318125303628e-05, "loss": 0.7236, "step": 560 }, { "epoch": 3.485981308411215, "grad_norm": 4.15625, "learning_rate": 4.9993123592114435e-05, "loss": 0.6526, "step": 561 }, { "epoch": 3.4922118380062304, "grad_norm": 4.65625, "learning_rate": 4.9993065688454365e-05, "loss": 0.8017, "step": 562 }, { "epoch": 3.498442367601246, "grad_norm": 4.21875, "learning_rate": 4.999300754205666e-05, "loss": 0.6326, "step": 563 }, { "epoch": 3.5046728971962615, "grad_norm": 5.59375, "learning_rate": 4.999294915292186e-05, "loss": 0.9381, "step": 564 }, { "epoch": 3.5109034267912773, "grad_norm": 5.96875, "learning_rate": 4.999289052105054e-05, "loss": 0.8984, "step": 565 }, { "epoch": 3.5171339563862927, "grad_norm": 5.90625, "learning_rate": 4.999283164644328e-05, "loss": 0.9072, "step": 566 }, { "epoch": 3.5233644859813085, "grad_norm": 5.9375, "learning_rate": 4.999277252910063e-05, "loss": 0.7769, "step": 567 }, { "epoch": 3.529595015576324, "grad_norm": 5.9375, "learning_rate": 4.9992713169023184e-05, "loss": 0.8851, "step": 568 }, { "epoch": 3.5358255451713396, "grad_norm": 5.84375, "learning_rate": 4.9992653566211525e-05, "loss": 0.8517, "step": 569 }, { "epoch": 3.542056074766355, "grad_norm": 6.0, "learning_rate": 4.99925937206662e-05, "loss": 0.6443, "step": 570 }, { "epoch": 3.5482866043613708, "grad_norm": 7.875, "learning_rate": 4.9992533632387815e-05, "loss": 0.9067, "step": 571 }, { "epoch": 3.554517133956386, "grad_norm": 6.71875, "learning_rate": 4.999247330137695e-05, "loss": 1.0451, "step": 572 }, { "epoch": 3.560747663551402, "grad_norm": 5.34375, "learning_rate": 4.9992412727634186e-05, "loss": 0.8341, "step": 573 }, { "epoch": 3.5669781931464173, "grad_norm": 4.96875, "learning_rate": 4.999235191116011e-05, "loss": 0.8516, "step": 574 }, { "epoch": 3.573208722741433, "grad_norm": 6.21875, "learning_rate": 4.9992290851955325e-05, "loss": 0.8938, "step": 575 }, { "epoch": 3.5794392523364484, "grad_norm": 4.75, "learning_rate": 4.999222955002041e-05, "loss": 0.8985, "step": 576 }, { "epoch": 3.5856697819314642, "grad_norm": 6.125, "learning_rate": 4.9992168005355963e-05, "loss": 0.7256, "step": 577 }, { "epoch": 3.5919003115264796, "grad_norm": 6.71875, "learning_rate": 4.999210621796259e-05, "loss": 0.7756, "step": 578 }, { "epoch": 3.5981308411214954, "grad_norm": 5.09375, "learning_rate": 4.999204418784088e-05, "loss": 0.6746, "step": 579 }, { "epoch": 3.6043613707165107, "grad_norm": 6.1875, "learning_rate": 4.999198191499145e-05, "loss": 0.9187, "step": 580 }, { "epoch": 3.6105919003115265, "grad_norm": 6.09375, "learning_rate": 4.9991919399414887e-05, "loss": 0.7648, "step": 581 }, { "epoch": 3.616822429906542, "grad_norm": 4.75, "learning_rate": 4.999185664111181e-05, "loss": 0.6462, "step": 582 }, { "epoch": 3.6230529595015577, "grad_norm": 5.84375, "learning_rate": 4.999179364008283e-05, "loss": 0.6839, "step": 583 }, { "epoch": 3.629283489096573, "grad_norm": 5.0625, "learning_rate": 4.999173039632855e-05, "loss": 0.6885, "step": 584 }, { "epoch": 3.635514018691589, "grad_norm": 9.6875, "learning_rate": 4.9991666909849586e-05, "loss": 0.8094, "step": 585 }, { "epoch": 3.641744548286604, "grad_norm": 7.09375, "learning_rate": 4.999160318064656e-05, "loss": 0.8606, "step": 586 }, { "epoch": 3.64797507788162, "grad_norm": 6.46875, "learning_rate": 4.9991539208720096e-05, "loss": 0.8589, "step": 587 }, { "epoch": 3.6542056074766354, "grad_norm": 6.53125, "learning_rate": 4.99914749940708e-05, "loss": 0.6127, "step": 588 }, { "epoch": 3.660436137071651, "grad_norm": 7.59375, "learning_rate": 4.9991410536699304e-05, "loss": 0.5971, "step": 589 }, { "epoch": 3.6666666666666665, "grad_norm": 7.21875, "learning_rate": 4.999134583660623e-05, "loss": 0.7092, "step": 590 }, { "epoch": 3.6728971962616823, "grad_norm": 4.5625, "learning_rate": 4.999128089379221e-05, "loss": 0.6235, "step": 591 }, { "epoch": 3.6791277258566977, "grad_norm": 8.5625, "learning_rate": 4.999121570825788e-05, "loss": 0.7013, "step": 592 }, { "epoch": 3.6853582554517135, "grad_norm": 10.75, "learning_rate": 4.9991150280003864e-05, "loss": 0.9585, "step": 593 }, { "epoch": 3.691588785046729, "grad_norm": 6.96875, "learning_rate": 4.99910846090308e-05, "loss": 0.7, "step": 594 }, { "epoch": 3.6978193146417446, "grad_norm": 4.375, "learning_rate": 4.999101869533932e-05, "loss": 0.6818, "step": 595 }, { "epoch": 3.70404984423676, "grad_norm": 6.53125, "learning_rate": 4.999095253893008e-05, "loss": 0.6139, "step": 596 }, { "epoch": 3.710280373831776, "grad_norm": 6.0, "learning_rate": 4.9990886139803714e-05, "loss": 0.7346, "step": 597 }, { "epoch": 3.716510903426791, "grad_norm": 7.28125, "learning_rate": 4.999081949796086e-05, "loss": 0.9631, "step": 598 }, { "epoch": 3.722741433021807, "grad_norm": 5.375, "learning_rate": 4.999075261340218e-05, "loss": 0.5707, "step": 599 }, { "epoch": 3.7289719626168223, "grad_norm": 8.1875, "learning_rate": 4.999068548612831e-05, "loss": 0.8877, "step": 600 }, { "epoch": 3.735202492211838, "grad_norm": 8.125, "learning_rate": 4.9990618116139904e-05, "loss": 0.7875, "step": 601 }, { "epoch": 3.7414330218068534, "grad_norm": 5.875, "learning_rate": 4.999055050343762e-05, "loss": 0.9689, "step": 602 }, { "epoch": 3.7476635514018692, "grad_norm": 4.3125, "learning_rate": 4.999048264802212e-05, "loss": 0.8712, "step": 603 }, { "epoch": 3.7538940809968846, "grad_norm": 6.71875, "learning_rate": 4.999041454989405e-05, "loss": 0.8917, "step": 604 }, { "epoch": 3.7601246105919004, "grad_norm": 6.34375, "learning_rate": 4.999034620905408e-05, "loss": 0.7225, "step": 605 }, { "epoch": 3.7663551401869158, "grad_norm": 5.375, "learning_rate": 4.9990277625502876e-05, "loss": 0.5459, "step": 606 }, { "epoch": 3.7725856697819315, "grad_norm": 4.90625, "learning_rate": 4.99902087992411e-05, "loss": 0.8627, "step": 607 }, { "epoch": 3.778816199376947, "grad_norm": 4.40625, "learning_rate": 4.999013973026942e-05, "loss": 0.7303, "step": 608 }, { "epoch": 3.7850467289719627, "grad_norm": 4.8125, "learning_rate": 4.999007041858851e-05, "loss": 0.7531, "step": 609 }, { "epoch": 3.791277258566978, "grad_norm": 5.4375, "learning_rate": 4.999000086419903e-05, "loss": 0.8009, "step": 610 }, { "epoch": 3.797507788161994, "grad_norm": 5.375, "learning_rate": 4.998993106710168e-05, "loss": 0.7496, "step": 611 }, { "epoch": 3.803738317757009, "grad_norm": 4.59375, "learning_rate": 4.9989861027297115e-05, "loss": 0.859, "step": 612 }, { "epoch": 3.809968847352025, "grad_norm": 5.3125, "learning_rate": 4.998979074478603e-05, "loss": 0.9045, "step": 613 }, { "epoch": 3.8161993769470404, "grad_norm": 5.1875, "learning_rate": 4.998972021956909e-05, "loss": 0.756, "step": 614 }, { "epoch": 3.822429906542056, "grad_norm": 4.125, "learning_rate": 4.9989649451647e-05, "loss": 0.5173, "step": 615 }, { "epoch": 3.8286604361370715, "grad_norm": 4.5, "learning_rate": 4.998957844102044e-05, "loss": 0.5987, "step": 616 }, { "epoch": 3.8348909657320873, "grad_norm": 5.25, "learning_rate": 4.9989507187690106e-05, "loss": 0.7363, "step": 617 }, { "epoch": 3.8411214953271027, "grad_norm": 5.25, "learning_rate": 4.9989435691656674e-05, "loss": 0.8524, "step": 618 }, { "epoch": 3.8473520249221185, "grad_norm": 5.6875, "learning_rate": 4.9989363952920846e-05, "loss": 0.8353, "step": 619 }, { "epoch": 3.853582554517134, "grad_norm": 4.71875, "learning_rate": 4.9989291971483325e-05, "loss": 0.4733, "step": 620 }, { "epoch": 3.8598130841121496, "grad_norm": 5.0625, "learning_rate": 4.9989219747344804e-05, "loss": 0.855, "step": 621 }, { "epoch": 3.866043613707165, "grad_norm": 4.9375, "learning_rate": 4.998914728050598e-05, "loss": 0.6563, "step": 622 }, { "epoch": 3.872274143302181, "grad_norm": 5.875, "learning_rate": 4.998907457096757e-05, "loss": 0.7815, "step": 623 }, { "epoch": 3.878504672897196, "grad_norm": 4.53125, "learning_rate": 4.998900161873027e-05, "loss": 0.593, "step": 624 }, { "epoch": 3.884735202492212, "grad_norm": 4.78125, "learning_rate": 4.998892842379479e-05, "loss": 0.5736, "step": 625 }, { "epoch": 3.8909657320872273, "grad_norm": 5.03125, "learning_rate": 4.998885498616184e-05, "loss": 0.7603, "step": 626 }, { "epoch": 3.897196261682243, "grad_norm": 5.09375, "learning_rate": 4.998878130583214e-05, "loss": 0.8182, "step": 627 }, { "epoch": 3.9034267912772584, "grad_norm": 5.84375, "learning_rate": 4.99887073828064e-05, "loss": 0.782, "step": 628 }, { "epoch": 3.9096573208722742, "grad_norm": 5.65625, "learning_rate": 4.998863321708534e-05, "loss": 0.7215, "step": 629 }, { "epoch": 3.9158878504672896, "grad_norm": 5.78125, "learning_rate": 4.998855880866967e-05, "loss": 0.7405, "step": 630 }, { "epoch": 3.9221183800623054, "grad_norm": 5.375, "learning_rate": 4.9988484157560136e-05, "loss": 0.6579, "step": 631 }, { "epoch": 3.9283489096573208, "grad_norm": 3.953125, "learning_rate": 4.9988409263757436e-05, "loss": 0.5864, "step": 632 }, { "epoch": 3.9345794392523366, "grad_norm": 6.5, "learning_rate": 4.998833412726231e-05, "loss": 0.7875, "step": 633 }, { "epoch": 3.940809968847352, "grad_norm": 5.09375, "learning_rate": 4.99882587480755e-05, "loss": 0.5504, "step": 634 }, { "epoch": 3.9470404984423677, "grad_norm": 4.6875, "learning_rate": 4.998818312619772e-05, "loss": 0.5876, "step": 635 }, { "epoch": 3.953271028037383, "grad_norm": 4.5625, "learning_rate": 4.99881072616297e-05, "loss": 0.5262, "step": 636 }, { "epoch": 3.959501557632399, "grad_norm": 5.0625, "learning_rate": 4.99880311543722e-05, "loss": 0.7, "step": 637 }, { "epoch": 3.965732087227414, "grad_norm": 4.90625, "learning_rate": 4.998795480442595e-05, "loss": 0.7841, "step": 638 }, { "epoch": 3.97196261682243, "grad_norm": 5.96875, "learning_rate": 4.9987878211791674e-05, "loss": 0.7631, "step": 639 }, { "epoch": 3.9781931464174454, "grad_norm": 4.40625, "learning_rate": 4.998780137647014e-05, "loss": 0.7605, "step": 640 }, { "epoch": 3.984423676012461, "grad_norm": 5.09375, "learning_rate": 4.998772429846208e-05, "loss": 0.6811, "step": 641 }, { "epoch": 3.9906542056074765, "grad_norm": 6.96875, "learning_rate": 4.998764697776824e-05, "loss": 0.7619, "step": 642 }, { "epoch": 3.9968847352024923, "grad_norm": 4.65625, "learning_rate": 4.998756941438939e-05, "loss": 0.5984, "step": 643 }, { "epoch": 4.0, "grad_norm": 2.828125, "learning_rate": 4.9987491608326267e-05, "loss": 0.3537, "step": 644 }, { "epoch": 4.006230529595015, "grad_norm": 6.46875, "learning_rate": 4.9987413559579636e-05, "loss": 0.8512, "step": 645 }, { "epoch": 4.012461059190031, "grad_norm": 5.21875, "learning_rate": 4.998733526815024e-05, "loss": 0.5929, "step": 646 }, { "epoch": 4.018691588785047, "grad_norm": 6.65625, "learning_rate": 4.998725673403885e-05, "loss": 0.5444, "step": 647 }, { "epoch": 4.024922118380062, "grad_norm": 6.71875, "learning_rate": 4.998717795724624e-05, "loss": 0.7869, "step": 648 }, { "epoch": 4.031152647975078, "grad_norm": 6.5625, "learning_rate": 4.998709893777315e-05, "loss": 0.6958, "step": 649 }, { "epoch": 4.037383177570093, "grad_norm": 5.59375, "learning_rate": 4.998701967562035e-05, "loss": 0.6092, "step": 650 }, { "epoch": 4.043613707165109, "grad_norm": 5.125, "learning_rate": 4.9986940170788646e-05, "loss": 0.7544, "step": 651 }, { "epoch": 4.049844236760125, "grad_norm": 6.0625, "learning_rate": 4.998686042327877e-05, "loss": 0.5201, "step": 652 }, { "epoch": 4.05607476635514, "grad_norm": 4.5625, "learning_rate": 4.998678043309151e-05, "loss": 0.4694, "step": 653 }, { "epoch": 4.062305295950155, "grad_norm": 4.8125, "learning_rate": 4.9986700200227645e-05, "loss": 0.6531, "step": 654 }, { "epoch": 4.068535825545172, "grad_norm": 6.40625, "learning_rate": 4.9986619724687954e-05, "loss": 0.8526, "step": 655 }, { "epoch": 4.074766355140187, "grad_norm": 5.3125, "learning_rate": 4.9986539006473215e-05, "loss": 0.6849, "step": 656 }, { "epoch": 4.080996884735202, "grad_norm": 4.875, "learning_rate": 4.9986458045584215e-05, "loss": 0.5885, "step": 657 }, { "epoch": 4.087227414330218, "grad_norm": 5.28125, "learning_rate": 4.9986376842021745e-05, "loss": 0.6471, "step": 658 }, { "epoch": 4.093457943925234, "grad_norm": 5.78125, "learning_rate": 4.9986295395786586e-05, "loss": 0.6092, "step": 659 }, { "epoch": 4.099688473520249, "grad_norm": 4.34375, "learning_rate": 4.998621370687952e-05, "loss": 0.6097, "step": 660 }, { "epoch": 4.105919003115265, "grad_norm": 5.34375, "learning_rate": 4.998613177530137e-05, "loss": 0.7251, "step": 661 }, { "epoch": 4.11214953271028, "grad_norm": 5.6875, "learning_rate": 4.998604960105291e-05, "loss": 0.7224, "step": 662 }, { "epoch": 4.118380062305296, "grad_norm": 4.46875, "learning_rate": 4.9985967184134936e-05, "loss": 0.6455, "step": 663 }, { "epoch": 4.1246105919003115, "grad_norm": 5.8125, "learning_rate": 4.9985884524548254e-05, "loss": 0.4878, "step": 664 }, { "epoch": 4.130841121495327, "grad_norm": 4.96875, "learning_rate": 4.9985801622293674e-05, "loss": 0.5062, "step": 665 }, { "epoch": 4.137071651090342, "grad_norm": 3.90625, "learning_rate": 4.998571847737198e-05, "loss": 0.6016, "step": 666 }, { "epoch": 4.1433021806853585, "grad_norm": 5.46875, "learning_rate": 4.9985635089784e-05, "loss": 0.524, "step": 667 }, { "epoch": 4.149532710280374, "grad_norm": 4.0625, "learning_rate": 4.998555145953054e-05, "loss": 0.458, "step": 668 }, { "epoch": 4.155763239875389, "grad_norm": 4.59375, "learning_rate": 4.998546758661241e-05, "loss": 0.4851, "step": 669 }, { "epoch": 4.161993769470405, "grad_norm": 4.40625, "learning_rate": 4.9985383471030425e-05, "loss": 0.5916, "step": 670 }, { "epoch": 4.168224299065421, "grad_norm": 6.09375, "learning_rate": 4.9985299112785406e-05, "loss": 0.5923, "step": 671 }, { "epoch": 4.174454828660436, "grad_norm": 5.4375, "learning_rate": 4.998521451187816e-05, "loss": 0.6733, "step": 672 }, { "epoch": 4.1806853582554515, "grad_norm": 5.625, "learning_rate": 4.998512966830951e-05, "loss": 0.8544, "step": 673 }, { "epoch": 4.186915887850467, "grad_norm": 5.5625, "learning_rate": 4.99850445820803e-05, "loss": 0.6227, "step": 674 }, { "epoch": 4.193146417445483, "grad_norm": 4.65625, "learning_rate": 4.998495925319134e-05, "loss": 0.4555, "step": 675 }, { "epoch": 4.1993769470404985, "grad_norm": 5.5625, "learning_rate": 4.998487368164346e-05, "loss": 0.6757, "step": 676 }, { "epoch": 4.205607476635514, "grad_norm": 4.90625, "learning_rate": 4.998478786743749e-05, "loss": 0.6447, "step": 677 }, { "epoch": 4.211838006230529, "grad_norm": 4.75, "learning_rate": 4.998470181057426e-05, "loss": 0.613, "step": 678 }, { "epoch": 4.218068535825545, "grad_norm": 5.0625, "learning_rate": 4.998461551105462e-05, "loss": 0.6352, "step": 679 }, { "epoch": 4.224299065420561, "grad_norm": 5.46875, "learning_rate": 4.9984528968879394e-05, "loss": 0.7015, "step": 680 }, { "epoch": 4.230529595015576, "grad_norm": 6.53125, "learning_rate": 4.998444218404944e-05, "loss": 0.7203, "step": 681 }, { "epoch": 4.2367601246105915, "grad_norm": 4.8125, "learning_rate": 4.9984355156565576e-05, "loss": 0.6342, "step": 682 }, { "epoch": 4.242990654205608, "grad_norm": 5.0625, "learning_rate": 4.998426788642867e-05, "loss": 0.6383, "step": 683 }, { "epoch": 4.249221183800623, "grad_norm": 5.4375, "learning_rate": 4.9984180373639555e-05, "loss": 0.5606, "step": 684 }, { "epoch": 4.255451713395638, "grad_norm": 5.0, "learning_rate": 4.998409261819909e-05, "loss": 0.5101, "step": 685 }, { "epoch": 4.261682242990654, "grad_norm": 4.21875, "learning_rate": 4.998400462010812e-05, "loss": 0.5743, "step": 686 }, { "epoch": 4.26791277258567, "grad_norm": 5.28125, "learning_rate": 4.99839163793675e-05, "loss": 0.6679, "step": 687 }, { "epoch": 4.274143302180685, "grad_norm": 5.46875, "learning_rate": 4.998382789597809e-05, "loss": 0.5584, "step": 688 }, { "epoch": 4.280373831775701, "grad_norm": 5.125, "learning_rate": 4.9983739169940756e-05, "loss": 0.5341, "step": 689 }, { "epoch": 4.286604361370716, "grad_norm": 4.90625, "learning_rate": 4.998365020125635e-05, "loss": 0.5683, "step": 690 }, { "epoch": 4.292834890965732, "grad_norm": 5.21875, "learning_rate": 4.9983560989925736e-05, "loss": 0.4846, "step": 691 }, { "epoch": 4.299065420560748, "grad_norm": 5.125, "learning_rate": 4.998347153594979e-05, "loss": 0.6068, "step": 692 }, { "epoch": 4.305295950155763, "grad_norm": 4.4375, "learning_rate": 4.998338183932937e-05, "loss": 0.4914, "step": 693 }, { "epoch": 4.311526479750778, "grad_norm": 5.5, "learning_rate": 4.998329190006535e-05, "loss": 0.7465, "step": 694 }, { "epoch": 4.317757009345795, "grad_norm": 5.1875, "learning_rate": 4.99832017181586e-05, "loss": 0.7215, "step": 695 }, { "epoch": 4.32398753894081, "grad_norm": 4.28125, "learning_rate": 4.9983111293610015e-05, "loss": 0.6023, "step": 696 }, { "epoch": 4.330218068535825, "grad_norm": 5.375, "learning_rate": 4.998302062642045e-05, "loss": 0.7708, "step": 697 }, { "epoch": 4.336448598130841, "grad_norm": 5.375, "learning_rate": 4.99829297165908e-05, "loss": 0.5709, "step": 698 }, { "epoch": 4.342679127725857, "grad_norm": 4.0625, "learning_rate": 4.998283856412194e-05, "loss": 0.4831, "step": 699 }, { "epoch": 4.348909657320872, "grad_norm": 4.28125, "learning_rate": 4.998274716901475e-05, "loss": 0.5089, "step": 700 }, { "epoch": 4.355140186915888, "grad_norm": 3.90625, "learning_rate": 4.998265553127013e-05, "loss": 0.5293, "step": 701 }, { "epoch": 4.361370716510903, "grad_norm": 8.4375, "learning_rate": 4.998256365088897e-05, "loss": 0.6808, "step": 702 }, { "epoch": 4.367601246105919, "grad_norm": 7.125, "learning_rate": 4.998247152787216e-05, "loss": 0.4541, "step": 703 }, { "epoch": 4.373831775700935, "grad_norm": 6.1875, "learning_rate": 4.9982379162220586e-05, "loss": 0.6551, "step": 704 }, { "epoch": 4.38006230529595, "grad_norm": 7.125, "learning_rate": 4.998228655393515e-05, "loss": 0.8244, "step": 705 }, { "epoch": 4.386292834890965, "grad_norm": 7.25, "learning_rate": 4.9982193703016765e-05, "loss": 0.5893, "step": 706 }, { "epoch": 4.392523364485982, "grad_norm": 7.875, "learning_rate": 4.998210060946631e-05, "loss": 0.4978, "step": 707 }, { "epoch": 4.398753894080997, "grad_norm": 4.46875, "learning_rate": 4.99820072732847e-05, "loss": 0.6438, "step": 708 }, { "epoch": 4.404984423676012, "grad_norm": 9.125, "learning_rate": 4.9981913694472846e-05, "loss": 0.5922, "step": 709 }, { "epoch": 4.411214953271028, "grad_norm": 6.96875, "learning_rate": 4.9981819873031656e-05, "loss": 0.6971, "step": 710 }, { "epoch": 4.417445482866044, "grad_norm": 6.65625, "learning_rate": 4.998172580896203e-05, "loss": 0.5714, "step": 711 }, { "epoch": 4.423676012461059, "grad_norm": 7.3125, "learning_rate": 4.998163150226489e-05, "loss": 0.6378, "step": 712 }, { "epoch": 4.429906542056075, "grad_norm": 4.78125, "learning_rate": 4.998153695294116e-05, "loss": 0.4493, "step": 713 }, { "epoch": 4.43613707165109, "grad_norm": 4.1875, "learning_rate": 4.998144216099174e-05, "loss": 0.3921, "step": 714 }, { "epoch": 4.442367601246106, "grad_norm": 4.625, "learning_rate": 4.9981347126417564e-05, "loss": 0.5869, "step": 715 }, { "epoch": 4.4485981308411215, "grad_norm": 4.96875, "learning_rate": 4.998125184921955e-05, "loss": 0.4429, "step": 716 }, { "epoch": 4.454828660436137, "grad_norm": 5.03125, "learning_rate": 4.9981156329398626e-05, "loss": 0.5401, "step": 717 }, { "epoch": 4.461059190031152, "grad_norm": 5.46875, "learning_rate": 4.9981060566955715e-05, "loss": 0.6392, "step": 718 }, { "epoch": 4.4672897196261685, "grad_norm": 6.15625, "learning_rate": 4.998096456189175e-05, "loss": 0.4644, "step": 719 }, { "epoch": 4.473520249221184, "grad_norm": 4.375, "learning_rate": 4.9980868314207665e-05, "loss": 0.5941, "step": 720 }, { "epoch": 4.479750778816199, "grad_norm": 4.90625, "learning_rate": 4.9980771823904396e-05, "loss": 0.637, "step": 721 }, { "epoch": 4.485981308411215, "grad_norm": 4.375, "learning_rate": 4.998067509098288e-05, "loss": 0.597, "step": 722 }, { "epoch": 4.492211838006231, "grad_norm": 5.15625, "learning_rate": 4.9980578115444045e-05, "loss": 0.8217, "step": 723 }, { "epoch": 4.498442367601246, "grad_norm": 5.1875, "learning_rate": 4.998048089728884e-05, "loss": 0.8387, "step": 724 }, { "epoch": 4.5046728971962615, "grad_norm": 6.625, "learning_rate": 4.998038343651822e-05, "loss": 0.6024, "step": 725 }, { "epoch": 4.510903426791277, "grad_norm": 4.0, "learning_rate": 4.998028573313312e-05, "loss": 0.5134, "step": 726 }, { "epoch": 4.517133956386293, "grad_norm": 5.34375, "learning_rate": 4.9980187787134495e-05, "loss": 0.6402, "step": 727 }, { "epoch": 4.5233644859813085, "grad_norm": 6.5625, "learning_rate": 4.998008959852328e-05, "loss": 0.8354, "step": 728 }, { "epoch": 4.529595015576324, "grad_norm": 5.75, "learning_rate": 4.9979991167300453e-05, "loss": 0.7811, "step": 729 }, { "epoch": 4.535825545171339, "grad_norm": 4.46875, "learning_rate": 4.997989249346695e-05, "loss": 0.4961, "step": 730 }, { "epoch": 4.542056074766355, "grad_norm": 4.1875, "learning_rate": 4.9979793577023746e-05, "loss": 0.3693, "step": 731 }, { "epoch": 4.548286604361371, "grad_norm": 4.09375, "learning_rate": 4.997969441797179e-05, "loss": 0.4789, "step": 732 }, { "epoch": 4.554517133956386, "grad_norm": 4.03125, "learning_rate": 4.997959501631205e-05, "loss": 0.5738, "step": 733 }, { "epoch": 4.5607476635514015, "grad_norm": 3.796875, "learning_rate": 4.997949537204549e-05, "loss": 0.5126, "step": 734 }, { "epoch": 4.566978193146418, "grad_norm": 4.0, "learning_rate": 4.997939548517308e-05, "loss": 0.4648, "step": 735 }, { "epoch": 4.573208722741433, "grad_norm": 3.453125, "learning_rate": 4.9979295355695776e-05, "loss": 0.5322, "step": 736 }, { "epoch": 4.579439252336448, "grad_norm": 4.96875, "learning_rate": 4.997919498361457e-05, "loss": 0.554, "step": 737 }, { "epoch": 4.585669781931464, "grad_norm": 3.59375, "learning_rate": 4.997909436893043e-05, "loss": 0.6386, "step": 738 }, { "epoch": 4.59190031152648, "grad_norm": 4.28125, "learning_rate": 4.9978993511644334e-05, "loss": 0.5965, "step": 739 }, { "epoch": 4.598130841121495, "grad_norm": 3.890625, "learning_rate": 4.997889241175726e-05, "loss": 0.416, "step": 740 }, { "epoch": 4.604361370716511, "grad_norm": 4.6875, "learning_rate": 4.9978791069270184e-05, "loss": 0.3923, "step": 741 }, { "epoch": 4.610591900311526, "grad_norm": 4.96875, "learning_rate": 4.9978689484184104e-05, "loss": 0.5956, "step": 742 }, { "epoch": 4.616822429906542, "grad_norm": 4.09375, "learning_rate": 4.9978587656499995e-05, "loss": 0.4647, "step": 743 }, { "epoch": 4.623052959501558, "grad_norm": 3.40625, "learning_rate": 4.9978485586218846e-05, "loss": 0.4451, "step": 744 }, { "epoch": 4.629283489096573, "grad_norm": 4.21875, "learning_rate": 4.9978383273341655e-05, "loss": 0.5009, "step": 745 }, { "epoch": 4.635514018691588, "grad_norm": 4.625, "learning_rate": 4.997828071786941e-05, "loss": 0.5868, "step": 746 }, { "epoch": 4.641744548286605, "grad_norm": 5.125, "learning_rate": 4.997817791980311e-05, "loss": 0.7593, "step": 747 }, { "epoch": 4.64797507788162, "grad_norm": 6.625, "learning_rate": 4.9978074879143754e-05, "loss": 0.5129, "step": 748 }, { "epoch": 4.654205607476635, "grad_norm": 4.53125, "learning_rate": 4.997797159589235e-05, "loss": 0.4364, "step": 749 }, { "epoch": 4.660436137071651, "grad_norm": 5.53125, "learning_rate": 4.997786807004988e-05, "loss": 0.5048, "step": 750 }, { "epoch": 4.666666666666667, "grad_norm": 7.40625, "learning_rate": 4.997776430161737e-05, "loss": 0.7256, "step": 751 }, { "epoch": 4.672897196261682, "grad_norm": 3.78125, "learning_rate": 4.997766029059582e-05, "loss": 0.4477, "step": 752 }, { "epoch": 4.679127725856698, "grad_norm": 4.3125, "learning_rate": 4.9977556036986236e-05, "loss": 0.4604, "step": 753 }, { "epoch": 4.685358255451713, "grad_norm": 5.96875, "learning_rate": 4.9977451540789635e-05, "loss": 0.6264, "step": 754 }, { "epoch": 4.691588785046729, "grad_norm": 3.421875, "learning_rate": 4.997734680200703e-05, "loss": 0.5397, "step": 755 }, { "epoch": 4.697819314641745, "grad_norm": 5.0, "learning_rate": 4.997724182063945e-05, "loss": 0.4754, "step": 756 }, { "epoch": 4.70404984423676, "grad_norm": 4.1875, "learning_rate": 4.99771365966879e-05, "loss": 0.542, "step": 757 }, { "epoch": 4.710280373831775, "grad_norm": 6.4375, "learning_rate": 4.997703113015339e-05, "loss": 0.4681, "step": 758 }, { "epoch": 4.716510903426792, "grad_norm": 3.703125, "learning_rate": 4.9976925421036986e-05, "loss": 0.5615, "step": 759 }, { "epoch": 4.722741433021807, "grad_norm": 6.53125, "learning_rate": 4.997681946933967e-05, "loss": 0.687, "step": 760 }, { "epoch": 4.728971962616822, "grad_norm": 5.0625, "learning_rate": 4.9976713275062506e-05, "loss": 0.5605, "step": 761 }, { "epoch": 4.735202492211838, "grad_norm": 4.65625, "learning_rate": 4.9976606838206506e-05, "loss": 0.6131, "step": 762 }, { "epoch": 4.741433021806854, "grad_norm": 4.78125, "learning_rate": 4.9976500158772706e-05, "loss": 0.5277, "step": 763 }, { "epoch": 4.747663551401869, "grad_norm": 5.03125, "learning_rate": 4.997639323676214e-05, "loss": 0.4088, "step": 764 }, { "epoch": 4.753894080996885, "grad_norm": 4.59375, "learning_rate": 4.997628607217585e-05, "loss": 0.6742, "step": 765 }, { "epoch": 4.7601246105919, "grad_norm": 5.90625, "learning_rate": 4.997617866501488e-05, "loss": 0.6446, "step": 766 }, { "epoch": 4.766355140186916, "grad_norm": 4.4375, "learning_rate": 4.9976071015280276e-05, "loss": 0.392, "step": 767 }, { "epoch": 4.7725856697819315, "grad_norm": 4.4375, "learning_rate": 4.9975963122973066e-05, "loss": 0.6358, "step": 768 }, { "epoch": 4.778816199376947, "grad_norm": 3.828125, "learning_rate": 4.9975854988094316e-05, "loss": 0.5524, "step": 769 }, { "epoch": 4.785046728971962, "grad_norm": 5.125, "learning_rate": 4.9975746610645075e-05, "loss": 0.423, "step": 770 }, { "epoch": 4.7912772585669785, "grad_norm": 4.375, "learning_rate": 4.997563799062638e-05, "loss": 0.4124, "step": 771 }, { "epoch": 4.797507788161994, "grad_norm": 4.9375, "learning_rate": 4.9975529128039305e-05, "loss": 0.5648, "step": 772 }, { "epoch": 4.803738317757009, "grad_norm": 6.59375, "learning_rate": 4.9975420022884896e-05, "loss": 0.4473, "step": 773 }, { "epoch": 4.809968847352025, "grad_norm": 4.375, "learning_rate": 4.997531067516421e-05, "loss": 0.548, "step": 774 }, { "epoch": 4.816199376947041, "grad_norm": 4.125, "learning_rate": 4.997520108487832e-05, "loss": 0.3893, "step": 775 }, { "epoch": 4.822429906542056, "grad_norm": 3.78125, "learning_rate": 4.997509125202828e-05, "loss": 0.5388, "step": 776 }, { "epoch": 4.8286604361370715, "grad_norm": 4.75, "learning_rate": 4.997498117661517e-05, "loss": 0.5643, "step": 777 }, { "epoch": 4.834890965732087, "grad_norm": 6.15625, "learning_rate": 4.997487085864004e-05, "loss": 0.6649, "step": 778 }, { "epoch": 4.841121495327103, "grad_norm": 4.46875, "learning_rate": 4.997476029810398e-05, "loss": 0.5124, "step": 779 }, { "epoch": 4.8473520249221185, "grad_norm": 4.46875, "learning_rate": 4.9974649495008055e-05, "loss": 0.6564, "step": 780 }, { "epoch": 4.853582554517134, "grad_norm": 3.21875, "learning_rate": 4.9974538449353335e-05, "loss": 0.4781, "step": 781 }, { "epoch": 4.859813084112149, "grad_norm": 7.4375, "learning_rate": 4.997442716114091e-05, "loss": 0.447, "step": 782 }, { "epoch": 4.866043613707165, "grad_norm": 6.65625, "learning_rate": 4.9974315630371855e-05, "loss": 0.3421, "step": 783 }, { "epoch": 4.872274143302181, "grad_norm": 4.84375, "learning_rate": 4.997420385704725e-05, "loss": 0.4854, "step": 784 }, { "epoch": 4.878504672897196, "grad_norm": 4.6875, "learning_rate": 4.9974091841168195e-05, "loss": 0.4635, "step": 785 }, { "epoch": 4.8847352024922115, "grad_norm": 6.5, "learning_rate": 4.997397958273576e-05, "loss": 0.5343, "step": 786 }, { "epoch": 4.890965732087228, "grad_norm": 5.125, "learning_rate": 4.997386708175105e-05, "loss": 0.4888, "step": 787 }, { "epoch": 4.897196261682243, "grad_norm": 3.484375, "learning_rate": 4.997375433821514e-05, "loss": 0.3721, "step": 788 }, { "epoch": 4.9034267912772584, "grad_norm": 4.3125, "learning_rate": 4.9973641352129144e-05, "loss": 0.4948, "step": 789 }, { "epoch": 4.909657320872274, "grad_norm": 9.4375, "learning_rate": 4.9973528123494146e-05, "loss": 0.485, "step": 790 }, { "epoch": 4.91588785046729, "grad_norm": 8.0625, "learning_rate": 4.997341465231126e-05, "loss": 0.5562, "step": 791 }, { "epoch": 4.922118380062305, "grad_norm": 5.15625, "learning_rate": 4.997330093858157e-05, "loss": 0.5911, "step": 792 }, { "epoch": 4.928348909657321, "grad_norm": 5.28125, "learning_rate": 4.997318698230619e-05, "loss": 0.5905, "step": 793 }, { "epoch": 4.934579439252336, "grad_norm": 6.5625, "learning_rate": 4.9973072783486226e-05, "loss": 0.5666, "step": 794 }, { "epoch": 4.940809968847352, "grad_norm": 5.34375, "learning_rate": 4.997295834212279e-05, "loss": 0.5233, "step": 795 }, { "epoch": 4.947040498442368, "grad_norm": 3.28125, "learning_rate": 4.997284365821699e-05, "loss": 0.5307, "step": 796 }, { "epoch": 4.953271028037383, "grad_norm": 4.78125, "learning_rate": 4.997272873176994e-05, "loss": 0.6502, "step": 797 }, { "epoch": 4.959501557632398, "grad_norm": 4.34375, "learning_rate": 4.9972613562782756e-05, "loss": 0.4848, "step": 798 }, { "epoch": 4.965732087227415, "grad_norm": 3.796875, "learning_rate": 4.9972498151256555e-05, "loss": 0.4976, "step": 799 }, { "epoch": 4.97196261682243, "grad_norm": 4.8125, "learning_rate": 4.9972382497192464e-05, "loss": 0.7667, "step": 800 }, { "epoch": 4.978193146417445, "grad_norm": 5.3125, "learning_rate": 4.99722666005916e-05, "loss": 0.4827, "step": 801 }, { "epoch": 4.984423676012461, "grad_norm": 5.15625, "learning_rate": 4.997215046145509e-05, "loss": 0.4081, "step": 802 }, { "epoch": 4.990654205607477, "grad_norm": 4.71875, "learning_rate": 4.997203407978407e-05, "loss": 0.4031, "step": 803 }, { "epoch": 4.996884735202492, "grad_norm": 6.75, "learning_rate": 4.9971917455579654e-05, "loss": 0.3994, "step": 804 }, { "epoch": 5.0, "grad_norm": 2.9375, "learning_rate": 4.997180058884299e-05, "loss": 0.29, "step": 805 }, { "epoch": 5.006230529595015, "grad_norm": 5.625, "learning_rate": 4.99716834795752e-05, "loss": 0.5152, "step": 806 }, { "epoch": 5.012461059190031, "grad_norm": 5.53125, "learning_rate": 4.9971566127777436e-05, "loss": 0.5324, "step": 807 }, { "epoch": 5.018691588785047, "grad_norm": 3.15625, "learning_rate": 4.9971448533450825e-05, "loss": 0.4167, "step": 808 }, { "epoch": 5.024922118380062, "grad_norm": 4.21875, "learning_rate": 4.997133069659652e-05, "loss": 0.3634, "step": 809 }, { "epoch": 5.031152647975078, "grad_norm": 5.09375, "learning_rate": 4.997121261721565e-05, "loss": 0.5812, "step": 810 }, { "epoch": 5.037383177570093, "grad_norm": 4.21875, "learning_rate": 4.997109429530938e-05, "loss": 0.5952, "step": 811 }, { "epoch": 5.043613707165109, "grad_norm": 4.34375, "learning_rate": 4.997097573087884e-05, "loss": 0.5939, "step": 812 }, { "epoch": 5.049844236760125, "grad_norm": 4.8125, "learning_rate": 4.997085692392521e-05, "loss": 0.6911, "step": 813 }, { "epoch": 5.05607476635514, "grad_norm": 4.96875, "learning_rate": 4.9970737874449615e-05, "loss": 0.5382, "step": 814 }, { "epoch": 5.062305295950155, "grad_norm": 4.90625, "learning_rate": 4.997061858245322e-05, "loss": 0.4517, "step": 815 }, { "epoch": 5.068535825545172, "grad_norm": 5.1875, "learning_rate": 4.997049904793719e-05, "loss": 0.7311, "step": 816 }, { "epoch": 5.074766355140187, "grad_norm": 6.59375, "learning_rate": 4.997037927090268e-05, "loss": 0.5173, "step": 817 }, { "epoch": 5.080996884735202, "grad_norm": 5.40625, "learning_rate": 4.997025925135086e-05, "loss": 0.4499, "step": 818 }, { "epoch": 5.087227414330218, "grad_norm": 5.375, "learning_rate": 4.997013898928288e-05, "loss": 0.5604, "step": 819 }, { "epoch": 5.093457943925234, "grad_norm": 4.71875, "learning_rate": 4.997001848469993e-05, "loss": 0.6147, "step": 820 }, { "epoch": 5.099688473520249, "grad_norm": 4.34375, "learning_rate": 4.996989773760316e-05, "loss": 0.4573, "step": 821 }, { "epoch": 5.105919003115265, "grad_norm": 4.6875, "learning_rate": 4.996977674799376e-05, "loss": 0.538, "step": 822 }, { "epoch": 5.11214953271028, "grad_norm": 3.359375, "learning_rate": 4.9969655515872895e-05, "loss": 0.4125, "step": 823 }, { "epoch": 5.118380062305296, "grad_norm": 4.25, "learning_rate": 4.996953404124174e-05, "loss": 0.5349, "step": 824 }, { "epoch": 5.1246105919003115, "grad_norm": 4.46875, "learning_rate": 4.996941232410148e-05, "loss": 0.5448, "step": 825 }, { "epoch": 5.130841121495327, "grad_norm": 5.21875, "learning_rate": 4.996929036445329e-05, "loss": 0.6183, "step": 826 }, { "epoch": 5.137071651090342, "grad_norm": 4.03125, "learning_rate": 4.996916816229837e-05, "loss": 0.5212, "step": 827 }, { "epoch": 5.1433021806853585, "grad_norm": 6.0, "learning_rate": 4.9969045717637896e-05, "loss": 0.5104, "step": 828 }, { "epoch": 5.149532710280374, "grad_norm": 4.59375, "learning_rate": 4.996892303047306e-05, "loss": 0.455, "step": 829 }, { "epoch": 5.155763239875389, "grad_norm": 4.3125, "learning_rate": 4.9968800100805054e-05, "loss": 0.5589, "step": 830 }, { "epoch": 5.161993769470405, "grad_norm": 3.859375, "learning_rate": 4.996867692863507e-05, "loss": 0.4931, "step": 831 }, { "epoch": 5.168224299065421, "grad_norm": 5.34375, "learning_rate": 4.99685535139643e-05, "loss": 0.8132, "step": 832 }, { "epoch": 5.174454828660436, "grad_norm": 4.40625, "learning_rate": 4.9968429856793945e-05, "loss": 0.3563, "step": 833 }, { "epoch": 5.1806853582554515, "grad_norm": 4.25, "learning_rate": 4.9968305957125215e-05, "loss": 0.3753, "step": 834 }, { "epoch": 5.186915887850467, "grad_norm": 3.515625, "learning_rate": 4.99681818149593e-05, "loss": 0.5692, "step": 835 }, { "epoch": 5.193146417445483, "grad_norm": 6.75, "learning_rate": 4.9968057430297415e-05, "loss": 0.4894, "step": 836 }, { "epoch": 5.1993769470404985, "grad_norm": 6.53125, "learning_rate": 4.9967932803140764e-05, "loss": 0.4608, "step": 837 }, { "epoch": 5.205607476635514, "grad_norm": 4.75, "learning_rate": 4.996780793349056e-05, "loss": 0.412, "step": 838 }, { "epoch": 5.211838006230529, "grad_norm": 4.1875, "learning_rate": 4.996768282134802e-05, "loss": 0.6614, "step": 839 }, { "epoch": 5.218068535825545, "grad_norm": 4.9375, "learning_rate": 4.996755746671434e-05, "loss": 0.3526, "step": 840 }, { "epoch": 5.224299065420561, "grad_norm": 5.6875, "learning_rate": 4.9967431869590766e-05, "loss": 0.5028, "step": 841 }, { "epoch": 5.230529595015576, "grad_norm": 6.03125, "learning_rate": 4.9967306029978497e-05, "loss": 0.4771, "step": 842 }, { "epoch": 5.2367601246105915, "grad_norm": 5.34375, "learning_rate": 4.996717994787876e-05, "loss": 0.5306, "step": 843 }, { "epoch": 5.242990654205608, "grad_norm": 5.75, "learning_rate": 4.996705362329278e-05, "loss": 0.6897, "step": 844 }, { "epoch": 5.249221183800623, "grad_norm": 4.875, "learning_rate": 4.996692705622178e-05, "loss": 0.4081, "step": 845 }, { "epoch": 5.255451713395638, "grad_norm": 5.6875, "learning_rate": 4.9966800246667e-05, "loss": 0.5947, "step": 846 }, { "epoch": 5.261682242990654, "grad_norm": 4.0, "learning_rate": 4.996667319462966e-05, "loss": 0.4354, "step": 847 }, { "epoch": 5.26791277258567, "grad_norm": 4.9375, "learning_rate": 4.9966545900111006e-05, "loss": 0.5428, "step": 848 }, { "epoch": 5.274143302180685, "grad_norm": 5.40625, "learning_rate": 4.996641836311226e-05, "loss": 0.5329, "step": 849 }, { "epoch": 5.280373831775701, "grad_norm": 3.953125, "learning_rate": 4.996629058363468e-05, "loss": 0.4647, "step": 850 }, { "epoch": 5.286604361370716, "grad_norm": 3.046875, "learning_rate": 4.9966162561679486e-05, "loss": 0.4125, "step": 851 }, { "epoch": 5.292834890965732, "grad_norm": 4.5625, "learning_rate": 4.996603429724793e-05, "loss": 0.5113, "step": 852 }, { "epoch": 5.299065420560748, "grad_norm": 4.90625, "learning_rate": 4.996590579034126e-05, "loss": 0.3235, "step": 853 }, { "epoch": 5.305295950155763, "grad_norm": 3.609375, "learning_rate": 4.996577704096072e-05, "loss": 0.3765, "step": 854 }, { "epoch": 5.311526479750778, "grad_norm": 4.90625, "learning_rate": 4.9965648049107575e-05, "loss": 0.569, "step": 855 }, { "epoch": 5.317757009345795, "grad_norm": 4.3125, "learning_rate": 4.996551881478305e-05, "loss": 0.5853, "step": 856 }, { "epoch": 5.32398753894081, "grad_norm": 4.9375, "learning_rate": 4.996538933798843e-05, "loss": 0.5972, "step": 857 }, { "epoch": 5.330218068535825, "grad_norm": 6.34375, "learning_rate": 4.996525961872495e-05, "loss": 0.8639, "step": 858 }, { "epoch": 5.336448598130841, "grad_norm": 6.0625, "learning_rate": 4.9965129656993875e-05, "loss": 0.5853, "step": 859 }, { "epoch": 5.342679127725857, "grad_norm": 5.375, "learning_rate": 4.996499945279648e-05, "loss": 0.3496, "step": 860 }, { "epoch": 5.348909657320872, "grad_norm": 6.28125, "learning_rate": 4.9964869006134004e-05, "loss": 0.6613, "step": 861 }, { "epoch": 5.355140186915888, "grad_norm": 4.53125, "learning_rate": 4.996473831700774e-05, "loss": 0.4269, "step": 862 }, { "epoch": 5.361370716510903, "grad_norm": 7.28125, "learning_rate": 4.996460738541895e-05, "loss": 0.789, "step": 863 }, { "epoch": 5.367601246105919, "grad_norm": 3.015625, "learning_rate": 4.996447621136891e-05, "loss": 0.4311, "step": 864 }, { "epoch": 5.373831775700935, "grad_norm": 4.3125, "learning_rate": 4.996434479485886e-05, "loss": 0.4077, "step": 865 }, { "epoch": 5.38006230529595, "grad_norm": 6.21875, "learning_rate": 4.996421313589013e-05, "loss": 0.4448, "step": 866 }, { "epoch": 5.386292834890965, "grad_norm": 8.4375, "learning_rate": 4.996408123446396e-05, "loss": 0.5301, "step": 867 }, { "epoch": 5.392523364485982, "grad_norm": 4.0625, "learning_rate": 4.996394909058164e-05, "loss": 0.4028, "step": 868 }, { "epoch": 5.398753894080997, "grad_norm": 6.5, "learning_rate": 4.996381670424446e-05, "loss": 0.6803, "step": 869 }, { "epoch": 5.404984423676012, "grad_norm": 10.8125, "learning_rate": 4.99636840754537e-05, "loss": 0.4817, "step": 870 }, { "epoch": 5.411214953271028, "grad_norm": 4.21875, "learning_rate": 4.9963551204210655e-05, "loss": 0.3013, "step": 871 }, { "epoch": 5.417445482866044, "grad_norm": 3.515625, "learning_rate": 4.996341809051661e-05, "loss": 0.3417, "step": 872 }, { "epoch": 5.423676012461059, "grad_norm": 10.0, "learning_rate": 4.9963284734372855e-05, "loss": 0.6224, "step": 873 }, { "epoch": 5.429906542056075, "grad_norm": 10.75, "learning_rate": 4.996315113578069e-05, "loss": 0.4565, "step": 874 }, { "epoch": 5.43613707165109, "grad_norm": 9.5625, "learning_rate": 4.9963017294741407e-05, "loss": 0.5562, "step": 875 }, { "epoch": 5.442367601246106, "grad_norm": 4.53125, "learning_rate": 4.9962883211256315e-05, "loss": 0.398, "step": 876 }, { "epoch": 5.4485981308411215, "grad_norm": 6.625, "learning_rate": 4.996274888532672e-05, "loss": 0.5051, "step": 877 }, { "epoch": 5.454828660436137, "grad_norm": 9.875, "learning_rate": 4.99626143169539e-05, "loss": 0.5218, "step": 878 }, { "epoch": 5.461059190031152, "grad_norm": 7.96875, "learning_rate": 4.9962479506139195e-05, "loss": 0.4092, "step": 879 }, { "epoch": 5.4672897196261685, "grad_norm": 3.578125, "learning_rate": 4.996234445288389e-05, "loss": 0.5676, "step": 880 }, { "epoch": 5.473520249221184, "grad_norm": 5.125, "learning_rate": 4.996220915718931e-05, "loss": 0.3629, "step": 881 }, { "epoch": 5.479750778816199, "grad_norm": 5.40625, "learning_rate": 4.996207361905676e-05, "loss": 0.5503, "step": 882 }, { "epoch": 5.485981308411215, "grad_norm": 7.875, "learning_rate": 4.996193783848756e-05, "loss": 0.5268, "step": 883 }, { "epoch": 5.492211838006231, "grad_norm": 3.921875, "learning_rate": 4.996180181548303e-05, "loss": 0.4549, "step": 884 }, { "epoch": 5.498442367601246, "grad_norm": 6.625, "learning_rate": 4.99616655500445e-05, "loss": 0.3825, "step": 885 }, { "epoch": 5.5046728971962615, "grad_norm": 6.71875, "learning_rate": 4.996152904217328e-05, "loss": 0.4499, "step": 886 }, { "epoch": 5.510903426791277, "grad_norm": 4.3125, "learning_rate": 4.99613922918707e-05, "loss": 0.4498, "step": 887 }, { "epoch": 5.517133956386293, "grad_norm": 3.953125, "learning_rate": 4.996125529913809e-05, "loss": 0.4688, "step": 888 }, { "epoch": 5.5233644859813085, "grad_norm": 3.203125, "learning_rate": 4.996111806397677e-05, "loss": 0.4939, "step": 889 }, { "epoch": 5.529595015576324, "grad_norm": 5.65625, "learning_rate": 4.996098058638809e-05, "loss": 0.6239, "step": 890 }, { "epoch": 5.535825545171339, "grad_norm": 3.125, "learning_rate": 4.996084286637338e-05, "loss": 0.5127, "step": 891 }, { "epoch": 5.542056074766355, "grad_norm": 4.375, "learning_rate": 4.996070490393397e-05, "loss": 0.601, "step": 892 }, { "epoch": 5.548286604361371, "grad_norm": 2.796875, "learning_rate": 4.9960566699071204e-05, "loss": 0.3069, "step": 893 }, { "epoch": 5.554517133956386, "grad_norm": 3.484375, "learning_rate": 4.9960428251786426e-05, "loss": 0.4631, "step": 894 }, { "epoch": 5.5607476635514015, "grad_norm": 4.53125, "learning_rate": 4.9960289562080974e-05, "loss": 0.4522, "step": 895 }, { "epoch": 5.566978193146418, "grad_norm": 4.40625, "learning_rate": 4.996015062995621e-05, "loss": 0.3931, "step": 896 }, { "epoch": 5.573208722741433, "grad_norm": 3.703125, "learning_rate": 4.996001145541347e-05, "loss": 0.5091, "step": 897 }, { "epoch": 5.579439252336448, "grad_norm": 3.34375, "learning_rate": 4.99598720384541e-05, "loss": 0.4879, "step": 898 }, { "epoch": 5.585669781931464, "grad_norm": 6.21875, "learning_rate": 4.9959732379079475e-05, "loss": 0.6151, "step": 899 }, { "epoch": 5.59190031152648, "grad_norm": 7.34375, "learning_rate": 4.9959592477290936e-05, "loss": 0.4314, "step": 900 }, { "epoch": 5.598130841121495, "grad_norm": 4.65625, "learning_rate": 4.995945233308985e-05, "loss": 0.5442, "step": 901 }, { "epoch": 5.604361370716511, "grad_norm": 4.25, "learning_rate": 4.995931194647757e-05, "loss": 0.4462, "step": 902 }, { "epoch": 5.610591900311526, "grad_norm": 6.25, "learning_rate": 4.9959171317455455e-05, "loss": 0.6712, "step": 903 }, { "epoch": 5.616822429906542, "grad_norm": 5.03125, "learning_rate": 4.995903044602489e-05, "loss": 0.3918, "step": 904 }, { "epoch": 5.623052959501558, "grad_norm": 3.96875, "learning_rate": 4.995888933218723e-05, "loss": 0.4826, "step": 905 }, { "epoch": 5.629283489096573, "grad_norm": 4.96875, "learning_rate": 4.995874797594384e-05, "loss": 0.4262, "step": 906 }, { "epoch": 5.635514018691588, "grad_norm": 5.03125, "learning_rate": 4.995860637729611e-05, "loss": 0.5213, "step": 907 }, { "epoch": 5.641744548286605, "grad_norm": 3.453125, "learning_rate": 4.9958464536245406e-05, "loss": 0.3488, "step": 908 }, { "epoch": 5.64797507788162, "grad_norm": 4.40625, "learning_rate": 4.9958322452793096e-05, "loss": 0.3977, "step": 909 }, { "epoch": 5.654205607476635, "grad_norm": 3.828125, "learning_rate": 4.9958180126940577e-05, "loss": 0.4696, "step": 910 }, { "epoch": 5.660436137071651, "grad_norm": 5.71875, "learning_rate": 4.9958037558689216e-05, "loss": 0.6562, "step": 911 }, { "epoch": 5.666666666666667, "grad_norm": 4.1875, "learning_rate": 4.995789474804042e-05, "loss": 0.4133, "step": 912 }, { "epoch": 5.672897196261682, "grad_norm": 4.34375, "learning_rate": 4.9957751694995545e-05, "loss": 0.5714, "step": 913 }, { "epoch": 5.679127725856698, "grad_norm": 3.625, "learning_rate": 4.995760839955601e-05, "loss": 0.4018, "step": 914 }, { "epoch": 5.685358255451713, "grad_norm": 4.125, "learning_rate": 4.995746486172318e-05, "loss": 0.4565, "step": 915 }, { "epoch": 5.691588785046729, "grad_norm": 5.375, "learning_rate": 4.9957321081498475e-05, "loss": 0.3611, "step": 916 }, { "epoch": 5.697819314641745, "grad_norm": 4.875, "learning_rate": 4.995717705888327e-05, "loss": 0.3855, "step": 917 }, { "epoch": 5.70404984423676, "grad_norm": 3.828125, "learning_rate": 4.995703279387898e-05, "loss": 0.68, "step": 918 }, { "epoch": 5.710280373831775, "grad_norm": 4.15625, "learning_rate": 4.995688828648699e-05, "loss": 0.7096, "step": 919 }, { "epoch": 5.716510903426792, "grad_norm": 4.09375, "learning_rate": 4.995674353670872e-05, "loss": 0.4275, "step": 920 }, { "epoch": 5.722741433021807, "grad_norm": 4.625, "learning_rate": 4.9956598544545566e-05, "loss": 0.5967, "step": 921 }, { "epoch": 5.728971962616822, "grad_norm": 4.09375, "learning_rate": 4.995645330999893e-05, "loss": 0.3085, "step": 922 }, { "epoch": 5.735202492211838, "grad_norm": 3.703125, "learning_rate": 4.995630783307024e-05, "loss": 0.5602, "step": 923 }, { "epoch": 5.741433021806854, "grad_norm": 3.953125, "learning_rate": 4.995616211376089e-05, "loss": 0.5846, "step": 924 }, { "epoch": 5.747663551401869, "grad_norm": 5.4375, "learning_rate": 4.9956016152072316e-05, "loss": 0.8082, "step": 925 }, { "epoch": 5.753894080996885, "grad_norm": 3.859375, "learning_rate": 4.995586994800592e-05, "loss": 0.3642, "step": 926 }, { "epoch": 5.7601246105919, "grad_norm": 4.03125, "learning_rate": 4.995572350156313e-05, "loss": 0.6573, "step": 927 }, { "epoch": 5.766355140186916, "grad_norm": 3.65625, "learning_rate": 4.9955576812745366e-05, "loss": 0.4611, "step": 928 }, { "epoch": 5.7725856697819315, "grad_norm": 4.28125, "learning_rate": 4.995542988155404e-05, "loss": 0.5372, "step": 929 }, { "epoch": 5.778816199376947, "grad_norm": 3.765625, "learning_rate": 4.99552827079906e-05, "loss": 0.4261, "step": 930 }, { "epoch": 5.785046728971962, "grad_norm": 4.6875, "learning_rate": 4.995513529205647e-05, "loss": 0.7184, "step": 931 }, { "epoch": 5.7912772585669785, "grad_norm": 4.28125, "learning_rate": 4.995498763375306e-05, "loss": 0.5393, "step": 932 }, { "epoch": 5.797507788161994, "grad_norm": 3.078125, "learning_rate": 4.995483973308184e-05, "loss": 0.4515, "step": 933 }, { "epoch": 5.803738317757009, "grad_norm": 3.234375, "learning_rate": 4.9954691590044215e-05, "loss": 0.3811, "step": 934 }, { "epoch": 5.809968847352025, "grad_norm": 4.53125, "learning_rate": 4.9954543204641644e-05, "loss": 0.5241, "step": 935 }, { "epoch": 5.816199376947041, "grad_norm": 4.0, "learning_rate": 4.9954394576875564e-05, "loss": 0.5225, "step": 936 }, { "epoch": 5.822429906542056, "grad_norm": 4.15625, "learning_rate": 4.9954245706747405e-05, "loss": 0.4893, "step": 937 }, { "epoch": 5.8286604361370715, "grad_norm": 3.734375, "learning_rate": 4.9954096594258637e-05, "loss": 0.4085, "step": 938 }, { "epoch": 5.834890965732087, "grad_norm": 3.46875, "learning_rate": 4.995394723941068e-05, "loss": 0.3327, "step": 939 }, { "epoch": 5.841121495327103, "grad_norm": 3.453125, "learning_rate": 4.995379764220501e-05, "loss": 0.3659, "step": 940 }, { "epoch": 5.8473520249221185, "grad_norm": 3.109375, "learning_rate": 4.9953647802643065e-05, "loss": 0.3575, "step": 941 }, { "epoch": 5.853582554517134, "grad_norm": 4.0625, "learning_rate": 4.995349772072631e-05, "loss": 0.479, "step": 942 }, { "epoch": 5.859813084112149, "grad_norm": 4.25, "learning_rate": 4.9953347396456194e-05, "loss": 0.5983, "step": 943 }, { "epoch": 5.866043613707165, "grad_norm": 4.34375, "learning_rate": 4.995319682983418e-05, "loss": 0.4943, "step": 944 }, { "epoch": 5.872274143302181, "grad_norm": 4.1875, "learning_rate": 4.9953046020861724e-05, "loss": 0.3514, "step": 945 }, { "epoch": 5.878504672897196, "grad_norm": 3.375, "learning_rate": 4.9952894969540296e-05, "loss": 0.4107, "step": 946 }, { "epoch": 5.8847352024922115, "grad_norm": 4.96875, "learning_rate": 4.995274367587137e-05, "loss": 0.3171, "step": 947 }, { "epoch": 5.890965732087228, "grad_norm": 7.21875, "learning_rate": 4.9952592139856413e-05, "loss": 0.605, "step": 948 }, { "epoch": 5.897196261682243, "grad_norm": 3.4375, "learning_rate": 4.9952440361496886e-05, "loss": 0.4364, "step": 949 }, { "epoch": 5.9034267912772584, "grad_norm": 3.59375, "learning_rate": 4.995228834079428e-05, "loss": 0.5196, "step": 950 }, { "epoch": 5.909657320872274, "grad_norm": 9.5625, "learning_rate": 4.995213607775005e-05, "loss": 0.4771, "step": 951 }, { "epoch": 5.91588785046729, "grad_norm": 4.96875, "learning_rate": 4.9951983572365694e-05, "loss": 0.4298, "step": 952 }, { "epoch": 5.922118380062305, "grad_norm": 5.84375, "learning_rate": 4.995183082464269e-05, "loss": 0.5363, "step": 953 }, { "epoch": 5.928348909657321, "grad_norm": 3.421875, "learning_rate": 4.9951677834582514e-05, "loss": 0.42, "step": 954 }, { "epoch": 5.934579439252336, "grad_norm": 6.15625, "learning_rate": 4.995152460218665e-05, "loss": 0.3586, "step": 955 }, { "epoch": 5.940809968847352, "grad_norm": 4.90625, "learning_rate": 4.995137112745659e-05, "loss": 0.5767, "step": 956 }, { "epoch": 5.947040498442368, "grad_norm": 4.09375, "learning_rate": 4.995121741039383e-05, "loss": 0.5265, "step": 957 }, { "epoch": 5.953271028037383, "grad_norm": 4.03125, "learning_rate": 4.9951063450999856e-05, "loss": 0.5286, "step": 958 }, { "epoch": 5.959501557632398, "grad_norm": 4.125, "learning_rate": 4.9950909249276166e-05, "loss": 0.508, "step": 959 }, { "epoch": 5.965732087227415, "grad_norm": 4.1875, "learning_rate": 4.9950754805224265e-05, "loss": 0.5642, "step": 960 }, { "epoch": 5.97196261682243, "grad_norm": 3.84375, "learning_rate": 4.9950600118845644e-05, "loss": 0.5373, "step": 961 }, { "epoch": 5.978193146417445, "grad_norm": 3.40625, "learning_rate": 4.9950445190141805e-05, "loss": 0.3396, "step": 962 }, { "epoch": 5.984423676012461, "grad_norm": 4.4375, "learning_rate": 4.9950290019114255e-05, "loss": 0.4789, "step": 963 }, { "epoch": 5.990654205607477, "grad_norm": 8.25, "learning_rate": 4.99501346057645e-05, "loss": 0.4476, "step": 964 }, { "epoch": 5.996884735202492, "grad_norm": 3.765625, "learning_rate": 4.994997895009404e-05, "loss": 0.4666, "step": 965 }, { "epoch": 6.0, "grad_norm": 2.546875, "learning_rate": 4.9949823052104416e-05, "loss": 0.2556, "step": 966 }, { "epoch": 6.006230529595015, "grad_norm": 2.5625, "learning_rate": 4.994966691179711e-05, "loss": 0.2487, "step": 967 }, { "epoch": 6.012461059190031, "grad_norm": 5.0625, "learning_rate": 4.994951052917366e-05, "loss": 0.4049, "step": 968 }, { "epoch": 6.018691588785047, "grad_norm": 3.90625, "learning_rate": 4.9949353904235574e-05, "loss": 0.3957, "step": 969 }, { "epoch": 6.024922118380062, "grad_norm": 4.75, "learning_rate": 4.994919703698438e-05, "loss": 0.4916, "step": 970 }, { "epoch": 6.031152647975078, "grad_norm": 5.3125, "learning_rate": 4.9949039927421584e-05, "loss": 0.3957, "step": 971 }, { "epoch": 6.037383177570093, "grad_norm": 5.46875, "learning_rate": 4.9948882575548726e-05, "loss": 0.4501, "step": 972 }, { "epoch": 6.043613707165109, "grad_norm": 4.21875, "learning_rate": 4.994872498136734e-05, "loss": 0.5637, "step": 973 }, { "epoch": 6.049844236760125, "grad_norm": 3.015625, "learning_rate": 4.9948567144878956e-05, "loss": 0.3254, "step": 974 }, { "epoch": 6.05607476635514, "grad_norm": 6.03125, "learning_rate": 4.9948409066085086e-05, "loss": 0.6274, "step": 975 }, { "epoch": 6.062305295950155, "grad_norm": 4.1875, "learning_rate": 4.994825074498729e-05, "loss": 0.3241, "step": 976 }, { "epoch": 6.068535825545172, "grad_norm": 4.40625, "learning_rate": 4.9948092181587095e-05, "loss": 0.3067, "step": 977 }, { "epoch": 6.074766355140187, "grad_norm": 3.796875, "learning_rate": 4.994793337588605e-05, "loss": 0.4279, "step": 978 }, { "epoch": 6.080996884735202, "grad_norm": 6.125, "learning_rate": 4.994777432788568e-05, "loss": 0.7804, "step": 979 }, { "epoch": 6.087227414330218, "grad_norm": 4.46875, "learning_rate": 4.994761503758753e-05, "loss": 0.4237, "step": 980 }, { "epoch": 6.093457943925234, "grad_norm": 4.78125, "learning_rate": 4.994745550499317e-05, "loss": 0.4527, "step": 981 }, { "epoch": 6.099688473520249, "grad_norm": 6.71875, "learning_rate": 4.994729573010413e-05, "loss": 0.5885, "step": 982 }, { "epoch": 6.105919003115265, "grad_norm": 3.015625, "learning_rate": 4.994713571292196e-05, "loss": 0.4919, "step": 983 }, { "epoch": 6.11214953271028, "grad_norm": 4.875, "learning_rate": 4.994697545344823e-05, "loss": 0.3116, "step": 984 }, { "epoch": 6.118380062305296, "grad_norm": 3.3125, "learning_rate": 4.9946814951684484e-05, "loss": 0.3766, "step": 985 }, { "epoch": 6.1246105919003115, "grad_norm": 7.65625, "learning_rate": 4.994665420763229e-05, "loss": 0.4124, "step": 986 }, { "epoch": 6.130841121495327, "grad_norm": 6.90625, "learning_rate": 4.994649322129319e-05, "loss": 0.4193, "step": 987 }, { "epoch": 6.137071651090342, "grad_norm": 4.65625, "learning_rate": 4.9946331992668775e-05, "loss": 0.4742, "step": 988 }, { "epoch": 6.1433021806853585, "grad_norm": 6.3125, "learning_rate": 4.994617052176059e-05, "loss": 0.6975, "step": 989 }, { "epoch": 6.149532710280374, "grad_norm": 5.9375, "learning_rate": 4.994600880857022e-05, "loss": 0.2893, "step": 990 }, { "epoch": 6.155763239875389, "grad_norm": 5.25, "learning_rate": 4.994584685309921e-05, "loss": 0.2645, "step": 991 }, { "epoch": 6.161993769470405, "grad_norm": 3.375, "learning_rate": 4.994568465534916e-05, "loss": 0.5542, "step": 992 }, { "epoch": 6.168224299065421, "grad_norm": 5.0625, "learning_rate": 4.9945522215321635e-05, "loss": 0.4482, "step": 993 }, { "epoch": 6.174454828660436, "grad_norm": 8.25, "learning_rate": 4.994535953301821e-05, "loss": 0.3642, "step": 994 }, { "epoch": 6.1806853582554515, "grad_norm": 7.875, "learning_rate": 4.9945196608440456e-05, "loss": 0.4384, "step": 995 }, { "epoch": 6.186915887850467, "grad_norm": 3.46875, "learning_rate": 4.994503344158998e-05, "loss": 0.5048, "step": 996 }, { "epoch": 6.193146417445483, "grad_norm": 2.734375, "learning_rate": 4.9944870032468346e-05, "loss": 0.3639, "step": 997 }, { "epoch": 6.1993769470404985, "grad_norm": 6.15625, "learning_rate": 4.9944706381077154e-05, "loss": 0.3666, "step": 998 }, { "epoch": 6.205607476635514, "grad_norm": 7.4375, "learning_rate": 4.994454248741798e-05, "loss": 0.4868, "step": 999 }, { "epoch": 6.211838006230529, "grad_norm": 6.6875, "learning_rate": 4.9944378351492424e-05, "loss": 0.6463, "step": 1000 }, { "epoch": 6.218068535825545, "grad_norm": 5.96875, "learning_rate": 4.9944213973302084e-05, "loss": 0.5097, "step": 1001 }, { "epoch": 6.224299065420561, "grad_norm": 5.03125, "learning_rate": 4.994404935284855e-05, "loss": 0.3176, "step": 1002 }, { "epoch": 6.230529595015576, "grad_norm": 6.3125, "learning_rate": 4.994388449013342e-05, "loss": 0.4336, "step": 1003 }, { "epoch": 6.2367601246105915, "grad_norm": 3.15625, "learning_rate": 4.99437193851583e-05, "loss": 0.4893, "step": 1004 }, { "epoch": 6.242990654205608, "grad_norm": 7.25, "learning_rate": 4.994355403792479e-05, "loss": 0.6182, "step": 1005 }, { "epoch": 6.249221183800623, "grad_norm": 7.65625, "learning_rate": 4.994338844843449e-05, "loss": 0.4564, "step": 1006 }, { "epoch": 6.255451713395638, "grad_norm": 5.84375, "learning_rate": 4.9943222616689025e-05, "loss": 0.6317, "step": 1007 }, { "epoch": 6.261682242990654, "grad_norm": 4.0, "learning_rate": 4.994305654268999e-05, "loss": 0.6705, "step": 1008 }, { "epoch": 6.26791277258567, "grad_norm": 4.625, "learning_rate": 4.994289022643901e-05, "loss": 0.4719, "step": 1009 }, { "epoch": 6.274143302180685, "grad_norm": 4.125, "learning_rate": 4.99427236679377e-05, "loss": 0.3821, "step": 1010 }, { "epoch": 6.280373831775701, "grad_norm": 3.03125, "learning_rate": 4.9942556867187654e-05, "loss": 0.3258, "step": 1011 }, { "epoch": 6.286604361370716, "grad_norm": 5.25, "learning_rate": 4.994238982419052e-05, "loss": 0.5844, "step": 1012 }, { "epoch": 6.292834890965732, "grad_norm": 4.3125, "learning_rate": 4.994222253894791e-05, "loss": 0.4033, "step": 1013 }, { "epoch": 6.299065420560748, "grad_norm": 2.890625, "learning_rate": 4.9942055011461455e-05, "loss": 0.2575, "step": 1014 }, { "epoch": 6.305295950155763, "grad_norm": 2.609375, "learning_rate": 4.994188724173276e-05, "loss": 0.3217, "step": 1015 }, { "epoch": 6.311526479750778, "grad_norm": 4.96875, "learning_rate": 4.994171922976348e-05, "loss": 0.3689, "step": 1016 }, { "epoch": 6.317757009345795, "grad_norm": 6.9375, "learning_rate": 4.994155097555524e-05, "loss": 0.502, "step": 1017 }, { "epoch": 6.32398753894081, "grad_norm": 4.25, "learning_rate": 4.9941382479109665e-05, "loss": 0.3992, "step": 1018 }, { "epoch": 6.330218068535825, "grad_norm": 3.65625, "learning_rate": 4.99412137404284e-05, "loss": 0.3359, "step": 1019 }, { "epoch": 6.336448598130841, "grad_norm": 2.96875, "learning_rate": 4.994104475951308e-05, "loss": 0.3605, "step": 1020 }, { "epoch": 6.342679127725857, "grad_norm": 3.5625, "learning_rate": 4.994087553636535e-05, "loss": 0.3842, "step": 1021 }, { "epoch": 6.348909657320872, "grad_norm": 4.65625, "learning_rate": 4.9940706070986854e-05, "loss": 0.6774, "step": 1022 }, { "epoch": 6.355140186915888, "grad_norm": 3.46875, "learning_rate": 4.9940536363379234e-05, "loss": 0.2911, "step": 1023 }, { "epoch": 6.361370716510903, "grad_norm": 3.65625, "learning_rate": 4.994036641354413e-05, "loss": 0.4656, "step": 1024 }, { "epoch": 6.367601246105919, "grad_norm": 3.703125, "learning_rate": 4.9940196221483214e-05, "loss": 0.3486, "step": 1025 }, { "epoch": 6.373831775700935, "grad_norm": 3.6875, "learning_rate": 4.994002578719812e-05, "loss": 0.5038, "step": 1026 }, { "epoch": 6.38006230529595, "grad_norm": 4.46875, "learning_rate": 4.993985511069051e-05, "loss": 0.5796, "step": 1027 }, { "epoch": 6.386292834890965, "grad_norm": 4.5, "learning_rate": 4.993968419196204e-05, "loss": 0.4349, "step": 1028 }, { "epoch": 6.392523364485982, "grad_norm": 4.84375, "learning_rate": 4.9939513031014376e-05, "loss": 0.3734, "step": 1029 }, { "epoch": 6.398753894080997, "grad_norm": 3.5, "learning_rate": 4.9939341627849176e-05, "loss": 0.4435, "step": 1030 }, { "epoch": 6.404984423676012, "grad_norm": 2.84375, "learning_rate": 4.993916998246811e-05, "loss": 0.3772, "step": 1031 }, { "epoch": 6.411214953271028, "grad_norm": 4.25, "learning_rate": 4.993899809487284e-05, "loss": 0.264, "step": 1032 }, { "epoch": 6.417445482866044, "grad_norm": 3.09375, "learning_rate": 4.993882596506502e-05, "loss": 0.484, "step": 1033 }, { "epoch": 6.423676012461059, "grad_norm": 3.3125, "learning_rate": 4.9938653593046345e-05, "loss": 0.3647, "step": 1034 }, { "epoch": 6.429906542056075, "grad_norm": 3.6875, "learning_rate": 4.993848097881849e-05, "loss": 0.3569, "step": 1035 }, { "epoch": 6.43613707165109, "grad_norm": 3.890625, "learning_rate": 4.993830812238311e-05, "loss": 0.5153, "step": 1036 }, { "epoch": 6.442367601246106, "grad_norm": 3.9375, "learning_rate": 4.9938135023741904e-05, "loss": 0.4638, "step": 1037 }, { "epoch": 6.4485981308411215, "grad_norm": 3.703125, "learning_rate": 4.993796168289654e-05, "loss": 0.3317, "step": 1038 }, { "epoch": 6.454828660436137, "grad_norm": 2.96875, "learning_rate": 4.9937788099848713e-05, "loss": 0.3571, "step": 1039 }, { "epoch": 6.461059190031152, "grad_norm": 4.125, "learning_rate": 4.99376142746001e-05, "loss": 0.4932, "step": 1040 }, { "epoch": 6.4672897196261685, "grad_norm": 3.578125, "learning_rate": 4.99374402071524e-05, "loss": 0.4275, "step": 1041 }, { "epoch": 6.473520249221184, "grad_norm": 2.28125, "learning_rate": 4.993726589750728e-05, "loss": 0.2699, "step": 1042 }, { "epoch": 6.479750778816199, "grad_norm": 3.359375, "learning_rate": 4.993709134566646e-05, "loss": 0.5922, "step": 1043 }, { "epoch": 6.485981308411215, "grad_norm": 7.1875, "learning_rate": 4.993691655163162e-05, "loss": 0.4393, "step": 1044 }, { "epoch": 6.492211838006231, "grad_norm": 4.34375, "learning_rate": 4.993674151540446e-05, "loss": 0.2813, "step": 1045 }, { "epoch": 6.498442367601246, "grad_norm": 3.4375, "learning_rate": 4.993656623698669e-05, "loss": 0.4869, "step": 1046 }, { "epoch": 6.5046728971962615, "grad_norm": 4.59375, "learning_rate": 4.993639071638e-05, "loss": 0.6756, "step": 1047 }, { "epoch": 6.510903426791277, "grad_norm": 4.0625, "learning_rate": 4.993621495358609e-05, "loss": 0.3896, "step": 1048 }, { "epoch": 6.517133956386293, "grad_norm": 3.828125, "learning_rate": 4.993603894860669e-05, "loss": 0.3984, "step": 1049 }, { "epoch": 6.5233644859813085, "grad_norm": 3.84375, "learning_rate": 4.993586270144349e-05, "loss": 0.3634, "step": 1050 }, { "epoch": 6.529595015576324, "grad_norm": 4.03125, "learning_rate": 4.99356862120982e-05, "loss": 0.5371, "step": 1051 }, { "epoch": 6.535825545171339, "grad_norm": 3.65625, "learning_rate": 4.993550948057255e-05, "loss": 0.5111, "step": 1052 }, { "epoch": 6.542056074766355, "grad_norm": 3.109375, "learning_rate": 4.993533250686824e-05, "loss": 0.3913, "step": 1053 }, { "epoch": 6.548286604361371, "grad_norm": 3.75, "learning_rate": 4.9935155290987e-05, "loss": 0.446, "step": 1054 }, { "epoch": 6.554517133956386, "grad_norm": 4.28125, "learning_rate": 4.9934977832930546e-05, "loss": 0.4998, "step": 1055 }, { "epoch": 6.5607476635514015, "grad_norm": 5.21875, "learning_rate": 4.99348001327006e-05, "loss": 0.5589, "step": 1056 }, { "epoch": 6.566978193146418, "grad_norm": 3.53125, "learning_rate": 4.9934622190298896e-05, "loss": 0.5125, "step": 1057 }, { "epoch": 6.573208722741433, "grad_norm": 4.375, "learning_rate": 4.993444400572716e-05, "loss": 0.4332, "step": 1058 }, { "epoch": 6.579439252336448, "grad_norm": 3.40625, "learning_rate": 4.993426557898711e-05, "loss": 0.4533, "step": 1059 }, { "epoch": 6.585669781931464, "grad_norm": 2.71875, "learning_rate": 4.9934086910080505e-05, "loss": 0.3055, "step": 1060 }, { "epoch": 6.59190031152648, "grad_norm": 4.5625, "learning_rate": 4.993390799900904e-05, "loss": 0.4728, "step": 1061 }, { "epoch": 6.598130841121495, "grad_norm": 6.25, "learning_rate": 4.9933728845774496e-05, "loss": 0.6341, "step": 1062 }, { "epoch": 6.604361370716511, "grad_norm": 4.03125, "learning_rate": 4.993354945037858e-05, "loss": 0.4096, "step": 1063 }, { "epoch": 6.610591900311526, "grad_norm": 3.265625, "learning_rate": 4.993336981282306e-05, "loss": 0.3311, "step": 1064 }, { "epoch": 6.616822429906542, "grad_norm": 4.875, "learning_rate": 4.9933189933109657e-05, "loss": 0.5484, "step": 1065 }, { "epoch": 6.623052959501558, "grad_norm": 3.53125, "learning_rate": 4.9933009811240136e-05, "loss": 0.4002, "step": 1066 }, { "epoch": 6.629283489096573, "grad_norm": 4.5625, "learning_rate": 4.993282944721624e-05, "loss": 0.4628, "step": 1067 }, { "epoch": 6.635514018691588, "grad_norm": 3.5, "learning_rate": 4.9932648841039716e-05, "loss": 0.3231, "step": 1068 }, { "epoch": 6.641744548286605, "grad_norm": 4.28125, "learning_rate": 4.993246799271233e-05, "loss": 0.3077, "step": 1069 }, { "epoch": 6.64797507788162, "grad_norm": 3.84375, "learning_rate": 4.9932286902235824e-05, "loss": 0.3891, "step": 1070 }, { "epoch": 6.654205607476635, "grad_norm": 2.828125, "learning_rate": 4.993210556961197e-05, "loss": 0.3203, "step": 1071 }, { "epoch": 6.660436137071651, "grad_norm": 3.734375, "learning_rate": 4.993192399484251e-05, "loss": 0.3305, "step": 1072 }, { "epoch": 6.666666666666667, "grad_norm": 3.484375, "learning_rate": 4.993174217792923e-05, "loss": 0.281, "step": 1073 }, { "epoch": 6.672897196261682, "grad_norm": 4.71875, "learning_rate": 4.993156011887389e-05, "loss": 0.6112, "step": 1074 }, { "epoch": 6.679127725856698, "grad_norm": 5.0625, "learning_rate": 4.9931377817678246e-05, "loss": 0.5132, "step": 1075 }, { "epoch": 6.685358255451713, "grad_norm": 5.1875, "learning_rate": 4.993119527434408e-05, "loss": 0.5102, "step": 1076 }, { "epoch": 6.691588785046729, "grad_norm": 5.0, "learning_rate": 4.993101248887316e-05, "loss": 0.5021, "step": 1077 }, { "epoch": 6.697819314641745, "grad_norm": 7.53125, "learning_rate": 4.993082946126727e-05, "loss": 0.5001, "step": 1078 }, { "epoch": 6.70404984423676, "grad_norm": 3.84375, "learning_rate": 4.9930646191528175e-05, "loss": 0.4475, "step": 1079 }, { "epoch": 6.710280373831775, "grad_norm": 3.0, "learning_rate": 4.993046267965766e-05, "loss": 0.3082, "step": 1080 }, { "epoch": 6.716510903426792, "grad_norm": 7.03125, "learning_rate": 4.993027892565751e-05, "loss": 0.6205, "step": 1081 }, { "epoch": 6.722741433021807, "grad_norm": 6.9375, "learning_rate": 4.9930094929529506e-05, "loss": 0.3602, "step": 1082 }, { "epoch": 6.728971962616822, "grad_norm": 4.84375, "learning_rate": 4.992991069127544e-05, "loss": 0.3406, "step": 1083 }, { "epoch": 6.735202492211838, "grad_norm": 6.71875, "learning_rate": 4.992972621089709e-05, "loss": 0.4045, "step": 1084 }, { "epoch": 6.741433021806854, "grad_norm": 8.25, "learning_rate": 4.9929541488396266e-05, "loss": 0.4651, "step": 1085 }, { "epoch": 6.747663551401869, "grad_norm": 4.46875, "learning_rate": 4.9929356523774745e-05, "loss": 0.3925, "step": 1086 }, { "epoch": 6.753894080996885, "grad_norm": 3.53125, "learning_rate": 4.992917131703433e-05, "loss": 0.4, "step": 1087 }, { "epoch": 6.7601246105919, "grad_norm": 3.828125, "learning_rate": 4.992898586817682e-05, "loss": 0.2424, "step": 1088 }, { "epoch": 6.766355140186916, "grad_norm": 6.84375, "learning_rate": 4.9928800177204025e-05, "loss": 0.4258, "step": 1089 }, { "epoch": 6.7725856697819315, "grad_norm": 4.09375, "learning_rate": 4.9928614244117735e-05, "loss": 0.428, "step": 1090 }, { "epoch": 6.778816199376947, "grad_norm": 4.78125, "learning_rate": 4.9928428068919755e-05, "loss": 0.5986, "step": 1091 }, { "epoch": 6.785046728971962, "grad_norm": 8.375, "learning_rate": 4.99282416516119e-05, "loss": 0.5953, "step": 1092 }, { "epoch": 6.7912772585669785, "grad_norm": 5.34375, "learning_rate": 4.9928054992195985e-05, "loss": 0.4983, "step": 1093 }, { "epoch": 6.797507788161994, "grad_norm": 4.78125, "learning_rate": 4.992786809067381e-05, "loss": 0.5495, "step": 1094 }, { "epoch": 6.803738317757009, "grad_norm": 3.0, "learning_rate": 4.99276809470472e-05, "loss": 0.3395, "step": 1095 }, { "epoch": 6.809968847352025, "grad_norm": 3.578125, "learning_rate": 4.9927493561317974e-05, "loss": 0.2733, "step": 1096 }, { "epoch": 6.816199376947041, "grad_norm": 6.5625, "learning_rate": 4.992730593348794e-05, "loss": 0.5006, "step": 1097 }, { "epoch": 6.822429906542056, "grad_norm": 4.6875, "learning_rate": 4.992711806355893e-05, "loss": 0.4895, "step": 1098 }, { "epoch": 6.8286604361370715, "grad_norm": 4.78125, "learning_rate": 4.992692995153276e-05, "loss": 0.5019, "step": 1099 }, { "epoch": 6.834890965732087, "grad_norm": 7.6875, "learning_rate": 4.992674159741128e-05, "loss": 0.4564, "step": 1100 }, { "epoch": 6.841121495327103, "grad_norm": 4.59375, "learning_rate": 4.992655300119629e-05, "loss": 0.2719, "step": 1101 }, { "epoch": 6.8473520249221185, "grad_norm": 3.34375, "learning_rate": 4.992636416288964e-05, "loss": 0.4087, "step": 1102 }, { "epoch": 6.853582554517134, "grad_norm": 3.140625, "learning_rate": 4.992617508249316e-05, "loss": 0.2165, "step": 1103 }, { "epoch": 6.859813084112149, "grad_norm": 6.78125, "learning_rate": 4.992598576000868e-05, "loss": 0.5951, "step": 1104 }, { "epoch": 6.866043613707165, "grad_norm": 3.375, "learning_rate": 4.9925796195438044e-05, "loss": 0.3582, "step": 1105 }, { "epoch": 6.872274143302181, "grad_norm": 2.59375, "learning_rate": 4.99256063887831e-05, "loss": 0.3054, "step": 1106 }, { "epoch": 6.878504672897196, "grad_norm": 6.5625, "learning_rate": 4.992541634004567e-05, "loss": 0.4324, "step": 1107 }, { "epoch": 6.8847352024922115, "grad_norm": 5.21875, "learning_rate": 4.992522604922762e-05, "loss": 0.2989, "step": 1108 }, { "epoch": 6.890965732087228, "grad_norm": 3.21875, "learning_rate": 4.9925035516330796e-05, "loss": 0.4028, "step": 1109 }, { "epoch": 6.897196261682243, "grad_norm": 5.34375, "learning_rate": 4.992484474135705e-05, "loss": 0.4615, "step": 1110 }, { "epoch": 6.9034267912772584, "grad_norm": 7.84375, "learning_rate": 4.9924653724308225e-05, "loss": 0.4422, "step": 1111 }, { "epoch": 6.909657320872274, "grad_norm": 5.84375, "learning_rate": 4.9924462465186174e-05, "loss": 0.3251, "step": 1112 }, { "epoch": 6.91588785046729, "grad_norm": 3.875, "learning_rate": 4.992427096399277e-05, "loss": 0.3764, "step": 1113 }, { "epoch": 6.922118380062305, "grad_norm": 6.8125, "learning_rate": 4.9924079220729855e-05, "loss": 0.3444, "step": 1114 }, { "epoch": 6.928348909657321, "grad_norm": 6.96875, "learning_rate": 4.992388723539931e-05, "loss": 0.2993, "step": 1115 }, { "epoch": 6.934579439252336, "grad_norm": 7.5625, "learning_rate": 4.992369500800299e-05, "loss": 0.5292, "step": 1116 }, { "epoch": 6.940809968847352, "grad_norm": 3.46875, "learning_rate": 4.992350253854276e-05, "loss": 0.3734, "step": 1117 }, { "epoch": 6.947040498442368, "grad_norm": 6.15625, "learning_rate": 4.992330982702048e-05, "loss": 0.4628, "step": 1118 }, { "epoch": 6.953271028037383, "grad_norm": 5.90625, "learning_rate": 4.992311687343804e-05, "loss": 0.3497, "step": 1119 }, { "epoch": 6.959501557632398, "grad_norm": 8.25, "learning_rate": 4.992292367779731e-05, "loss": 0.3545, "step": 1120 }, { "epoch": 6.965732087227415, "grad_norm": 4.28125, "learning_rate": 4.992273024010016e-05, "loss": 0.3712, "step": 1121 }, { "epoch": 6.97196261682243, "grad_norm": 6.3125, "learning_rate": 4.9922536560348473e-05, "loss": 0.549, "step": 1122 }, { "epoch": 6.978193146417445, "grad_norm": 6.53125, "learning_rate": 4.9922342638544125e-05, "loss": 0.294, "step": 1123 }, { "epoch": 6.984423676012461, "grad_norm": 6.46875, "learning_rate": 4.992214847468901e-05, "loss": 0.5027, "step": 1124 }, { "epoch": 6.990654205607477, "grad_norm": 4.0625, "learning_rate": 4.992195406878501e-05, "loss": 0.4068, "step": 1125 }, { "epoch": 6.996884735202492, "grad_norm": 5.84375, "learning_rate": 4.9921759420834e-05, "loss": 0.299, "step": 1126 }, { "epoch": 7.0, "grad_norm": 4.9375, "learning_rate": 4.9921564530837894e-05, "loss": 0.3624, "step": 1127 }, { "epoch": 7.006230529595015, "grad_norm": 7.46875, "learning_rate": 4.992136939879856e-05, "loss": 0.5005, "step": 1128 }, { "epoch": 7.012461059190031, "grad_norm": 3.234375, "learning_rate": 4.992117402471791e-05, "loss": 0.4705, "step": 1129 }, { "epoch": 7.018691588785047, "grad_norm": 3.921875, "learning_rate": 4.9920978408597837e-05, "loss": 0.3865, "step": 1130 }, { "epoch": 7.024922118380062, "grad_norm": 6.46875, "learning_rate": 4.9920782550440236e-05, "loss": 0.5202, "step": 1131 }, { "epoch": 7.031152647975078, "grad_norm": 4.5, "learning_rate": 4.992058645024702e-05, "loss": 0.2611, "step": 1132 }, { "epoch": 7.037383177570093, "grad_norm": 4.125, "learning_rate": 4.9920390108020075e-05, "loss": 0.3743, "step": 1133 }, { "epoch": 7.043613707165109, "grad_norm": 2.4375, "learning_rate": 4.992019352376133e-05, "loss": 0.2646, "step": 1134 }, { "epoch": 7.049844236760125, "grad_norm": 5.28125, "learning_rate": 4.9919996697472684e-05, "loss": 0.3622, "step": 1135 }, { "epoch": 7.05607476635514, "grad_norm": 4.5625, "learning_rate": 4.9919799629156036e-05, "loss": 0.4537, "step": 1136 }, { "epoch": 7.062305295950155, "grad_norm": 4.46875, "learning_rate": 4.991960231881333e-05, "loss": 0.4496, "step": 1137 }, { "epoch": 7.068535825545172, "grad_norm": 5.71875, "learning_rate": 4.9919404766446456e-05, "loss": 0.3433, "step": 1138 }, { "epoch": 7.074766355140187, "grad_norm": 3.96875, "learning_rate": 4.991920697205734e-05, "loss": 0.3525, "step": 1139 }, { "epoch": 7.080996884735202, "grad_norm": 3.578125, "learning_rate": 4.9919008935647904e-05, "loss": 0.512, "step": 1140 }, { "epoch": 7.087227414330218, "grad_norm": 3.421875, "learning_rate": 4.9918810657220075e-05, "loss": 0.366, "step": 1141 }, { "epoch": 7.093457943925234, "grad_norm": 3.859375, "learning_rate": 4.9918612136775776e-05, "loss": 0.2886, "step": 1142 }, { "epoch": 7.099688473520249, "grad_norm": 3.71875, "learning_rate": 4.991841337431694e-05, "loss": 0.4487, "step": 1143 }, { "epoch": 7.105919003115265, "grad_norm": 3.53125, "learning_rate": 4.991821436984548e-05, "loss": 0.4561, "step": 1144 }, { "epoch": 7.11214953271028, "grad_norm": 4.71875, "learning_rate": 4.9918015123363355e-05, "loss": 0.5704, "step": 1145 }, { "epoch": 7.118380062305296, "grad_norm": 4.59375, "learning_rate": 4.991781563487248e-05, "loss": 0.5129, "step": 1146 }, { "epoch": 7.1246105919003115, "grad_norm": 3.359375, "learning_rate": 4.991761590437479e-05, "loss": 0.2844, "step": 1147 }, { "epoch": 7.130841121495327, "grad_norm": 3.4375, "learning_rate": 4.991741593187225e-05, "loss": 0.5552, "step": 1148 }, { "epoch": 7.137071651090342, "grad_norm": 3.765625, "learning_rate": 4.991721571736677e-05, "loss": 0.5532, "step": 1149 }, { "epoch": 7.1433021806853585, "grad_norm": 5.65625, "learning_rate": 4.991701526086032e-05, "loss": 0.2973, "step": 1150 }, { "epoch": 7.149532710280374, "grad_norm": 3.90625, "learning_rate": 4.991681456235483e-05, "loss": 0.3134, "step": 1151 }, { "epoch": 7.155763239875389, "grad_norm": 3.609375, "learning_rate": 4.9916613621852265e-05, "loss": 0.3036, "step": 1152 }, { "epoch": 7.161993769470405, "grad_norm": 6.8125, "learning_rate": 4.991641243935457e-05, "loss": 0.3793, "step": 1153 }, { "epoch": 7.168224299065421, "grad_norm": 6.15625, "learning_rate": 4.991621101486368e-05, "loss": 0.439, "step": 1154 }, { "epoch": 7.174454828660436, "grad_norm": 6.03125, "learning_rate": 4.9916009348381574e-05, "loss": 0.4148, "step": 1155 }, { "epoch": 7.1806853582554515, "grad_norm": 4.9375, "learning_rate": 4.9915807439910213e-05, "loss": 0.481, "step": 1156 }, { "epoch": 7.186915887850467, "grad_norm": 8.3125, "learning_rate": 4.991560528945154e-05, "loss": 0.4532, "step": 1157 }, { "epoch": 7.193146417445483, "grad_norm": 7.375, "learning_rate": 4.991540289700754e-05, "loss": 0.4857, "step": 1158 }, { "epoch": 7.1993769470404985, "grad_norm": 4.90625, "learning_rate": 4.991520026258015e-05, "loss": 0.387, "step": 1159 }, { "epoch": 7.205607476635514, "grad_norm": 3.0, "learning_rate": 4.991499738617137e-05, "loss": 0.2769, "step": 1160 }, { "epoch": 7.211838006230529, "grad_norm": 4.78125, "learning_rate": 4.9914794267783146e-05, "loss": 0.3248, "step": 1161 }, { "epoch": 7.218068535825545, "grad_norm": 5.1875, "learning_rate": 4.991459090741746e-05, "loss": 0.436, "step": 1162 }, { "epoch": 7.224299065420561, "grad_norm": 1.8671875, "learning_rate": 4.991438730507629e-05, "loss": 0.2429, "step": 1163 }, { "epoch": 7.230529595015576, "grad_norm": 3.328125, "learning_rate": 4.991418346076161e-05, "loss": 0.3676, "step": 1164 }, { "epoch": 7.2367601246105915, "grad_norm": 6.625, "learning_rate": 4.991397937447541e-05, "loss": 0.4761, "step": 1165 }, { "epoch": 7.242990654205608, "grad_norm": 5.59375, "learning_rate": 4.991377504621965e-05, "loss": 0.6998, "step": 1166 }, { "epoch": 7.249221183800623, "grad_norm": 4.21875, "learning_rate": 4.991357047599632e-05, "loss": 0.4308, "step": 1167 }, { "epoch": 7.255451713395638, "grad_norm": 4.84375, "learning_rate": 4.991336566380742e-05, "loss": 0.4071, "step": 1168 }, { "epoch": 7.261682242990654, "grad_norm": 5.9375, "learning_rate": 4.991316060965494e-05, "loss": 0.3881, "step": 1169 }, { "epoch": 7.26791277258567, "grad_norm": 4.90625, "learning_rate": 4.991295531354085e-05, "loss": 0.4372, "step": 1170 }, { "epoch": 7.274143302180685, "grad_norm": 3.328125, "learning_rate": 4.991274977546717e-05, "loss": 0.424, "step": 1171 }, { "epoch": 7.280373831775701, "grad_norm": 8.3125, "learning_rate": 4.9912543995435875e-05, "loss": 0.4042, "step": 1172 }, { "epoch": 7.286604361370716, "grad_norm": 6.8125, "learning_rate": 4.9912337973448984e-05, "loss": 0.3438, "step": 1173 }, { "epoch": 7.292834890965732, "grad_norm": 9.25, "learning_rate": 4.991213170950848e-05, "loss": 0.4577, "step": 1174 }, { "epoch": 7.299065420560748, "grad_norm": 2.90625, "learning_rate": 4.9911925203616374e-05, "loss": 0.3981, "step": 1175 }, { "epoch": 7.305295950155763, "grad_norm": 3.5625, "learning_rate": 4.991171845577467e-05, "loss": 0.4004, "step": 1176 }, { "epoch": 7.311526479750778, "grad_norm": 7.9375, "learning_rate": 4.9911511465985386e-05, "loss": 0.6137, "step": 1177 }, { "epoch": 7.317757009345795, "grad_norm": 3.96875, "learning_rate": 4.9911304234250514e-05, "loss": 0.3317, "step": 1178 }, { "epoch": 7.32398753894081, "grad_norm": 4.21875, "learning_rate": 4.9911096760572066e-05, "loss": 0.4097, "step": 1179 }, { "epoch": 7.330218068535825, "grad_norm": 4.84375, "learning_rate": 4.991088904495208e-05, "loss": 0.424, "step": 1180 }, { "epoch": 7.336448598130841, "grad_norm": 5.90625, "learning_rate": 4.9910681087392554e-05, "loss": 0.3834, "step": 1181 }, { "epoch": 7.342679127725857, "grad_norm": 2.734375, "learning_rate": 4.9910472887895524e-05, "loss": 0.3285, "step": 1182 }, { "epoch": 7.348909657320872, "grad_norm": 3.578125, "learning_rate": 4.9910264446462986e-05, "loss": 0.6835, "step": 1183 }, { "epoch": 7.355140186915888, "grad_norm": 3.234375, "learning_rate": 4.991005576309699e-05, "loss": 0.3767, "step": 1184 }, { "epoch": 7.361370716510903, "grad_norm": 3.90625, "learning_rate": 4.990984683779955e-05, "loss": 0.4639, "step": 1185 }, { "epoch": 7.367601246105919, "grad_norm": 3.859375, "learning_rate": 4.99096376705727e-05, "loss": 0.595, "step": 1186 }, { "epoch": 7.373831775700935, "grad_norm": 4.78125, "learning_rate": 4.990942826141847e-05, "loss": 0.3647, "step": 1187 }, { "epoch": 7.38006230529595, "grad_norm": 4.90625, "learning_rate": 4.990921861033889e-05, "loss": 0.31, "step": 1188 }, { "epoch": 7.386292834890965, "grad_norm": 3.328125, "learning_rate": 4.9909008717336e-05, "loss": 0.4912, "step": 1189 }, { "epoch": 7.392523364485982, "grad_norm": 3.359375, "learning_rate": 4.9908798582411845e-05, "loss": 0.3261, "step": 1190 }, { "epoch": 7.398753894080997, "grad_norm": 4.1875, "learning_rate": 4.990858820556845e-05, "loss": 0.2408, "step": 1191 }, { "epoch": 7.404984423676012, "grad_norm": 5.125, "learning_rate": 4.990837758680787e-05, "loss": 0.576, "step": 1192 }, { "epoch": 7.411214953271028, "grad_norm": 4.3125, "learning_rate": 4.990816672613216e-05, "loss": 0.4293, "step": 1193 }, { "epoch": 7.417445482866044, "grad_norm": 4.15625, "learning_rate": 4.990795562354333e-05, "loss": 0.4167, "step": 1194 }, { "epoch": 7.423676012461059, "grad_norm": 2.734375, "learning_rate": 4.9907744279043475e-05, "loss": 0.317, "step": 1195 }, { "epoch": 7.429906542056075, "grad_norm": 6.03125, "learning_rate": 4.990753269263463e-05, "loss": 0.4487, "step": 1196 }, { "epoch": 7.43613707165109, "grad_norm": 5.375, "learning_rate": 4.990732086431884e-05, "loss": 0.4674, "step": 1197 }, { "epoch": 7.442367601246106, "grad_norm": 4.21875, "learning_rate": 4.9907108794098176e-05, "loss": 0.5019, "step": 1198 }, { "epoch": 7.4485981308411215, "grad_norm": 3.5625, "learning_rate": 4.990689648197469e-05, "loss": 0.39, "step": 1199 }, { "epoch": 7.454828660436137, "grad_norm": 5.96875, "learning_rate": 4.9906683927950446e-05, "loss": 0.3816, "step": 1200 }, { "epoch": 7.461059190031152, "grad_norm": 4.625, "learning_rate": 4.990647113202751e-05, "loss": 0.4683, "step": 1201 }, { "epoch": 7.4672897196261685, "grad_norm": 4.96875, "learning_rate": 4.9906258094207945e-05, "loss": 0.5677, "step": 1202 }, { "epoch": 7.473520249221184, "grad_norm": 2.96875, "learning_rate": 4.9906044814493824e-05, "loss": 0.243, "step": 1203 }, { "epoch": 7.479750778816199, "grad_norm": 4.90625, "learning_rate": 4.990583129288722e-05, "loss": 0.3101, "step": 1204 }, { "epoch": 7.485981308411215, "grad_norm": 5.375, "learning_rate": 4.9905617529390203e-05, "loss": 0.433, "step": 1205 }, { "epoch": 7.492211838006231, "grad_norm": 5.8125, "learning_rate": 4.9905403524004844e-05, "loss": 0.4636, "step": 1206 }, { "epoch": 7.498442367601246, "grad_norm": 5.96875, "learning_rate": 4.990518927673324e-05, "loss": 0.4162, "step": 1207 }, { "epoch": 7.5046728971962615, "grad_norm": 7.25, "learning_rate": 4.9904974787577455e-05, "loss": 0.4913, "step": 1208 }, { "epoch": 7.510903426791277, "grad_norm": 6.1875, "learning_rate": 4.9904760056539565e-05, "loss": 0.3615, "step": 1209 }, { "epoch": 7.517133956386293, "grad_norm": 5.21875, "learning_rate": 4.990454508362168e-05, "loss": 0.7119, "step": 1210 }, { "epoch": 7.5233644859813085, "grad_norm": 3.703125, "learning_rate": 4.9904329868825874e-05, "loss": 0.4234, "step": 1211 }, { "epoch": 7.529595015576324, "grad_norm": 3.0, "learning_rate": 4.990411441215423e-05, "loss": 0.3564, "step": 1212 }, { "epoch": 7.535825545171339, "grad_norm": 5.9375, "learning_rate": 4.9903898713608856e-05, "loss": 0.6968, "step": 1213 }, { "epoch": 7.542056074766355, "grad_norm": 6.0625, "learning_rate": 4.990368277319183e-05, "loss": 0.2672, "step": 1214 }, { "epoch": 7.548286604361371, "grad_norm": 6.75, "learning_rate": 4.990346659090527e-05, "loss": 0.4244, "step": 1215 }, { "epoch": 7.554517133956386, "grad_norm": 6.03125, "learning_rate": 4.990325016675126e-05, "loss": 0.5732, "step": 1216 }, { "epoch": 7.5607476635514015, "grad_norm": 6.28125, "learning_rate": 4.9903033500731906e-05, "loss": 0.3504, "step": 1217 }, { "epoch": 7.566978193146418, "grad_norm": 6.46875, "learning_rate": 4.990281659284931e-05, "loss": 0.4776, "step": 1218 }, { "epoch": 7.573208722741433, "grad_norm": 4.84375, "learning_rate": 4.9902599443105584e-05, "loss": 0.5786, "step": 1219 }, { "epoch": 7.579439252336448, "grad_norm": 3.5625, "learning_rate": 4.990238205150284e-05, "loss": 0.2676, "step": 1220 }, { "epoch": 7.585669781931464, "grad_norm": 5.46875, "learning_rate": 4.990216441804318e-05, "loss": 0.7115, "step": 1221 }, { "epoch": 7.59190031152648, "grad_norm": 5.65625, "learning_rate": 4.9901946542728714e-05, "loss": 0.2844, "step": 1222 }, { "epoch": 7.598130841121495, "grad_norm": 8.8125, "learning_rate": 4.9901728425561564e-05, "loss": 0.5509, "step": 1223 }, { "epoch": 7.604361370716511, "grad_norm": 4.90625, "learning_rate": 4.990151006654386e-05, "loss": 0.5199, "step": 1224 }, { "epoch": 7.610591900311526, "grad_norm": 4.65625, "learning_rate": 4.9901291465677706e-05, "loss": 0.5587, "step": 1225 }, { "epoch": 7.616822429906542, "grad_norm": 4.25, "learning_rate": 4.990107262296524e-05, "loss": 0.3305, "step": 1226 }, { "epoch": 7.623052959501558, "grad_norm": 3.46875, "learning_rate": 4.990085353840857e-05, "loss": 0.307, "step": 1227 }, { "epoch": 7.629283489096573, "grad_norm": 4.125, "learning_rate": 4.9900634212009834e-05, "loss": 0.3406, "step": 1228 }, { "epoch": 7.635514018691588, "grad_norm": 5.71875, "learning_rate": 4.990041464377117e-05, "loss": 0.7568, "step": 1229 }, { "epoch": 7.641744548286605, "grad_norm": 4.75, "learning_rate": 4.990019483369469e-05, "loss": 0.3004, "step": 1230 }, { "epoch": 7.64797507788162, "grad_norm": 4.78125, "learning_rate": 4.9899974781782546e-05, "loss": 0.3819, "step": 1231 }, { "epoch": 7.654205607476635, "grad_norm": 3.21875, "learning_rate": 4.9899754488036864e-05, "loss": 0.3096, "step": 1232 }, { "epoch": 7.660436137071651, "grad_norm": 3.71875, "learning_rate": 4.98995339524598e-05, "loss": 0.3275, "step": 1233 }, { "epoch": 7.666666666666667, "grad_norm": 4.03125, "learning_rate": 4.9899313175053474e-05, "loss": 0.3751, "step": 1234 }, { "epoch": 7.672897196261682, "grad_norm": 2.84375, "learning_rate": 4.989909215582005e-05, "loss": 0.2836, "step": 1235 }, { "epoch": 7.679127725856698, "grad_norm": 3.609375, "learning_rate": 4.9898870894761664e-05, "loss": 0.5198, "step": 1236 }, { "epoch": 7.685358255451713, "grad_norm": 2.953125, "learning_rate": 4.989864939188046e-05, "loss": 0.3102, "step": 1237 }, { "epoch": 7.691588785046729, "grad_norm": 3.21875, "learning_rate": 4.9898427647178604e-05, "loss": 0.2529, "step": 1238 }, { "epoch": 7.697819314641745, "grad_norm": 4.03125, "learning_rate": 4.989820566065824e-05, "loss": 0.5137, "step": 1239 }, { "epoch": 7.70404984423676, "grad_norm": 3.234375, "learning_rate": 4.989798343232151e-05, "loss": 0.3336, "step": 1240 }, { "epoch": 7.710280373831775, "grad_norm": 2.984375, "learning_rate": 4.989776096217061e-05, "loss": 0.3, "step": 1241 }, { "epoch": 7.716510903426792, "grad_norm": 3.078125, "learning_rate": 4.989753825020766e-05, "loss": 0.3739, "step": 1242 }, { "epoch": 7.722741433021807, "grad_norm": 4.875, "learning_rate": 4.989731529643486e-05, "loss": 0.6294, "step": 1243 }, { "epoch": 7.728971962616822, "grad_norm": 4.5, "learning_rate": 4.989709210085435e-05, "loss": 0.4821, "step": 1244 }, { "epoch": 7.735202492211838, "grad_norm": 3.3125, "learning_rate": 4.9896868663468296e-05, "loss": 0.36, "step": 1245 }, { "epoch": 7.741433021806854, "grad_norm": 4.40625, "learning_rate": 4.989664498427889e-05, "loss": 0.4021, "step": 1246 }, { "epoch": 7.747663551401869, "grad_norm": 3.765625, "learning_rate": 4.9896421063288286e-05, "loss": 0.281, "step": 1247 }, { "epoch": 7.753894080996885, "grad_norm": 3.421875, "learning_rate": 4.989619690049866e-05, "loss": 0.4404, "step": 1248 }, { "epoch": 7.7601246105919, "grad_norm": 3.890625, "learning_rate": 4.98959724959122e-05, "loss": 0.3461, "step": 1249 }, { "epoch": 7.766355140186916, "grad_norm": 4.0, "learning_rate": 4.989574784953107e-05, "loss": 0.3134, "step": 1250 }, { "epoch": 7.7725856697819315, "grad_norm": 4.3125, "learning_rate": 4.989552296135747e-05, "loss": 0.4651, "step": 1251 }, { "epoch": 7.778816199376947, "grad_norm": 3.390625, "learning_rate": 4.989529783139356e-05, "loss": 0.3497, "step": 1252 }, { "epoch": 7.785046728971962, "grad_norm": 5.375, "learning_rate": 4.9895072459641554e-05, "loss": 0.2959, "step": 1253 }, { "epoch": 7.7912772585669785, "grad_norm": 4.09375, "learning_rate": 4.9894846846103625e-05, "loss": 0.353, "step": 1254 }, { "epoch": 7.797507788161994, "grad_norm": 2.546875, "learning_rate": 4.989462099078197e-05, "loss": 0.2935, "step": 1255 }, { "epoch": 7.803738317757009, "grad_norm": 3.5625, "learning_rate": 4.989439489367877e-05, "loss": 0.317, "step": 1256 }, { "epoch": 7.809968847352025, "grad_norm": 3.921875, "learning_rate": 4.989416855479624e-05, "loss": 0.3485, "step": 1257 }, { "epoch": 7.816199376947041, "grad_norm": 5.625, "learning_rate": 4.989394197413656e-05, "loss": 0.3505, "step": 1258 }, { "epoch": 7.822429906542056, "grad_norm": 4.09375, "learning_rate": 4.989371515170195e-05, "loss": 0.3131, "step": 1259 }, { "epoch": 7.8286604361370715, "grad_norm": 2.796875, "learning_rate": 4.989348808749459e-05, "loss": 0.2893, "step": 1260 }, { "epoch": 7.834890965732087, "grad_norm": 5.0625, "learning_rate": 4.989326078151671e-05, "loss": 0.5849, "step": 1261 }, { "epoch": 7.841121495327103, "grad_norm": 4.78125, "learning_rate": 4.98930332337705e-05, "loss": 0.2642, "step": 1262 }, { "epoch": 7.8473520249221185, "grad_norm": 6.875, "learning_rate": 4.9892805444258174e-05, "loss": 0.4094, "step": 1263 }, { "epoch": 7.853582554517134, "grad_norm": 3.859375, "learning_rate": 4.9892577412981946e-05, "loss": 0.271, "step": 1264 }, { "epoch": 7.859813084112149, "grad_norm": 4.28125, "learning_rate": 4.989234913994403e-05, "loss": 0.4152, "step": 1265 }, { "epoch": 7.866043613707165, "grad_norm": 6.0625, "learning_rate": 4.989212062514664e-05, "loss": 0.3011, "step": 1266 }, { "epoch": 7.872274143302181, "grad_norm": 3.921875, "learning_rate": 4.989189186859201e-05, "loss": 0.3179, "step": 1267 }, { "epoch": 7.878504672897196, "grad_norm": 4.46875, "learning_rate": 4.989166287028234e-05, "loss": 0.3416, "step": 1268 }, { "epoch": 7.8847352024922115, "grad_norm": 2.515625, "learning_rate": 4.989143363021987e-05, "loss": 0.2686, "step": 1269 }, { "epoch": 7.890965732087228, "grad_norm": 4.375, "learning_rate": 4.989120414840681e-05, "loss": 0.3523, "step": 1270 }, { "epoch": 7.897196261682243, "grad_norm": 3.53125, "learning_rate": 4.989097442484542e-05, "loss": 0.4397, "step": 1271 }, { "epoch": 7.9034267912772584, "grad_norm": 5.5, "learning_rate": 4.989074445953789e-05, "loss": 0.3698, "step": 1272 }, { "epoch": 7.909657320872274, "grad_norm": 3.328125, "learning_rate": 4.9890514252486486e-05, "loss": 0.2742, "step": 1273 }, { "epoch": 7.91588785046729, "grad_norm": 5.21875, "learning_rate": 4.989028380369343e-05, "loss": 0.4423, "step": 1274 }, { "epoch": 7.922118380062305, "grad_norm": 5.46875, "learning_rate": 4.989005311316096e-05, "loss": 0.5309, "step": 1275 }, { "epoch": 7.928348909657321, "grad_norm": 3.75, "learning_rate": 4.9889822180891324e-05, "loss": 0.4353, "step": 1276 }, { "epoch": 7.934579439252336, "grad_norm": 3.609375, "learning_rate": 4.988959100688675e-05, "loss": 0.2706, "step": 1277 }, { "epoch": 7.940809968847352, "grad_norm": 5.28125, "learning_rate": 4.9889359591149496e-05, "loss": 0.3171, "step": 1278 }, { "epoch": 7.947040498442368, "grad_norm": 3.296875, "learning_rate": 4.988912793368181e-05, "loss": 0.3996, "step": 1279 }, { "epoch": 7.953271028037383, "grad_norm": 3.171875, "learning_rate": 4.9888896034485935e-05, "loss": 0.4366, "step": 1280 }, { "epoch": 7.959501557632398, "grad_norm": 5.6875, "learning_rate": 4.988866389356413e-05, "loss": 0.5389, "step": 1281 }, { "epoch": 7.965732087227415, "grad_norm": 4.59375, "learning_rate": 4.9888431510918645e-05, "loss": 0.3711, "step": 1282 }, { "epoch": 7.97196261682243, "grad_norm": 3.28125, "learning_rate": 4.988819888655174e-05, "loss": 0.4037, "step": 1283 }, { "epoch": 7.978193146417445, "grad_norm": 5.25, "learning_rate": 4.988796602046567e-05, "loss": 0.3255, "step": 1284 }, { "epoch": 7.984423676012461, "grad_norm": 3.6875, "learning_rate": 4.98877329126627e-05, "loss": 0.3119, "step": 1285 }, { "epoch": 7.990654205607477, "grad_norm": 4.21875, "learning_rate": 4.988749956314509e-05, "loss": 0.4487, "step": 1286 }, { "epoch": 7.996884735202492, "grad_norm": 3.109375, "learning_rate": 4.988726597191511e-05, "loss": 0.3174, "step": 1287 }, { "epoch": 8.0, "grad_norm": 3.28125, "learning_rate": 4.988703213897503e-05, "loss": 0.366, "step": 1288 }, { "epoch": 8.006230529595015, "grad_norm": 3.453125, "learning_rate": 4.988679806432712e-05, "loss": 0.3347, "step": 1289 }, { "epoch": 8.01246105919003, "grad_norm": 4.65625, "learning_rate": 4.988656374797365e-05, "loss": 0.4169, "step": 1290 }, { "epoch": 8.018691588785046, "grad_norm": 3.640625, "learning_rate": 4.98863291899169e-05, "loss": 0.4884, "step": 1291 }, { "epoch": 8.024922118380061, "grad_norm": 3.203125, "learning_rate": 4.9886094390159144e-05, "loss": 0.3929, "step": 1292 }, { "epoch": 8.031152647975079, "grad_norm": 3.53125, "learning_rate": 4.9885859348702665e-05, "loss": 0.4501, "step": 1293 }, { "epoch": 8.037383177570094, "grad_norm": 3.015625, "learning_rate": 4.9885624065549747e-05, "loss": 0.3085, "step": 1294 }, { "epoch": 8.04361370716511, "grad_norm": 4.34375, "learning_rate": 4.988538854070267e-05, "loss": 0.486, "step": 1295 }, { "epoch": 8.049844236760125, "grad_norm": 3.78125, "learning_rate": 4.988515277416373e-05, "loss": 0.4864, "step": 1296 }, { "epoch": 8.05607476635514, "grad_norm": 4.40625, "learning_rate": 4.9884916765935205e-05, "loss": 0.39, "step": 1297 }, { "epoch": 8.062305295950155, "grad_norm": 3.96875, "learning_rate": 4.98846805160194e-05, "loss": 0.2917, "step": 1298 }, { "epoch": 8.06853582554517, "grad_norm": 3.125, "learning_rate": 4.988444402441861e-05, "loss": 0.2848, "step": 1299 }, { "epoch": 8.074766355140186, "grad_norm": 3.890625, "learning_rate": 4.988420729113511e-05, "loss": 0.3825, "step": 1300 }, { "epoch": 8.080996884735203, "grad_norm": 4.03125, "learning_rate": 4.9883970316171226e-05, "loss": 0.5276, "step": 1301 }, { "epoch": 8.087227414330219, "grad_norm": 3.59375, "learning_rate": 4.988373309952924e-05, "loss": 0.4851, "step": 1302 }, { "epoch": 8.093457943925234, "grad_norm": 4.84375, "learning_rate": 4.988349564121147e-05, "loss": 0.4123, "step": 1303 }, { "epoch": 8.09968847352025, "grad_norm": 4.375, "learning_rate": 4.988325794122022e-05, "loss": 0.3266, "step": 1304 }, { "epoch": 8.105919003115265, "grad_norm": 2.140625, "learning_rate": 4.9883019999557786e-05, "loss": 0.2324, "step": 1305 }, { "epoch": 8.11214953271028, "grad_norm": 3.765625, "learning_rate": 4.9882781816226485e-05, "loss": 0.4671, "step": 1306 }, { "epoch": 8.118380062305295, "grad_norm": 4.09375, "learning_rate": 4.988254339122864e-05, "loss": 0.3285, "step": 1307 }, { "epoch": 8.12461059190031, "grad_norm": 2.90625, "learning_rate": 4.9882304724566565e-05, "loss": 0.465, "step": 1308 }, { "epoch": 8.130841121495328, "grad_norm": 4.3125, "learning_rate": 4.988206581624256e-05, "loss": 0.5063, "step": 1309 }, { "epoch": 8.137071651090343, "grad_norm": 4.3125, "learning_rate": 4.988182666625897e-05, "loss": 0.4262, "step": 1310 }, { "epoch": 8.143302180685358, "grad_norm": 3.90625, "learning_rate": 4.988158727461809e-05, "loss": 0.2515, "step": 1311 }, { "epoch": 8.149532710280374, "grad_norm": 3.578125, "learning_rate": 4.9881347641322277e-05, "loss": 0.4284, "step": 1312 }, { "epoch": 8.15576323987539, "grad_norm": 3.25, "learning_rate": 4.988110776637383e-05, "loss": 0.2837, "step": 1313 }, { "epoch": 8.161993769470405, "grad_norm": 5.21875, "learning_rate": 4.9880867649775096e-05, "loss": 0.2663, "step": 1314 }, { "epoch": 8.16822429906542, "grad_norm": 4.8125, "learning_rate": 4.9880627291528403e-05, "loss": 0.3982, "step": 1315 }, { "epoch": 8.174454828660435, "grad_norm": 3.109375, "learning_rate": 4.9880386691636086e-05, "loss": 0.2316, "step": 1316 }, { "epoch": 8.180685358255452, "grad_norm": 2.40625, "learning_rate": 4.988014585010048e-05, "loss": 0.3048, "step": 1317 }, { "epoch": 8.186915887850468, "grad_norm": 4.6875, "learning_rate": 4.987990476692392e-05, "loss": 0.3945, "step": 1318 }, { "epoch": 8.193146417445483, "grad_norm": 5.6875, "learning_rate": 4.987966344210875e-05, "loss": 0.558, "step": 1319 }, { "epoch": 8.199376947040498, "grad_norm": 5.96875, "learning_rate": 4.9879421875657314e-05, "loss": 0.4607, "step": 1320 }, { "epoch": 8.205607476635514, "grad_norm": 4.4375, "learning_rate": 4.9879180067571966e-05, "loss": 0.4133, "step": 1321 }, { "epoch": 8.21183800623053, "grad_norm": 4.40625, "learning_rate": 4.9878938017855046e-05, "loss": 0.3801, "step": 1322 }, { "epoch": 8.218068535825545, "grad_norm": 5.3125, "learning_rate": 4.987869572650891e-05, "loss": 0.613, "step": 1323 }, { "epoch": 8.22429906542056, "grad_norm": 4.59375, "learning_rate": 4.98784531935359e-05, "loss": 0.4078, "step": 1324 }, { "epoch": 8.230529595015577, "grad_norm": 3.40625, "learning_rate": 4.987821041893839e-05, "loss": 0.4506, "step": 1325 }, { "epoch": 8.236760124610592, "grad_norm": 3.1875, "learning_rate": 4.987796740271872e-05, "loss": 0.3616, "step": 1326 }, { "epoch": 8.242990654205608, "grad_norm": 4.875, "learning_rate": 4.987772414487926e-05, "loss": 0.6186, "step": 1327 }, { "epoch": 8.249221183800623, "grad_norm": 4.875, "learning_rate": 4.9877480645422365e-05, "loss": 0.3585, "step": 1328 }, { "epoch": 8.255451713395638, "grad_norm": 3.390625, "learning_rate": 4.987723690435041e-05, "loss": 0.2844, "step": 1329 }, { "epoch": 8.261682242990654, "grad_norm": 3.390625, "learning_rate": 4.9876992921665754e-05, "loss": 0.404, "step": 1330 }, { "epoch": 8.26791277258567, "grad_norm": 3.6875, "learning_rate": 4.987674869737077e-05, "loss": 0.3388, "step": 1331 }, { "epoch": 8.274143302180685, "grad_norm": 3.484375, "learning_rate": 4.987650423146783e-05, "loss": 0.3457, "step": 1332 }, { "epoch": 8.280373831775702, "grad_norm": 3.3125, "learning_rate": 4.987625952395931e-05, "loss": 0.2975, "step": 1333 }, { "epoch": 8.286604361370717, "grad_norm": 3.265625, "learning_rate": 4.987601457484758e-05, "loss": 0.519, "step": 1334 }, { "epoch": 8.292834890965732, "grad_norm": 3.203125, "learning_rate": 4.987576938413504e-05, "loss": 0.2646, "step": 1335 }, { "epoch": 8.299065420560748, "grad_norm": 3.765625, "learning_rate": 4.9875523951824045e-05, "loss": 0.5175, "step": 1336 }, { "epoch": 8.305295950155763, "grad_norm": 5.0, "learning_rate": 4.987527827791698e-05, "loss": 0.5623, "step": 1337 }, { "epoch": 8.311526479750778, "grad_norm": 3.296875, "learning_rate": 4.9875032362416254e-05, "loss": 0.5291, "step": 1338 }, { "epoch": 8.317757009345794, "grad_norm": 3.515625, "learning_rate": 4.987478620532423e-05, "loss": 0.4635, "step": 1339 }, { "epoch": 8.32398753894081, "grad_norm": 3.40625, "learning_rate": 4.987453980664332e-05, "loss": 0.4083, "step": 1340 }, { "epoch": 8.330218068535826, "grad_norm": 3.59375, "learning_rate": 4.98742931663759e-05, "loss": 0.5085, "step": 1341 }, { "epoch": 8.336448598130842, "grad_norm": 4.90625, "learning_rate": 4.987404628452438e-05, "loss": 0.4653, "step": 1342 }, { "epoch": 8.342679127725857, "grad_norm": 4.125, "learning_rate": 4.987379916109114e-05, "loss": 0.489, "step": 1343 }, { "epoch": 8.348909657320872, "grad_norm": 4.0, "learning_rate": 4.987355179607859e-05, "loss": 0.5742, "step": 1344 }, { "epoch": 8.355140186915888, "grad_norm": 4.875, "learning_rate": 4.9873304189489144e-05, "loss": 0.5105, "step": 1345 }, { "epoch": 8.361370716510903, "grad_norm": 4.40625, "learning_rate": 4.987305634132519e-05, "loss": 0.3681, "step": 1346 }, { "epoch": 8.367601246105918, "grad_norm": 5.25, "learning_rate": 4.9872808251589144e-05, "loss": 0.375, "step": 1347 }, { "epoch": 8.373831775700934, "grad_norm": 3.40625, "learning_rate": 4.98725599202834e-05, "loss": 0.4207, "step": 1348 }, { "epoch": 8.38006230529595, "grad_norm": 4.65625, "learning_rate": 4.98723113474104e-05, "loss": 0.308, "step": 1349 }, { "epoch": 8.386292834890966, "grad_norm": 5.21875, "learning_rate": 4.987206253297253e-05, "loss": 0.405, "step": 1350 }, { "epoch": 8.392523364485982, "grad_norm": 2.609375, "learning_rate": 4.987181347697221e-05, "loss": 0.2778, "step": 1351 }, { "epoch": 8.398753894080997, "grad_norm": 4.4375, "learning_rate": 4.987156417941188e-05, "loss": 0.363, "step": 1352 }, { "epoch": 8.404984423676012, "grad_norm": 4.34375, "learning_rate": 4.987131464029394e-05, "loss": 0.2555, "step": 1353 }, { "epoch": 8.411214953271028, "grad_norm": 3.78125, "learning_rate": 4.987106485962082e-05, "loss": 0.4244, "step": 1354 }, { "epoch": 8.417445482866043, "grad_norm": 2.640625, "learning_rate": 4.987081483739495e-05, "loss": 0.3264, "step": 1355 }, { "epoch": 8.423676012461058, "grad_norm": 3.265625, "learning_rate": 4.9870564573618747e-05, "loss": 0.3783, "step": 1356 }, { "epoch": 8.429906542056075, "grad_norm": 8.125, "learning_rate": 4.9870314068294655e-05, "loss": 0.5975, "step": 1357 }, { "epoch": 8.43613707165109, "grad_norm": 5.375, "learning_rate": 4.9870063321425105e-05, "loss": 0.4025, "step": 1358 }, { "epoch": 8.442367601246106, "grad_norm": 2.921875, "learning_rate": 4.986981233301252e-05, "loss": 0.2771, "step": 1359 }, { "epoch": 8.448598130841122, "grad_norm": 3.765625, "learning_rate": 4.9869561103059354e-05, "loss": 0.3953, "step": 1360 }, { "epoch": 8.454828660436137, "grad_norm": 5.875, "learning_rate": 4.986930963156804e-05, "loss": 0.4403, "step": 1361 }, { "epoch": 8.461059190031152, "grad_norm": 4.5625, "learning_rate": 4.986905791854101e-05, "loss": 0.4021, "step": 1362 }, { "epoch": 8.467289719626168, "grad_norm": 3.75, "learning_rate": 4.9868805963980726e-05, "loss": 0.601, "step": 1363 }, { "epoch": 8.473520249221183, "grad_norm": 3.40625, "learning_rate": 4.986855376788963e-05, "loss": 0.515, "step": 1364 }, { "epoch": 8.4797507788162, "grad_norm": 3.625, "learning_rate": 4.986830133027016e-05, "loss": 0.3628, "step": 1365 }, { "epoch": 8.485981308411215, "grad_norm": 4.6875, "learning_rate": 4.986804865112478e-05, "loss": 0.5495, "step": 1366 }, { "epoch": 8.49221183800623, "grad_norm": 3.890625, "learning_rate": 4.986779573045594e-05, "loss": 0.2991, "step": 1367 }, { "epoch": 8.498442367601246, "grad_norm": 4.75, "learning_rate": 4.98675425682661e-05, "loss": 0.3734, "step": 1368 }, { "epoch": 8.504672897196262, "grad_norm": 4.03125, "learning_rate": 4.986728916455771e-05, "loss": 0.48, "step": 1369 }, { "epoch": 8.510903426791277, "grad_norm": 2.71875, "learning_rate": 4.986703551933324e-05, "loss": 0.2212, "step": 1370 }, { "epoch": 8.517133956386292, "grad_norm": 3.140625, "learning_rate": 4.986678163259515e-05, "loss": 0.2972, "step": 1371 }, { "epoch": 8.523364485981308, "grad_norm": 3.875, "learning_rate": 4.986652750434591e-05, "loss": 0.5386, "step": 1372 }, { "epoch": 8.529595015576325, "grad_norm": 3.859375, "learning_rate": 4.986627313458798e-05, "loss": 0.2935, "step": 1373 }, { "epoch": 8.53582554517134, "grad_norm": 3.921875, "learning_rate": 4.986601852332383e-05, "loss": 0.4043, "step": 1374 }, { "epoch": 8.542056074766355, "grad_norm": 2.890625, "learning_rate": 4.986576367055594e-05, "loss": 0.2797, "step": 1375 }, { "epoch": 8.54828660436137, "grad_norm": 4.59375, "learning_rate": 4.986550857628679e-05, "loss": 0.4316, "step": 1376 }, { "epoch": 8.554517133956386, "grad_norm": 3.65625, "learning_rate": 4.986525324051885e-05, "loss": 0.3692, "step": 1377 }, { "epoch": 8.560747663551401, "grad_norm": 3.796875, "learning_rate": 4.986499766325459e-05, "loss": 0.408, "step": 1378 }, { "epoch": 8.566978193146417, "grad_norm": 4.65625, "learning_rate": 4.9864741844496506e-05, "loss": 0.4166, "step": 1379 }, { "epoch": 8.573208722741432, "grad_norm": 4.65625, "learning_rate": 4.9864485784247086e-05, "loss": 0.381, "step": 1380 }, { "epoch": 8.57943925233645, "grad_norm": 3.0625, "learning_rate": 4.9864229482508804e-05, "loss": 0.2736, "step": 1381 }, { "epoch": 8.585669781931465, "grad_norm": 4.3125, "learning_rate": 4.986397293928415e-05, "loss": 0.2526, "step": 1382 }, { "epoch": 8.59190031152648, "grad_norm": 7.5, "learning_rate": 4.986371615457564e-05, "loss": 0.3489, "step": 1383 }, { "epoch": 8.598130841121495, "grad_norm": 5.28125, "learning_rate": 4.986345912838573e-05, "loss": 0.6103, "step": 1384 }, { "epoch": 8.60436137071651, "grad_norm": 4.6875, "learning_rate": 4.986320186071694e-05, "loss": 0.2931, "step": 1385 }, { "epoch": 8.610591900311526, "grad_norm": 2.484375, "learning_rate": 4.986294435157176e-05, "loss": 0.2484, "step": 1386 }, { "epoch": 8.616822429906541, "grad_norm": 6.5625, "learning_rate": 4.986268660095269e-05, "loss": 0.334, "step": 1387 }, { "epoch": 8.623052959501557, "grad_norm": 5.125, "learning_rate": 4.9862428608862253e-05, "loss": 0.5123, "step": 1388 }, { "epoch": 8.629283489096574, "grad_norm": 7.34375, "learning_rate": 4.986217037530293e-05, "loss": 0.4144, "step": 1389 }, { "epoch": 8.63551401869159, "grad_norm": 4.21875, "learning_rate": 4.9861911900277236e-05, "loss": 0.4631, "step": 1390 }, { "epoch": 8.641744548286605, "grad_norm": 4.71875, "learning_rate": 4.986165318378769e-05, "loss": 0.2088, "step": 1391 }, { "epoch": 8.64797507788162, "grad_norm": 7.0, "learning_rate": 4.98613942258368e-05, "loss": 0.5726, "step": 1392 }, { "epoch": 8.654205607476635, "grad_norm": 4.84375, "learning_rate": 4.9861135026427074e-05, "loss": 0.4645, "step": 1393 }, { "epoch": 8.66043613707165, "grad_norm": 3.78125, "learning_rate": 4.986087558556104e-05, "loss": 0.4516, "step": 1394 }, { "epoch": 8.666666666666666, "grad_norm": 4.34375, "learning_rate": 4.98606159032412e-05, "loss": 0.342, "step": 1395 }, { "epoch": 8.672897196261681, "grad_norm": 5.21875, "learning_rate": 4.986035597947011e-05, "loss": 0.6123, "step": 1396 }, { "epoch": 8.679127725856699, "grad_norm": 3.796875, "learning_rate": 4.986009581425026e-05, "loss": 0.3505, "step": 1397 }, { "epoch": 8.685358255451714, "grad_norm": 4.6875, "learning_rate": 4.9859835407584186e-05, "loss": 0.3487, "step": 1398 }, { "epoch": 8.69158878504673, "grad_norm": 3.890625, "learning_rate": 4.985957475947443e-05, "loss": 0.4335, "step": 1399 }, { "epoch": 8.697819314641745, "grad_norm": 4.40625, "learning_rate": 4.985931386992351e-05, "loss": 0.3712, "step": 1400 }, { "epoch": 8.70404984423676, "grad_norm": 4.0, "learning_rate": 4.9859052738933966e-05, "loss": 0.3335, "step": 1401 }, { "epoch": 8.710280373831775, "grad_norm": 2.796875, "learning_rate": 4.985879136650834e-05, "loss": 0.373, "step": 1402 }, { "epoch": 8.71651090342679, "grad_norm": 2.984375, "learning_rate": 4.985852975264915e-05, "loss": 0.3417, "step": 1403 }, { "epoch": 8.722741433021806, "grad_norm": 2.6875, "learning_rate": 4.9858267897358956e-05, "loss": 0.341, "step": 1404 }, { "epoch": 8.728971962616823, "grad_norm": 3.359375, "learning_rate": 4.985800580064029e-05, "loss": 0.3096, "step": 1405 }, { "epoch": 8.735202492211839, "grad_norm": 2.90625, "learning_rate": 4.985774346249571e-05, "loss": 0.3406, "step": 1406 }, { "epoch": 8.741433021806854, "grad_norm": 4.0625, "learning_rate": 4.985748088292775e-05, "loss": 0.6418, "step": 1407 }, { "epoch": 8.74766355140187, "grad_norm": 4.03125, "learning_rate": 4.9857218061938976e-05, "loss": 0.4699, "step": 1408 }, { "epoch": 8.753894080996885, "grad_norm": 3.21875, "learning_rate": 4.985695499953193e-05, "loss": 0.5428, "step": 1409 }, { "epoch": 8.7601246105919, "grad_norm": 3.8125, "learning_rate": 4.985669169570916e-05, "loss": 0.3722, "step": 1410 }, { "epoch": 8.766355140186915, "grad_norm": 3.375, "learning_rate": 4.985642815047323e-05, "loss": 0.4381, "step": 1411 }, { "epoch": 8.77258566978193, "grad_norm": 3.578125, "learning_rate": 4.985616436382671e-05, "loss": 0.4347, "step": 1412 }, { "epoch": 8.778816199376948, "grad_norm": 3.171875, "learning_rate": 4.985590033577215e-05, "loss": 0.2959, "step": 1413 }, { "epoch": 8.785046728971963, "grad_norm": 4.4375, "learning_rate": 4.985563606631212e-05, "loss": 0.433, "step": 1414 }, { "epoch": 8.791277258566979, "grad_norm": 3.53125, "learning_rate": 4.9855371555449185e-05, "loss": 0.3808, "step": 1415 }, { "epoch": 8.797507788161994, "grad_norm": 3.71875, "learning_rate": 4.98551068031859e-05, "loss": 0.3154, "step": 1416 }, { "epoch": 8.80373831775701, "grad_norm": 4.625, "learning_rate": 4.9854841809524866e-05, "loss": 0.4619, "step": 1417 }, { "epoch": 8.809968847352025, "grad_norm": 2.734375, "learning_rate": 4.985457657446863e-05, "loss": 0.3041, "step": 1418 }, { "epoch": 8.81619937694704, "grad_norm": 3.546875, "learning_rate": 4.985431109801978e-05, "loss": 0.3683, "step": 1419 }, { "epoch": 8.822429906542055, "grad_norm": 6.28125, "learning_rate": 4.985404538018089e-05, "loss": 0.3789, "step": 1420 }, { "epoch": 8.828660436137072, "grad_norm": 6.46875, "learning_rate": 4.985377942095455e-05, "loss": 0.4158, "step": 1421 }, { "epoch": 8.834890965732088, "grad_norm": 5.21875, "learning_rate": 4.985351322034333e-05, "loss": 0.4277, "step": 1422 }, { "epoch": 8.841121495327103, "grad_norm": 3.796875, "learning_rate": 4.985324677834983e-05, "loss": 0.4379, "step": 1423 }, { "epoch": 8.847352024922118, "grad_norm": 5.9375, "learning_rate": 4.985298009497662e-05, "loss": 0.4923, "step": 1424 }, { "epoch": 8.853582554517134, "grad_norm": 9.125, "learning_rate": 4.9852713170226294e-05, "loss": 0.6862, "step": 1425 }, { "epoch": 8.85981308411215, "grad_norm": 6.53125, "learning_rate": 4.985244600410146e-05, "loss": 0.3098, "step": 1426 }, { "epoch": 8.866043613707165, "grad_norm": 5.59375, "learning_rate": 4.9852178596604705e-05, "loss": 0.3421, "step": 1427 }, { "epoch": 8.87227414330218, "grad_norm": 4.25, "learning_rate": 4.985191094773861e-05, "loss": 0.352, "step": 1428 }, { "epoch": 8.878504672897197, "grad_norm": 7.125, "learning_rate": 4.9851643057505805e-05, "loss": 0.3544, "step": 1429 }, { "epoch": 8.884735202492212, "grad_norm": 6.25, "learning_rate": 4.985137492590887e-05, "loss": 0.2265, "step": 1430 }, { "epoch": 8.890965732087228, "grad_norm": 4.1875, "learning_rate": 4.9851106552950404e-05, "loss": 0.4324, "step": 1431 }, { "epoch": 8.897196261682243, "grad_norm": 4.375, "learning_rate": 4.985083793863303e-05, "loss": 0.4722, "step": 1432 }, { "epoch": 8.903426791277258, "grad_norm": 3.375, "learning_rate": 4.985056908295935e-05, "loss": 0.4213, "step": 1433 }, { "epoch": 8.909657320872274, "grad_norm": 4.25, "learning_rate": 4.985029998593198e-05, "loss": 0.428, "step": 1434 }, { "epoch": 8.91588785046729, "grad_norm": 4.03125, "learning_rate": 4.9850030647553525e-05, "loss": 0.4185, "step": 1435 }, { "epoch": 8.922118380062305, "grad_norm": 4.59375, "learning_rate": 4.984976106782661e-05, "loss": 0.4633, "step": 1436 }, { "epoch": 8.928348909657322, "grad_norm": 3.03125, "learning_rate": 4.984949124675383e-05, "loss": 0.2845, "step": 1437 }, { "epoch": 8.934579439252337, "grad_norm": 2.84375, "learning_rate": 4.9849221184337843e-05, "loss": 0.2843, "step": 1438 }, { "epoch": 8.940809968847352, "grad_norm": 3.84375, "learning_rate": 4.9848950880581246e-05, "loss": 0.2972, "step": 1439 }, { "epoch": 8.947040498442368, "grad_norm": 3.1875, "learning_rate": 4.984868033548667e-05, "loss": 0.2857, "step": 1440 }, { "epoch": 8.953271028037383, "grad_norm": 3.84375, "learning_rate": 4.9848409549056745e-05, "loss": 0.3891, "step": 1441 }, { "epoch": 8.959501557632398, "grad_norm": 3.890625, "learning_rate": 4.984813852129409e-05, "loss": 0.4106, "step": 1442 }, { "epoch": 8.965732087227414, "grad_norm": 2.5, "learning_rate": 4.984786725220136e-05, "loss": 0.2219, "step": 1443 }, { "epoch": 8.97196261682243, "grad_norm": 4.03125, "learning_rate": 4.9847595741781175e-05, "loss": 0.3955, "step": 1444 }, { "epoch": 8.978193146417446, "grad_norm": 3.484375, "learning_rate": 4.9847323990036174e-05, "loss": 0.4804, "step": 1445 }, { "epoch": 8.984423676012462, "grad_norm": 3.9375, "learning_rate": 4.984705199696899e-05, "loss": 0.3511, "step": 1446 }, { "epoch": 8.990654205607477, "grad_norm": 3.109375, "learning_rate": 4.984677976258227e-05, "loss": 0.2896, "step": 1447 }, { "epoch": 8.996884735202492, "grad_norm": 3.75, "learning_rate": 4.9846507286878665e-05, "loss": 0.4947, "step": 1448 }, { "epoch": 9.0, "grad_norm": 2.453125, "learning_rate": 4.9846234569860806e-05, "loss": 0.24, "step": 1449 }, { "epoch": 9.006230529595015, "grad_norm": 2.515625, "learning_rate": 4.984596161153136e-05, "loss": 0.2993, "step": 1450 }, { "epoch": 9.01246105919003, "grad_norm": 4.1875, "learning_rate": 4.9845688411892956e-05, "loss": 0.4036, "step": 1451 }, { "epoch": 9.018691588785046, "grad_norm": 3.421875, "learning_rate": 4.984541497094827e-05, "loss": 0.3393, "step": 1452 }, { "epoch": 9.024922118380061, "grad_norm": 3.8125, "learning_rate": 4.984514128869994e-05, "loss": 0.411, "step": 1453 }, { "epoch": 9.031152647975079, "grad_norm": 4.34375, "learning_rate": 4.9844867365150636e-05, "loss": 0.4003, "step": 1454 }, { "epoch": 9.037383177570094, "grad_norm": 4.4375, "learning_rate": 4.984459320030301e-05, "loss": 0.3377, "step": 1455 }, { "epoch": 9.04361370716511, "grad_norm": 3.625, "learning_rate": 4.984431879415973e-05, "loss": 0.2611, "step": 1456 }, { "epoch": 9.049844236760125, "grad_norm": 3.984375, "learning_rate": 4.984404414672346e-05, "loss": 0.4152, "step": 1457 }, { "epoch": 9.05607476635514, "grad_norm": 7.40625, "learning_rate": 4.984376925799687e-05, "loss": 0.344, "step": 1458 }, { "epoch": 9.062305295950155, "grad_norm": 8.0625, "learning_rate": 4.984349412798263e-05, "loss": 0.2804, "step": 1459 }, { "epoch": 9.06853582554517, "grad_norm": 9.25, "learning_rate": 4.98432187566834e-05, "loss": 0.4757, "step": 1460 }, { "epoch": 9.074766355140186, "grad_norm": 4.84375, "learning_rate": 4.984294314410186e-05, "loss": 0.4826, "step": 1461 }, { "epoch": 9.080996884735203, "grad_norm": 5.0, "learning_rate": 4.98426672902407e-05, "loss": 0.4146, "step": 1462 }, { "epoch": 9.087227414330219, "grad_norm": 4.0625, "learning_rate": 4.9842391195102585e-05, "loss": 0.4338, "step": 1463 }, { "epoch": 9.093457943925234, "grad_norm": 4.40625, "learning_rate": 4.98421148586902e-05, "loss": 0.4155, "step": 1464 }, { "epoch": 9.09968847352025, "grad_norm": 3.0625, "learning_rate": 4.9841838281006235e-05, "loss": 0.2522, "step": 1465 }, { "epoch": 9.105919003115265, "grad_norm": 3.3125, "learning_rate": 4.984156146205336e-05, "loss": 0.2294, "step": 1466 }, { "epoch": 9.11214953271028, "grad_norm": 4.09375, "learning_rate": 4.984128440183429e-05, "loss": 0.2848, "step": 1467 }, { "epoch": 9.118380062305295, "grad_norm": 3.328125, "learning_rate": 4.984100710035169e-05, "loss": 0.2469, "step": 1468 }, { "epoch": 9.12461059190031, "grad_norm": 2.90625, "learning_rate": 4.9840729557608265e-05, "loss": 0.3061, "step": 1469 }, { "epoch": 9.130841121495328, "grad_norm": 5.0625, "learning_rate": 4.984045177360671e-05, "loss": 0.423, "step": 1470 }, { "epoch": 9.137071651090343, "grad_norm": 3.703125, "learning_rate": 4.9840173748349716e-05, "loss": 0.4468, "step": 1471 }, { "epoch": 9.143302180685358, "grad_norm": 3.078125, "learning_rate": 4.9839895481839994e-05, "loss": 0.2443, "step": 1472 }, { "epoch": 9.149532710280374, "grad_norm": 2.171875, "learning_rate": 4.9839616974080246e-05, "loss": 0.2, "step": 1473 }, { "epoch": 9.15576323987539, "grad_norm": 3.265625, "learning_rate": 4.983933822507316e-05, "loss": 0.3402, "step": 1474 }, { "epoch": 9.161993769470405, "grad_norm": 3.203125, "learning_rate": 4.983905923482147e-05, "loss": 0.3279, "step": 1475 }, { "epoch": 9.16822429906542, "grad_norm": 2.96875, "learning_rate": 4.983878000332787e-05, "loss": 0.2908, "step": 1476 }, { "epoch": 9.174454828660435, "grad_norm": 3.828125, "learning_rate": 4.9838500530595066e-05, "loss": 0.3899, "step": 1477 }, { "epoch": 9.180685358255452, "grad_norm": 1.8203125, "learning_rate": 4.983822081662578e-05, "loss": 0.2113, "step": 1478 }, { "epoch": 9.186915887850468, "grad_norm": 3.453125, "learning_rate": 4.983794086142273e-05, "loss": 0.3908, "step": 1479 }, { "epoch": 9.193146417445483, "grad_norm": 4.0, "learning_rate": 4.9837660664988636e-05, "loss": 0.2624, "step": 1480 }, { "epoch": 9.199376947040498, "grad_norm": 3.0625, "learning_rate": 4.983738022732621e-05, "loss": 0.4495, "step": 1481 }, { "epoch": 9.205607476635514, "grad_norm": 2.484375, "learning_rate": 4.9837099548438195e-05, "loss": 0.2885, "step": 1482 }, { "epoch": 9.21183800623053, "grad_norm": 3.203125, "learning_rate": 4.98368186283273e-05, "loss": 0.3212, "step": 1483 }, { "epoch": 9.218068535825545, "grad_norm": 2.1875, "learning_rate": 4.9836537466996244e-05, "loss": 0.2248, "step": 1484 }, { "epoch": 9.22429906542056, "grad_norm": 2.265625, "learning_rate": 4.983625606444779e-05, "loss": 0.257, "step": 1485 }, { "epoch": 9.230529595015577, "grad_norm": 2.390625, "learning_rate": 4.983597442068464e-05, "loss": 0.257, "step": 1486 }, { "epoch": 9.236760124610592, "grad_norm": 3.21875, "learning_rate": 4.983569253570955e-05, "loss": 0.2767, "step": 1487 }, { "epoch": 9.242990654205608, "grad_norm": 3.671875, "learning_rate": 4.9835410409525246e-05, "loss": 0.3661, "step": 1488 }, { "epoch": 9.249221183800623, "grad_norm": 4.1875, "learning_rate": 4.983512804213447e-05, "loss": 0.4692, "step": 1489 }, { "epoch": 9.255451713395638, "grad_norm": 3.25, "learning_rate": 4.9834845433539966e-05, "loss": 0.3731, "step": 1490 }, { "epoch": 9.261682242990654, "grad_norm": 3.4375, "learning_rate": 4.983456258374448e-05, "loss": 0.2799, "step": 1491 }, { "epoch": 9.26791277258567, "grad_norm": 4.375, "learning_rate": 4.983427949275076e-05, "loss": 0.3337, "step": 1492 }, { "epoch": 9.274143302180685, "grad_norm": 3.78125, "learning_rate": 4.983399616056155e-05, "loss": 0.4058, "step": 1493 }, { "epoch": 9.280373831775702, "grad_norm": 4.75, "learning_rate": 4.983371258717961e-05, "loss": 0.3449, "step": 1494 }, { "epoch": 9.286604361370717, "grad_norm": 6.625, "learning_rate": 4.9833428772607685e-05, "loss": 0.5227, "step": 1495 }, { "epoch": 9.292834890965732, "grad_norm": 5.96875, "learning_rate": 4.983314471684853e-05, "loss": 0.2311, "step": 1496 }, { "epoch": 9.299065420560748, "grad_norm": 2.1875, "learning_rate": 4.983286041990492e-05, "loss": 0.3054, "step": 1497 }, { "epoch": 9.305295950155763, "grad_norm": 5.125, "learning_rate": 4.98325758817796e-05, "loss": 0.379, "step": 1498 }, { "epoch": 9.311526479750778, "grad_norm": 5.46875, "learning_rate": 4.983229110247534e-05, "loss": 0.6068, "step": 1499 }, { "epoch": 9.317757009345794, "grad_norm": 3.84375, "learning_rate": 4.9832006081994907e-05, "loss": 0.4105, "step": 1500 }, { "epoch": 9.32398753894081, "grad_norm": 3.71875, "learning_rate": 4.983172082034106e-05, "loss": 0.5494, "step": 1501 }, { "epoch": 9.330218068535826, "grad_norm": 2.75, "learning_rate": 4.983143531751659e-05, "loss": 0.4175, "step": 1502 }, { "epoch": 9.336448598130842, "grad_norm": 3.765625, "learning_rate": 4.983114957352425e-05, "loss": 0.3544, "step": 1503 }, { "epoch": 9.342679127725857, "grad_norm": 4.8125, "learning_rate": 4.9830863588366815e-05, "loss": 0.2478, "step": 1504 }, { "epoch": 9.348909657320872, "grad_norm": 2.734375, "learning_rate": 4.9830577362047076e-05, "loss": 0.2417, "step": 1505 }, { "epoch": 9.355140186915888, "grad_norm": 3.828125, "learning_rate": 4.98302908945678e-05, "loss": 0.3977, "step": 1506 }, { "epoch": 9.361370716510903, "grad_norm": 3.21875, "learning_rate": 4.983000418593179e-05, "loss": 0.2396, "step": 1507 }, { "epoch": 9.367601246105918, "grad_norm": 2.375, "learning_rate": 4.98297172361418e-05, "loss": 0.2267, "step": 1508 }, { "epoch": 9.373831775700934, "grad_norm": 3.015625, "learning_rate": 4.982943004520064e-05, "loss": 0.2676, "step": 1509 }, { "epoch": 9.38006230529595, "grad_norm": 3.90625, "learning_rate": 4.982914261311109e-05, "loss": 0.37, "step": 1510 }, { "epoch": 9.386292834890966, "grad_norm": 4.3125, "learning_rate": 4.9828854939875945e-05, "loss": 0.3094, "step": 1511 }, { "epoch": 9.392523364485982, "grad_norm": 3.953125, "learning_rate": 4.9828567025498e-05, "loss": 0.3521, "step": 1512 }, { "epoch": 9.398753894080997, "grad_norm": 3.625, "learning_rate": 4.9828278869980044e-05, "loss": 0.3441, "step": 1513 }, { "epoch": 9.404984423676012, "grad_norm": 2.84375, "learning_rate": 4.982799047332488e-05, "loss": 0.3019, "step": 1514 }, { "epoch": 9.411214953271028, "grad_norm": 4.5625, "learning_rate": 4.9827701835535315e-05, "loss": 0.307, "step": 1515 }, { "epoch": 9.417445482866043, "grad_norm": 3.484375, "learning_rate": 4.9827412956614145e-05, "loss": 0.4376, "step": 1516 }, { "epoch": 9.423676012461058, "grad_norm": 4.25, "learning_rate": 4.9827123836564175e-05, "loss": 0.2868, "step": 1517 }, { "epoch": 9.429906542056075, "grad_norm": 2.484375, "learning_rate": 4.982683447538822e-05, "loss": 0.2598, "step": 1518 }, { "epoch": 9.43613707165109, "grad_norm": 4.40625, "learning_rate": 4.982654487308908e-05, "loss": 0.4036, "step": 1519 }, { "epoch": 9.442367601246106, "grad_norm": 2.484375, "learning_rate": 4.9826255029669577e-05, "loss": 0.2996, "step": 1520 }, { "epoch": 9.448598130841122, "grad_norm": 6.375, "learning_rate": 4.982596494513252e-05, "loss": 0.4265, "step": 1521 }, { "epoch": 9.454828660436137, "grad_norm": 4.90625, "learning_rate": 4.982567461948073e-05, "loss": 0.2577, "step": 1522 }, { "epoch": 9.461059190031152, "grad_norm": 5.84375, "learning_rate": 4.982538405271703e-05, "loss": 0.5314, "step": 1523 }, { "epoch": 9.467289719626168, "grad_norm": 3.15625, "learning_rate": 4.982509324484424e-05, "loss": 0.2529, "step": 1524 }, { "epoch": 9.473520249221183, "grad_norm": 6.46875, "learning_rate": 4.9824802195865164e-05, "loss": 0.3395, "step": 1525 }, { "epoch": 9.4797507788162, "grad_norm": 2.921875, "learning_rate": 4.982451090578266e-05, "loss": 0.3474, "step": 1526 }, { "epoch": 9.485981308411215, "grad_norm": 4.78125, "learning_rate": 4.9824219374599543e-05, "loss": 0.335, "step": 1527 }, { "epoch": 9.49221183800623, "grad_norm": 4.125, "learning_rate": 4.982392760231864e-05, "loss": 0.4863, "step": 1528 }, { "epoch": 9.498442367601246, "grad_norm": 2.90625, "learning_rate": 4.98236355889428e-05, "loss": 0.3372, "step": 1529 }, { "epoch": 9.504672897196262, "grad_norm": 3.8125, "learning_rate": 4.9823343334474844e-05, "loss": 0.4272, "step": 1530 }, { "epoch": 9.510903426791277, "grad_norm": 4.0, "learning_rate": 4.982305083891761e-05, "loss": 0.4918, "step": 1531 }, { "epoch": 9.517133956386292, "grad_norm": 2.296875, "learning_rate": 4.982275810227395e-05, "loss": 0.2727, "step": 1532 }, { "epoch": 9.523364485981308, "grad_norm": 3.0, "learning_rate": 4.9822465124546705e-05, "loss": 0.2535, "step": 1533 }, { "epoch": 9.529595015576325, "grad_norm": 3.96875, "learning_rate": 4.982217190573871e-05, "loss": 0.4352, "step": 1534 }, { "epoch": 9.53582554517134, "grad_norm": 2.828125, "learning_rate": 4.982187844585282e-05, "loss": 0.2624, "step": 1535 }, { "epoch": 9.542056074766355, "grad_norm": 2.59375, "learning_rate": 4.982158474489189e-05, "loss": 0.2743, "step": 1536 }, { "epoch": 9.54828660436137, "grad_norm": 1.9453125, "learning_rate": 4.982129080285876e-05, "loss": 0.2233, "step": 1537 }, { "epoch": 9.554517133956386, "grad_norm": 3.828125, "learning_rate": 4.98209966197563e-05, "loss": 0.3479, "step": 1538 }, { "epoch": 9.560747663551401, "grad_norm": 2.53125, "learning_rate": 4.9820702195587356e-05, "loss": 0.2525, "step": 1539 }, { "epoch": 9.566978193146417, "grad_norm": 3.34375, "learning_rate": 4.9820407530354786e-05, "loss": 0.3433, "step": 1540 }, { "epoch": 9.573208722741432, "grad_norm": 4.0625, "learning_rate": 4.982011262406145e-05, "loss": 0.3334, "step": 1541 }, { "epoch": 9.57943925233645, "grad_norm": 3.078125, "learning_rate": 4.981981747671024e-05, "loss": 0.3732, "step": 1542 }, { "epoch": 9.585669781931465, "grad_norm": 3.828125, "learning_rate": 4.981952208830399e-05, "loss": 0.4863, "step": 1543 }, { "epoch": 9.59190031152648, "grad_norm": 4.59375, "learning_rate": 4.981922645884558e-05, "loss": 0.3562, "step": 1544 }, { "epoch": 9.598130841121495, "grad_norm": 4.875, "learning_rate": 4.981893058833788e-05, "loss": 0.3072, "step": 1545 }, { "epoch": 9.60436137071651, "grad_norm": 3.1875, "learning_rate": 4.9818634476783764e-05, "loss": 0.3958, "step": 1546 }, { "epoch": 9.610591900311526, "grad_norm": 3.9375, "learning_rate": 4.981833812418612e-05, "loss": 0.4716, "step": 1547 }, { "epoch": 9.616822429906541, "grad_norm": 3.6875, "learning_rate": 4.9818041530547796e-05, "loss": 0.3006, "step": 1548 }, { "epoch": 9.623052959501557, "grad_norm": 4.40625, "learning_rate": 4.98177446958717e-05, "loss": 0.5452, "step": 1549 }, { "epoch": 9.629283489096574, "grad_norm": 3.609375, "learning_rate": 4.981744762016072e-05, "loss": 0.5232, "step": 1550 }, { "epoch": 9.63551401869159, "grad_norm": 3.734375, "learning_rate": 4.9817150303417716e-05, "loss": 0.3052, "step": 1551 }, { "epoch": 9.641744548286605, "grad_norm": 4.5, "learning_rate": 4.981685274564558e-05, "loss": 0.4713, "step": 1552 }, { "epoch": 9.64797507788162, "grad_norm": 4.4375, "learning_rate": 4.9816554946847216e-05, "loss": 0.3603, "step": 1553 }, { "epoch": 9.654205607476635, "grad_norm": 2.78125, "learning_rate": 4.981625690702552e-05, "loss": 0.2546, "step": 1554 }, { "epoch": 9.66043613707165, "grad_norm": 3.890625, "learning_rate": 4.981595862618336e-05, "loss": 0.4446, "step": 1555 }, { "epoch": 9.666666666666666, "grad_norm": 3.1875, "learning_rate": 4.9815660104323656e-05, "loss": 0.3243, "step": 1556 }, { "epoch": 9.672897196261681, "grad_norm": 5.25, "learning_rate": 4.98153613414493e-05, "loss": 0.3055, "step": 1557 }, { "epoch": 9.679127725856699, "grad_norm": 4.25, "learning_rate": 4.98150623375632e-05, "loss": 0.3349, "step": 1558 }, { "epoch": 9.685358255451714, "grad_norm": 2.5, "learning_rate": 4.9814763092668245e-05, "loss": 0.2742, "step": 1559 }, { "epoch": 9.69158878504673, "grad_norm": 2.921875, "learning_rate": 4.981446360676736e-05, "loss": 0.2665, "step": 1560 }, { "epoch": 9.697819314641745, "grad_norm": 3.734375, "learning_rate": 4.9814163879863436e-05, "loss": 0.2946, "step": 1561 }, { "epoch": 9.70404984423676, "grad_norm": 4.59375, "learning_rate": 4.981386391195939e-05, "loss": 0.4124, "step": 1562 }, { "epoch": 9.710280373831775, "grad_norm": 4.6875, "learning_rate": 4.981356370305815e-05, "loss": 0.2222, "step": 1563 }, { "epoch": 9.71651090342679, "grad_norm": 2.984375, "learning_rate": 4.981326325316261e-05, "loss": 0.2866, "step": 1564 }, { "epoch": 9.722741433021806, "grad_norm": 2.59375, "learning_rate": 4.981296256227569e-05, "loss": 0.2444, "step": 1565 }, { "epoch": 9.728971962616823, "grad_norm": 3.421875, "learning_rate": 4.981266163040033e-05, "loss": 0.2655, "step": 1566 }, { "epoch": 9.735202492211839, "grad_norm": 4.0625, "learning_rate": 4.9812360457539444e-05, "loss": 0.2508, "step": 1567 }, { "epoch": 9.741433021806854, "grad_norm": 5.21875, "learning_rate": 4.981205904369594e-05, "loss": 0.4171, "step": 1568 }, { "epoch": 9.74766355140187, "grad_norm": 5.09375, "learning_rate": 4.9811757388872764e-05, "loss": 0.3575, "step": 1569 }, { "epoch": 9.753894080996885, "grad_norm": 3.390625, "learning_rate": 4.981145549307285e-05, "loss": 0.2845, "step": 1570 }, { "epoch": 9.7601246105919, "grad_norm": 2.953125, "learning_rate": 4.98111533562991e-05, "loss": 0.4146, "step": 1571 }, { "epoch": 9.766355140186915, "grad_norm": 3.25, "learning_rate": 4.981085097855449e-05, "loss": 0.2509, "step": 1572 }, { "epoch": 9.77258566978193, "grad_norm": 3.21875, "learning_rate": 4.981054835984192e-05, "loss": 0.3957, "step": 1573 }, { "epoch": 9.778816199376948, "grad_norm": 2.6875, "learning_rate": 4.981024550016435e-05, "loss": 0.2785, "step": 1574 }, { "epoch": 9.785046728971963, "grad_norm": 2.875, "learning_rate": 4.980994239952472e-05, "loss": 0.2763, "step": 1575 }, { "epoch": 9.791277258566979, "grad_norm": 4.0625, "learning_rate": 4.9809639057925966e-05, "loss": 0.3019, "step": 1576 }, { "epoch": 9.797507788161994, "grad_norm": 4.0625, "learning_rate": 4.9809335475371045e-05, "loss": 0.3584, "step": 1577 }, { "epoch": 9.80373831775701, "grad_norm": 3.609375, "learning_rate": 4.980903165186289e-05, "loss": 0.3667, "step": 1578 }, { "epoch": 9.809968847352025, "grad_norm": 3.296875, "learning_rate": 4.9808727587404464e-05, "loss": 0.3258, "step": 1579 }, { "epoch": 9.81619937694704, "grad_norm": 5.53125, "learning_rate": 4.980842328199872e-05, "loss": 0.3599, "step": 1580 }, { "epoch": 9.822429906542055, "grad_norm": 2.609375, "learning_rate": 4.98081187356486e-05, "loss": 0.3065, "step": 1581 }, { "epoch": 9.828660436137072, "grad_norm": 3.359375, "learning_rate": 4.980781394835707e-05, "loss": 0.3058, "step": 1582 }, { "epoch": 9.834890965732088, "grad_norm": 3.265625, "learning_rate": 4.980750892012711e-05, "loss": 0.4658, "step": 1583 }, { "epoch": 9.841121495327103, "grad_norm": 3.59375, "learning_rate": 4.9807203650961654e-05, "loss": 0.3304, "step": 1584 }, { "epoch": 9.847352024922118, "grad_norm": 2.796875, "learning_rate": 4.980689814086368e-05, "loss": 0.3275, "step": 1585 }, { "epoch": 9.853582554517134, "grad_norm": 2.578125, "learning_rate": 4.980659238983615e-05, "loss": 0.3111, "step": 1586 }, { "epoch": 9.85981308411215, "grad_norm": 2.859375, "learning_rate": 4.980628639788203e-05, "loss": 0.2771, "step": 1587 }, { "epoch": 9.866043613707165, "grad_norm": 2.546875, "learning_rate": 4.9805980165004304e-05, "loss": 0.2486, "step": 1588 }, { "epoch": 9.87227414330218, "grad_norm": 4.96875, "learning_rate": 4.980567369120594e-05, "loss": 0.4979, "step": 1589 }, { "epoch": 9.878504672897197, "grad_norm": 2.296875, "learning_rate": 4.9805366976489915e-05, "loss": 0.2439, "step": 1590 }, { "epoch": 9.884735202492212, "grad_norm": 3.875, "learning_rate": 4.980506002085921e-05, "loss": 0.4273, "step": 1591 }, { "epoch": 9.890965732087228, "grad_norm": 3.234375, "learning_rate": 4.9804752824316794e-05, "loss": 0.3615, "step": 1592 }, { "epoch": 9.897196261682243, "grad_norm": 5.0, "learning_rate": 4.980444538686567e-05, "loss": 0.4937, "step": 1593 }, { "epoch": 9.903426791277258, "grad_norm": 3.203125, "learning_rate": 4.9804137708508814e-05, "loss": 0.3817, "step": 1594 }, { "epoch": 9.909657320872274, "grad_norm": 4.125, "learning_rate": 4.9803829789249205e-05, "loss": 0.3778, "step": 1595 }, { "epoch": 9.91588785046729, "grad_norm": 3.265625, "learning_rate": 4.980352162908985e-05, "loss": 0.4179, "step": 1596 }, { "epoch": 9.922118380062305, "grad_norm": 2.671875, "learning_rate": 4.980321322803374e-05, "loss": 0.2771, "step": 1597 }, { "epoch": 9.928348909657322, "grad_norm": 2.265625, "learning_rate": 4.9802904586083856e-05, "loss": 0.3559, "step": 1598 }, { "epoch": 9.934579439252337, "grad_norm": 2.875, "learning_rate": 4.9802595703243205e-05, "loss": 0.2838, "step": 1599 }, { "epoch": 9.940809968847352, "grad_norm": 2.671875, "learning_rate": 4.9802286579514787e-05, "loss": 0.3266, "step": 1600 }, { "epoch": 9.947040498442368, "grad_norm": 2.171875, "learning_rate": 4.980197721490161e-05, "loss": 0.2071, "step": 1601 }, { "epoch": 9.953271028037383, "grad_norm": 3.875, "learning_rate": 4.980166760940667e-05, "loss": 0.3106, "step": 1602 }, { "epoch": 9.959501557632398, "grad_norm": 2.90625, "learning_rate": 4.980135776303297e-05, "loss": 0.3547, "step": 1603 }, { "epoch": 9.965732087227414, "grad_norm": 3.28125, "learning_rate": 4.980104767578353e-05, "loss": 0.4603, "step": 1604 }, { "epoch": 9.97196261682243, "grad_norm": 3.578125, "learning_rate": 4.980073734766136e-05, "loss": 0.3774, "step": 1605 }, { "epoch": 9.978193146417446, "grad_norm": 2.578125, "learning_rate": 4.980042677866947e-05, "loss": 0.2673, "step": 1606 }, { "epoch": 9.984423676012462, "grad_norm": 3.4375, "learning_rate": 4.980011596881087e-05, "loss": 0.3239, "step": 1607 }, { "epoch": 9.990654205607477, "grad_norm": 3.53125, "learning_rate": 4.97998049180886e-05, "loss": 0.3667, "step": 1608 }, { "epoch": 9.996884735202492, "grad_norm": 2.515625, "learning_rate": 4.9799493626505653e-05, "loss": 0.2338, "step": 1609 }, { "epoch": 10.0, "grad_norm": 2.21875, "learning_rate": 4.9799182094065075e-05, "loss": 0.1516, "step": 1610 }, { "epoch": 10.006230529595015, "grad_norm": 3.578125, "learning_rate": 4.9798870320769886e-05, "loss": 0.4352, "step": 1611 }, { "epoch": 10.01246105919003, "grad_norm": 3.34375, "learning_rate": 4.979855830662311e-05, "loss": 0.2902, "step": 1612 }, { "epoch": 10.018691588785046, "grad_norm": 3.03125, "learning_rate": 4.9798246051627776e-05, "loss": 0.4102, "step": 1613 }, { "epoch": 10.024922118380061, "grad_norm": 4.09375, "learning_rate": 4.979793355578691e-05, "loss": 0.6046, "step": 1614 }, { "epoch": 10.031152647975079, "grad_norm": 4.125, "learning_rate": 4.9797620819103574e-05, "loss": 0.4895, "step": 1615 }, { "epoch": 10.037383177570094, "grad_norm": 3.734375, "learning_rate": 4.979730784158078e-05, "loss": 0.4159, "step": 1616 }, { "epoch": 10.04361370716511, "grad_norm": 3.140625, "learning_rate": 4.979699462322157e-05, "loss": 0.4023, "step": 1617 }, { "epoch": 10.049844236760125, "grad_norm": 6.84375, "learning_rate": 4.9796681164029e-05, "loss": 0.5295, "step": 1618 }, { "epoch": 10.05607476635514, "grad_norm": 4.125, "learning_rate": 4.97963674640061e-05, "loss": 0.3447, "step": 1619 }, { "epoch": 10.062305295950155, "grad_norm": 3.078125, "learning_rate": 4.979605352315592e-05, "loss": 0.269, "step": 1620 }, { "epoch": 10.06853582554517, "grad_norm": 6.0, "learning_rate": 4.979573934148152e-05, "loss": 0.369, "step": 1621 }, { "epoch": 10.074766355140186, "grad_norm": 6.21875, "learning_rate": 4.9795424918985936e-05, "loss": 0.4843, "step": 1622 }, { "epoch": 10.080996884735203, "grad_norm": 4.9375, "learning_rate": 4.979511025567223e-05, "loss": 0.3495, "step": 1623 }, { "epoch": 10.087227414330219, "grad_norm": 3.015625, "learning_rate": 4.979479535154346e-05, "loss": 0.2173, "step": 1624 }, { "epoch": 10.093457943925234, "grad_norm": 2.890625, "learning_rate": 4.979448020660268e-05, "loss": 0.236, "step": 1625 }, { "epoch": 10.09968847352025, "grad_norm": 4.1875, "learning_rate": 4.979416482085295e-05, "loss": 0.3235, "step": 1626 }, { "epoch": 10.105919003115265, "grad_norm": 3.109375, "learning_rate": 4.9793849194297335e-05, "loss": 0.244, "step": 1627 }, { "epoch": 10.11214953271028, "grad_norm": 4.40625, "learning_rate": 4.97935333269389e-05, "loss": 0.3962, "step": 1628 }, { "epoch": 10.118380062305295, "grad_norm": 3.5, "learning_rate": 4.979321721878072e-05, "loss": 0.386, "step": 1629 }, { "epoch": 10.12461059190031, "grad_norm": 3.609375, "learning_rate": 4.9792900869825854e-05, "loss": 0.4172, "step": 1630 }, { "epoch": 10.130841121495328, "grad_norm": 3.828125, "learning_rate": 4.979258428007738e-05, "loss": 0.4187, "step": 1631 }, { "epoch": 10.137071651090343, "grad_norm": 4.21875, "learning_rate": 4.979226744953837e-05, "loss": 0.3376, "step": 1632 }, { "epoch": 10.143302180685358, "grad_norm": 4.46875, "learning_rate": 4.97919503782119e-05, "loss": 0.283, "step": 1633 }, { "epoch": 10.149532710280374, "grad_norm": 3.171875, "learning_rate": 4.979163306610105e-05, "loss": 0.3432, "step": 1634 }, { "epoch": 10.15576323987539, "grad_norm": 4.34375, "learning_rate": 4.979131551320891e-05, "loss": 0.4562, "step": 1635 }, { "epoch": 10.161993769470405, "grad_norm": 3.9375, "learning_rate": 4.9790997719538554e-05, "loss": 0.224, "step": 1636 }, { "epoch": 10.16822429906542, "grad_norm": 3.53125, "learning_rate": 4.9790679685093075e-05, "loss": 0.4093, "step": 1637 }, { "epoch": 10.174454828660435, "grad_norm": 2.78125, "learning_rate": 4.979036140987556e-05, "loss": 0.3773, "step": 1638 }, { "epoch": 10.180685358255452, "grad_norm": 4.65625, "learning_rate": 4.97900428938891e-05, "loss": 0.5101, "step": 1639 }, { "epoch": 10.186915887850468, "grad_norm": 2.5625, "learning_rate": 4.978972413713678e-05, "loss": 0.3439, "step": 1640 }, { "epoch": 10.193146417445483, "grad_norm": 5.40625, "learning_rate": 4.9789405139621714e-05, "loss": 0.2689, "step": 1641 }, { "epoch": 10.199376947040498, "grad_norm": 4.125, "learning_rate": 4.978908590134699e-05, "loss": 0.3486, "step": 1642 }, { "epoch": 10.205607476635514, "grad_norm": 3.875, "learning_rate": 4.97887664223157e-05, "loss": 0.3506, "step": 1643 }, { "epoch": 10.21183800623053, "grad_norm": 4.40625, "learning_rate": 4.978844670253096e-05, "loss": 0.2343, "step": 1644 }, { "epoch": 10.218068535825545, "grad_norm": 4.65625, "learning_rate": 4.978812674199587e-05, "loss": 0.3374, "step": 1645 }, { "epoch": 10.22429906542056, "grad_norm": 4.625, "learning_rate": 4.9787806540713546e-05, "loss": 0.3573, "step": 1646 }, { "epoch": 10.230529595015577, "grad_norm": 2.765625, "learning_rate": 4.9787486098687076e-05, "loss": 0.249, "step": 1647 }, { "epoch": 10.236760124610592, "grad_norm": 2.515625, "learning_rate": 4.97871654159196e-05, "loss": 0.3293, "step": 1648 }, { "epoch": 10.242990654205608, "grad_norm": 5.53125, "learning_rate": 4.978684449241421e-05, "loss": 0.3728, "step": 1649 }, { "epoch": 10.249221183800623, "grad_norm": 4.0, "learning_rate": 4.978652332817403e-05, "loss": 0.2792, "step": 1650 }, { "epoch": 10.255451713395638, "grad_norm": 3.359375, "learning_rate": 4.978620192320218e-05, "loss": 0.3799, "step": 1651 }, { "epoch": 10.261682242990654, "grad_norm": 5.9375, "learning_rate": 4.978588027750178e-05, "loss": 0.328, "step": 1652 }, { "epoch": 10.26791277258567, "grad_norm": 3.984375, "learning_rate": 4.978555839107597e-05, "loss": 0.2989, "step": 1653 }, { "epoch": 10.274143302180685, "grad_norm": 5.3125, "learning_rate": 4.9785236263927854e-05, "loss": 0.2172, "step": 1654 }, { "epoch": 10.280373831775702, "grad_norm": 3.4375, "learning_rate": 4.978491389606057e-05, "loss": 0.3088, "step": 1655 }, { "epoch": 10.286604361370717, "grad_norm": 3.6875, "learning_rate": 4.978459128747724e-05, "loss": 0.2624, "step": 1656 }, { "epoch": 10.292834890965732, "grad_norm": 6.125, "learning_rate": 4.9784268438181016e-05, "loss": 0.4416, "step": 1657 }, { "epoch": 10.299065420560748, "grad_norm": 5.1875, "learning_rate": 4.978394534817502e-05, "loss": 0.2592, "step": 1658 }, { "epoch": 10.305295950155763, "grad_norm": 5.21875, "learning_rate": 4.978362201746239e-05, "loss": 0.4515, "step": 1659 }, { "epoch": 10.311526479750778, "grad_norm": 3.21875, "learning_rate": 4.978329844604627e-05, "loss": 0.2882, "step": 1660 }, { "epoch": 10.317757009345794, "grad_norm": 5.84375, "learning_rate": 4.97829746339298e-05, "loss": 0.4842, "step": 1661 }, { "epoch": 10.32398753894081, "grad_norm": 6.25, "learning_rate": 4.978265058111613e-05, "loss": 0.5348, "step": 1662 }, { "epoch": 10.330218068535826, "grad_norm": 7.15625, "learning_rate": 4.978232628760841e-05, "loss": 0.5213, "step": 1663 }, { "epoch": 10.336448598130842, "grad_norm": 2.265625, "learning_rate": 4.978200175340977e-05, "loss": 0.3275, "step": 1664 }, { "epoch": 10.342679127725857, "grad_norm": 2.53125, "learning_rate": 4.978167697852338e-05, "loss": 0.2535, "step": 1665 }, { "epoch": 10.348909657320872, "grad_norm": 2.328125, "learning_rate": 4.9781351962952384e-05, "loss": 0.3194, "step": 1666 }, { "epoch": 10.355140186915888, "grad_norm": 3.890625, "learning_rate": 4.978102670669995e-05, "loss": 0.2694, "step": 1667 }, { "epoch": 10.361370716510903, "grad_norm": 4.625, "learning_rate": 4.978070120976923e-05, "loss": 0.3476, "step": 1668 }, { "epoch": 10.367601246105918, "grad_norm": 2.578125, "learning_rate": 4.9780375472163385e-05, "loss": 0.3262, "step": 1669 }, { "epoch": 10.373831775700934, "grad_norm": 3.828125, "learning_rate": 4.978004949388558e-05, "loss": 0.4028, "step": 1670 }, { "epoch": 10.38006230529595, "grad_norm": 6.28125, "learning_rate": 4.977972327493899e-05, "loss": 0.3731, "step": 1671 }, { "epoch": 10.386292834890966, "grad_norm": 4.625, "learning_rate": 4.9779396815326765e-05, "loss": 0.4095, "step": 1672 }, { "epoch": 10.392523364485982, "grad_norm": 3.171875, "learning_rate": 4.977907011505208e-05, "loss": 0.2811, "step": 1673 }, { "epoch": 10.398753894080997, "grad_norm": 4.0625, "learning_rate": 4.977874317411813e-05, "loss": 0.2559, "step": 1674 }, { "epoch": 10.404984423676012, "grad_norm": 4.1875, "learning_rate": 4.977841599252806e-05, "loss": 0.3463, "step": 1675 }, { "epoch": 10.411214953271028, "grad_norm": 3.78125, "learning_rate": 4.977808857028506e-05, "loss": 0.3652, "step": 1676 }, { "epoch": 10.417445482866043, "grad_norm": 4.78125, "learning_rate": 4.977776090739231e-05, "loss": 0.4156, "step": 1677 }, { "epoch": 10.423676012461058, "grad_norm": 3.796875, "learning_rate": 4.9777433003853e-05, "loss": 0.2205, "step": 1678 }, { "epoch": 10.429906542056075, "grad_norm": 4.21875, "learning_rate": 4.977710485967031e-05, "loss": 0.2415, "step": 1679 }, { "epoch": 10.43613707165109, "grad_norm": 4.15625, "learning_rate": 4.977677647484741e-05, "loss": 0.449, "step": 1680 }, { "epoch": 10.442367601246106, "grad_norm": 3.421875, "learning_rate": 4.977644784938752e-05, "loss": 0.3045, "step": 1681 }, { "epoch": 10.448598130841122, "grad_norm": 3.359375, "learning_rate": 4.9776118983293805e-05, "loss": 0.3476, "step": 1682 }, { "epoch": 10.454828660436137, "grad_norm": 5.96875, "learning_rate": 4.9775789876569476e-05, "loss": 0.2566, "step": 1683 }, { "epoch": 10.461059190031152, "grad_norm": 4.21875, "learning_rate": 4.9775460529217724e-05, "loss": 0.2245, "step": 1684 }, { "epoch": 10.467289719626168, "grad_norm": 2.765625, "learning_rate": 4.9775130941241747e-05, "loss": 0.3771, "step": 1685 }, { "epoch": 10.473520249221183, "grad_norm": 3.34375, "learning_rate": 4.977480111264474e-05, "loss": 0.2818, "step": 1686 }, { "epoch": 10.4797507788162, "grad_norm": 2.8125, "learning_rate": 4.977447104342992e-05, "loss": 0.3472, "step": 1687 }, { "epoch": 10.485981308411215, "grad_norm": 2.515625, "learning_rate": 4.9774140733600475e-05, "loss": 0.3812, "step": 1688 }, { "epoch": 10.49221183800623, "grad_norm": 3.28125, "learning_rate": 4.977381018315963e-05, "loss": 0.3612, "step": 1689 }, { "epoch": 10.498442367601246, "grad_norm": 2.625, "learning_rate": 4.977347939211058e-05, "loss": 0.193, "step": 1690 }, { "epoch": 10.504672897196262, "grad_norm": 3.234375, "learning_rate": 4.9773148360456555e-05, "loss": 0.4167, "step": 1691 }, { "epoch": 10.510903426791277, "grad_norm": 5.0, "learning_rate": 4.977281708820076e-05, "loss": 0.4475, "step": 1692 }, { "epoch": 10.517133956386292, "grad_norm": 4.0625, "learning_rate": 4.977248557534641e-05, "loss": 0.5483, "step": 1693 }, { "epoch": 10.523364485981308, "grad_norm": 3.28125, "learning_rate": 4.9772153821896725e-05, "loss": 0.5637, "step": 1694 }, { "epoch": 10.529595015576325, "grad_norm": 3.609375, "learning_rate": 4.977182182785494e-05, "loss": 0.406, "step": 1695 }, { "epoch": 10.53582554517134, "grad_norm": 3.203125, "learning_rate": 4.977148959322426e-05, "loss": 0.2589, "step": 1696 }, { "epoch": 10.542056074766355, "grad_norm": 3.09375, "learning_rate": 4.977115711800793e-05, "loss": 0.3682, "step": 1697 }, { "epoch": 10.54828660436137, "grad_norm": 4.125, "learning_rate": 4.977082440220917e-05, "loss": 0.3624, "step": 1698 }, { "epoch": 10.554517133956386, "grad_norm": 3.953125, "learning_rate": 4.977049144583121e-05, "loss": 0.3664, "step": 1699 }, { "epoch": 10.560747663551401, "grad_norm": 3.796875, "learning_rate": 4.9770158248877286e-05, "loss": 0.2837, "step": 1700 }, { "epoch": 10.566978193146417, "grad_norm": 4.90625, "learning_rate": 4.976982481135063e-05, "loss": 0.6075, "step": 1701 }, { "epoch": 10.573208722741432, "grad_norm": 4.3125, "learning_rate": 4.9769491133254485e-05, "loss": 0.36, "step": 1702 }, { "epoch": 10.57943925233645, "grad_norm": 3.140625, "learning_rate": 4.976915721459209e-05, "loss": 0.2108, "step": 1703 }, { "epoch": 10.585669781931465, "grad_norm": 3.453125, "learning_rate": 4.976882305536669e-05, "loss": 0.5151, "step": 1704 }, { "epoch": 10.59190031152648, "grad_norm": 2.796875, "learning_rate": 4.976848865558153e-05, "loss": 0.2693, "step": 1705 }, { "epoch": 10.598130841121495, "grad_norm": 5.28125, "learning_rate": 4.9768154015239854e-05, "loss": 0.3783, "step": 1706 }, { "epoch": 10.60436137071651, "grad_norm": 3.8125, "learning_rate": 4.9767819134344914e-05, "loss": 0.403, "step": 1707 }, { "epoch": 10.610591900311526, "grad_norm": 2.703125, "learning_rate": 4.976748401289997e-05, "loss": 0.3663, "step": 1708 }, { "epoch": 10.616822429906541, "grad_norm": 2.21875, "learning_rate": 4.976714865090827e-05, "loss": 0.2542, "step": 1709 }, { "epoch": 10.623052959501557, "grad_norm": 3.453125, "learning_rate": 4.976681304837307e-05, "loss": 0.3907, "step": 1710 }, { "epoch": 10.629283489096574, "grad_norm": 4.21875, "learning_rate": 4.9766477205297634e-05, "loss": 0.411, "step": 1711 }, { "epoch": 10.63551401869159, "grad_norm": 4.625, "learning_rate": 4.976614112168522e-05, "loss": 0.2636, "step": 1712 }, { "epoch": 10.641744548286605, "grad_norm": 2.421875, "learning_rate": 4.9765804797539086e-05, "loss": 0.2373, "step": 1713 }, { "epoch": 10.64797507788162, "grad_norm": 2.5625, "learning_rate": 4.9765468232862505e-05, "loss": 0.254, "step": 1714 }, { "epoch": 10.654205607476635, "grad_norm": 3.046875, "learning_rate": 4.976513142765875e-05, "loss": 0.4048, "step": 1715 }, { "epoch": 10.66043613707165, "grad_norm": 3.203125, "learning_rate": 4.976479438193109e-05, "loss": 0.3122, "step": 1716 }, { "epoch": 10.666666666666666, "grad_norm": 3.34375, "learning_rate": 4.976445709568279e-05, "loss": 0.3337, "step": 1717 }, { "epoch": 10.672897196261681, "grad_norm": 3.78125, "learning_rate": 4.976411956891714e-05, "loss": 0.4724, "step": 1718 }, { "epoch": 10.679127725856699, "grad_norm": 2.734375, "learning_rate": 4.976378180163741e-05, "loss": 0.3219, "step": 1719 }, { "epoch": 10.685358255451714, "grad_norm": 3.109375, "learning_rate": 4.976344379384688e-05, "loss": 0.413, "step": 1720 }, { "epoch": 10.69158878504673, "grad_norm": 2.421875, "learning_rate": 4.976310554554883e-05, "loss": 0.2792, "step": 1721 }, { "epoch": 10.697819314641745, "grad_norm": 3.171875, "learning_rate": 4.976276705674655e-05, "loss": 0.3504, "step": 1722 }, { "epoch": 10.70404984423676, "grad_norm": 4.125, "learning_rate": 4.976242832744332e-05, "loss": 0.3641, "step": 1723 }, { "epoch": 10.710280373831775, "grad_norm": 3.53125, "learning_rate": 4.976208935764245e-05, "loss": 0.455, "step": 1724 }, { "epoch": 10.71651090342679, "grad_norm": 2.890625, "learning_rate": 4.976175014734721e-05, "loss": 0.4284, "step": 1725 }, { "epoch": 10.722741433021806, "grad_norm": 3.125, "learning_rate": 4.976141069656091e-05, "loss": 0.316, "step": 1726 }, { "epoch": 10.728971962616823, "grad_norm": 2.234375, "learning_rate": 4.976107100528683e-05, "loss": 0.251, "step": 1727 }, { "epoch": 10.735202492211839, "grad_norm": 4.71875, "learning_rate": 4.9760731073528285e-05, "loss": 0.5486, "step": 1728 }, { "epoch": 10.741433021806854, "grad_norm": 2.640625, "learning_rate": 4.976039090128857e-05, "loss": 0.1993, "step": 1729 }, { "epoch": 10.74766355140187, "grad_norm": 4.4375, "learning_rate": 4.9760050488570984e-05, "loss": 0.4187, "step": 1730 }, { "epoch": 10.753894080996885, "grad_norm": 2.65625, "learning_rate": 4.975970983537884e-05, "loss": 0.3085, "step": 1731 }, { "epoch": 10.7601246105919, "grad_norm": 3.8125, "learning_rate": 4.9759368941715445e-05, "loss": 0.3054, "step": 1732 }, { "epoch": 10.766355140186915, "grad_norm": 2.71875, "learning_rate": 4.975902780758411e-05, "loss": 0.2451, "step": 1733 }, { "epoch": 10.77258566978193, "grad_norm": 2.859375, "learning_rate": 4.9758686432988155e-05, "loss": 0.2737, "step": 1734 }, { "epoch": 10.778816199376948, "grad_norm": 2.859375, "learning_rate": 4.975834481793088e-05, "loss": 0.2639, "step": 1735 }, { "epoch": 10.785046728971963, "grad_norm": 4.59375, "learning_rate": 4.975800296241561e-05, "loss": 0.6028, "step": 1736 }, { "epoch": 10.791277258566979, "grad_norm": 3.375, "learning_rate": 4.975766086644567e-05, "loss": 0.3264, "step": 1737 }, { "epoch": 10.797507788161994, "grad_norm": 3.65625, "learning_rate": 4.9757318530024376e-05, "loss": 0.4381, "step": 1738 }, { "epoch": 10.80373831775701, "grad_norm": 3.3125, "learning_rate": 4.975697595315506e-05, "loss": 0.3753, "step": 1739 }, { "epoch": 10.809968847352025, "grad_norm": 3.8125, "learning_rate": 4.975663313584104e-05, "loss": 0.2354, "step": 1740 }, { "epoch": 10.81619937694704, "grad_norm": 3.546875, "learning_rate": 4.975629007808565e-05, "loss": 0.2258, "step": 1741 }, { "epoch": 10.822429906542055, "grad_norm": 2.171875, "learning_rate": 4.9755946779892225e-05, "loss": 0.201, "step": 1742 }, { "epoch": 10.828660436137072, "grad_norm": 6.34375, "learning_rate": 4.97556032412641e-05, "loss": 0.5661, "step": 1743 }, { "epoch": 10.834890965732088, "grad_norm": 7.375, "learning_rate": 4.9755259462204604e-05, "loss": 0.3587, "step": 1744 }, { "epoch": 10.841121495327103, "grad_norm": 5.96875, "learning_rate": 4.975491544271708e-05, "loss": 0.4029, "step": 1745 }, { "epoch": 10.847352024922118, "grad_norm": 4.0, "learning_rate": 4.975457118280487e-05, "loss": 0.2886, "step": 1746 }, { "epoch": 10.853582554517134, "grad_norm": 5.3125, "learning_rate": 4.975422668247131e-05, "loss": 0.3636, "step": 1747 }, { "epoch": 10.85981308411215, "grad_norm": 6.0, "learning_rate": 4.975388194171975e-05, "loss": 0.3863, "step": 1748 }, { "epoch": 10.866043613707165, "grad_norm": 5.8125, "learning_rate": 4.9753536960553545e-05, "loss": 0.2802, "step": 1749 }, { "epoch": 10.87227414330218, "grad_norm": 5.03125, "learning_rate": 4.9753191738976045e-05, "loss": 0.388, "step": 1750 }, { "epoch": 10.878504672897197, "grad_norm": 3.1875, "learning_rate": 4.97528462769906e-05, "loss": 0.2512, "step": 1751 }, { "epoch": 10.884735202492212, "grad_norm": 4.75, "learning_rate": 4.9752500574600556e-05, "loss": 0.3566, "step": 1752 }, { "epoch": 10.890965732087228, "grad_norm": 2.546875, "learning_rate": 4.975215463180929e-05, "loss": 0.1771, "step": 1753 }, { "epoch": 10.897196261682243, "grad_norm": 2.75, "learning_rate": 4.975180844862014e-05, "loss": 0.2113, "step": 1754 }, { "epoch": 10.903426791277258, "grad_norm": 3.375, "learning_rate": 4.975146202503648e-05, "loss": 0.2725, "step": 1755 }, { "epoch": 10.909657320872274, "grad_norm": 4.84375, "learning_rate": 4.975111536106167e-05, "loss": 0.387, "step": 1756 }, { "epoch": 10.91588785046729, "grad_norm": 5.3125, "learning_rate": 4.9750768456699095e-05, "loss": 0.2607, "step": 1757 }, { "epoch": 10.922118380062305, "grad_norm": 6.0, "learning_rate": 4.975042131195209e-05, "loss": 0.2711, "step": 1758 }, { "epoch": 10.928348909657322, "grad_norm": 4.90625, "learning_rate": 4.975007392682406e-05, "loss": 0.386, "step": 1759 }, { "epoch": 10.934579439252337, "grad_norm": 3.578125, "learning_rate": 4.974972630131836e-05, "loss": 0.3828, "step": 1760 }, { "epoch": 10.940809968847352, "grad_norm": 4.1875, "learning_rate": 4.974937843543836e-05, "loss": 0.291, "step": 1761 }, { "epoch": 10.947040498442368, "grad_norm": 7.3125, "learning_rate": 4.9749030329187466e-05, "loss": 0.3035, "step": 1762 }, { "epoch": 10.953271028037383, "grad_norm": 7.09375, "learning_rate": 4.9748681982569035e-05, "loss": 0.3721, "step": 1763 }, { "epoch": 10.959501557632398, "grad_norm": 4.25, "learning_rate": 4.974833339558645e-05, "loss": 0.4957, "step": 1764 }, { "epoch": 10.965732087227414, "grad_norm": 3.359375, "learning_rate": 4.974798456824312e-05, "loss": 0.3321, "step": 1765 }, { "epoch": 10.97196261682243, "grad_norm": 7.15625, "learning_rate": 4.974763550054241e-05, "loss": 0.3322, "step": 1766 }, { "epoch": 10.978193146417446, "grad_norm": 5.4375, "learning_rate": 4.974728619248772e-05, "loss": 0.283, "step": 1767 }, { "epoch": 10.984423676012462, "grad_norm": 3.3125, "learning_rate": 4.974693664408243e-05, "loss": 0.2918, "step": 1768 }, { "epoch": 10.990654205607477, "grad_norm": 4.0, "learning_rate": 4.9746586855329944e-05, "loss": 0.4142, "step": 1769 }, { "epoch": 10.996884735202492, "grad_norm": 4.625, "learning_rate": 4.9746236826233674e-05, "loss": 0.339, "step": 1770 }, { "epoch": 11.0, "grad_norm": 3.5625, "learning_rate": 4.9745886556796995e-05, "loss": 0.1028, "step": 1771 }, { "epoch": 11.006230529595015, "grad_norm": 6.5, "learning_rate": 4.9745536047023324e-05, "loss": 0.4129, "step": 1772 }, { "epoch": 11.01246105919003, "grad_norm": 5.34375, "learning_rate": 4.9745185296916054e-05, "loss": 0.32, "step": 1773 }, { "epoch": 11.018691588785046, "grad_norm": 3.25, "learning_rate": 4.97448343064786e-05, "loss": 0.2749, "step": 1774 }, { "epoch": 11.024922118380061, "grad_norm": 3.515625, "learning_rate": 4.974448307571437e-05, "loss": 0.4339, "step": 1775 }, { "epoch": 11.031152647975079, "grad_norm": 4.375, "learning_rate": 4.974413160462678e-05, "loss": 0.2843, "step": 1776 }, { "epoch": 11.037383177570094, "grad_norm": 4.34375, "learning_rate": 4.974377989321923e-05, "loss": 0.3586, "step": 1777 }, { "epoch": 11.04361370716511, "grad_norm": 2.84375, "learning_rate": 4.974342794149515e-05, "loss": 0.2819, "step": 1778 }, { "epoch": 11.049844236760125, "grad_norm": 2.125, "learning_rate": 4.974307574945794e-05, "loss": 0.2186, "step": 1779 }, { "epoch": 11.05607476635514, "grad_norm": 3.90625, "learning_rate": 4.974272331711104e-05, "loss": 0.493, "step": 1780 }, { "epoch": 11.062305295950155, "grad_norm": 2.234375, "learning_rate": 4.974237064445787e-05, "loss": 0.2194, "step": 1781 }, { "epoch": 11.06853582554517, "grad_norm": 2.484375, "learning_rate": 4.974201773150184e-05, "loss": 0.2184, "step": 1782 }, { "epoch": 11.074766355140186, "grad_norm": 3.578125, "learning_rate": 4.9741664578246394e-05, "loss": 0.2962, "step": 1783 }, { "epoch": 11.080996884735203, "grad_norm": 3.796875, "learning_rate": 4.974131118469495e-05, "loss": 0.286, "step": 1784 }, { "epoch": 11.087227414330219, "grad_norm": 4.21875, "learning_rate": 4.9740957550850956e-05, "loss": 0.2256, "step": 1785 }, { "epoch": 11.093457943925234, "grad_norm": 4.0, "learning_rate": 4.974060367671783e-05, "loss": 0.2587, "step": 1786 }, { "epoch": 11.09968847352025, "grad_norm": 2.6875, "learning_rate": 4.974024956229901e-05, "loss": 0.3186, "step": 1787 }, { "epoch": 11.105919003115265, "grad_norm": 4.09375, "learning_rate": 4.973989520759795e-05, "loss": 0.3566, "step": 1788 }, { "epoch": 11.11214953271028, "grad_norm": 4.03125, "learning_rate": 4.973954061261809e-05, "loss": 0.4185, "step": 1789 }, { "epoch": 11.118380062305295, "grad_norm": 5.4375, "learning_rate": 4.973918577736285e-05, "loss": 0.439, "step": 1790 }, { "epoch": 11.12461059190031, "grad_norm": 2.765625, "learning_rate": 4.97388307018357e-05, "loss": 0.3391, "step": 1791 }, { "epoch": 11.130841121495328, "grad_norm": 4.0625, "learning_rate": 4.973847538604008e-05, "loss": 0.4999, "step": 1792 }, { "epoch": 11.137071651090343, "grad_norm": 3.546875, "learning_rate": 4.973811982997944e-05, "loss": 0.4053, "step": 1793 }, { "epoch": 11.143302180685358, "grad_norm": 3.046875, "learning_rate": 4.973776403365724e-05, "loss": 0.3265, "step": 1794 }, { "epoch": 11.149532710280374, "grad_norm": 4.28125, "learning_rate": 4.973740799707692e-05, "loss": 0.4779, "step": 1795 }, { "epoch": 11.15576323987539, "grad_norm": 2.203125, "learning_rate": 4.973705172024196e-05, "loss": 0.2573, "step": 1796 }, { "epoch": 11.161993769470405, "grad_norm": 4.53125, "learning_rate": 4.9736695203155805e-05, "loss": 0.2318, "step": 1797 }, { "epoch": 11.16822429906542, "grad_norm": 4.78125, "learning_rate": 4.9736338445821925e-05, "loss": 0.5799, "step": 1798 }, { "epoch": 11.174454828660435, "grad_norm": 2.640625, "learning_rate": 4.973598144824377e-05, "loss": 0.2384, "step": 1799 }, { "epoch": 11.180685358255452, "grad_norm": 4.6875, "learning_rate": 4.973562421042483e-05, "loss": 0.4389, "step": 1800 }, { "epoch": 11.186915887850468, "grad_norm": 4.90625, "learning_rate": 4.973526673236856e-05, "loss": 0.3759, "step": 1801 }, { "epoch": 11.193146417445483, "grad_norm": 2.15625, "learning_rate": 4.9734909014078435e-05, "loss": 0.3147, "step": 1802 }, { "epoch": 11.199376947040498, "grad_norm": 1.8828125, "learning_rate": 4.973455105555793e-05, "loss": 0.2257, "step": 1803 }, { "epoch": 11.205607476635514, "grad_norm": 3.734375, "learning_rate": 4.973419285681052e-05, "loss": 0.4167, "step": 1804 }, { "epoch": 11.21183800623053, "grad_norm": 2.515625, "learning_rate": 4.973383441783969e-05, "loss": 0.2767, "step": 1805 }, { "epoch": 11.218068535825545, "grad_norm": 3.5625, "learning_rate": 4.973347573864891e-05, "loss": 0.3753, "step": 1806 }, { "epoch": 11.22429906542056, "grad_norm": 2.15625, "learning_rate": 4.9733116819241664e-05, "loss": 0.2748, "step": 1807 }, { "epoch": 11.230529595015577, "grad_norm": 2.78125, "learning_rate": 4.973275765962145e-05, "loss": 0.3352, "step": 1808 }, { "epoch": 11.236760124610592, "grad_norm": 2.53125, "learning_rate": 4.973239825979175e-05, "loss": 0.3357, "step": 1809 }, { "epoch": 11.242990654205608, "grad_norm": 3.6875, "learning_rate": 4.973203861975605e-05, "loss": 0.2944, "step": 1810 }, { "epoch": 11.249221183800623, "grad_norm": 2.90625, "learning_rate": 4.973167873951785e-05, "loss": 0.2527, "step": 1811 }, { "epoch": 11.255451713395638, "grad_norm": 3.0625, "learning_rate": 4.973131861908064e-05, "loss": 0.296, "step": 1812 }, { "epoch": 11.261682242990654, "grad_norm": 3.265625, "learning_rate": 4.973095825844793e-05, "loss": 0.3883, "step": 1813 }, { "epoch": 11.26791277258567, "grad_norm": 4.75, "learning_rate": 4.973059765762319e-05, "loss": 0.2579, "step": 1814 }, { "epoch": 11.274143302180685, "grad_norm": 5.6875, "learning_rate": 4.973023681660996e-05, "loss": 0.4065, "step": 1815 }, { "epoch": 11.280373831775702, "grad_norm": 3.375, "learning_rate": 4.972987573541172e-05, "loss": 0.2592, "step": 1816 }, { "epoch": 11.286604361370717, "grad_norm": 4.03125, "learning_rate": 4.9729514414031985e-05, "loss": 0.2207, "step": 1817 }, { "epoch": 11.292834890965732, "grad_norm": 5.59375, "learning_rate": 4.972915285247426e-05, "loss": 0.3439, "step": 1818 }, { "epoch": 11.299065420560748, "grad_norm": 6.375, "learning_rate": 4.9728791050742064e-05, "loss": 0.2102, "step": 1819 }, { "epoch": 11.305295950155763, "grad_norm": 3.4375, "learning_rate": 4.9728429008838906e-05, "loss": 0.2149, "step": 1820 }, { "epoch": 11.311526479750778, "grad_norm": 3.453125, "learning_rate": 4.97280667267683e-05, "loss": 0.2055, "step": 1821 }, { "epoch": 11.317757009345794, "grad_norm": 5.6875, "learning_rate": 4.972770420453376e-05, "loss": 0.3262, "step": 1822 }, { "epoch": 11.32398753894081, "grad_norm": 6.6875, "learning_rate": 4.972734144213882e-05, "loss": 0.3478, "step": 1823 }, { "epoch": 11.330218068535826, "grad_norm": 2.828125, "learning_rate": 4.9726978439587e-05, "loss": 0.2307, "step": 1824 }, { "epoch": 11.336448598130842, "grad_norm": 3.078125, "learning_rate": 4.972661519688182e-05, "loss": 0.3896, "step": 1825 }, { "epoch": 11.342679127725857, "grad_norm": 5.71875, "learning_rate": 4.9726251714026804e-05, "loss": 0.2397, "step": 1826 }, { "epoch": 11.348909657320872, "grad_norm": 5.46875, "learning_rate": 4.97258879910255e-05, "loss": 0.3574, "step": 1827 }, { "epoch": 11.355140186915888, "grad_norm": 6.96875, "learning_rate": 4.972552402788142e-05, "loss": 0.2377, "step": 1828 }, { "epoch": 11.361370716510903, "grad_norm": 3.25, "learning_rate": 4.972515982459812e-05, "loss": 0.2967, "step": 1829 }, { "epoch": 11.367601246105918, "grad_norm": 3.6875, "learning_rate": 4.972479538117911e-05, "loss": 0.5257, "step": 1830 }, { "epoch": 11.373831775700934, "grad_norm": 3.8125, "learning_rate": 4.972443069762795e-05, "loss": 0.3176, "step": 1831 }, { "epoch": 11.38006230529595, "grad_norm": 6.53125, "learning_rate": 4.9724065773948185e-05, "loss": 0.3765, "step": 1832 }, { "epoch": 11.386292834890966, "grad_norm": 3.21875, "learning_rate": 4.9723700610143344e-05, "loss": 0.4771, "step": 1833 }, { "epoch": 11.392523364485982, "grad_norm": 3.359375, "learning_rate": 4.972333520621698e-05, "loss": 0.2672, "step": 1834 }, { "epoch": 11.398753894080997, "grad_norm": 2.5, "learning_rate": 4.972296956217265e-05, "loss": 0.2363, "step": 1835 }, { "epoch": 11.404984423676012, "grad_norm": 4.71875, "learning_rate": 4.972260367801389e-05, "loss": 0.3016, "step": 1836 }, { "epoch": 11.411214953271028, "grad_norm": 6.5625, "learning_rate": 4.972223755374426e-05, "loss": 0.4322, "step": 1837 }, { "epoch": 11.417445482866043, "grad_norm": 3.609375, "learning_rate": 4.972187118936732e-05, "loss": 0.4128, "step": 1838 }, { "epoch": 11.423676012461058, "grad_norm": 3.859375, "learning_rate": 4.9721504584886624e-05, "loss": 0.3934, "step": 1839 }, { "epoch": 11.429906542056075, "grad_norm": 2.84375, "learning_rate": 4.9721137740305734e-05, "loss": 0.2586, "step": 1840 }, { "epoch": 11.43613707165109, "grad_norm": 4.28125, "learning_rate": 4.972077065562821e-05, "loss": 0.4579, "step": 1841 }, { "epoch": 11.442367601246106, "grad_norm": 3.703125, "learning_rate": 4.972040333085763e-05, "loss": 0.2475, "step": 1842 }, { "epoch": 11.448598130841122, "grad_norm": 3.921875, "learning_rate": 4.972003576599754e-05, "loss": 0.3643, "step": 1843 }, { "epoch": 11.454828660436137, "grad_norm": 3.96875, "learning_rate": 4.971966796105153e-05, "loss": 0.3893, "step": 1844 }, { "epoch": 11.461059190031152, "grad_norm": 6.53125, "learning_rate": 4.971929991602315e-05, "loss": 0.3468, "step": 1845 }, { "epoch": 11.467289719626168, "grad_norm": 6.53125, "learning_rate": 4.9718931630916e-05, "loss": 0.3757, "step": 1846 }, { "epoch": 11.473520249221183, "grad_norm": 3.765625, "learning_rate": 4.971856310573365e-05, "loss": 0.3579, "step": 1847 }, { "epoch": 11.4797507788162, "grad_norm": 3.5625, "learning_rate": 4.9718194340479663e-05, "loss": 0.2603, "step": 1848 }, { "epoch": 11.485981308411215, "grad_norm": 5.0625, "learning_rate": 4.971782533515763e-05, "loss": 0.2998, "step": 1849 }, { "epoch": 11.49221183800623, "grad_norm": 4.53125, "learning_rate": 4.971745608977114e-05, "loss": 0.2668, "step": 1850 }, { "epoch": 11.498442367601246, "grad_norm": 3.84375, "learning_rate": 4.971708660432378e-05, "loss": 0.35, "step": 1851 }, { "epoch": 11.504672897196262, "grad_norm": 2.53125, "learning_rate": 4.971671687881913e-05, "loss": 0.3376, "step": 1852 }, { "epoch": 11.510903426791277, "grad_norm": 5.0625, "learning_rate": 4.971634691326079e-05, "loss": 0.2987, "step": 1853 }, { "epoch": 11.517133956386292, "grad_norm": 3.421875, "learning_rate": 4.971597670765234e-05, "loss": 0.2326, "step": 1854 }, { "epoch": 11.523364485981308, "grad_norm": 4.8125, "learning_rate": 4.9715606261997386e-05, "loss": 0.2439, "step": 1855 }, { "epoch": 11.529595015576325, "grad_norm": 3.390625, "learning_rate": 4.9715235576299523e-05, "loss": 0.3162, "step": 1856 }, { "epoch": 11.53582554517134, "grad_norm": 5.0, "learning_rate": 4.9714864650562356e-05, "loss": 0.4207, "step": 1857 }, { "epoch": 11.542056074766355, "grad_norm": 2.78125, "learning_rate": 4.9714493484789476e-05, "loss": 0.2445, "step": 1858 }, { "epoch": 11.54828660436137, "grad_norm": 3.34375, "learning_rate": 4.971412207898451e-05, "loss": 0.3018, "step": 1859 }, { "epoch": 11.554517133956386, "grad_norm": 2.4375, "learning_rate": 4.971375043315103e-05, "loss": 0.2303, "step": 1860 }, { "epoch": 11.560747663551401, "grad_norm": 4.09375, "learning_rate": 4.971337854729268e-05, "loss": 0.3378, "step": 1861 }, { "epoch": 11.566978193146417, "grad_norm": 4.53125, "learning_rate": 4.9713006421413047e-05, "loss": 0.4347, "step": 1862 }, { "epoch": 11.573208722741432, "grad_norm": 4.3125, "learning_rate": 4.971263405551576e-05, "loss": 0.3612, "step": 1863 }, { "epoch": 11.57943925233645, "grad_norm": 3.4375, "learning_rate": 4.971226144960443e-05, "loss": 0.375, "step": 1864 }, { "epoch": 11.585669781931465, "grad_norm": 3.0, "learning_rate": 4.971188860368268e-05, "loss": 0.2507, "step": 1865 }, { "epoch": 11.59190031152648, "grad_norm": 4.9375, "learning_rate": 4.971151551775413e-05, "loss": 0.2766, "step": 1866 }, { "epoch": 11.598130841121495, "grad_norm": 2.0625, "learning_rate": 4.971114219182239e-05, "loss": 0.2393, "step": 1867 }, { "epoch": 11.60436137071651, "grad_norm": 2.5, "learning_rate": 4.9710768625891114e-05, "loss": 0.3786, "step": 1868 }, { "epoch": 11.610591900311526, "grad_norm": 3.53125, "learning_rate": 4.97103948199639e-05, "loss": 0.2394, "step": 1869 }, { "epoch": 11.616822429906541, "grad_norm": 3.125, "learning_rate": 4.9710020774044394e-05, "loss": 0.2407, "step": 1870 }, { "epoch": 11.623052959501557, "grad_norm": 3.71875, "learning_rate": 4.970964648813623e-05, "loss": 0.2156, "step": 1871 }, { "epoch": 11.629283489096574, "grad_norm": 4.1875, "learning_rate": 4.9709271962243044e-05, "loss": 0.3862, "step": 1872 }, { "epoch": 11.63551401869159, "grad_norm": 3.1875, "learning_rate": 4.970889719636847e-05, "loss": 0.2339, "step": 1873 }, { "epoch": 11.641744548286605, "grad_norm": 2.171875, "learning_rate": 4.9708522190516135e-05, "loss": 0.2157, "step": 1874 }, { "epoch": 11.64797507788162, "grad_norm": 2.796875, "learning_rate": 4.9708146944689705e-05, "loss": 0.2994, "step": 1875 }, { "epoch": 11.654205607476635, "grad_norm": 2.828125, "learning_rate": 4.9707771458892804e-05, "loss": 0.335, "step": 1876 }, { "epoch": 11.66043613707165, "grad_norm": 3.25, "learning_rate": 4.970739573312909e-05, "loss": 0.3362, "step": 1877 }, { "epoch": 11.666666666666666, "grad_norm": 3.328125, "learning_rate": 4.970701976740222e-05, "loss": 0.313, "step": 1878 }, { "epoch": 11.672897196261681, "grad_norm": 3.390625, "learning_rate": 4.970664356171583e-05, "loss": 0.3884, "step": 1879 }, { "epoch": 11.679127725856699, "grad_norm": 3.546875, "learning_rate": 4.9706267116073565e-05, "loss": 0.2666, "step": 1880 }, { "epoch": 11.685358255451714, "grad_norm": 2.9375, "learning_rate": 4.970589043047911e-05, "loss": 0.3425, "step": 1881 }, { "epoch": 11.69158878504673, "grad_norm": 3.859375, "learning_rate": 4.970551350493611e-05, "loss": 0.2864, "step": 1882 }, { "epoch": 11.697819314641745, "grad_norm": 2.59375, "learning_rate": 4.9705136339448214e-05, "loss": 0.2726, "step": 1883 }, { "epoch": 11.70404984423676, "grad_norm": 3.0625, "learning_rate": 4.97047589340191e-05, "loss": 0.3397, "step": 1884 }, { "epoch": 11.710280373831775, "grad_norm": 3.296875, "learning_rate": 4.9704381288652436e-05, "loss": 0.2343, "step": 1885 }, { "epoch": 11.71651090342679, "grad_norm": 1.78125, "learning_rate": 4.970400340335187e-05, "loss": 0.2233, "step": 1886 }, { "epoch": 11.722741433021806, "grad_norm": 3.921875, "learning_rate": 4.970362527812109e-05, "loss": 0.4212, "step": 1887 }, { "epoch": 11.728971962616823, "grad_norm": 3.28125, "learning_rate": 4.9703246912963764e-05, "loss": 0.2776, "step": 1888 }, { "epoch": 11.735202492211839, "grad_norm": 3.0625, "learning_rate": 4.970286830788357e-05, "loss": 0.4026, "step": 1889 }, { "epoch": 11.741433021806854, "grad_norm": 3.765625, "learning_rate": 4.970248946288417e-05, "loss": 0.5104, "step": 1890 }, { "epoch": 11.74766355140187, "grad_norm": 3.953125, "learning_rate": 4.970211037796927e-05, "loss": 0.632, "step": 1891 }, { "epoch": 11.753894080996885, "grad_norm": 2.90625, "learning_rate": 4.970173105314252e-05, "loss": 0.3062, "step": 1892 }, { "epoch": 11.7601246105919, "grad_norm": 2.640625, "learning_rate": 4.970135148840763e-05, "loss": 0.2463, "step": 1893 }, { "epoch": 11.766355140186915, "grad_norm": 3.34375, "learning_rate": 4.970097168376827e-05, "loss": 0.3414, "step": 1894 }, { "epoch": 11.77258566978193, "grad_norm": 2.90625, "learning_rate": 4.9700591639228144e-05, "loss": 0.266, "step": 1895 }, { "epoch": 11.778816199376948, "grad_norm": 3.984375, "learning_rate": 4.970021135479093e-05, "loss": 0.3885, "step": 1896 }, { "epoch": 11.785046728971963, "grad_norm": 4.125, "learning_rate": 4.9699830830460324e-05, "loss": 0.3919, "step": 1897 }, { "epoch": 11.791277258566979, "grad_norm": 2.78125, "learning_rate": 4.969945006624003e-05, "loss": 0.2688, "step": 1898 }, { "epoch": 11.797507788161994, "grad_norm": 3.53125, "learning_rate": 4.969906906213373e-05, "loss": 0.323, "step": 1899 }, { "epoch": 11.80373831775701, "grad_norm": 1.5546875, "learning_rate": 4.969868781814514e-05, "loss": 0.2089, "step": 1900 }, { "epoch": 11.809968847352025, "grad_norm": 3.34375, "learning_rate": 4.9698306334277956e-05, "loss": 0.4002, "step": 1901 }, { "epoch": 11.81619937694704, "grad_norm": 3.34375, "learning_rate": 4.969792461053588e-05, "loss": 0.3422, "step": 1902 }, { "epoch": 11.822429906542055, "grad_norm": 4.3125, "learning_rate": 4.969754264692264e-05, "loss": 0.2768, "step": 1903 }, { "epoch": 11.828660436137072, "grad_norm": 3.546875, "learning_rate": 4.969716044344191e-05, "loss": 0.3393, "step": 1904 }, { "epoch": 11.834890965732088, "grad_norm": 4.53125, "learning_rate": 4.9696778000097424e-05, "loss": 0.4317, "step": 1905 }, { "epoch": 11.841121495327103, "grad_norm": 4.28125, "learning_rate": 4.9696395316892895e-05, "loss": 0.3583, "step": 1906 }, { "epoch": 11.847352024922118, "grad_norm": 4.03125, "learning_rate": 4.9696012393832034e-05, "loss": 0.2594, "step": 1907 }, { "epoch": 11.853582554517134, "grad_norm": 3.375, "learning_rate": 4.9695629230918574e-05, "loss": 0.3005, "step": 1908 }, { "epoch": 11.85981308411215, "grad_norm": 3.875, "learning_rate": 4.969524582815622e-05, "loss": 0.2791, "step": 1909 }, { "epoch": 11.866043613707165, "grad_norm": 1.9609375, "learning_rate": 4.969486218554871e-05, "loss": 0.1948, "step": 1910 }, { "epoch": 11.87227414330218, "grad_norm": 3.75, "learning_rate": 4.969447830309975e-05, "loss": 0.3182, "step": 1911 }, { "epoch": 11.878504672897197, "grad_norm": 3.9375, "learning_rate": 4.969409418081309e-05, "loss": 0.4386, "step": 1912 }, { "epoch": 11.884735202492212, "grad_norm": 1.703125, "learning_rate": 4.9693709818692444e-05, "loss": 0.2015, "step": 1913 }, { "epoch": 11.890965732087228, "grad_norm": 3.953125, "learning_rate": 4.969332521674155e-05, "loss": 0.3825, "step": 1914 }, { "epoch": 11.897196261682243, "grad_norm": 3.96875, "learning_rate": 4.9692940374964155e-05, "loss": 0.3792, "step": 1915 }, { "epoch": 11.903426791277258, "grad_norm": 3.0, "learning_rate": 4.969255529336398e-05, "loss": 0.4019, "step": 1916 }, { "epoch": 11.909657320872274, "grad_norm": 2.234375, "learning_rate": 4.9692169971944774e-05, "loss": 0.219, "step": 1917 }, { "epoch": 11.91588785046729, "grad_norm": 4.1875, "learning_rate": 4.969178441071028e-05, "loss": 0.4068, "step": 1918 }, { "epoch": 11.922118380062305, "grad_norm": 2.9375, "learning_rate": 4.9691398609664233e-05, "loss": 0.2436, "step": 1919 }, { "epoch": 11.928348909657322, "grad_norm": 2.765625, "learning_rate": 4.969101256881039e-05, "loss": 0.2341, "step": 1920 }, { "epoch": 11.934579439252337, "grad_norm": 2.71875, "learning_rate": 4.9690626288152504e-05, "loss": 0.262, "step": 1921 }, { "epoch": 11.940809968847352, "grad_norm": 1.9921875, "learning_rate": 4.969023976769431e-05, "loss": 0.1892, "step": 1922 }, { "epoch": 11.947040498442368, "grad_norm": 3.578125, "learning_rate": 4.9689853007439577e-05, "loss": 0.2112, "step": 1923 }, { "epoch": 11.953271028037383, "grad_norm": 3.515625, "learning_rate": 4.968946600739206e-05, "loss": 0.3895, "step": 1924 }, { "epoch": 11.959501557632398, "grad_norm": 3.15625, "learning_rate": 4.9689078767555505e-05, "loss": 0.2395, "step": 1925 }, { "epoch": 11.965732087227414, "grad_norm": 3.84375, "learning_rate": 4.968869128793369e-05, "loss": 0.2977, "step": 1926 }, { "epoch": 11.97196261682243, "grad_norm": 3.21875, "learning_rate": 4.968830356853037e-05, "loss": 0.5132, "step": 1927 }, { "epoch": 11.978193146417446, "grad_norm": 3.078125, "learning_rate": 4.9687915609349305e-05, "loss": 0.2922, "step": 1928 }, { "epoch": 11.984423676012462, "grad_norm": 4.03125, "learning_rate": 4.9687527410394275e-05, "loss": 0.3033, "step": 1929 }, { "epoch": 11.990654205607477, "grad_norm": 3.5, "learning_rate": 4.968713897166903e-05, "loss": 0.3741, "step": 1930 }, { "epoch": 11.996884735202492, "grad_norm": 3.59375, "learning_rate": 4.968675029317738e-05, "loss": 0.3112, "step": 1931 }, { "epoch": 12.0, "grad_norm": 3.03125, "learning_rate": 4.968636137492305e-05, "loss": 0.1329, "step": 1932 }, { "epoch": 12.006230529595015, "grad_norm": 2.546875, "learning_rate": 4.968597221690986e-05, "loss": 0.2455, "step": 1933 }, { "epoch": 12.01246105919003, "grad_norm": 3.140625, "learning_rate": 4.968558281914158e-05, "loss": 0.317, "step": 1934 }, { "epoch": 12.018691588785046, "grad_norm": 1.96875, "learning_rate": 4.968519318162196e-05, "loss": 0.203, "step": 1935 }, { "epoch": 12.024922118380061, "grad_norm": 3.90625, "learning_rate": 4.968480330435483e-05, "loss": 0.427, "step": 1936 }, { "epoch": 12.031152647975079, "grad_norm": 4.46875, "learning_rate": 4.9684413187343946e-05, "loss": 0.3895, "step": 1937 }, { "epoch": 12.037383177570094, "grad_norm": 4.34375, "learning_rate": 4.9684022830593116e-05, "loss": 0.2898, "step": 1938 }, { "epoch": 12.04361370716511, "grad_norm": 1.4921875, "learning_rate": 4.968363223410612e-05, "loss": 0.1643, "step": 1939 }, { "epoch": 12.049844236760125, "grad_norm": 4.21875, "learning_rate": 4.9683241397886745e-05, "loss": 0.2907, "step": 1940 }, { "epoch": 12.05607476635514, "grad_norm": 2.953125, "learning_rate": 4.96828503219388e-05, "loss": 0.3146, "step": 1941 }, { "epoch": 12.062305295950155, "grad_norm": 2.65625, "learning_rate": 4.968245900626608e-05, "loss": 0.2888, "step": 1942 }, { "epoch": 12.06853582554517, "grad_norm": 2.640625, "learning_rate": 4.9682067450872385e-05, "loss": 0.2736, "step": 1943 }, { "epoch": 12.074766355140186, "grad_norm": 1.953125, "learning_rate": 4.968167565576151e-05, "loss": 0.1886, "step": 1944 }, { "epoch": 12.080996884735203, "grad_norm": 4.03125, "learning_rate": 4.968128362093728e-05, "loss": 0.2558, "step": 1945 }, { "epoch": 12.087227414330219, "grad_norm": 2.703125, "learning_rate": 4.968089134640348e-05, "loss": 0.2106, "step": 1946 }, { "epoch": 12.093457943925234, "grad_norm": 3.796875, "learning_rate": 4.968049883216393e-05, "loss": 0.4318, "step": 1947 }, { "epoch": 12.09968847352025, "grad_norm": 2.640625, "learning_rate": 4.9680106078222444e-05, "loss": 0.3833, "step": 1948 }, { "epoch": 12.105919003115265, "grad_norm": 4.3125, "learning_rate": 4.967971308458283e-05, "loss": 0.3959, "step": 1949 }, { "epoch": 12.11214953271028, "grad_norm": 3.71875, "learning_rate": 4.967931985124892e-05, "loss": 0.2953, "step": 1950 }, { "epoch": 12.118380062305295, "grad_norm": 3.28125, "learning_rate": 4.967892637822451e-05, "loss": 0.2887, "step": 1951 }, { "epoch": 12.12461059190031, "grad_norm": 3.5625, "learning_rate": 4.9678532665513444e-05, "loss": 0.4137, "step": 1952 }, { "epoch": 12.130841121495328, "grad_norm": 2.875, "learning_rate": 4.9678138713119524e-05, "loss": 0.2699, "step": 1953 }, { "epoch": 12.137071651090343, "grad_norm": 2.984375, "learning_rate": 4.96777445210466e-05, "loss": 0.3674, "step": 1954 }, { "epoch": 12.143302180685358, "grad_norm": 4.0, "learning_rate": 4.967735008929848e-05, "loss": 0.4276, "step": 1955 }, { "epoch": 12.149532710280374, "grad_norm": 2.53125, "learning_rate": 4.967695541787901e-05, "loss": 0.2574, "step": 1956 }, { "epoch": 12.15576323987539, "grad_norm": 4.21875, "learning_rate": 4.967656050679201e-05, "loss": 0.4382, "step": 1957 }, { "epoch": 12.161993769470405, "grad_norm": 4.03125, "learning_rate": 4.967616535604133e-05, "loss": 0.3072, "step": 1958 }, { "epoch": 12.16822429906542, "grad_norm": 3.1875, "learning_rate": 4.967576996563079e-05, "loss": 0.2973, "step": 1959 }, { "epoch": 12.174454828660435, "grad_norm": 2.59375, "learning_rate": 4.967537433556424e-05, "loss": 0.3327, "step": 1960 }, { "epoch": 12.180685358255452, "grad_norm": 3.03125, "learning_rate": 4.967497846584552e-05, "loss": 0.2096, "step": 1961 }, { "epoch": 12.186915887850468, "grad_norm": 4.75, "learning_rate": 4.967458235647849e-05, "loss": 0.2097, "step": 1962 }, { "epoch": 12.193146417445483, "grad_norm": 5.375, "learning_rate": 4.967418600746697e-05, "loss": 0.3963, "step": 1963 }, { "epoch": 12.199376947040498, "grad_norm": 3.875, "learning_rate": 4.967378941881483e-05, "loss": 0.4216, "step": 1964 }, { "epoch": 12.205607476635514, "grad_norm": 3.6875, "learning_rate": 4.9673392590525915e-05, "loss": 0.2706, "step": 1965 }, { "epoch": 12.21183800623053, "grad_norm": 4.25, "learning_rate": 4.9672995522604085e-05, "loss": 0.4475, "step": 1966 }, { "epoch": 12.218068535825545, "grad_norm": 2.828125, "learning_rate": 4.967259821505318e-05, "loss": 0.2232, "step": 1967 }, { "epoch": 12.22429906542056, "grad_norm": 4.6875, "learning_rate": 4.9672200667877065e-05, "loss": 0.2916, "step": 1968 }, { "epoch": 12.230529595015577, "grad_norm": 3.828125, "learning_rate": 4.967180288107961e-05, "loss": 0.3716, "step": 1969 }, { "epoch": 12.236760124610592, "grad_norm": 4.4375, "learning_rate": 4.9671404854664685e-05, "loss": 0.3205, "step": 1970 }, { "epoch": 12.242990654205608, "grad_norm": 4.375, "learning_rate": 4.967100658863613e-05, "loss": 0.2405, "step": 1971 }, { "epoch": 12.249221183800623, "grad_norm": 4.46875, "learning_rate": 4.967060808299784e-05, "loss": 0.4361, "step": 1972 }, { "epoch": 12.255451713395638, "grad_norm": 2.53125, "learning_rate": 4.967020933775366e-05, "loss": 0.218, "step": 1973 }, { "epoch": 12.261682242990654, "grad_norm": 4.53125, "learning_rate": 4.966981035290748e-05, "loss": 0.5257, "step": 1974 }, { "epoch": 12.26791277258567, "grad_norm": 3.203125, "learning_rate": 4.966941112846317e-05, "loss": 0.3195, "step": 1975 }, { "epoch": 12.274143302180685, "grad_norm": 3.0625, "learning_rate": 4.9669011664424616e-05, "loss": 0.2527, "step": 1976 }, { "epoch": 12.280373831775702, "grad_norm": 3.640625, "learning_rate": 4.9668611960795683e-05, "loss": 0.436, "step": 1977 }, { "epoch": 12.286604361370717, "grad_norm": 3.34375, "learning_rate": 4.966821201758025e-05, "loss": 0.2907, "step": 1978 }, { "epoch": 12.292834890965732, "grad_norm": 2.640625, "learning_rate": 4.9667811834782224e-05, "loss": 0.1936, "step": 1979 }, { "epoch": 12.299065420560748, "grad_norm": 2.890625, "learning_rate": 4.9667411412405477e-05, "loss": 0.282, "step": 1980 }, { "epoch": 12.305295950155763, "grad_norm": 2.484375, "learning_rate": 4.9667010750453896e-05, "loss": 0.274, "step": 1981 }, { "epoch": 12.311526479750778, "grad_norm": 3.390625, "learning_rate": 4.966660984893138e-05, "loss": 0.5162, "step": 1982 }, { "epoch": 12.317757009345794, "grad_norm": 3.578125, "learning_rate": 4.966620870784181e-05, "loss": 0.4643, "step": 1983 }, { "epoch": 12.32398753894081, "grad_norm": 2.78125, "learning_rate": 4.9665807327189096e-05, "loss": 0.2721, "step": 1984 }, { "epoch": 12.330218068535826, "grad_norm": 2.21875, "learning_rate": 4.966540570697713e-05, "loss": 0.1726, "step": 1985 }, { "epoch": 12.336448598130842, "grad_norm": 2.78125, "learning_rate": 4.966500384720981e-05, "loss": 0.3056, "step": 1986 }, { "epoch": 12.342679127725857, "grad_norm": 2.578125, "learning_rate": 4.966460174789105e-05, "loss": 0.2082, "step": 1987 }, { "epoch": 12.348909657320872, "grad_norm": 4.4375, "learning_rate": 4.9664199409024745e-05, "loss": 0.3279, "step": 1988 }, { "epoch": 12.355140186915888, "grad_norm": 3.640625, "learning_rate": 4.96637968306148e-05, "loss": 0.2231, "step": 1989 }, { "epoch": 12.361370716510903, "grad_norm": 4.21875, "learning_rate": 4.9663394012665135e-05, "loss": 0.3131, "step": 1990 }, { "epoch": 12.367601246105918, "grad_norm": 3.9375, "learning_rate": 4.9662990955179656e-05, "loss": 0.3297, "step": 1991 }, { "epoch": 12.373831775700934, "grad_norm": 3.53125, "learning_rate": 4.966258765816227e-05, "loss": 0.3684, "step": 1992 }, { "epoch": 12.38006230529595, "grad_norm": 4.5, "learning_rate": 4.966218412161692e-05, "loss": 0.3489, "step": 1993 }, { "epoch": 12.386292834890966, "grad_norm": 2.140625, "learning_rate": 4.9661780345547495e-05, "loss": 0.19, "step": 1994 }, { "epoch": 12.392523364485982, "grad_norm": 4.34375, "learning_rate": 4.966137632995793e-05, "loss": 0.3512, "step": 1995 }, { "epoch": 12.398753894080997, "grad_norm": 4.03125, "learning_rate": 4.9660972074852154e-05, "loss": 0.2497, "step": 1996 }, { "epoch": 12.404984423676012, "grad_norm": 1.5703125, "learning_rate": 4.966056758023408e-05, "loss": 0.2018, "step": 1997 }, { "epoch": 12.411214953271028, "grad_norm": 3.09375, "learning_rate": 4.9660162846107654e-05, "loss": 0.4691, "step": 1998 }, { "epoch": 12.417445482866043, "grad_norm": 3.703125, "learning_rate": 4.96597578724768e-05, "loss": 0.3415, "step": 1999 }, { "epoch": 12.423676012461058, "grad_norm": 2.640625, "learning_rate": 4.965935265934544e-05, "loss": 0.2843, "step": 2000 }, { "epoch": 12.429906542056075, "grad_norm": 3.5, "learning_rate": 4.965894720671751e-05, "loss": 0.4128, "step": 2001 }, { "epoch": 12.43613707165109, "grad_norm": 2.65625, "learning_rate": 4.965854151459697e-05, "loss": 0.2927, "step": 2002 }, { "epoch": 12.442367601246106, "grad_norm": 3.15625, "learning_rate": 4.965813558298773e-05, "loss": 0.4498, "step": 2003 }, { "epoch": 12.448598130841122, "grad_norm": 3.3125, "learning_rate": 4.965772941189376e-05, "loss": 0.3049, "step": 2004 }, { "epoch": 12.454828660436137, "grad_norm": 2.96875, "learning_rate": 4.965732300131899e-05, "loss": 0.2197, "step": 2005 }, { "epoch": 12.461059190031152, "grad_norm": 4.03125, "learning_rate": 4.9656916351267375e-05, "loss": 0.3732, "step": 2006 }, { "epoch": 12.467289719626168, "grad_norm": 2.5625, "learning_rate": 4.965650946174285e-05, "loss": 0.2442, "step": 2007 }, { "epoch": 12.473520249221183, "grad_norm": 1.71875, "learning_rate": 4.965610233274939e-05, "loss": 0.19, "step": 2008 }, { "epoch": 12.4797507788162, "grad_norm": 2.6875, "learning_rate": 4.965569496429092e-05, "loss": 0.2317, "step": 2009 }, { "epoch": 12.485981308411215, "grad_norm": 3.328125, "learning_rate": 4.9655287356371416e-05, "loss": 0.4297, "step": 2010 }, { "epoch": 12.49221183800623, "grad_norm": 3.65625, "learning_rate": 4.965487950899484e-05, "loss": 0.2787, "step": 2011 }, { "epoch": 12.498442367601246, "grad_norm": 3.03125, "learning_rate": 4.9654471422165135e-05, "loss": 0.2729, "step": 2012 }, { "epoch": 12.504672897196262, "grad_norm": 3.265625, "learning_rate": 4.9654063095886286e-05, "loss": 0.2213, "step": 2013 }, { "epoch": 12.510903426791277, "grad_norm": 3.578125, "learning_rate": 4.965365453016224e-05, "loss": 0.4686, "step": 2014 }, { "epoch": 12.517133956386292, "grad_norm": 3.640625, "learning_rate": 4.965324572499698e-05, "loss": 0.3015, "step": 2015 }, { "epoch": 12.523364485981308, "grad_norm": 2.90625, "learning_rate": 4.965283668039447e-05, "loss": 0.2362, "step": 2016 }, { "epoch": 12.529595015576325, "grad_norm": 6.4375, "learning_rate": 4.965242739635867e-05, "loss": 0.3562, "step": 2017 }, { "epoch": 12.53582554517134, "grad_norm": 6.46875, "learning_rate": 4.965201787289358e-05, "loss": 0.248, "step": 2018 }, { "epoch": 12.542056074766355, "grad_norm": 4.59375, "learning_rate": 4.965160811000316e-05, "loss": 0.3131, "step": 2019 }, { "epoch": 12.54828660436137, "grad_norm": 3.828125, "learning_rate": 4.96511981076914e-05, "loss": 0.4679, "step": 2020 }, { "epoch": 12.554517133956386, "grad_norm": 5.1875, "learning_rate": 4.965078786596227e-05, "loss": 0.3069, "step": 2021 }, { "epoch": 12.560747663551401, "grad_norm": 4.5625, "learning_rate": 4.965037738481976e-05, "loss": 0.1957, "step": 2022 }, { "epoch": 12.566978193146417, "grad_norm": 2.921875, "learning_rate": 4.9649966664267856e-05, "loss": 0.1921, "step": 2023 }, { "epoch": 12.573208722741432, "grad_norm": 4.03125, "learning_rate": 4.964955570431055e-05, "loss": 0.2409, "step": 2024 }, { "epoch": 12.57943925233645, "grad_norm": 3.65625, "learning_rate": 4.964914450495183e-05, "loss": 0.2861, "step": 2025 }, { "epoch": 12.585669781931465, "grad_norm": 6.46875, "learning_rate": 4.9648733066195694e-05, "loss": 0.2647, "step": 2026 }, { "epoch": 12.59190031152648, "grad_norm": 4.71875, "learning_rate": 4.9648321388046137e-05, "loss": 0.2406, "step": 2027 }, { "epoch": 12.598130841121495, "grad_norm": 4.3125, "learning_rate": 4.964790947050715e-05, "loss": 0.4196, "step": 2028 }, { "epoch": 12.60436137071651, "grad_norm": 2.96875, "learning_rate": 4.9647497313582744e-05, "loss": 0.314, "step": 2029 }, { "epoch": 12.610591900311526, "grad_norm": 3.6875, "learning_rate": 4.964708491727692e-05, "loss": 0.2913, "step": 2030 }, { "epoch": 12.616822429906541, "grad_norm": 6.3125, "learning_rate": 4.9646672281593674e-05, "loss": 0.3647, "step": 2031 }, { "epoch": 12.623052959501557, "grad_norm": 4.9375, "learning_rate": 4.964625940653702e-05, "loss": 0.379, "step": 2032 }, { "epoch": 12.629283489096574, "grad_norm": 3.5625, "learning_rate": 4.964584629211098e-05, "loss": 0.2402, "step": 2033 }, { "epoch": 12.63551401869159, "grad_norm": 2.859375, "learning_rate": 4.9645432938319536e-05, "loss": 0.2692, "step": 2034 }, { "epoch": 12.641744548286605, "grad_norm": 5.6875, "learning_rate": 4.9645019345166734e-05, "loss": 0.3807, "step": 2035 }, { "epoch": 12.64797507788162, "grad_norm": 4.03125, "learning_rate": 4.964460551265657e-05, "loss": 0.3105, "step": 2036 }, { "epoch": 12.654205607476635, "grad_norm": 4.125, "learning_rate": 4.9644191440793076e-05, "loss": 0.2636, "step": 2037 }, { "epoch": 12.66043613707165, "grad_norm": 2.578125, "learning_rate": 4.9643777129580263e-05, "loss": 0.2434, "step": 2038 }, { "epoch": 12.666666666666666, "grad_norm": 2.765625, "learning_rate": 4.964336257902217e-05, "loss": 0.3072, "step": 2039 }, { "epoch": 12.672897196261681, "grad_norm": 3.125, "learning_rate": 4.964294778912281e-05, "loss": 0.2468, "step": 2040 }, { "epoch": 12.679127725856699, "grad_norm": 2.9375, "learning_rate": 4.9642532759886205e-05, "loss": 0.2867, "step": 2041 }, { "epoch": 12.685358255451714, "grad_norm": 4.375, "learning_rate": 4.96421174913164e-05, "loss": 0.4539, "step": 2042 }, { "epoch": 12.69158878504673, "grad_norm": 2.71875, "learning_rate": 4.9641701983417434e-05, "loss": 0.2407, "step": 2043 }, { "epoch": 12.697819314641745, "grad_norm": 2.90625, "learning_rate": 4.964128623619332e-05, "loss": 0.3282, "step": 2044 }, { "epoch": 12.70404984423676, "grad_norm": 5.375, "learning_rate": 4.964087024964812e-05, "loss": 0.3078, "step": 2045 }, { "epoch": 12.710280373831775, "grad_norm": 4.375, "learning_rate": 4.964045402378585e-05, "loss": 0.2152, "step": 2046 }, { "epoch": 12.71651090342679, "grad_norm": 4.5625, "learning_rate": 4.964003755861057e-05, "loss": 0.3074, "step": 2047 }, { "epoch": 12.722741433021806, "grad_norm": 3.59375, "learning_rate": 4.9639620854126326e-05, "loss": 0.3332, "step": 2048 }, { "epoch": 12.728971962616823, "grad_norm": 3.953125, "learning_rate": 4.963920391033715e-05, "loss": 0.3088, "step": 2049 }, { "epoch": 12.735202492211839, "grad_norm": 4.34375, "learning_rate": 4.963878672724711e-05, "loss": 0.3381, "step": 2050 }, { "epoch": 12.741433021806854, "grad_norm": 4.21875, "learning_rate": 4.963836930486025e-05, "loss": 0.3092, "step": 2051 }, { "epoch": 12.74766355140187, "grad_norm": 2.90625, "learning_rate": 4.9637951643180605e-05, "loss": 0.2462, "step": 2052 }, { "epoch": 12.753894080996885, "grad_norm": 6.46875, "learning_rate": 4.963753374221226e-05, "loss": 0.2742, "step": 2053 }, { "epoch": 12.7601246105919, "grad_norm": 5.28125, "learning_rate": 4.963711560195926e-05, "loss": 0.3235, "step": 2054 }, { "epoch": 12.766355140186915, "grad_norm": 5.6875, "learning_rate": 4.963669722242567e-05, "loss": 0.2465, "step": 2055 }, { "epoch": 12.77258566978193, "grad_norm": 4.9375, "learning_rate": 4.963627860361555e-05, "loss": 0.33, "step": 2056 }, { "epoch": 12.778816199376948, "grad_norm": 2.125, "learning_rate": 4.9635859745532974e-05, "loss": 0.2064, "step": 2057 }, { "epoch": 12.785046728971963, "grad_norm": 2.703125, "learning_rate": 4.9635440648181994e-05, "loss": 0.2536, "step": 2058 }, { "epoch": 12.791277258566979, "grad_norm": 4.96875, "learning_rate": 4.9635021311566696e-05, "loss": 0.3027, "step": 2059 }, { "epoch": 12.797507788161994, "grad_norm": 3.78125, "learning_rate": 4.963460173569114e-05, "loss": 0.4137, "step": 2060 }, { "epoch": 12.80373831775701, "grad_norm": 2.421875, "learning_rate": 4.963418192055942e-05, "loss": 0.2092, "step": 2061 }, { "epoch": 12.809968847352025, "grad_norm": 4.4375, "learning_rate": 4.9633761866175586e-05, "loss": 0.3685, "step": 2062 }, { "epoch": 12.81619937694704, "grad_norm": 5.78125, "learning_rate": 4.9633341572543745e-05, "loss": 0.4078, "step": 2063 }, { "epoch": 12.822429906542055, "grad_norm": 6.09375, "learning_rate": 4.963292103966796e-05, "loss": 0.3216, "step": 2064 }, { "epoch": 12.828660436137072, "grad_norm": 3.90625, "learning_rate": 4.963250026755232e-05, "loss": 0.2011, "step": 2065 }, { "epoch": 12.834890965732088, "grad_norm": 3.25, "learning_rate": 4.963207925620092e-05, "loss": 0.3246, "step": 2066 }, { "epoch": 12.841121495327103, "grad_norm": 5.03125, "learning_rate": 4.963165800561784e-05, "loss": 0.2769, "step": 2067 }, { "epoch": 12.847352024922118, "grad_norm": 5.0, "learning_rate": 4.963123651580717e-05, "loss": 0.4097, "step": 2068 }, { "epoch": 12.853582554517134, "grad_norm": 3.25, "learning_rate": 4.963081478677301e-05, "loss": 0.253, "step": 2069 }, { "epoch": 12.85981308411215, "grad_norm": 3.265625, "learning_rate": 4.9630392818519457e-05, "loss": 0.4105, "step": 2070 }, { "epoch": 12.866043613707165, "grad_norm": 3.8125, "learning_rate": 4.96299706110506e-05, "loss": 0.3166, "step": 2071 }, { "epoch": 12.87227414330218, "grad_norm": 3.78125, "learning_rate": 4.9629548164370546e-05, "loss": 0.3095, "step": 2072 }, { "epoch": 12.878504672897197, "grad_norm": 4.625, "learning_rate": 4.962912547848339e-05, "loss": 0.3666, "step": 2073 }, { "epoch": 12.884735202492212, "grad_norm": 4.75, "learning_rate": 4.9628702553393255e-05, "loss": 0.4749, "step": 2074 }, { "epoch": 12.890965732087228, "grad_norm": 3.109375, "learning_rate": 4.962827938910424e-05, "loss": 0.2609, "step": 2075 }, { "epoch": 12.897196261682243, "grad_norm": 3.609375, "learning_rate": 4.962785598562044e-05, "loss": 0.241, "step": 2076 }, { "epoch": 12.903426791277258, "grad_norm": 3.265625, "learning_rate": 4.962743234294599e-05, "loss": 0.2721, "step": 2077 }, { "epoch": 12.909657320872274, "grad_norm": 3.296875, "learning_rate": 4.9627008461084987e-05, "loss": 0.3123, "step": 2078 }, { "epoch": 12.91588785046729, "grad_norm": 4.125, "learning_rate": 4.962658434004156e-05, "loss": 0.4008, "step": 2079 }, { "epoch": 12.922118380062305, "grad_norm": 5.03125, "learning_rate": 4.9626159979819826e-05, "loss": 0.3552, "step": 2080 }, { "epoch": 12.928348909657322, "grad_norm": 4.46875, "learning_rate": 4.96257353804239e-05, "loss": 0.4172, "step": 2081 }, { "epoch": 12.934579439252337, "grad_norm": 2.78125, "learning_rate": 4.962531054185791e-05, "loss": 0.3121, "step": 2082 }, { "epoch": 12.940809968847352, "grad_norm": 2.703125, "learning_rate": 4.962488546412598e-05, "loss": 0.2668, "step": 2083 }, { "epoch": 12.947040498442368, "grad_norm": 4.75, "learning_rate": 4.962446014723224e-05, "loss": 0.3662, "step": 2084 }, { "epoch": 12.953271028037383, "grad_norm": 2.859375, "learning_rate": 4.9624034591180815e-05, "loss": 0.2082, "step": 2085 }, { "epoch": 12.959501557632398, "grad_norm": 3.78125, "learning_rate": 4.962360879597585e-05, "loss": 0.3149, "step": 2086 }, { "epoch": 12.965732087227414, "grad_norm": 3.390625, "learning_rate": 4.962318276162148e-05, "loss": 0.3184, "step": 2087 }, { "epoch": 12.97196261682243, "grad_norm": 3.375, "learning_rate": 4.9622756488121826e-05, "loss": 0.2869, "step": 2088 }, { "epoch": 12.978193146417446, "grad_norm": 2.453125, "learning_rate": 4.962232997548105e-05, "loss": 0.1648, "step": 2089 }, { "epoch": 12.984423676012462, "grad_norm": 4.53125, "learning_rate": 4.962190322370327e-05, "loss": 0.3891, "step": 2090 }, { "epoch": 12.990654205607477, "grad_norm": 2.375, "learning_rate": 4.9621476232792654e-05, "loss": 0.2075, "step": 2091 }, { "epoch": 12.996884735202492, "grad_norm": 2.578125, "learning_rate": 4.962104900275334e-05, "loss": 0.2254, "step": 2092 }, { "epoch": 13.0, "grad_norm": 2.28125, "learning_rate": 4.962062153358947e-05, "loss": 0.195, "step": 2093 }, { "epoch": 13.006230529595015, "grad_norm": 5.6875, "learning_rate": 4.962019382530521e-05, "loss": 0.301, "step": 2094 }, { "epoch": 13.01246105919003, "grad_norm": 3.078125, "learning_rate": 4.9619765877904696e-05, "loss": 0.248, "step": 2095 }, { "epoch": 13.018691588785046, "grad_norm": 3.4375, "learning_rate": 4.9619337691392096e-05, "loss": 0.2698, "step": 2096 }, { "epoch": 13.024922118380061, "grad_norm": 2.328125, "learning_rate": 4.9618909265771574e-05, "loss": 0.2458, "step": 2097 }, { "epoch": 13.031152647975079, "grad_norm": 5.65625, "learning_rate": 4.961848060104728e-05, "loss": 0.3919, "step": 2098 }, { "epoch": 13.037383177570094, "grad_norm": 3.296875, "learning_rate": 4.961805169722338e-05, "loss": 0.4126, "step": 2099 }, { "epoch": 13.04361370716511, "grad_norm": 2.625, "learning_rate": 4.9617622554304044e-05, "loss": 0.2848, "step": 2100 }, { "epoch": 13.049844236760125, "grad_norm": 3.84375, "learning_rate": 4.961719317229344e-05, "loss": 0.2336, "step": 2101 }, { "epoch": 13.05607476635514, "grad_norm": 2.234375, "learning_rate": 4.961676355119572e-05, "loss": 0.2096, "step": 2102 }, { "epoch": 13.062305295950155, "grad_norm": 2.4375, "learning_rate": 4.961633369101509e-05, "loss": 0.2514, "step": 2103 }, { "epoch": 13.06853582554517, "grad_norm": 3.15625, "learning_rate": 4.96159035917557e-05, "loss": 0.27, "step": 2104 }, { "epoch": 13.074766355140186, "grad_norm": 2.390625, "learning_rate": 4.9615473253421727e-05, "loss": 0.3274, "step": 2105 }, { "epoch": 13.080996884735203, "grad_norm": 2.890625, "learning_rate": 4.961504267601737e-05, "loss": 0.3066, "step": 2106 }, { "epoch": 13.087227414330219, "grad_norm": 1.984375, "learning_rate": 4.961461185954678e-05, "loss": 0.1936, "step": 2107 }, { "epoch": 13.093457943925234, "grad_norm": 2.28125, "learning_rate": 4.961418080401418e-05, "loss": 0.2279, "step": 2108 }, { "epoch": 13.09968847352025, "grad_norm": 3.3125, "learning_rate": 4.961374950942372e-05, "loss": 0.346, "step": 2109 }, { "epoch": 13.105919003115265, "grad_norm": 3.296875, "learning_rate": 4.9613317975779605e-05, "loss": 0.3768, "step": 2110 }, { "epoch": 13.11214953271028, "grad_norm": 3.171875, "learning_rate": 4.961288620308603e-05, "loss": 0.3256, "step": 2111 }, { "epoch": 13.118380062305295, "grad_norm": 2.671875, "learning_rate": 4.961245419134719e-05, "loss": 0.2556, "step": 2112 }, { "epoch": 13.12461059190031, "grad_norm": 3.640625, "learning_rate": 4.961202194056727e-05, "loss": 0.2642, "step": 2113 }, { "epoch": 13.130841121495328, "grad_norm": 4.625, "learning_rate": 4.9611589450750475e-05, "loss": 0.5115, "step": 2114 }, { "epoch": 13.137071651090343, "grad_norm": 3.5625, "learning_rate": 4.9611156721901e-05, "loss": 0.2911, "step": 2115 }, { "epoch": 13.143302180685358, "grad_norm": 3.546875, "learning_rate": 4.961072375402305e-05, "loss": 0.4232, "step": 2116 }, { "epoch": 13.149532710280374, "grad_norm": 2.3125, "learning_rate": 4.9610290547120835e-05, "loss": 0.2185, "step": 2117 }, { "epoch": 13.15576323987539, "grad_norm": 3.65625, "learning_rate": 4.960985710119855e-05, "loss": 0.4968, "step": 2118 }, { "epoch": 13.161993769470405, "grad_norm": 3.75, "learning_rate": 4.9609423416260426e-05, "loss": 0.217, "step": 2119 }, { "epoch": 13.16822429906542, "grad_norm": 2.046875, "learning_rate": 4.960898949231066e-05, "loss": 0.2573, "step": 2120 }, { "epoch": 13.174454828660435, "grad_norm": 3.296875, "learning_rate": 4.960855532935347e-05, "loss": 0.4622, "step": 2121 }, { "epoch": 13.180685358255452, "grad_norm": 3.6875, "learning_rate": 4.960812092739306e-05, "loss": 0.5526, "step": 2122 }, { "epoch": 13.186915887850468, "grad_norm": 2.78125, "learning_rate": 4.9607686286433676e-05, "loss": 0.2338, "step": 2123 }, { "epoch": 13.193146417445483, "grad_norm": 2.390625, "learning_rate": 4.960725140647952e-05, "loss": 0.293, "step": 2124 }, { "epoch": 13.199376947040498, "grad_norm": 6.4375, "learning_rate": 4.9606816287534805e-05, "loss": 0.5162, "step": 2125 }, { "epoch": 13.205607476635514, "grad_norm": 3.703125, "learning_rate": 4.9606380929603784e-05, "loss": 0.3307, "step": 2126 }, { "epoch": 13.21183800623053, "grad_norm": 2.5, "learning_rate": 4.960594533269067e-05, "loss": 0.2355, "step": 2127 }, { "epoch": 13.218068535825545, "grad_norm": 2.53125, "learning_rate": 4.960550949679969e-05, "loss": 0.2363, "step": 2128 }, { "epoch": 13.22429906542056, "grad_norm": 3.921875, "learning_rate": 4.96050734219351e-05, "loss": 0.4406, "step": 2129 }, { "epoch": 13.230529595015577, "grad_norm": 4.21875, "learning_rate": 4.960463710810111e-05, "loss": 0.444, "step": 2130 }, { "epoch": 13.236760124610592, "grad_norm": 3.359375, "learning_rate": 4.960420055530196e-05, "loss": 0.2749, "step": 2131 }, { "epoch": 13.242990654205608, "grad_norm": 3.40625, "learning_rate": 4.96037637635419e-05, "loss": 0.3246, "step": 2132 }, { "epoch": 13.249221183800623, "grad_norm": 2.328125, "learning_rate": 4.9603326732825175e-05, "loss": 0.1732, "step": 2133 }, { "epoch": 13.255451713395638, "grad_norm": 4.4375, "learning_rate": 4.960288946315601e-05, "loss": 0.5064, "step": 2134 }, { "epoch": 13.261682242990654, "grad_norm": 4.375, "learning_rate": 4.960245195453868e-05, "loss": 0.501, "step": 2135 }, { "epoch": 13.26791277258567, "grad_norm": 3.03125, "learning_rate": 4.96020142069774e-05, "loss": 0.2776, "step": 2136 }, { "epoch": 13.274143302180685, "grad_norm": 3.1875, "learning_rate": 4.960157622047645e-05, "loss": 0.3631, "step": 2137 }, { "epoch": 13.280373831775702, "grad_norm": 3.09375, "learning_rate": 4.960113799504008e-05, "loss": 0.3345, "step": 2138 }, { "epoch": 13.286604361370717, "grad_norm": 3.0, "learning_rate": 4.9600699530672536e-05, "loss": 0.3393, "step": 2139 }, { "epoch": 13.292834890965732, "grad_norm": 3.0, "learning_rate": 4.9600260827378074e-05, "loss": 0.511, "step": 2140 }, { "epoch": 13.299065420560748, "grad_norm": 1.8984375, "learning_rate": 4.9599821885160966e-05, "loss": 0.2118, "step": 2141 }, { "epoch": 13.305295950155763, "grad_norm": 2.265625, "learning_rate": 4.959938270402547e-05, "loss": 0.2322, "step": 2142 }, { "epoch": 13.311526479750778, "grad_norm": 2.953125, "learning_rate": 4.9598943283975854e-05, "loss": 0.2519, "step": 2143 }, { "epoch": 13.317757009345794, "grad_norm": 2.65625, "learning_rate": 4.959850362501638e-05, "loss": 0.2695, "step": 2144 }, { "epoch": 13.32398753894081, "grad_norm": 3.921875, "learning_rate": 4.9598063727151325e-05, "loss": 0.2793, "step": 2145 }, { "epoch": 13.330218068535826, "grad_norm": 2.5625, "learning_rate": 4.9597623590384956e-05, "loss": 0.1844, "step": 2146 }, { "epoch": 13.336448598130842, "grad_norm": 4.625, "learning_rate": 4.959718321472155e-05, "loss": 0.4477, "step": 2147 }, { "epoch": 13.342679127725857, "grad_norm": 2.90625, "learning_rate": 4.959674260016538e-05, "loss": 0.3134, "step": 2148 }, { "epoch": 13.348909657320872, "grad_norm": 3.28125, "learning_rate": 4.9596301746720734e-05, "loss": 0.3251, "step": 2149 }, { "epoch": 13.355140186915888, "grad_norm": 3.015625, "learning_rate": 4.959586065439189e-05, "loss": 0.3112, "step": 2150 }, { "epoch": 13.361370716510903, "grad_norm": 3.28125, "learning_rate": 4.9595419323183125e-05, "loss": 0.2154, "step": 2151 }, { "epoch": 13.367601246105918, "grad_norm": 3.46875, "learning_rate": 4.9594977753098734e-05, "loss": 0.3004, "step": 2152 }, { "epoch": 13.373831775700934, "grad_norm": 3.640625, "learning_rate": 4.959453594414301e-05, "loss": 0.256, "step": 2153 }, { "epoch": 13.38006230529595, "grad_norm": 2.59375, "learning_rate": 4.959409389632022e-05, "loss": 0.324, "step": 2154 }, { "epoch": 13.386292834890966, "grad_norm": 3.578125, "learning_rate": 4.9593651609634696e-05, "loss": 0.2222, "step": 2155 }, { "epoch": 13.392523364485982, "grad_norm": 2.53125, "learning_rate": 4.9593209084090696e-05, "loss": 0.2295, "step": 2156 }, { "epoch": 13.398753894080997, "grad_norm": 2.5, "learning_rate": 4.959276631969253e-05, "loss": 0.2218, "step": 2157 }, { "epoch": 13.404984423676012, "grad_norm": 4.625, "learning_rate": 4.959232331644451e-05, "loss": 0.3177, "step": 2158 }, { "epoch": 13.411214953271028, "grad_norm": 2.421875, "learning_rate": 4.9591880074350926e-05, "loss": 0.2643, "step": 2159 }, { "epoch": 13.417445482866043, "grad_norm": 3.5625, "learning_rate": 4.959143659341609e-05, "loss": 0.2342, "step": 2160 }, { "epoch": 13.423676012461058, "grad_norm": 3.6875, "learning_rate": 4.95909928736443e-05, "loss": 0.4089, "step": 2161 }, { "epoch": 13.429906542056075, "grad_norm": 2.953125, "learning_rate": 4.959054891503988e-05, "loss": 0.2234, "step": 2162 }, { "epoch": 13.43613707165109, "grad_norm": 4.0, "learning_rate": 4.9590104717607135e-05, "loss": 0.6074, "step": 2163 }, { "epoch": 13.442367601246106, "grad_norm": 2.921875, "learning_rate": 4.958966028135037e-05, "loss": 0.3639, "step": 2164 }, { "epoch": 13.448598130841122, "grad_norm": 2.734375, "learning_rate": 4.9589215606273905e-05, "loss": 0.2445, "step": 2165 }, { "epoch": 13.454828660436137, "grad_norm": 3.140625, "learning_rate": 4.958877069238208e-05, "loss": 0.2173, "step": 2166 }, { "epoch": 13.461059190031152, "grad_norm": 3.78125, "learning_rate": 4.9588325539679185e-05, "loss": 0.3397, "step": 2167 }, { "epoch": 13.467289719626168, "grad_norm": 3.265625, "learning_rate": 4.9587880148169554e-05, "loss": 0.2293, "step": 2168 }, { "epoch": 13.473520249221183, "grad_norm": 4.9375, "learning_rate": 4.9587434517857525e-05, "loss": 0.2203, "step": 2169 }, { "epoch": 13.4797507788162, "grad_norm": 3.390625, "learning_rate": 4.958698864874741e-05, "loss": 0.3538, "step": 2170 }, { "epoch": 13.485981308411215, "grad_norm": 3.625, "learning_rate": 4.958654254084355e-05, "loss": 0.3047, "step": 2171 }, { "epoch": 13.49221183800623, "grad_norm": 2.953125, "learning_rate": 4.958609619415028e-05, "loss": 0.1773, "step": 2172 }, { "epoch": 13.498442367601246, "grad_norm": 2.8125, "learning_rate": 4.958564960867192e-05, "loss": 0.2451, "step": 2173 }, { "epoch": 13.504672897196262, "grad_norm": 2.953125, "learning_rate": 4.958520278441282e-05, "loss": 0.2097, "step": 2174 }, { "epoch": 13.510903426791277, "grad_norm": 4.25, "learning_rate": 4.958475572137732e-05, "loss": 0.29, "step": 2175 }, { "epoch": 13.517133956386292, "grad_norm": 2.328125, "learning_rate": 4.9584308419569755e-05, "loss": 0.2186, "step": 2176 }, { "epoch": 13.523364485981308, "grad_norm": 2.984375, "learning_rate": 4.958386087899447e-05, "loss": 0.2766, "step": 2177 }, { "epoch": 13.529595015576325, "grad_norm": 2.53125, "learning_rate": 4.958341309965582e-05, "loss": 0.2522, "step": 2178 }, { "epoch": 13.53582554517134, "grad_norm": 1.34375, "learning_rate": 4.958296508155814e-05, "loss": 0.1498, "step": 2179 }, { "epoch": 13.542056074766355, "grad_norm": 1.8671875, "learning_rate": 4.9582516824705795e-05, "loss": 0.1581, "step": 2180 }, { "epoch": 13.54828660436137, "grad_norm": 2.0625, "learning_rate": 4.9582068329103125e-05, "loss": 0.1992, "step": 2181 }, { "epoch": 13.554517133956386, "grad_norm": 3.0, "learning_rate": 4.95816195947545e-05, "loss": 0.3419, "step": 2182 }, { "epoch": 13.560747663551401, "grad_norm": 2.203125, "learning_rate": 4.958117062166427e-05, "loss": 0.2124, "step": 2183 }, { "epoch": 13.566978193146417, "grad_norm": 4.875, "learning_rate": 4.958072140983679e-05, "loss": 0.2445, "step": 2184 }, { "epoch": 13.573208722741432, "grad_norm": 4.21875, "learning_rate": 4.9580271959276445e-05, "loss": 0.2612, "step": 2185 }, { "epoch": 13.57943925233645, "grad_norm": 2.828125, "learning_rate": 4.9579822269987574e-05, "loss": 0.2111, "step": 2186 }, { "epoch": 13.585669781931465, "grad_norm": 3.421875, "learning_rate": 4.957937234197456e-05, "loss": 0.4228, "step": 2187 }, { "epoch": 13.59190031152648, "grad_norm": 2.703125, "learning_rate": 4.957892217524177e-05, "loss": 0.2392, "step": 2188 }, { "epoch": 13.598130841121495, "grad_norm": 1.6953125, "learning_rate": 4.957847176979357e-05, "loss": 0.1894, "step": 2189 }, { "epoch": 13.60436137071651, "grad_norm": 2.25, "learning_rate": 4.957802112563434e-05, "loss": 0.3218, "step": 2190 }, { "epoch": 13.610591900311526, "grad_norm": 3.4375, "learning_rate": 4.957757024276846e-05, "loss": 0.2906, "step": 2191 }, { "epoch": 13.616822429906541, "grad_norm": 4.75, "learning_rate": 4.957711912120031e-05, "loss": 0.3414, "step": 2192 }, { "epoch": 13.623052959501557, "grad_norm": 2.3125, "learning_rate": 4.9576667760934256e-05, "loss": 0.1836, "step": 2193 }, { "epoch": 13.629283489096574, "grad_norm": 4.15625, "learning_rate": 4.957621616197469e-05, "loss": 0.3271, "step": 2194 }, { "epoch": 13.63551401869159, "grad_norm": 2.84375, "learning_rate": 4.9575764324326004e-05, "loss": 0.2161, "step": 2195 }, { "epoch": 13.641744548286605, "grad_norm": 4.78125, "learning_rate": 4.9575312247992584e-05, "loss": 0.4427, "step": 2196 }, { "epoch": 13.64797507788162, "grad_norm": 2.5625, "learning_rate": 4.957485993297882e-05, "loss": 0.2373, "step": 2197 }, { "epoch": 13.654205607476635, "grad_norm": 4.15625, "learning_rate": 4.95744073792891e-05, "loss": 0.3088, "step": 2198 }, { "epoch": 13.66043613707165, "grad_norm": 3.078125, "learning_rate": 4.957395458692782e-05, "loss": 0.215, "step": 2199 }, { "epoch": 13.666666666666666, "grad_norm": 3.4375, "learning_rate": 4.9573501555899394e-05, "loss": 0.1828, "step": 2200 }, { "epoch": 13.672897196261681, "grad_norm": 3.03125, "learning_rate": 4.95730482862082e-05, "loss": 0.2356, "step": 2201 }, { "epoch": 13.679127725856699, "grad_norm": 2.609375, "learning_rate": 4.9572594777858644e-05, "loss": 0.4203, "step": 2202 }, { "epoch": 13.685358255451714, "grad_norm": 4.78125, "learning_rate": 4.957214103085514e-05, "loss": 0.2602, "step": 2203 }, { "epoch": 13.69158878504673, "grad_norm": 4.40625, "learning_rate": 4.957168704520209e-05, "loss": 0.3767, "step": 2204 }, { "epoch": 13.697819314641745, "grad_norm": 3.265625, "learning_rate": 4.9571232820903903e-05, "loss": 0.3451, "step": 2205 }, { "epoch": 13.70404984423676, "grad_norm": 2.25, "learning_rate": 4.9570778357965e-05, "loss": 0.2051, "step": 2206 }, { "epoch": 13.710280373831775, "grad_norm": 3.0, "learning_rate": 4.9570323656389776e-05, "loss": 0.3567, "step": 2207 }, { "epoch": 13.71651090342679, "grad_norm": 3.375, "learning_rate": 4.956986871618266e-05, "loss": 0.3363, "step": 2208 }, { "epoch": 13.722741433021806, "grad_norm": 2.890625, "learning_rate": 4.956941353734807e-05, "loss": 0.234, "step": 2209 }, { "epoch": 13.728971962616823, "grad_norm": 2.703125, "learning_rate": 4.9568958119890417e-05, "loss": 0.3315, "step": 2210 }, { "epoch": 13.735202492211839, "grad_norm": 2.84375, "learning_rate": 4.956850246381414e-05, "loss": 0.2427, "step": 2211 }, { "epoch": 13.741433021806854, "grad_norm": 3.296875, "learning_rate": 4.956804656912365e-05, "loss": 0.3098, "step": 2212 }, { "epoch": 13.74766355140187, "grad_norm": 2.859375, "learning_rate": 4.9567590435823383e-05, "loss": 0.2023, "step": 2213 }, { "epoch": 13.753894080996885, "grad_norm": 4.03125, "learning_rate": 4.956713406391777e-05, "loss": 0.6036, "step": 2214 }, { "epoch": 13.7601246105919, "grad_norm": 2.703125, "learning_rate": 4.956667745341124e-05, "loss": 0.3741, "step": 2215 }, { "epoch": 13.766355140186915, "grad_norm": 2.671875, "learning_rate": 4.9566220604308224e-05, "loss": 0.2995, "step": 2216 }, { "epoch": 13.77258566978193, "grad_norm": 2.734375, "learning_rate": 4.9565763516613165e-05, "loss": 0.2605, "step": 2217 }, { "epoch": 13.778816199376948, "grad_norm": 3.28125, "learning_rate": 4.956530619033049e-05, "loss": 0.3205, "step": 2218 }, { "epoch": 13.785046728971963, "grad_norm": 2.84375, "learning_rate": 4.956484862546467e-05, "loss": 0.3675, "step": 2219 }, { "epoch": 13.791277258566979, "grad_norm": 4.15625, "learning_rate": 4.9564390822020114e-05, "loss": 0.258, "step": 2220 }, { "epoch": 13.797507788161994, "grad_norm": 3.234375, "learning_rate": 4.956393278000129e-05, "loss": 0.2464, "step": 2221 }, { "epoch": 13.80373831775701, "grad_norm": 3.03125, "learning_rate": 4.956347449941264e-05, "loss": 0.2326, "step": 2222 }, { "epoch": 13.809968847352025, "grad_norm": 3.546875, "learning_rate": 4.956301598025861e-05, "loss": 0.3667, "step": 2223 }, { "epoch": 13.81619937694704, "grad_norm": 2.859375, "learning_rate": 4.956255722254367e-05, "loss": 0.3655, "step": 2224 }, { "epoch": 13.822429906542055, "grad_norm": 5.21875, "learning_rate": 4.956209822627226e-05, "loss": 0.3301, "step": 2225 }, { "epoch": 13.828660436137072, "grad_norm": 3.09375, "learning_rate": 4.956163899144884e-05, "loss": 0.2702, "step": 2226 }, { "epoch": 13.834890965732088, "grad_norm": 2.828125, "learning_rate": 4.9561179518077875e-05, "loss": 0.2701, "step": 2227 }, { "epoch": 13.841121495327103, "grad_norm": 5.15625, "learning_rate": 4.956071980616382e-05, "loss": 0.3026, "step": 2228 }, { "epoch": 13.847352024922118, "grad_norm": 3.9375, "learning_rate": 4.956025985571115e-05, "loss": 0.2596, "step": 2229 }, { "epoch": 13.853582554517134, "grad_norm": 2.5, "learning_rate": 4.955979966672432e-05, "loss": 0.3511, "step": 2230 }, { "epoch": 13.85981308411215, "grad_norm": 3.15625, "learning_rate": 4.9559339239207815e-05, "loss": 0.3386, "step": 2231 }, { "epoch": 13.866043613707165, "grad_norm": 5.3125, "learning_rate": 4.955887857316609e-05, "loss": 0.4464, "step": 2232 }, { "epoch": 13.87227414330218, "grad_norm": 3.71875, "learning_rate": 4.955841766860363e-05, "loss": 0.2741, "step": 2233 }, { "epoch": 13.878504672897197, "grad_norm": 4.125, "learning_rate": 4.9557956525524904e-05, "loss": 0.2682, "step": 2234 }, { "epoch": 13.884735202492212, "grad_norm": 4.09375, "learning_rate": 4.9557495143934405e-05, "loss": 0.3089, "step": 2235 }, { "epoch": 13.890965732087228, "grad_norm": 2.515625, "learning_rate": 4.95570335238366e-05, "loss": 0.2121, "step": 2236 }, { "epoch": 13.897196261682243, "grad_norm": 3.015625, "learning_rate": 4.955657166523597e-05, "loss": 0.2475, "step": 2237 }, { "epoch": 13.903426791277258, "grad_norm": 4.0, "learning_rate": 4.955610956813702e-05, "loss": 0.3998, "step": 2238 }, { "epoch": 13.909657320872274, "grad_norm": 3.1875, "learning_rate": 4.9555647232544214e-05, "loss": 0.2644, "step": 2239 }, { "epoch": 13.91588785046729, "grad_norm": 2.03125, "learning_rate": 4.955518465846206e-05, "loss": 0.2186, "step": 2240 }, { "epoch": 13.922118380062305, "grad_norm": 2.109375, "learning_rate": 4.955472184589505e-05, "loss": 0.2432, "step": 2241 }, { "epoch": 13.928348909657322, "grad_norm": 2.234375, "learning_rate": 4.955425879484766e-05, "loss": 0.2431, "step": 2242 }, { "epoch": 13.934579439252337, "grad_norm": 3.5625, "learning_rate": 4.9553795505324406e-05, "loss": 0.244, "step": 2243 }, { "epoch": 13.940809968847352, "grad_norm": 5.875, "learning_rate": 4.955333197732978e-05, "loss": 0.2789, "step": 2244 }, { "epoch": 13.947040498442368, "grad_norm": 4.21875, "learning_rate": 4.955286821086829e-05, "loss": 0.4741, "step": 2245 }, { "epoch": 13.953271028037383, "grad_norm": 2.96875, "learning_rate": 4.955240420594443e-05, "loss": 0.252, "step": 2246 }, { "epoch": 13.959501557632398, "grad_norm": 3.46875, "learning_rate": 4.9551939962562714e-05, "loss": 0.2861, "step": 2247 }, { "epoch": 13.965732087227414, "grad_norm": 4.71875, "learning_rate": 4.955147548072765e-05, "loss": 0.3961, "step": 2248 }, { "epoch": 13.97196261682243, "grad_norm": 4.09375, "learning_rate": 4.955101076044375e-05, "loss": 0.4026, "step": 2249 }, { "epoch": 13.978193146417446, "grad_norm": 2.375, "learning_rate": 4.955054580171553e-05, "loss": 0.1723, "step": 2250 }, { "epoch": 13.984423676012462, "grad_norm": 2.390625, "learning_rate": 4.955008060454749e-05, "loss": 0.3364, "step": 2251 }, { "epoch": 13.990654205607477, "grad_norm": 5.4375, "learning_rate": 4.954961516894417e-05, "loss": 0.3447, "step": 2252 }, { "epoch": 13.996884735202492, "grad_norm": 5.15625, "learning_rate": 4.954914949491007e-05, "loss": 0.2855, "step": 2253 }, { "epoch": 14.0, "grad_norm": 2.015625, "learning_rate": 4.9548683582449733e-05, "loss": 0.128, "step": 2254 }, { "epoch": 14.006230529595015, "grad_norm": 3.453125, "learning_rate": 4.9548217431567665e-05, "loss": 0.2803, "step": 2255 }, { "epoch": 14.01246105919003, "grad_norm": 3.21875, "learning_rate": 4.954775104226841e-05, "loss": 0.3978, "step": 2256 }, { "epoch": 14.018691588785046, "grad_norm": 4.1875, "learning_rate": 4.9547284414556487e-05, "loss": 0.2609, "step": 2257 }, { "epoch": 14.024922118380061, "grad_norm": 4.6875, "learning_rate": 4.954681754843643e-05, "loss": 0.2188, "step": 2258 }, { "epoch": 14.031152647975079, "grad_norm": 3.890625, "learning_rate": 4.9546350443912783e-05, "loss": 0.5006, "step": 2259 }, { "epoch": 14.037383177570094, "grad_norm": 3.203125, "learning_rate": 4.954588310099007e-05, "loss": 0.546, "step": 2260 }, { "epoch": 14.04361370716511, "grad_norm": 3.359375, "learning_rate": 4.954541551967282e-05, "loss": 0.3129, "step": 2261 }, { "epoch": 14.049844236760125, "grad_norm": 5.0625, "learning_rate": 4.9544947699965594e-05, "loss": 0.2844, "step": 2262 }, { "epoch": 14.05607476635514, "grad_norm": 3.828125, "learning_rate": 4.954447964187294e-05, "loss": 0.4454, "step": 2263 }, { "epoch": 14.062305295950155, "grad_norm": 2.078125, "learning_rate": 4.9544011345399384e-05, "loss": 0.2032, "step": 2264 }, { "epoch": 14.06853582554517, "grad_norm": 3.4375, "learning_rate": 4.954354281054948e-05, "loss": 0.3154, "step": 2265 }, { "epoch": 14.074766355140186, "grad_norm": 3.078125, "learning_rate": 4.95430740373278e-05, "loss": 0.3783, "step": 2266 }, { "epoch": 14.080996884735203, "grad_norm": 2.375, "learning_rate": 4.954260502573886e-05, "loss": 0.2467, "step": 2267 }, { "epoch": 14.087227414330219, "grad_norm": 2.1875, "learning_rate": 4.954213577578724e-05, "loss": 0.2121, "step": 2268 }, { "epoch": 14.093457943925234, "grad_norm": 2.765625, "learning_rate": 4.9541666287477494e-05, "loss": 0.2579, "step": 2269 }, { "epoch": 14.09968847352025, "grad_norm": 3.046875, "learning_rate": 4.9541196560814173e-05, "loss": 0.3842, "step": 2270 }, { "epoch": 14.105919003115265, "grad_norm": 2.375, "learning_rate": 4.954072659580185e-05, "loss": 0.2964, "step": 2271 }, { "epoch": 14.11214953271028, "grad_norm": 2.171875, "learning_rate": 4.9540256392445086e-05, "loss": 0.188, "step": 2272 }, { "epoch": 14.118380062305295, "grad_norm": 2.59375, "learning_rate": 4.953978595074845e-05, "loss": 0.2089, "step": 2273 }, { "epoch": 14.12461059190031, "grad_norm": 2.75, "learning_rate": 4.95393152707165e-05, "loss": 0.3641, "step": 2274 }, { "epoch": 14.130841121495328, "grad_norm": 3.328125, "learning_rate": 4.953884435235382e-05, "loss": 0.4728, "step": 2275 }, { "epoch": 14.137071651090343, "grad_norm": 4.46875, "learning_rate": 4.953837319566497e-05, "loss": 0.3327, "step": 2276 }, { "epoch": 14.143302180685358, "grad_norm": 4.125, "learning_rate": 4.953790180065454e-05, "loss": 0.3896, "step": 2277 }, { "epoch": 14.149532710280374, "grad_norm": 3.515625, "learning_rate": 4.95374301673271e-05, "loss": 0.3846, "step": 2278 }, { "epoch": 14.15576323987539, "grad_norm": 5.5, "learning_rate": 4.9536958295687244e-05, "loss": 0.5431, "step": 2279 }, { "epoch": 14.161993769470405, "grad_norm": 5.90625, "learning_rate": 4.953648618573953e-05, "loss": 0.241, "step": 2280 }, { "epoch": 14.16822429906542, "grad_norm": 4.78125, "learning_rate": 4.953601383748857e-05, "loss": 0.3138, "step": 2281 }, { "epoch": 14.174454828660435, "grad_norm": 2.609375, "learning_rate": 4.953554125093893e-05, "loss": 0.2451, "step": 2282 }, { "epoch": 14.180685358255452, "grad_norm": 1.84375, "learning_rate": 4.953506842609521e-05, "loss": 0.1764, "step": 2283 }, { "epoch": 14.186915887850468, "grad_norm": 2.859375, "learning_rate": 4.9534595362962e-05, "loss": 0.2265, "step": 2284 }, { "epoch": 14.193146417445483, "grad_norm": 3.9375, "learning_rate": 4.95341220615439e-05, "loss": 0.292, "step": 2285 }, { "epoch": 14.199376947040498, "grad_norm": 3.671875, "learning_rate": 4.95336485218455e-05, "loss": 0.3642, "step": 2286 }, { "epoch": 14.205607476635514, "grad_norm": 3.109375, "learning_rate": 4.95331747438714e-05, "loss": 0.2656, "step": 2287 }, { "epoch": 14.21183800623053, "grad_norm": 3.78125, "learning_rate": 4.9532700727626205e-05, "loss": 0.3824, "step": 2288 }, { "epoch": 14.218068535825545, "grad_norm": 2.65625, "learning_rate": 4.953222647311452e-05, "loss": 0.3081, "step": 2289 }, { "epoch": 14.22429906542056, "grad_norm": 2.34375, "learning_rate": 4.953175198034094e-05, "loss": 0.2351, "step": 2290 }, { "epoch": 14.230529595015577, "grad_norm": 2.890625, "learning_rate": 4.953127724931008e-05, "loss": 0.2996, "step": 2291 }, { "epoch": 14.236760124610592, "grad_norm": 4.25, "learning_rate": 4.953080228002656e-05, "loss": 0.4607, "step": 2292 }, { "epoch": 14.242990654205608, "grad_norm": 3.40625, "learning_rate": 4.9530327072494985e-05, "loss": 0.2112, "step": 2293 }, { "epoch": 14.249221183800623, "grad_norm": 3.046875, "learning_rate": 4.952985162671996e-05, "loss": 0.3388, "step": 2294 }, { "epoch": 14.255451713395638, "grad_norm": 2.984375, "learning_rate": 4.9529375942706124e-05, "loss": 0.274, "step": 2295 }, { "epoch": 14.261682242990654, "grad_norm": 3.5, "learning_rate": 4.952890002045808e-05, "loss": 0.2085, "step": 2296 }, { "epoch": 14.26791277258567, "grad_norm": 4.25, "learning_rate": 4.952842385998046e-05, "loss": 0.412, "step": 2297 }, { "epoch": 14.274143302180685, "grad_norm": 2.609375, "learning_rate": 4.952794746127787e-05, "loss": 0.2311, "step": 2298 }, { "epoch": 14.280373831775702, "grad_norm": 3.40625, "learning_rate": 4.952747082435496e-05, "loss": 0.2808, "step": 2299 }, { "epoch": 14.286604361370717, "grad_norm": 2.671875, "learning_rate": 4.9526993949216355e-05, "loss": 0.3129, "step": 2300 }, { "epoch": 14.292834890965732, "grad_norm": 3.65625, "learning_rate": 4.952651683586668e-05, "loss": 0.3815, "step": 2301 }, { "epoch": 14.299065420560748, "grad_norm": 5.75, "learning_rate": 4.952603948431057e-05, "loss": 0.345, "step": 2302 }, { "epoch": 14.305295950155763, "grad_norm": 4.125, "learning_rate": 4.952556189455266e-05, "loss": 0.3423, "step": 2303 }, { "epoch": 14.311526479750778, "grad_norm": 3.546875, "learning_rate": 4.9525084066597594e-05, "loss": 0.4491, "step": 2304 }, { "epoch": 14.317757009345794, "grad_norm": 3.5, "learning_rate": 4.9524606000450005e-05, "loss": 0.2789, "step": 2305 }, { "epoch": 14.32398753894081, "grad_norm": 3.421875, "learning_rate": 4.952412769611454e-05, "loss": 0.2266, "step": 2306 }, { "epoch": 14.330218068535826, "grad_norm": 2.9375, "learning_rate": 4.952364915359584e-05, "loss": 0.2462, "step": 2307 }, { "epoch": 14.336448598130842, "grad_norm": 2.390625, "learning_rate": 4.952317037289857e-05, "loss": 0.2211, "step": 2308 }, { "epoch": 14.342679127725857, "grad_norm": 3.390625, "learning_rate": 4.952269135402736e-05, "loss": 0.276, "step": 2309 }, { "epoch": 14.348909657320872, "grad_norm": 3.5625, "learning_rate": 4.952221209698687e-05, "loss": 0.345, "step": 2310 }, { "epoch": 14.355140186915888, "grad_norm": 2.078125, "learning_rate": 4.952173260178176e-05, "loss": 0.3144, "step": 2311 }, { "epoch": 14.361370716510903, "grad_norm": 4.40625, "learning_rate": 4.9521252868416665e-05, "loss": 0.3777, "step": 2312 }, { "epoch": 14.367601246105918, "grad_norm": 2.53125, "learning_rate": 4.952077289689627e-05, "loss": 0.171, "step": 2313 }, { "epoch": 14.373831775700934, "grad_norm": 2.40625, "learning_rate": 4.952029268722523e-05, "loss": 0.2124, "step": 2314 }, { "epoch": 14.38006230529595, "grad_norm": 2.734375, "learning_rate": 4.951981223940821e-05, "loss": 0.1809, "step": 2315 }, { "epoch": 14.386292834890966, "grad_norm": 2.890625, "learning_rate": 4.951933155344987e-05, "loss": 0.3076, "step": 2316 }, { "epoch": 14.392523364485982, "grad_norm": 4.03125, "learning_rate": 4.951885062935487e-05, "loss": 0.2891, "step": 2317 }, { "epoch": 14.398753894080997, "grad_norm": 3.09375, "learning_rate": 4.95183694671279e-05, "loss": 0.2137, "step": 2318 }, { "epoch": 14.404984423676012, "grad_norm": 2.1875, "learning_rate": 4.951788806677362e-05, "loss": 0.2019, "step": 2319 }, { "epoch": 14.411214953271028, "grad_norm": 3.21875, "learning_rate": 4.951740642829671e-05, "loss": 0.2798, "step": 2320 }, { "epoch": 14.417445482866043, "grad_norm": 3.109375, "learning_rate": 4.951692455170185e-05, "loss": 0.2404, "step": 2321 }, { "epoch": 14.423676012461058, "grad_norm": 5.15625, "learning_rate": 4.9516442436993724e-05, "loss": 0.5644, "step": 2322 }, { "epoch": 14.429906542056075, "grad_norm": 5.03125, "learning_rate": 4.9515960084177005e-05, "loss": 0.3819, "step": 2323 }, { "epoch": 14.43613707165109, "grad_norm": 5.625, "learning_rate": 4.951547749325638e-05, "loss": 0.2824, "step": 2324 }, { "epoch": 14.442367601246106, "grad_norm": 3.328125, "learning_rate": 4.9514994664236535e-05, "loss": 0.3405, "step": 2325 }, { "epoch": 14.448598130841122, "grad_norm": 2.875, "learning_rate": 4.951451159712216e-05, "loss": 0.2931, "step": 2326 }, { "epoch": 14.454828660436137, "grad_norm": 3.984375, "learning_rate": 4.951402829191795e-05, "loss": 0.3152, "step": 2327 }, { "epoch": 14.461059190031152, "grad_norm": 4.71875, "learning_rate": 4.9513544748628596e-05, "loss": 0.2765, "step": 2328 }, { "epoch": 14.467289719626168, "grad_norm": 6.0625, "learning_rate": 4.9513060967258794e-05, "loss": 0.243, "step": 2329 }, { "epoch": 14.473520249221183, "grad_norm": 5.71875, "learning_rate": 4.951257694781325e-05, "loss": 0.3524, "step": 2330 }, { "epoch": 14.4797507788162, "grad_norm": 3.140625, "learning_rate": 4.951209269029665e-05, "loss": 0.2247, "step": 2331 }, { "epoch": 14.485981308411215, "grad_norm": 3.765625, "learning_rate": 4.951160819471372e-05, "loss": 0.2427, "step": 2332 }, { "epoch": 14.49221183800623, "grad_norm": 5.3125, "learning_rate": 4.951112346106914e-05, "loss": 0.3791, "step": 2333 }, { "epoch": 14.498442367601246, "grad_norm": 2.953125, "learning_rate": 4.951063848936763e-05, "loss": 0.3236, "step": 2334 }, { "epoch": 14.504672897196262, "grad_norm": 3.296875, "learning_rate": 4.95101532796139e-05, "loss": 0.401, "step": 2335 }, { "epoch": 14.510903426791277, "grad_norm": 4.0625, "learning_rate": 4.950966783181267e-05, "loss": 0.2776, "step": 2336 }, { "epoch": 14.517133956386292, "grad_norm": 4.0625, "learning_rate": 4.950918214596863e-05, "loss": 0.2809, "step": 2337 }, { "epoch": 14.523364485981308, "grad_norm": 3.4375, "learning_rate": 4.950869622208653e-05, "loss": 0.2445, "step": 2338 }, { "epoch": 14.529595015576325, "grad_norm": 2.25, "learning_rate": 4.950821006017107e-05, "loss": 0.2589, "step": 2339 }, { "epoch": 14.53582554517134, "grad_norm": 3.125, "learning_rate": 4.950772366022697e-05, "loss": 0.2652, "step": 2340 }, { "epoch": 14.542056074766355, "grad_norm": 1.34375, "learning_rate": 4.9507237022258956e-05, "loss": 0.1625, "step": 2341 }, { "epoch": 14.54828660436137, "grad_norm": 2.71875, "learning_rate": 4.950675014627176e-05, "loss": 0.4458, "step": 2342 }, { "epoch": 14.554517133956386, "grad_norm": 1.9453125, "learning_rate": 4.950626303227011e-05, "loss": 0.2235, "step": 2343 }, { "epoch": 14.560747663551401, "grad_norm": 2.671875, "learning_rate": 4.9505775680258734e-05, "loss": 0.3427, "step": 2344 }, { "epoch": 14.566978193146417, "grad_norm": 2.875, "learning_rate": 4.9505288090242364e-05, "loss": 0.2948, "step": 2345 }, { "epoch": 14.573208722741432, "grad_norm": 3.03125, "learning_rate": 4.950480026222574e-05, "loss": 0.3456, "step": 2346 }, { "epoch": 14.57943925233645, "grad_norm": 3.046875, "learning_rate": 4.9504312196213596e-05, "loss": 0.2488, "step": 2347 }, { "epoch": 14.585669781931465, "grad_norm": 3.296875, "learning_rate": 4.950382389221068e-05, "loss": 0.2333, "step": 2348 }, { "epoch": 14.59190031152648, "grad_norm": 3.125, "learning_rate": 4.9503335350221717e-05, "loss": 0.3337, "step": 2349 }, { "epoch": 14.598130841121495, "grad_norm": 2.84375, "learning_rate": 4.950284657025147e-05, "loss": 0.3991, "step": 2350 }, { "epoch": 14.60436137071651, "grad_norm": 2.484375, "learning_rate": 4.950235755230468e-05, "loss": 0.1868, "step": 2351 }, { "epoch": 14.610591900311526, "grad_norm": 2.65625, "learning_rate": 4.950186829638609e-05, "loss": 0.3405, "step": 2352 }, { "epoch": 14.616822429906541, "grad_norm": 2.28125, "learning_rate": 4.950137880250046e-05, "loss": 0.1922, "step": 2353 }, { "epoch": 14.623052959501557, "grad_norm": 2.828125, "learning_rate": 4.9500889070652545e-05, "loss": 0.3919, "step": 2354 }, { "epoch": 14.629283489096574, "grad_norm": 1.8203125, "learning_rate": 4.9500399100847094e-05, "loss": 0.1998, "step": 2355 }, { "epoch": 14.63551401869159, "grad_norm": 2.734375, "learning_rate": 4.949990889308887e-05, "loss": 0.2343, "step": 2356 }, { "epoch": 14.641744548286605, "grad_norm": 3.28125, "learning_rate": 4.949941844738263e-05, "loss": 0.3984, "step": 2357 }, { "epoch": 14.64797507788162, "grad_norm": 2.890625, "learning_rate": 4.949892776373315e-05, "loss": 0.2155, "step": 2358 }, { "epoch": 14.654205607476635, "grad_norm": 3.15625, "learning_rate": 4.949843684214518e-05, "loss": 0.3008, "step": 2359 }, { "epoch": 14.66043613707165, "grad_norm": 2.546875, "learning_rate": 4.9497945682623495e-05, "loss": 0.3047, "step": 2360 }, { "epoch": 14.666666666666666, "grad_norm": 3.875, "learning_rate": 4.9497454285172865e-05, "loss": 0.2358, "step": 2361 }, { "epoch": 14.672897196261681, "grad_norm": 5.3125, "learning_rate": 4.949696264979806e-05, "loss": 0.4388, "step": 2362 }, { "epoch": 14.679127725856699, "grad_norm": 3.078125, "learning_rate": 4.9496470776503855e-05, "loss": 0.2036, "step": 2363 }, { "epoch": 14.685358255451714, "grad_norm": 3.203125, "learning_rate": 4.949597866529503e-05, "loss": 0.2752, "step": 2364 }, { "epoch": 14.69158878504673, "grad_norm": 3.625, "learning_rate": 4.949548631617637e-05, "loss": 0.2998, "step": 2365 }, { "epoch": 14.697819314641745, "grad_norm": 3.78125, "learning_rate": 4.949499372915264e-05, "loss": 0.2853, "step": 2366 }, { "epoch": 14.70404984423676, "grad_norm": 3.46875, "learning_rate": 4.949450090422865e-05, "loss": 0.2515, "step": 2367 }, { "epoch": 14.710280373831775, "grad_norm": 2.9375, "learning_rate": 4.949400784140915e-05, "loss": 0.306, "step": 2368 }, { "epoch": 14.71651090342679, "grad_norm": 1.859375, "learning_rate": 4.9493514540698963e-05, "loss": 0.1833, "step": 2369 }, { "epoch": 14.722741433021806, "grad_norm": 4.21875, "learning_rate": 4.949302100210287e-05, "loss": 0.6816, "step": 2370 }, { "epoch": 14.728971962616823, "grad_norm": 2.828125, "learning_rate": 4.949252722562565e-05, "loss": 0.3138, "step": 2371 }, { "epoch": 14.735202492211839, "grad_norm": 3.5625, "learning_rate": 4.949203321127212e-05, "loss": 0.2012, "step": 2372 }, { "epoch": 14.741433021806854, "grad_norm": 3.890625, "learning_rate": 4.9491538959047053e-05, "loss": 0.3492, "step": 2373 }, { "epoch": 14.74766355140187, "grad_norm": 2.484375, "learning_rate": 4.949104446895528e-05, "loss": 0.4013, "step": 2374 }, { "epoch": 14.753894080996885, "grad_norm": 2.046875, "learning_rate": 4.949054974100158e-05, "loss": 0.2064, "step": 2375 }, { "epoch": 14.7601246105919, "grad_norm": 2.546875, "learning_rate": 4.949005477519077e-05, "loss": 0.2433, "step": 2376 }, { "epoch": 14.766355140186915, "grad_norm": 2.421875, "learning_rate": 4.9489559571527644e-05, "loss": 0.2379, "step": 2377 }, { "epoch": 14.77258566978193, "grad_norm": 2.140625, "learning_rate": 4.9489064130017025e-05, "loss": 0.2225, "step": 2378 }, { "epoch": 14.778816199376948, "grad_norm": 3.25, "learning_rate": 4.948856845066372e-05, "loss": 0.3306, "step": 2379 }, { "epoch": 14.785046728971963, "grad_norm": 3.015625, "learning_rate": 4.9488072533472546e-05, "loss": 0.3334, "step": 2380 }, { "epoch": 14.791277258566979, "grad_norm": 3.90625, "learning_rate": 4.948757637844831e-05, "loss": 0.2455, "step": 2381 }, { "epoch": 14.797507788161994, "grad_norm": 2.328125, "learning_rate": 4.948707998559584e-05, "loss": 0.1943, "step": 2382 }, { "epoch": 14.80373831775701, "grad_norm": 3.203125, "learning_rate": 4.948658335491996e-05, "loss": 0.316, "step": 2383 }, { "epoch": 14.809968847352025, "grad_norm": 2.515625, "learning_rate": 4.9486086486425484e-05, "loss": 0.2979, "step": 2384 }, { "epoch": 14.81619937694704, "grad_norm": 2.78125, "learning_rate": 4.948558938011724e-05, "loss": 0.3352, "step": 2385 }, { "epoch": 14.822429906542055, "grad_norm": 3.953125, "learning_rate": 4.9485092036000055e-05, "loss": 0.3413, "step": 2386 }, { "epoch": 14.828660436137072, "grad_norm": 4.125, "learning_rate": 4.9484594454078765e-05, "loss": 0.317, "step": 2387 }, { "epoch": 14.834890965732088, "grad_norm": 3.46875, "learning_rate": 4.94840966343582e-05, "loss": 0.3701, "step": 2388 }, { "epoch": 14.841121495327103, "grad_norm": 3.953125, "learning_rate": 4.9483598576843196e-05, "loss": 0.2863, "step": 2389 }, { "epoch": 14.847352024922118, "grad_norm": 4.1875, "learning_rate": 4.9483100281538573e-05, "loss": 0.2802, "step": 2390 }, { "epoch": 14.853582554517134, "grad_norm": 4.78125, "learning_rate": 4.9482601748449204e-05, "loss": 0.2723, "step": 2391 }, { "epoch": 14.85981308411215, "grad_norm": 2.8125, "learning_rate": 4.9482102977579905e-05, "loss": 0.4152, "step": 2392 }, { "epoch": 14.866043613707165, "grad_norm": 3.4375, "learning_rate": 4.948160396893553e-05, "loss": 0.6227, "step": 2393 }, { "epoch": 14.87227414330218, "grad_norm": 4.1875, "learning_rate": 4.948110472252092e-05, "loss": 0.2582, "step": 2394 }, { "epoch": 14.878504672897197, "grad_norm": 2.375, "learning_rate": 4.948060523834094e-05, "loss": 0.234, "step": 2395 }, { "epoch": 14.884735202492212, "grad_norm": 4.75, "learning_rate": 4.948010551640041e-05, "loss": 0.3117, "step": 2396 }, { "epoch": 14.890965732087228, "grad_norm": 3.46875, "learning_rate": 4.947960555670421e-05, "loss": 0.3776, "step": 2397 }, { "epoch": 14.897196261682243, "grad_norm": 2.578125, "learning_rate": 4.947910535925719e-05, "loss": 0.2565, "step": 2398 }, { "epoch": 14.903426791277258, "grad_norm": 3.921875, "learning_rate": 4.947860492406421e-05, "loss": 0.2808, "step": 2399 }, { "epoch": 14.909657320872274, "grad_norm": 2.953125, "learning_rate": 4.947810425113011e-05, "loss": 0.244, "step": 2400 }, { "epoch": 14.91588785046729, "grad_norm": 2.1875, "learning_rate": 4.947760334045978e-05, "loss": 0.2634, "step": 2401 }, { "epoch": 14.922118380062305, "grad_norm": 2.96875, "learning_rate": 4.947710219205808e-05, "loss": 0.2996, "step": 2402 }, { "epoch": 14.928348909657322, "grad_norm": 5.03125, "learning_rate": 4.947660080592986e-05, "loss": 0.3631, "step": 2403 }, { "epoch": 14.934579439252337, "grad_norm": 5.40625, "learning_rate": 4.947609918208e-05, "loss": 0.2958, "step": 2404 }, { "epoch": 14.940809968847352, "grad_norm": 3.625, "learning_rate": 4.9475597320513374e-05, "loss": 0.3368, "step": 2405 }, { "epoch": 14.947040498442368, "grad_norm": 2.9375, "learning_rate": 4.947509522123485e-05, "loss": 0.2063, "step": 2406 }, { "epoch": 14.953271028037383, "grad_norm": 4.0, "learning_rate": 4.947459288424932e-05, "loss": 0.2057, "step": 2407 }, { "epoch": 14.959501557632398, "grad_norm": 2.96875, "learning_rate": 4.9474090309561635e-05, "loss": 0.2013, "step": 2408 }, { "epoch": 14.965732087227414, "grad_norm": 3.078125, "learning_rate": 4.947358749717671e-05, "loss": 0.3805, "step": 2409 }, { "epoch": 14.97196261682243, "grad_norm": 3.375, "learning_rate": 4.9473084447099404e-05, "loss": 0.3545, "step": 2410 }, { "epoch": 14.978193146417446, "grad_norm": 3.609375, "learning_rate": 4.947258115933461e-05, "loss": 0.2234, "step": 2411 }, { "epoch": 14.984423676012462, "grad_norm": 3.59375, "learning_rate": 4.947207763388722e-05, "loss": 0.363, "step": 2412 }, { "epoch": 14.990654205607477, "grad_norm": 3.28125, "learning_rate": 4.94715738707621e-05, "loss": 0.2674, "step": 2413 }, { "epoch": 14.996884735202492, "grad_norm": 2.609375, "learning_rate": 4.947106986996418e-05, "loss": 0.2425, "step": 2414 }, { "epoch": 15.0, "grad_norm": 2.234375, "learning_rate": 4.947056563149834e-05, "loss": 0.1624, "step": 2415 }, { "epoch": 15.006230529595015, "grad_norm": 3.578125, "learning_rate": 4.947006115536947e-05, "loss": 0.4272, "step": 2416 }, { "epoch": 15.01246105919003, "grad_norm": 3.9375, "learning_rate": 4.946955644158248e-05, "loss": 0.3557, "step": 2417 }, { "epoch": 15.018691588785046, "grad_norm": 2.734375, "learning_rate": 4.946905149014226e-05, "loss": 0.2737, "step": 2418 }, { "epoch": 15.024922118380061, "grad_norm": 3.09375, "learning_rate": 4.946854630105372e-05, "loss": 0.2399, "step": 2419 }, { "epoch": 15.031152647975079, "grad_norm": 5.4375, "learning_rate": 4.946804087432177e-05, "loss": 0.4785, "step": 2420 }, { "epoch": 15.037383177570094, "grad_norm": 4.0, "learning_rate": 4.9467535209951315e-05, "loss": 0.2515, "step": 2421 }, { "epoch": 15.04361370716511, "grad_norm": 4.125, "learning_rate": 4.946702930794726e-05, "loss": 0.3757, "step": 2422 }, { "epoch": 15.049844236760125, "grad_norm": 5.0, "learning_rate": 4.9466523168314535e-05, "loss": 0.2232, "step": 2423 }, { "epoch": 15.05607476635514, "grad_norm": 3.5625, "learning_rate": 4.946601679105805e-05, "loss": 0.3595, "step": 2424 }, { "epoch": 15.062305295950155, "grad_norm": 3.546875, "learning_rate": 4.9465510176182714e-05, "loss": 0.311, "step": 2425 }, { "epoch": 15.06853582554517, "grad_norm": 2.953125, "learning_rate": 4.946500332369345e-05, "loss": 0.3118, "step": 2426 }, { "epoch": 15.074766355140186, "grad_norm": 3.46875, "learning_rate": 4.946449623359519e-05, "loss": 0.2033, "step": 2427 }, { "epoch": 15.080996884735203, "grad_norm": 4.90625, "learning_rate": 4.9463988905892847e-05, "loss": 0.3474, "step": 2428 }, { "epoch": 15.087227414330219, "grad_norm": 5.09375, "learning_rate": 4.946348134059136e-05, "loss": 0.3397, "step": 2429 }, { "epoch": 15.093457943925234, "grad_norm": 4.90625, "learning_rate": 4.946297353769564e-05, "loss": 0.1987, "step": 2430 }, { "epoch": 15.09968847352025, "grad_norm": 2.71875, "learning_rate": 4.946246549721064e-05, "loss": 0.2872, "step": 2431 }, { "epoch": 15.105919003115265, "grad_norm": 3.46875, "learning_rate": 4.946195721914128e-05, "loss": 0.2256, "step": 2432 }, { "epoch": 15.11214953271028, "grad_norm": 4.46875, "learning_rate": 4.94614487034925e-05, "loss": 0.2991, "step": 2433 }, { "epoch": 15.118380062305295, "grad_norm": 2.9375, "learning_rate": 4.946093995026925e-05, "loss": 0.3223, "step": 2434 }, { "epoch": 15.12461059190031, "grad_norm": 2.15625, "learning_rate": 4.9460430959476456e-05, "loss": 0.2035, "step": 2435 }, { "epoch": 15.130841121495328, "grad_norm": 2.046875, "learning_rate": 4.9459921731119065e-05, "loss": 0.2061, "step": 2436 }, { "epoch": 15.137071651090343, "grad_norm": 2.421875, "learning_rate": 4.945941226520203e-05, "loss": 0.1748, "step": 2437 }, { "epoch": 15.143302180685358, "grad_norm": 3.9375, "learning_rate": 4.945890256173029e-05, "loss": 0.3059, "step": 2438 }, { "epoch": 15.149532710280374, "grad_norm": 2.640625, "learning_rate": 4.945839262070879e-05, "loss": 0.2505, "step": 2439 }, { "epoch": 15.15576323987539, "grad_norm": 2.734375, "learning_rate": 4.945788244214251e-05, "loss": 0.3505, "step": 2440 }, { "epoch": 15.161993769470405, "grad_norm": 2.921875, "learning_rate": 4.945737202603637e-05, "loss": 0.1788, "step": 2441 }, { "epoch": 15.16822429906542, "grad_norm": 2.734375, "learning_rate": 4.9456861372395356e-05, "loss": 0.2579, "step": 2442 }, { "epoch": 15.174454828660435, "grad_norm": 2.75, "learning_rate": 4.9456350481224415e-05, "loss": 0.2005, "step": 2443 }, { "epoch": 15.180685358255452, "grad_norm": 3.953125, "learning_rate": 4.94558393525285e-05, "loss": 0.357, "step": 2444 }, { "epoch": 15.186915887850468, "grad_norm": 2.796875, "learning_rate": 4.9455327986312595e-05, "loss": 0.2233, "step": 2445 }, { "epoch": 15.193146417445483, "grad_norm": 2.875, "learning_rate": 4.9454816382581645e-05, "loss": 0.2841, "step": 2446 }, { "epoch": 15.199376947040498, "grad_norm": 4.625, "learning_rate": 4.945430454134064e-05, "loss": 0.3903, "step": 2447 }, { "epoch": 15.205607476635514, "grad_norm": 3.046875, "learning_rate": 4.945379246259454e-05, "loss": 0.1979, "step": 2448 }, { "epoch": 15.21183800623053, "grad_norm": 4.5625, "learning_rate": 4.9453280146348315e-05, "loss": 0.4923, "step": 2449 }, { "epoch": 15.218068535825545, "grad_norm": 2.171875, "learning_rate": 4.945276759260695e-05, "loss": 0.1888, "step": 2450 }, { "epoch": 15.22429906542056, "grad_norm": 4.03125, "learning_rate": 4.9452254801375407e-05, "loss": 0.2467, "step": 2451 }, { "epoch": 15.230529595015577, "grad_norm": 4.40625, "learning_rate": 4.945174177265869e-05, "loss": 0.3817, "step": 2452 }, { "epoch": 15.236760124610592, "grad_norm": 3.4375, "learning_rate": 4.945122850646176e-05, "loss": 0.4986, "step": 2453 }, { "epoch": 15.242990654205608, "grad_norm": 3.03125, "learning_rate": 4.945071500278962e-05, "loss": 0.2595, "step": 2454 }, { "epoch": 15.249221183800623, "grad_norm": 2.734375, "learning_rate": 4.945020126164724e-05, "loss": 0.2244, "step": 2455 }, { "epoch": 15.255451713395638, "grad_norm": 2.75, "learning_rate": 4.944968728303962e-05, "loss": 0.1899, "step": 2456 }, { "epoch": 15.261682242990654, "grad_norm": 2.1875, "learning_rate": 4.944917306697175e-05, "loss": 0.1796, "step": 2457 }, { "epoch": 15.26791277258567, "grad_norm": 3.28125, "learning_rate": 4.944865861344863e-05, "loss": 0.2832, "step": 2458 }, { "epoch": 15.274143302180685, "grad_norm": 1.7265625, "learning_rate": 4.944814392247525e-05, "loss": 0.163, "step": 2459 }, { "epoch": 15.280373831775702, "grad_norm": 3.421875, "learning_rate": 4.9447628994056604e-05, "loss": 0.3247, "step": 2460 }, { "epoch": 15.286604361370717, "grad_norm": 3.5, "learning_rate": 4.94471138281977e-05, "loss": 0.3648, "step": 2461 }, { "epoch": 15.292834890965732, "grad_norm": 2.625, "learning_rate": 4.944659842490354e-05, "loss": 0.1958, "step": 2462 }, { "epoch": 15.299065420560748, "grad_norm": 3.3125, "learning_rate": 4.944608278417912e-05, "loss": 0.3274, "step": 2463 }, { "epoch": 15.305295950155763, "grad_norm": 2.484375, "learning_rate": 4.944556690602947e-05, "loss": 0.3075, "step": 2464 }, { "epoch": 15.311526479750778, "grad_norm": 4.0, "learning_rate": 4.944505079045958e-05, "loss": 0.4914, "step": 2465 }, { "epoch": 15.317757009345794, "grad_norm": 3.53125, "learning_rate": 4.9444534437474476e-05, "loss": 0.2992, "step": 2466 }, { "epoch": 15.32398753894081, "grad_norm": 2.84375, "learning_rate": 4.944401784707916e-05, "loss": 0.2899, "step": 2467 }, { "epoch": 15.330218068535826, "grad_norm": 2.390625, "learning_rate": 4.9443501019278664e-05, "loss": 0.3159, "step": 2468 }, { "epoch": 15.336448598130842, "grad_norm": 2.15625, "learning_rate": 4.944298395407799e-05, "loss": 0.2631, "step": 2469 }, { "epoch": 15.342679127725857, "grad_norm": 2.984375, "learning_rate": 4.9442466651482184e-05, "loss": 0.2585, "step": 2470 }, { "epoch": 15.348909657320872, "grad_norm": 2.171875, "learning_rate": 4.944194911149625e-05, "loss": 0.1952, "step": 2471 }, { "epoch": 15.355140186915888, "grad_norm": 3.03125, "learning_rate": 4.944143133412521e-05, "loss": 0.3609, "step": 2472 }, { "epoch": 15.361370716510903, "grad_norm": 1.8359375, "learning_rate": 4.944091331937412e-05, "loss": 0.1489, "step": 2473 }, { "epoch": 15.367601246105918, "grad_norm": 2.5, "learning_rate": 4.944039506724798e-05, "loss": 0.2042, "step": 2474 }, { "epoch": 15.373831775700934, "grad_norm": 2.515625, "learning_rate": 4.943987657775185e-05, "loss": 0.3098, "step": 2475 }, { "epoch": 15.38006230529595, "grad_norm": 2.90625, "learning_rate": 4.9439357850890744e-05, "loss": 0.1791, "step": 2476 }, { "epoch": 15.386292834890966, "grad_norm": 2.9375, "learning_rate": 4.9438838886669714e-05, "loss": 0.2815, "step": 2477 }, { "epoch": 15.392523364485982, "grad_norm": 2.203125, "learning_rate": 4.943831968509379e-05, "loss": 0.1873, "step": 2478 }, { "epoch": 15.398753894080997, "grad_norm": 1.78125, "learning_rate": 4.9437800246168025e-05, "loss": 0.2024, "step": 2479 }, { "epoch": 15.404984423676012, "grad_norm": 3.890625, "learning_rate": 4.943728056989746e-05, "loss": 0.6106, "step": 2480 }, { "epoch": 15.411214953271028, "grad_norm": 2.625, "learning_rate": 4.943676065628714e-05, "loss": 0.2413, "step": 2481 }, { "epoch": 15.417445482866043, "grad_norm": 2.8125, "learning_rate": 4.9436240505342115e-05, "loss": 0.2337, "step": 2482 }, { "epoch": 15.423676012461058, "grad_norm": 2.78125, "learning_rate": 4.9435720117067434e-05, "loss": 0.3213, "step": 2483 }, { "epoch": 15.429906542056075, "grad_norm": 3.59375, "learning_rate": 4.943519949146816e-05, "loss": 0.2852, "step": 2484 }, { "epoch": 15.43613707165109, "grad_norm": 2.671875, "learning_rate": 4.943467862854934e-05, "loss": 0.2017, "step": 2485 }, { "epoch": 15.442367601246106, "grad_norm": 3.140625, "learning_rate": 4.9434157528316036e-05, "loss": 0.375, "step": 2486 }, { "epoch": 15.448598130841122, "grad_norm": 2.953125, "learning_rate": 4.943363619077331e-05, "loss": 0.3216, "step": 2487 }, { "epoch": 15.454828660436137, "grad_norm": 2.203125, "learning_rate": 4.943311461592623e-05, "loss": 0.3025, "step": 2488 }, { "epoch": 15.461059190031152, "grad_norm": 2.296875, "learning_rate": 4.943259280377986e-05, "loss": 0.2145, "step": 2489 }, { "epoch": 15.467289719626168, "grad_norm": 3.09375, "learning_rate": 4.9432070754339254e-05, "loss": 0.3284, "step": 2490 }, { "epoch": 15.473520249221183, "grad_norm": 4.40625, "learning_rate": 4.94315484676095e-05, "loss": 0.4496, "step": 2491 }, { "epoch": 15.4797507788162, "grad_norm": 2.5, "learning_rate": 4.943102594359566e-05, "loss": 0.3026, "step": 2492 }, { "epoch": 15.485981308411215, "grad_norm": 1.953125, "learning_rate": 4.943050318230281e-05, "loss": 0.1781, "step": 2493 }, { "epoch": 15.49221183800623, "grad_norm": 4.125, "learning_rate": 4.9429980183736034e-05, "loss": 0.5655, "step": 2494 }, { "epoch": 15.498442367601246, "grad_norm": 1.96875, "learning_rate": 4.942945694790041e-05, "loss": 0.1691, "step": 2495 }, { "epoch": 15.504672897196262, "grad_norm": 2.890625, "learning_rate": 4.942893347480101e-05, "loss": 0.2291, "step": 2496 }, { "epoch": 15.510903426791277, "grad_norm": 3.125, "learning_rate": 4.9428409764442926e-05, "loss": 0.3743, "step": 2497 }, { "epoch": 15.517133956386292, "grad_norm": 1.75, "learning_rate": 4.9427885816831256e-05, "loss": 0.2068, "step": 2498 }, { "epoch": 15.523364485981308, "grad_norm": 4.8125, "learning_rate": 4.942736163197106e-05, "loss": 0.2725, "step": 2499 }, { "epoch": 15.529595015576325, "grad_norm": 3.109375, "learning_rate": 4.942683720986745e-05, "loss": 0.2625, "step": 2500 }, { "epoch": 15.53582554517134, "grad_norm": 3.171875, "learning_rate": 4.942631255052551e-05, "loss": 0.2338, "step": 2501 }, { "epoch": 15.542056074766355, "grad_norm": 3.59375, "learning_rate": 4.942578765395035e-05, "loss": 0.3985, "step": 2502 }, { "epoch": 15.54828660436137, "grad_norm": 4.375, "learning_rate": 4.9425262520147055e-05, "loss": 0.2615, "step": 2503 }, { "epoch": 15.554517133956386, "grad_norm": 4.0625, "learning_rate": 4.942473714912072e-05, "loss": 0.3357, "step": 2504 }, { "epoch": 15.560747663551401, "grad_norm": 2.421875, "learning_rate": 4.9424211540876456e-05, "loss": 0.1877, "step": 2505 }, { "epoch": 15.566978193146417, "grad_norm": 4.125, "learning_rate": 4.9423685695419374e-05, "loss": 0.4378, "step": 2506 }, { "epoch": 15.573208722741432, "grad_norm": 2.59375, "learning_rate": 4.942315961275457e-05, "loss": 0.2436, "step": 2507 }, { "epoch": 15.57943925233645, "grad_norm": 3.875, "learning_rate": 4.942263329288716e-05, "loss": 0.356, "step": 2508 }, { "epoch": 15.585669781931465, "grad_norm": 4.53125, "learning_rate": 4.942210673582226e-05, "loss": 0.4407, "step": 2509 }, { "epoch": 15.59190031152648, "grad_norm": 3.1875, "learning_rate": 4.9421579941564965e-05, "loss": 0.2685, "step": 2510 }, { "epoch": 15.598130841121495, "grad_norm": 1.9296875, "learning_rate": 4.942105291012041e-05, "loss": 0.1673, "step": 2511 }, { "epoch": 15.60436137071651, "grad_norm": 3.53125, "learning_rate": 4.94205256414937e-05, "loss": 0.4421, "step": 2512 }, { "epoch": 15.610591900311526, "grad_norm": 3.546875, "learning_rate": 4.9419998135689974e-05, "loss": 0.2336, "step": 2513 }, { "epoch": 15.616822429906541, "grad_norm": 3.1875, "learning_rate": 4.941947039271434e-05, "loss": 0.2929, "step": 2514 }, { "epoch": 15.623052959501557, "grad_norm": 2.578125, "learning_rate": 4.941894241257193e-05, "loss": 0.2444, "step": 2515 }, { "epoch": 15.629283489096574, "grad_norm": 2.671875, "learning_rate": 4.9418414195267873e-05, "loss": 0.3333, "step": 2516 }, { "epoch": 15.63551401869159, "grad_norm": 3.96875, "learning_rate": 4.941788574080729e-05, "loss": 0.2825, "step": 2517 }, { "epoch": 15.641744548286605, "grad_norm": 3.703125, "learning_rate": 4.941735704919532e-05, "loss": 0.3375, "step": 2518 }, { "epoch": 15.64797507788162, "grad_norm": 3.703125, "learning_rate": 4.9416828120437095e-05, "loss": 0.2876, "step": 2519 }, { "epoch": 15.654205607476635, "grad_norm": 2.828125, "learning_rate": 4.9416298954537763e-05, "loss": 0.338, "step": 2520 }, { "epoch": 15.66043613707165, "grad_norm": 2.25, "learning_rate": 4.941576955150245e-05, "loss": 0.2334, "step": 2521 }, { "epoch": 15.666666666666666, "grad_norm": 3.4375, "learning_rate": 4.94152399113363e-05, "loss": 0.1925, "step": 2522 }, { "epoch": 15.672897196261681, "grad_norm": 3.171875, "learning_rate": 4.9414710034044464e-05, "loss": 0.2885, "step": 2523 }, { "epoch": 15.679127725856699, "grad_norm": 2.3125, "learning_rate": 4.941417991963208e-05, "loss": 0.2117, "step": 2524 }, { "epoch": 15.685358255451714, "grad_norm": 1.859375, "learning_rate": 4.9413649568104305e-05, "loss": 0.1759, "step": 2525 }, { "epoch": 15.69158878504673, "grad_norm": 4.1875, "learning_rate": 4.941311897946628e-05, "loss": 0.2033, "step": 2526 }, { "epoch": 15.697819314641745, "grad_norm": 2.1875, "learning_rate": 4.9412588153723174e-05, "loss": 0.2638, "step": 2527 }, { "epoch": 15.70404984423676, "grad_norm": 3.78125, "learning_rate": 4.941205709088011e-05, "loss": 0.3767, "step": 2528 }, { "epoch": 15.710280373831775, "grad_norm": 2.796875, "learning_rate": 4.941152579094229e-05, "loss": 0.2264, "step": 2529 }, { "epoch": 15.71651090342679, "grad_norm": 4.0, "learning_rate": 4.9410994253914845e-05, "loss": 0.4684, "step": 2530 }, { "epoch": 15.722741433021806, "grad_norm": 2.546875, "learning_rate": 4.9410462479802945e-05, "loss": 0.2088, "step": 2531 }, { "epoch": 15.728971962616823, "grad_norm": 2.4375, "learning_rate": 4.940993046861174e-05, "loss": 0.2815, "step": 2532 }, { "epoch": 15.735202492211839, "grad_norm": 3.15625, "learning_rate": 4.940939822034643e-05, "loss": 0.2404, "step": 2533 }, { "epoch": 15.741433021806854, "grad_norm": 2.3125, "learning_rate": 4.9408865735012154e-05, "loss": 0.2008, "step": 2534 }, { "epoch": 15.74766355140187, "grad_norm": 2.1875, "learning_rate": 4.94083330126141e-05, "loss": 0.2032, "step": 2535 }, { "epoch": 15.753894080996885, "grad_norm": 2.71875, "learning_rate": 4.940780005315744e-05, "loss": 0.2162, "step": 2536 }, { "epoch": 15.7601246105919, "grad_norm": 2.5, "learning_rate": 4.940726685664734e-05, "loss": 0.1769, "step": 2537 }, { "epoch": 15.766355140186915, "grad_norm": 3.421875, "learning_rate": 4.940673342308898e-05, "loss": 0.3786, "step": 2538 }, { "epoch": 15.77258566978193, "grad_norm": 3.453125, "learning_rate": 4.940619975248756e-05, "loss": 0.3251, "step": 2539 }, { "epoch": 15.778816199376948, "grad_norm": 3.453125, "learning_rate": 4.940566584484824e-05, "loss": 0.3536, "step": 2540 }, { "epoch": 15.785046728971963, "grad_norm": 2.984375, "learning_rate": 4.9405131700176215e-05, "loss": 0.3622, "step": 2541 }, { "epoch": 15.791277258566979, "grad_norm": 3.953125, "learning_rate": 4.940459731847667e-05, "loss": 0.4363, "step": 2542 }, { "epoch": 15.797507788161994, "grad_norm": 3.046875, "learning_rate": 4.94040626997548e-05, "loss": 0.3148, "step": 2543 }, { "epoch": 15.80373831775701, "grad_norm": 4.65625, "learning_rate": 4.9403527844015805e-05, "loss": 0.5927, "step": 2544 }, { "epoch": 15.809968847352025, "grad_norm": 3.0, "learning_rate": 4.940299275126486e-05, "loss": 0.2375, "step": 2545 }, { "epoch": 15.81619937694704, "grad_norm": 2.28125, "learning_rate": 4.940245742150718e-05, "loss": 0.2705, "step": 2546 }, { "epoch": 15.822429906542055, "grad_norm": 3.21875, "learning_rate": 4.940192185474794e-05, "loss": 0.3224, "step": 2547 }, { "epoch": 15.828660436137072, "grad_norm": 2.265625, "learning_rate": 4.9401386050992365e-05, "loss": 0.2143, "step": 2548 }, { "epoch": 15.834890965732088, "grad_norm": 2.5625, "learning_rate": 4.9400850010245656e-05, "loss": 0.2607, "step": 2549 }, { "epoch": 15.841121495327103, "grad_norm": 1.4921875, "learning_rate": 4.9400313732513006e-05, "loss": 0.1591, "step": 2550 }, { "epoch": 15.847352024922118, "grad_norm": 2.375, "learning_rate": 4.939977721779964e-05, "loss": 0.3345, "step": 2551 }, { "epoch": 15.853582554517134, "grad_norm": 2.734375, "learning_rate": 4.9399240466110754e-05, "loss": 0.2043, "step": 2552 }, { "epoch": 15.85981308411215, "grad_norm": 3.25, "learning_rate": 4.939870347745156e-05, "loss": 0.2457, "step": 2553 }, { "epoch": 15.866043613707165, "grad_norm": 1.9765625, "learning_rate": 4.939816625182729e-05, "loss": 0.2094, "step": 2554 }, { "epoch": 15.87227414330218, "grad_norm": 2.453125, "learning_rate": 4.9397628789243155e-05, "loss": 0.2692, "step": 2555 }, { "epoch": 15.878504672897197, "grad_norm": 3.25, "learning_rate": 4.9397091089704364e-05, "loss": 0.3039, "step": 2556 }, { "epoch": 15.884735202492212, "grad_norm": 3.53125, "learning_rate": 4.939655315321615e-05, "loss": 0.3357, "step": 2557 }, { "epoch": 15.890965732087228, "grad_norm": 2.359375, "learning_rate": 4.9396014979783736e-05, "loss": 0.2226, "step": 2558 }, { "epoch": 15.897196261682243, "grad_norm": 3.046875, "learning_rate": 4.9395476569412355e-05, "loss": 0.1659, "step": 2559 }, { "epoch": 15.903426791277258, "grad_norm": 4.90625, "learning_rate": 4.9394937922107224e-05, "loss": 0.3771, "step": 2560 }, { "epoch": 15.909657320872274, "grad_norm": 3.296875, "learning_rate": 4.9394399037873574e-05, "loss": 0.4929, "step": 2561 }, { "epoch": 15.91588785046729, "grad_norm": 2.90625, "learning_rate": 4.939385991671664e-05, "loss": 0.2242, "step": 2562 }, { "epoch": 15.922118380062305, "grad_norm": 3.765625, "learning_rate": 4.9393320558641676e-05, "loss": 0.3404, "step": 2563 }, { "epoch": 15.928348909657322, "grad_norm": 5.03125, "learning_rate": 4.9392780963653905e-05, "loss": 0.3927, "step": 2564 }, { "epoch": 15.934579439252337, "grad_norm": 5.40625, "learning_rate": 4.939224113175857e-05, "loss": 0.2312, "step": 2565 }, { "epoch": 15.940809968847352, "grad_norm": 3.65625, "learning_rate": 4.939170106296091e-05, "loss": 0.3743, "step": 2566 }, { "epoch": 15.947040498442368, "grad_norm": 2.578125, "learning_rate": 4.9391160757266164e-05, "loss": 0.3677, "step": 2567 }, { "epoch": 15.953271028037383, "grad_norm": 4.03125, "learning_rate": 4.9390620214679595e-05, "loss": 0.3727, "step": 2568 }, { "epoch": 15.959501557632398, "grad_norm": 4.9375, "learning_rate": 4.9390079435206446e-05, "loss": 0.3201, "step": 2569 }, { "epoch": 15.965732087227414, "grad_norm": 5.0, "learning_rate": 4.9389538418851974e-05, "loss": 0.2881, "step": 2570 }, { "epoch": 15.97196261682243, "grad_norm": 3.453125, "learning_rate": 4.938899716562143e-05, "loss": 0.3272, "step": 2571 }, { "epoch": 15.978193146417446, "grad_norm": 3.4375, "learning_rate": 4.938845567552007e-05, "loss": 0.3433, "step": 2572 }, { "epoch": 15.984423676012462, "grad_norm": 4.75, "learning_rate": 4.938791394855316e-05, "loss": 0.2934, "step": 2573 }, { "epoch": 15.990654205607477, "grad_norm": 4.03125, "learning_rate": 4.938737198472594e-05, "loss": 0.3508, "step": 2574 }, { "epoch": 15.996884735202492, "grad_norm": 2.265625, "learning_rate": 4.93868297840437e-05, "loss": 0.263, "step": 2575 }, { "epoch": 16.0, "grad_norm": 1.96875, "learning_rate": 4.938628734651168e-05, "loss": 0.1443, "step": 2576 }, { "epoch": 16.006230529595015, "grad_norm": 3.390625, "learning_rate": 4.938574467213518e-05, "loss": 0.2562, "step": 2577 }, { "epoch": 16.01246105919003, "grad_norm": 4.34375, "learning_rate": 4.938520176091944e-05, "loss": 0.3296, "step": 2578 }, { "epoch": 16.018691588785046, "grad_norm": 2.359375, "learning_rate": 4.9384658612869754e-05, "loss": 0.2099, "step": 2579 }, { "epoch": 16.02492211838006, "grad_norm": 5.65625, "learning_rate": 4.938411522799139e-05, "loss": 0.4087, "step": 2580 }, { "epoch": 16.031152647975077, "grad_norm": 2.65625, "learning_rate": 4.938357160628962e-05, "loss": 0.2312, "step": 2581 }, { "epoch": 16.037383177570092, "grad_norm": 2.78125, "learning_rate": 4.938302774776973e-05, "loss": 0.2505, "step": 2582 }, { "epoch": 16.043613707165107, "grad_norm": 2.1875, "learning_rate": 4.9382483652437e-05, "loss": 0.1954, "step": 2583 }, { "epoch": 16.049844236760123, "grad_norm": 4.0625, "learning_rate": 4.938193932029671e-05, "loss": 0.3704, "step": 2584 }, { "epoch": 16.05607476635514, "grad_norm": 3.296875, "learning_rate": 4.938139475135416e-05, "loss": 0.5484, "step": 2585 }, { "epoch": 16.062305295950157, "grad_norm": 3.640625, "learning_rate": 4.938084994561463e-05, "loss": 0.4022, "step": 2586 }, { "epoch": 16.068535825545172, "grad_norm": 4.03125, "learning_rate": 4.9380304903083404e-05, "loss": 0.3085, "step": 2587 }, { "epoch": 16.074766355140188, "grad_norm": 2.578125, "learning_rate": 4.937975962376579e-05, "loss": 0.1838, "step": 2588 }, { "epoch": 16.080996884735203, "grad_norm": 2.921875, "learning_rate": 4.937921410766707e-05, "loss": 0.3536, "step": 2589 }, { "epoch": 16.08722741433022, "grad_norm": 3.140625, "learning_rate": 4.937866835479256e-05, "loss": 0.1995, "step": 2590 }, { "epoch": 16.093457943925234, "grad_norm": 3.890625, "learning_rate": 4.9378122365147536e-05, "loss": 0.256, "step": 2591 }, { "epoch": 16.09968847352025, "grad_norm": 3.515625, "learning_rate": 4.937757613873732e-05, "loss": 0.2492, "step": 2592 }, { "epoch": 16.105919003115265, "grad_norm": 3.25, "learning_rate": 4.937702967556722e-05, "loss": 0.2563, "step": 2593 }, { "epoch": 16.11214953271028, "grad_norm": 2.5, "learning_rate": 4.937648297564252e-05, "loss": 0.1782, "step": 2594 }, { "epoch": 16.118380062305295, "grad_norm": 2.90625, "learning_rate": 4.9375936038968565e-05, "loss": 0.2522, "step": 2595 }, { "epoch": 16.12461059190031, "grad_norm": 3.171875, "learning_rate": 4.9375388865550635e-05, "loss": 0.2217, "step": 2596 }, { "epoch": 16.130841121495326, "grad_norm": 2.734375, "learning_rate": 4.937484145539405e-05, "loss": 0.2828, "step": 2597 }, { "epoch": 16.13707165109034, "grad_norm": 2.1875, "learning_rate": 4.9374293808504136e-05, "loss": 0.1846, "step": 2598 }, { "epoch": 16.143302180685357, "grad_norm": 2.671875, "learning_rate": 4.937374592488621e-05, "loss": 0.2246, "step": 2599 }, { "epoch": 16.149532710280372, "grad_norm": 3.03125, "learning_rate": 4.937319780454559e-05, "loss": 0.3297, "step": 2600 }, { "epoch": 16.15576323987539, "grad_norm": 3.015625, "learning_rate": 4.9372649447487604e-05, "loss": 0.3195, "step": 2601 }, { "epoch": 16.161993769470406, "grad_norm": 3.921875, "learning_rate": 4.9372100853717577e-05, "loss": 0.437, "step": 2602 }, { "epoch": 16.16822429906542, "grad_norm": 3.46875, "learning_rate": 4.937155202324083e-05, "loss": 0.4745, "step": 2603 }, { "epoch": 16.174454828660437, "grad_norm": 2.296875, "learning_rate": 4.9371002956062694e-05, "loss": 0.2687, "step": 2604 }, { "epoch": 16.180685358255452, "grad_norm": 3.265625, "learning_rate": 4.937045365218851e-05, "loss": 0.2725, "step": 2605 }, { "epoch": 16.186915887850468, "grad_norm": 2.5, "learning_rate": 4.936990411162361e-05, "loss": 0.2215, "step": 2606 }, { "epoch": 16.193146417445483, "grad_norm": 4.15625, "learning_rate": 4.936935433437333e-05, "loss": 0.2344, "step": 2607 }, { "epoch": 16.1993769470405, "grad_norm": 2.890625, "learning_rate": 4.9368804320443004e-05, "loss": 0.1956, "step": 2608 }, { "epoch": 16.205607476635514, "grad_norm": 2.734375, "learning_rate": 4.936825406983798e-05, "loss": 0.2581, "step": 2609 }, { "epoch": 16.21183800623053, "grad_norm": 4.28125, "learning_rate": 4.9367703582563606e-05, "loss": 0.3117, "step": 2610 }, { "epoch": 16.218068535825545, "grad_norm": 4.46875, "learning_rate": 4.936715285862523e-05, "loss": 0.3035, "step": 2611 }, { "epoch": 16.22429906542056, "grad_norm": 4.09375, "learning_rate": 4.936660189802818e-05, "loss": 0.4918, "step": 2612 }, { "epoch": 16.230529595015575, "grad_norm": 2.5625, "learning_rate": 4.936605070077783e-05, "loss": 0.206, "step": 2613 }, { "epoch": 16.23676012461059, "grad_norm": 3.078125, "learning_rate": 4.936549926687952e-05, "loss": 0.2708, "step": 2614 }, { "epoch": 16.242990654205606, "grad_norm": 2.65625, "learning_rate": 4.936494759633862e-05, "loss": 0.2371, "step": 2615 }, { "epoch": 16.24922118380062, "grad_norm": 2.640625, "learning_rate": 4.936439568916047e-05, "loss": 0.2574, "step": 2616 }, { "epoch": 16.25545171339564, "grad_norm": 3.28125, "learning_rate": 4.936384354535044e-05, "loss": 0.3581, "step": 2617 }, { "epoch": 16.261682242990656, "grad_norm": 3.203125, "learning_rate": 4.93632911649139e-05, "loss": 0.2932, "step": 2618 }, { "epoch": 16.26791277258567, "grad_norm": 5.21875, "learning_rate": 4.936273854785619e-05, "loss": 0.2479, "step": 2619 }, { "epoch": 16.274143302180686, "grad_norm": 3.421875, "learning_rate": 4.9362185694182706e-05, "loss": 0.2978, "step": 2620 }, { "epoch": 16.2803738317757, "grad_norm": 3.265625, "learning_rate": 4.93616326038988e-05, "loss": 0.2741, "step": 2621 }, { "epoch": 16.286604361370717, "grad_norm": 2.640625, "learning_rate": 4.936107927700985e-05, "loss": 0.2125, "step": 2622 }, { "epoch": 16.292834890965732, "grad_norm": 3.5, "learning_rate": 4.936052571352122e-05, "loss": 0.2716, "step": 2623 }, { "epoch": 16.299065420560748, "grad_norm": 3.28125, "learning_rate": 4.93599719134383e-05, "loss": 0.3342, "step": 2624 }, { "epoch": 16.305295950155763, "grad_norm": 4.0625, "learning_rate": 4.935941787676647e-05, "loss": 0.305, "step": 2625 }, { "epoch": 16.31152647975078, "grad_norm": 3.203125, "learning_rate": 4.9358863603511095e-05, "loss": 0.314, "step": 2626 }, { "epoch": 16.317757009345794, "grad_norm": 2.921875, "learning_rate": 4.935830909367757e-05, "loss": 0.2842, "step": 2627 }, { "epoch": 16.32398753894081, "grad_norm": 2.625, "learning_rate": 4.935775434727128e-05, "loss": 0.1879, "step": 2628 }, { "epoch": 16.330218068535824, "grad_norm": 4.40625, "learning_rate": 4.9357199364297615e-05, "loss": 0.5024, "step": 2629 }, { "epoch": 16.33644859813084, "grad_norm": 4.375, "learning_rate": 4.935664414476196e-05, "loss": 0.5419, "step": 2630 }, { "epoch": 16.342679127725855, "grad_norm": 2.734375, "learning_rate": 4.9356088688669696e-05, "loss": 0.19, "step": 2631 }, { "epoch": 16.34890965732087, "grad_norm": 3.4375, "learning_rate": 4.9355532996026246e-05, "loss": 0.4606, "step": 2632 }, { "epoch": 16.35514018691589, "grad_norm": 1.5, "learning_rate": 4.9354977066836986e-05, "loss": 0.1849, "step": 2633 }, { "epoch": 16.361370716510905, "grad_norm": 3.0625, "learning_rate": 4.935442090110731e-05, "loss": 0.2495, "step": 2634 }, { "epoch": 16.36760124610592, "grad_norm": 1.578125, "learning_rate": 4.9353864498842636e-05, "loss": 0.1501, "step": 2635 }, { "epoch": 16.373831775700936, "grad_norm": 2.75, "learning_rate": 4.935330786004837e-05, "loss": 0.2151, "step": 2636 }, { "epoch": 16.38006230529595, "grad_norm": 2.875, "learning_rate": 4.93527509847299e-05, "loss": 0.2509, "step": 2637 }, { "epoch": 16.386292834890966, "grad_norm": 3.46875, "learning_rate": 4.935219387289265e-05, "loss": 0.405, "step": 2638 }, { "epoch": 16.39252336448598, "grad_norm": 4.65625, "learning_rate": 4.935163652454202e-05, "loss": 0.3343, "step": 2639 }, { "epoch": 16.398753894080997, "grad_norm": 3.625, "learning_rate": 4.9351078939683426e-05, "loss": 0.3095, "step": 2640 }, { "epoch": 16.404984423676012, "grad_norm": 3.40625, "learning_rate": 4.935052111832229e-05, "loss": 0.2837, "step": 2641 }, { "epoch": 16.411214953271028, "grad_norm": 2.6875, "learning_rate": 4.934996306046403e-05, "loss": 0.1927, "step": 2642 }, { "epoch": 16.417445482866043, "grad_norm": 4.84375, "learning_rate": 4.9349404766114063e-05, "loss": 0.3004, "step": 2643 }, { "epoch": 16.42367601246106, "grad_norm": 3.609375, "learning_rate": 4.9348846235277804e-05, "loss": 0.4334, "step": 2644 }, { "epoch": 16.429906542056074, "grad_norm": 2.921875, "learning_rate": 4.934828746796067e-05, "loss": 0.2579, "step": 2645 }, { "epoch": 16.43613707165109, "grad_norm": 4.75, "learning_rate": 4.934772846416812e-05, "loss": 0.3587, "step": 2646 }, { "epoch": 16.442367601246104, "grad_norm": 3.03125, "learning_rate": 4.934716922390556e-05, "loss": 0.3096, "step": 2647 }, { "epoch": 16.44859813084112, "grad_norm": 3.453125, "learning_rate": 4.9346609747178426e-05, "loss": 0.24, "step": 2648 }, { "epoch": 16.45482866043614, "grad_norm": 2.703125, "learning_rate": 4.9346050033992145e-05, "loss": 0.3318, "step": 2649 }, { "epoch": 16.461059190031154, "grad_norm": 2.484375, "learning_rate": 4.9345490084352165e-05, "loss": 0.2675, "step": 2650 }, { "epoch": 16.46728971962617, "grad_norm": 3.1875, "learning_rate": 4.9344929898263914e-05, "loss": 0.3438, "step": 2651 }, { "epoch": 16.473520249221185, "grad_norm": 3.84375, "learning_rate": 4.9344369475732846e-05, "loss": 0.2118, "step": 2652 }, { "epoch": 16.4797507788162, "grad_norm": 2.390625, "learning_rate": 4.934380881676439e-05, "loss": 0.1759, "step": 2653 }, { "epoch": 16.485981308411215, "grad_norm": 2.234375, "learning_rate": 4.934324792136399e-05, "loss": 0.2005, "step": 2654 }, { "epoch": 16.49221183800623, "grad_norm": 3.5, "learning_rate": 4.934268678953711e-05, "loss": 0.36, "step": 2655 }, { "epoch": 16.498442367601246, "grad_norm": 4.625, "learning_rate": 4.934212542128919e-05, "loss": 0.2233, "step": 2656 }, { "epoch": 16.50467289719626, "grad_norm": 3.0625, "learning_rate": 4.934156381662567e-05, "loss": 0.2592, "step": 2657 }, { "epoch": 16.510903426791277, "grad_norm": 3.609375, "learning_rate": 4.9341001975552027e-05, "loss": 0.209, "step": 2658 }, { "epoch": 16.517133956386292, "grad_norm": 3.09375, "learning_rate": 4.934043989807371e-05, "loss": 0.2364, "step": 2659 }, { "epoch": 16.523364485981308, "grad_norm": 5.3125, "learning_rate": 4.933987758419617e-05, "loss": 0.2205, "step": 2660 }, { "epoch": 16.529595015576323, "grad_norm": 4.46875, "learning_rate": 4.9339315033924874e-05, "loss": 0.2359, "step": 2661 }, { "epoch": 16.53582554517134, "grad_norm": 4.625, "learning_rate": 4.933875224726529e-05, "loss": 0.3235, "step": 2662 }, { "epoch": 16.542056074766354, "grad_norm": 3.625, "learning_rate": 4.933818922422287e-05, "loss": 0.2059, "step": 2663 }, { "epoch": 16.54828660436137, "grad_norm": 3.5, "learning_rate": 4.933762596480309e-05, "loss": 0.4024, "step": 2664 }, { "epoch": 16.554517133956388, "grad_norm": 4.3125, "learning_rate": 4.933706246901143e-05, "loss": 0.2762, "step": 2665 }, { "epoch": 16.560747663551403, "grad_norm": 4.34375, "learning_rate": 4.9336498736853347e-05, "loss": 0.2363, "step": 2666 }, { "epoch": 16.56697819314642, "grad_norm": 3.859375, "learning_rate": 4.9335934768334326e-05, "loss": 0.2733, "step": 2667 }, { "epoch": 16.573208722741434, "grad_norm": 2.734375, "learning_rate": 4.933537056345985e-05, "loss": 0.2201, "step": 2668 }, { "epoch": 16.57943925233645, "grad_norm": 3.15625, "learning_rate": 4.9334806122235376e-05, "loss": 0.3443, "step": 2669 }, { "epoch": 16.585669781931465, "grad_norm": 5.59375, "learning_rate": 4.933424144466641e-05, "loss": 0.3055, "step": 2670 }, { "epoch": 16.59190031152648, "grad_norm": 6.09375, "learning_rate": 4.933367653075843e-05, "loss": 0.3328, "step": 2671 }, { "epoch": 16.598130841121495, "grad_norm": 4.3125, "learning_rate": 4.9333111380516914e-05, "loss": 0.2235, "step": 2672 }, { "epoch": 16.60436137071651, "grad_norm": 3.78125, "learning_rate": 4.9332545993947355e-05, "loss": 0.2929, "step": 2673 }, { "epoch": 16.610591900311526, "grad_norm": 2.78125, "learning_rate": 4.9331980371055244e-05, "loss": 0.2201, "step": 2674 }, { "epoch": 16.61682242990654, "grad_norm": 4.0625, "learning_rate": 4.933141451184608e-05, "loss": 0.3419, "step": 2675 }, { "epoch": 16.623052959501557, "grad_norm": 2.421875, "learning_rate": 4.933084841632535e-05, "loss": 0.2217, "step": 2676 }, { "epoch": 16.629283489096572, "grad_norm": 3.15625, "learning_rate": 4.933028208449856e-05, "loss": 0.3528, "step": 2677 }, { "epoch": 16.635514018691588, "grad_norm": 2.609375, "learning_rate": 4.932971551637121e-05, "loss": 0.3351, "step": 2678 }, { "epoch": 16.641744548286603, "grad_norm": 2.046875, "learning_rate": 4.932914871194879e-05, "loss": 0.1828, "step": 2679 }, { "epoch": 16.64797507788162, "grad_norm": 2.15625, "learning_rate": 4.9328581671236824e-05, "loss": 0.1753, "step": 2680 }, { "epoch": 16.654205607476637, "grad_norm": 4.1875, "learning_rate": 4.9328014394240804e-05, "loss": 0.3926, "step": 2681 }, { "epoch": 16.660436137071652, "grad_norm": 1.4609375, "learning_rate": 4.932744688096625e-05, "loss": 0.1832, "step": 2682 }, { "epoch": 16.666666666666668, "grad_norm": 2.375, "learning_rate": 4.9326879131418666e-05, "loss": 0.2308, "step": 2683 }, { "epoch": 16.672897196261683, "grad_norm": 2.578125, "learning_rate": 4.932631114560357e-05, "loss": 0.1615, "step": 2684 }, { "epoch": 16.6791277258567, "grad_norm": 3.21875, "learning_rate": 4.9325742923526476e-05, "loss": 0.3325, "step": 2685 }, { "epoch": 16.685358255451714, "grad_norm": 3.078125, "learning_rate": 4.93251744651929e-05, "loss": 0.308, "step": 2686 }, { "epoch": 16.69158878504673, "grad_norm": 3.453125, "learning_rate": 4.9324605770608375e-05, "loss": 0.3455, "step": 2687 }, { "epoch": 16.697819314641745, "grad_norm": 3.015625, "learning_rate": 4.932403683977842e-05, "loss": 0.2414, "step": 2688 }, { "epoch": 16.70404984423676, "grad_norm": 2.5, "learning_rate": 4.9323467672708545e-05, "loss": 0.2001, "step": 2689 }, { "epoch": 16.710280373831775, "grad_norm": 2.09375, "learning_rate": 4.9322898269404293e-05, "loss": 0.2032, "step": 2690 }, { "epoch": 16.71651090342679, "grad_norm": 2.40625, "learning_rate": 4.932232862987119e-05, "loss": 0.2223, "step": 2691 }, { "epoch": 16.722741433021806, "grad_norm": 2.765625, "learning_rate": 4.932175875411478e-05, "loss": 0.2558, "step": 2692 }, { "epoch": 16.72897196261682, "grad_norm": 2.25, "learning_rate": 4.9321188642140575e-05, "loss": 0.2414, "step": 2693 }, { "epoch": 16.735202492211837, "grad_norm": 2.828125, "learning_rate": 4.932061829395413e-05, "loss": 0.3136, "step": 2694 }, { "epoch": 16.741433021806852, "grad_norm": 2.703125, "learning_rate": 4.932004770956098e-05, "loss": 0.268, "step": 2695 }, { "epoch": 16.747663551401867, "grad_norm": 3.6875, "learning_rate": 4.931947688896666e-05, "loss": 0.3009, "step": 2696 }, { "epoch": 16.753894080996886, "grad_norm": 2.90625, "learning_rate": 4.931890583217672e-05, "loss": 0.2336, "step": 2697 }, { "epoch": 16.7601246105919, "grad_norm": 2.171875, "learning_rate": 4.931833453919671e-05, "loss": 0.1967, "step": 2698 }, { "epoch": 16.766355140186917, "grad_norm": 3.28125, "learning_rate": 4.931776301003217e-05, "loss": 0.3468, "step": 2699 }, { "epoch": 16.772585669781932, "grad_norm": 3.140625, "learning_rate": 4.931719124468865e-05, "loss": 0.1854, "step": 2700 }, { "epoch": 16.778816199376948, "grad_norm": 4.0, "learning_rate": 4.9316619243171714e-05, "loss": 0.2077, "step": 2701 }, { "epoch": 16.785046728971963, "grad_norm": 2.640625, "learning_rate": 4.931604700548691e-05, "loss": 0.2355, "step": 2702 }, { "epoch": 16.79127725856698, "grad_norm": 4.21875, "learning_rate": 4.93154745316398e-05, "loss": 0.3227, "step": 2703 }, { "epoch": 16.797507788161994, "grad_norm": 4.3125, "learning_rate": 4.931490182163593e-05, "loss": 0.3373, "step": 2704 }, { "epoch": 16.80373831775701, "grad_norm": 2.5625, "learning_rate": 4.931432887548089e-05, "loss": 0.332, "step": 2705 }, { "epoch": 16.809968847352025, "grad_norm": 4.09375, "learning_rate": 4.931375569318021e-05, "loss": 0.3916, "step": 2706 }, { "epoch": 16.81619937694704, "grad_norm": 2.59375, "learning_rate": 4.931318227473949e-05, "loss": 0.2215, "step": 2707 }, { "epoch": 16.822429906542055, "grad_norm": 1.9453125, "learning_rate": 4.931260862016427e-05, "loss": 0.2202, "step": 2708 }, { "epoch": 16.82866043613707, "grad_norm": 2.65625, "learning_rate": 4.9312034729460144e-05, "loss": 0.1828, "step": 2709 }, { "epoch": 16.834890965732086, "grad_norm": 2.5, "learning_rate": 4.931146060263268e-05, "loss": 0.1975, "step": 2710 }, { "epoch": 16.8411214953271, "grad_norm": 2.65625, "learning_rate": 4.931088623968744e-05, "loss": 0.298, "step": 2711 }, { "epoch": 16.847352024922117, "grad_norm": 3.609375, "learning_rate": 4.9310311640630017e-05, "loss": 0.3084, "step": 2712 }, { "epoch": 16.853582554517136, "grad_norm": 2.5625, "learning_rate": 4.930973680546599e-05, "loss": 0.208, "step": 2713 }, { "epoch": 16.85981308411215, "grad_norm": 2.21875, "learning_rate": 4.9309161734200934e-05, "loss": 0.2015, "step": 2714 }, { "epoch": 16.866043613707166, "grad_norm": 3.453125, "learning_rate": 4.9308586426840454e-05, "loss": 0.2851, "step": 2715 }, { "epoch": 16.87227414330218, "grad_norm": 3.71875, "learning_rate": 4.930801088339011e-05, "loss": 0.336, "step": 2716 }, { "epoch": 16.878504672897197, "grad_norm": 2.421875, "learning_rate": 4.9307435103855507e-05, "loss": 0.1921, "step": 2717 }, { "epoch": 16.884735202492212, "grad_norm": 2.71875, "learning_rate": 4.930685908824224e-05, "loss": 0.2339, "step": 2718 }, { "epoch": 16.890965732087228, "grad_norm": 2.84375, "learning_rate": 4.930628283655589e-05, "loss": 0.2511, "step": 2719 }, { "epoch": 16.897196261682243, "grad_norm": 3.34375, "learning_rate": 4.930570634880207e-05, "loss": 0.2649, "step": 2720 }, { "epoch": 16.90342679127726, "grad_norm": 2.609375, "learning_rate": 4.930512962498638e-05, "loss": 0.2109, "step": 2721 }, { "epoch": 16.909657320872274, "grad_norm": 3.015625, "learning_rate": 4.93045526651144e-05, "loss": 0.2105, "step": 2722 }, { "epoch": 16.91588785046729, "grad_norm": 3.484375, "learning_rate": 4.930397546919174e-05, "loss": 0.304, "step": 2723 }, { "epoch": 16.922118380062305, "grad_norm": 3.0, "learning_rate": 4.930339803722402e-05, "loss": 0.4596, "step": 2724 }, { "epoch": 16.92834890965732, "grad_norm": 3.828125, "learning_rate": 4.930282036921685e-05, "loss": 0.4747, "step": 2725 }, { "epoch": 16.934579439252335, "grad_norm": 1.8515625, "learning_rate": 4.9302242465175816e-05, "loss": 0.1835, "step": 2726 }, { "epoch": 16.94080996884735, "grad_norm": 4.1875, "learning_rate": 4.930166432510656e-05, "loss": 0.3725, "step": 2727 }, { "epoch": 16.947040498442366, "grad_norm": 4.375, "learning_rate": 4.930108594901467e-05, "loss": 0.2455, "step": 2728 }, { "epoch": 16.953271028037385, "grad_norm": 4.0625, "learning_rate": 4.9300507336905775e-05, "loss": 0.1543, "step": 2729 }, { "epoch": 16.9595015576324, "grad_norm": 2.21875, "learning_rate": 4.92999284887855e-05, "loss": 0.1527, "step": 2730 }, { "epoch": 16.965732087227416, "grad_norm": 4.8125, "learning_rate": 4.929934940465946e-05, "loss": 0.1851, "step": 2731 }, { "epoch": 16.97196261682243, "grad_norm": 5.5, "learning_rate": 4.9298770084533286e-05, "loss": 0.1962, "step": 2732 }, { "epoch": 16.978193146417446, "grad_norm": 5.65625, "learning_rate": 4.9298190528412594e-05, "loss": 0.2961, "step": 2733 }, { "epoch": 16.98442367601246, "grad_norm": 2.25, "learning_rate": 4.929761073630302e-05, "loss": 0.178, "step": 2734 }, { "epoch": 16.990654205607477, "grad_norm": 3.875, "learning_rate": 4.929703070821019e-05, "loss": 0.2865, "step": 2735 }, { "epoch": 16.996884735202492, "grad_norm": 5.5, "learning_rate": 4.929645044413975e-05, "loss": 0.2261, "step": 2736 }, { "epoch": 17.0, "grad_norm": 2.4375, "learning_rate": 4.929586994409732e-05, "loss": 0.1148, "step": 2737 }, { "epoch": 17.006230529595015, "grad_norm": 4.53125, "learning_rate": 4.929528920808854e-05, "loss": 0.2594, "step": 2738 }, { "epoch": 17.01246105919003, "grad_norm": 2.875, "learning_rate": 4.929470823611906e-05, "loss": 0.2087, "step": 2739 }, { "epoch": 17.018691588785046, "grad_norm": 3.265625, "learning_rate": 4.929412702819453e-05, "loss": 0.1993, "step": 2740 }, { "epoch": 17.02492211838006, "grad_norm": 3.359375, "learning_rate": 4.9293545584320564e-05, "loss": 0.3048, "step": 2741 }, { "epoch": 17.031152647975077, "grad_norm": 3.515625, "learning_rate": 4.9292963904502834e-05, "loss": 0.1861, "step": 2742 }, { "epoch": 17.037383177570092, "grad_norm": 3.390625, "learning_rate": 4.929238198874698e-05, "loss": 0.2261, "step": 2743 }, { "epoch": 17.043613707165107, "grad_norm": 3.390625, "learning_rate": 4.929179983705866e-05, "loss": 0.199, "step": 2744 }, { "epoch": 17.049844236760123, "grad_norm": 4.40625, "learning_rate": 4.9291217449443515e-05, "loss": 0.2962, "step": 2745 }, { "epoch": 17.05607476635514, "grad_norm": 1.4921875, "learning_rate": 4.929063482590722e-05, "loss": 0.1632, "step": 2746 }, { "epoch": 17.062305295950157, "grad_norm": 4.1875, "learning_rate": 4.929005196645542e-05, "loss": 0.174, "step": 2747 }, { "epoch": 17.068535825545172, "grad_norm": 6.0, "learning_rate": 4.928946887109378e-05, "loss": 0.4278, "step": 2748 }, { "epoch": 17.074766355140188, "grad_norm": 5.53125, "learning_rate": 4.928888553982797e-05, "loss": 0.3085, "step": 2749 }, { "epoch": 17.080996884735203, "grad_norm": 3.609375, "learning_rate": 4.928830197266364e-05, "loss": 0.2487, "step": 2750 }, { "epoch": 17.08722741433022, "grad_norm": 3.84375, "learning_rate": 4.9287718169606464e-05, "loss": 0.2853, "step": 2751 }, { "epoch": 17.093457943925234, "grad_norm": 2.65625, "learning_rate": 4.9287134130662114e-05, "loss": 0.2625, "step": 2752 }, { "epoch": 17.09968847352025, "grad_norm": 5.375, "learning_rate": 4.928654985583627e-05, "loss": 0.2023, "step": 2753 }, { "epoch": 17.105919003115265, "grad_norm": 3.1875, "learning_rate": 4.92859653451346e-05, "loss": 0.2105, "step": 2754 }, { "epoch": 17.11214953271028, "grad_norm": 2.234375, "learning_rate": 4.928538059856277e-05, "loss": 0.2505, "step": 2755 }, { "epoch": 17.118380062305295, "grad_norm": 4.8125, "learning_rate": 4.928479561612648e-05, "loss": 0.3854, "step": 2756 }, { "epoch": 17.12461059190031, "grad_norm": 5.46875, "learning_rate": 4.9284210397831384e-05, "loss": 0.2923, "step": 2757 }, { "epoch": 17.130841121495326, "grad_norm": 4.84375, "learning_rate": 4.9283624943683204e-05, "loss": 0.2542, "step": 2758 }, { "epoch": 17.13707165109034, "grad_norm": 4.78125, "learning_rate": 4.928303925368759e-05, "loss": 0.4462, "step": 2759 }, { "epoch": 17.143302180685357, "grad_norm": 3.09375, "learning_rate": 4.928245332785025e-05, "loss": 0.2582, "step": 2760 }, { "epoch": 17.149532710280372, "grad_norm": 3.34375, "learning_rate": 4.928186716617686e-05, "loss": 0.3571, "step": 2761 }, { "epoch": 17.15576323987539, "grad_norm": 4.09375, "learning_rate": 4.9281280768673134e-05, "loss": 0.2719, "step": 2762 }, { "epoch": 17.161993769470406, "grad_norm": 2.609375, "learning_rate": 4.928069413534475e-05, "loss": 0.2414, "step": 2763 }, { "epoch": 17.16822429906542, "grad_norm": 3.9375, "learning_rate": 4.928010726619742e-05, "loss": 0.3002, "step": 2764 }, { "epoch": 17.174454828660437, "grad_norm": 2.53125, "learning_rate": 4.9279520161236824e-05, "loss": 0.2021, "step": 2765 }, { "epoch": 17.180685358255452, "grad_norm": 4.03125, "learning_rate": 4.9278932820468674e-05, "loss": 0.3305, "step": 2766 }, { "epoch": 17.186915887850468, "grad_norm": 2.390625, "learning_rate": 4.927834524389868e-05, "loss": 0.2314, "step": 2767 }, { "epoch": 17.193146417445483, "grad_norm": 2.46875, "learning_rate": 4.927775743153254e-05, "loss": 0.2553, "step": 2768 }, { "epoch": 17.1993769470405, "grad_norm": 2.453125, "learning_rate": 4.9277169383375974e-05, "loss": 0.2552, "step": 2769 }, { "epoch": 17.205607476635514, "grad_norm": 4.125, "learning_rate": 4.9276581099434685e-05, "loss": 0.2273, "step": 2770 }, { "epoch": 17.21183800623053, "grad_norm": 3.65625, "learning_rate": 4.927599257971438e-05, "loss": 0.3656, "step": 2771 }, { "epoch": 17.218068535825545, "grad_norm": 2.875, "learning_rate": 4.927540382422079e-05, "loss": 0.219, "step": 2772 }, { "epoch": 17.22429906542056, "grad_norm": 2.75, "learning_rate": 4.9274814832959627e-05, "loss": 0.2649, "step": 2773 }, { "epoch": 17.230529595015575, "grad_norm": 2.171875, "learning_rate": 4.92742256059366e-05, "loss": 0.1836, "step": 2774 }, { "epoch": 17.23676012461059, "grad_norm": 1.7890625, "learning_rate": 4.9273636143157445e-05, "loss": 0.171, "step": 2775 }, { "epoch": 17.242990654205606, "grad_norm": 3.109375, "learning_rate": 4.927304644462789e-05, "loss": 0.2393, "step": 2776 }, { "epoch": 17.24922118380062, "grad_norm": 1.8203125, "learning_rate": 4.927245651035365e-05, "loss": 0.1812, "step": 2777 }, { "epoch": 17.25545171339564, "grad_norm": 2.78125, "learning_rate": 4.927186634034046e-05, "loss": 0.2884, "step": 2778 }, { "epoch": 17.261682242990656, "grad_norm": 3.265625, "learning_rate": 4.927127593459405e-05, "loss": 0.2654, "step": 2779 }, { "epoch": 17.26791277258567, "grad_norm": 3.609375, "learning_rate": 4.9270685293120164e-05, "loss": 0.2865, "step": 2780 }, { "epoch": 17.274143302180686, "grad_norm": 2.390625, "learning_rate": 4.927009441592453e-05, "loss": 0.201, "step": 2781 }, { "epoch": 17.2803738317757, "grad_norm": 3.984375, "learning_rate": 4.926950330301288e-05, "loss": 0.3265, "step": 2782 }, { "epoch": 17.286604361370717, "grad_norm": 2.015625, "learning_rate": 4.9268911954390974e-05, "loss": 0.2013, "step": 2783 }, { "epoch": 17.292834890965732, "grad_norm": 2.765625, "learning_rate": 4.926832037006453e-05, "loss": 0.1693, "step": 2784 }, { "epoch": 17.299065420560748, "grad_norm": 3.09375, "learning_rate": 4.9267728550039316e-05, "loss": 0.29, "step": 2785 }, { "epoch": 17.305295950155763, "grad_norm": 4.96875, "learning_rate": 4.926713649432107e-05, "loss": 0.3409, "step": 2786 }, { "epoch": 17.31152647975078, "grad_norm": 4.09375, "learning_rate": 4.9266544202915545e-05, "loss": 0.184, "step": 2787 }, { "epoch": 17.317757009345794, "grad_norm": 4.09375, "learning_rate": 4.926595167582849e-05, "loss": 0.2838, "step": 2788 }, { "epoch": 17.32398753894081, "grad_norm": 2.1875, "learning_rate": 4.926535891306566e-05, "loss": 0.2304, "step": 2789 }, { "epoch": 17.330218068535824, "grad_norm": 2.671875, "learning_rate": 4.926476591463282e-05, "loss": 0.2587, "step": 2790 }, { "epoch": 17.33644859813084, "grad_norm": 4.75, "learning_rate": 4.926417268053571e-05, "loss": 0.4267, "step": 2791 }, { "epoch": 17.342679127725855, "grad_norm": 3.453125, "learning_rate": 4.926357921078012e-05, "loss": 0.2332, "step": 2792 }, { "epoch": 17.34890965732087, "grad_norm": 2.328125, "learning_rate": 4.926298550537179e-05, "loss": 0.2214, "step": 2793 }, { "epoch": 17.35514018691589, "grad_norm": 3.640625, "learning_rate": 4.92623915643165e-05, "loss": 0.3728, "step": 2794 }, { "epoch": 17.361370716510905, "grad_norm": 3.421875, "learning_rate": 4.926179738762e-05, "loss": 0.3565, "step": 2795 }, { "epoch": 17.36760124610592, "grad_norm": 3.5, "learning_rate": 4.926120297528809e-05, "loss": 0.212, "step": 2796 }, { "epoch": 17.373831775700936, "grad_norm": 3.21875, "learning_rate": 4.926060832732652e-05, "loss": 0.2472, "step": 2797 }, { "epoch": 17.38006230529595, "grad_norm": 3.28125, "learning_rate": 4.9260013443741074e-05, "loss": 0.4059, "step": 2798 }, { "epoch": 17.386292834890966, "grad_norm": 2.40625, "learning_rate": 4.9259418324537523e-05, "loss": 0.3218, "step": 2799 }, { "epoch": 17.39252336448598, "grad_norm": 4.375, "learning_rate": 4.9258822969721655e-05, "loss": 0.3358, "step": 2800 }, { "epoch": 17.398753894080997, "grad_norm": 3.25, "learning_rate": 4.925822737929925e-05, "loss": 0.1924, "step": 2801 }, { "epoch": 17.404984423676012, "grad_norm": 3.34375, "learning_rate": 4.925763155327609e-05, "loss": 0.2721, "step": 2802 }, { "epoch": 17.411214953271028, "grad_norm": 2.421875, "learning_rate": 4.9257035491657965e-05, "loss": 0.2275, "step": 2803 }, { "epoch": 17.417445482866043, "grad_norm": 2.828125, "learning_rate": 4.925643919445067e-05, "loss": 0.1885, "step": 2804 }, { "epoch": 17.42367601246106, "grad_norm": 2.59375, "learning_rate": 4.9255842661659975e-05, "loss": 0.2534, "step": 2805 }, { "epoch": 17.429906542056074, "grad_norm": 2.90625, "learning_rate": 4.925524589329169e-05, "loss": 0.3022, "step": 2806 }, { "epoch": 17.43613707165109, "grad_norm": 3.625, "learning_rate": 4.925464888935162e-05, "loss": 0.3033, "step": 2807 }, { "epoch": 17.442367601246104, "grad_norm": 3.453125, "learning_rate": 4.925405164984553e-05, "loss": 0.4108, "step": 2808 }, { "epoch": 17.44859813084112, "grad_norm": 3.15625, "learning_rate": 4.9253454174779265e-05, "loss": 0.2479, "step": 2809 }, { "epoch": 17.45482866043614, "grad_norm": 3.921875, "learning_rate": 4.9252856464158595e-05, "loss": 0.2201, "step": 2810 }, { "epoch": 17.461059190031154, "grad_norm": 2.78125, "learning_rate": 4.925225851798934e-05, "loss": 0.2933, "step": 2811 }, { "epoch": 17.46728971962617, "grad_norm": 3.203125, "learning_rate": 4.9251660336277296e-05, "loss": 0.2286, "step": 2812 }, { "epoch": 17.473520249221185, "grad_norm": 3.984375, "learning_rate": 4.9251061919028275e-05, "loss": 0.2681, "step": 2813 }, { "epoch": 17.4797507788162, "grad_norm": 3.9375, "learning_rate": 4.92504632662481e-05, "loss": 0.2766, "step": 2814 }, { "epoch": 17.485981308411215, "grad_norm": 3.484375, "learning_rate": 4.9249864377942576e-05, "loss": 0.2323, "step": 2815 }, { "epoch": 17.49221183800623, "grad_norm": 3.390625, "learning_rate": 4.924926525411752e-05, "loss": 0.1777, "step": 2816 }, { "epoch": 17.498442367601246, "grad_norm": 3.484375, "learning_rate": 4.924866589477876e-05, "loss": 0.2737, "step": 2817 }, { "epoch": 17.50467289719626, "grad_norm": 2.765625, "learning_rate": 4.92480662999321e-05, "loss": 0.2336, "step": 2818 }, { "epoch": 17.510903426791277, "grad_norm": 2.53125, "learning_rate": 4.924746646958338e-05, "loss": 0.2695, "step": 2819 }, { "epoch": 17.517133956386292, "grad_norm": 2.90625, "learning_rate": 4.9246866403738415e-05, "loss": 0.2278, "step": 2820 }, { "epoch": 17.523364485981308, "grad_norm": 4.6875, "learning_rate": 4.924626610240304e-05, "loss": 0.2649, "step": 2821 }, { "epoch": 17.529595015576323, "grad_norm": 3.734375, "learning_rate": 4.9245665565583075e-05, "loss": 0.2589, "step": 2822 }, { "epoch": 17.53582554517134, "grad_norm": 2.640625, "learning_rate": 4.9245064793284365e-05, "loss": 0.2107, "step": 2823 }, { "epoch": 17.542056074766354, "grad_norm": 2.15625, "learning_rate": 4.924446378551274e-05, "loss": 0.2089, "step": 2824 }, { "epoch": 17.54828660436137, "grad_norm": 3.359375, "learning_rate": 4.924386254227403e-05, "loss": 0.3595, "step": 2825 }, { "epoch": 17.554517133956388, "grad_norm": 4.28125, "learning_rate": 4.924326106357409e-05, "loss": 0.2409, "step": 2826 }, { "epoch": 17.560747663551403, "grad_norm": 3.015625, "learning_rate": 4.9242659349418746e-05, "loss": 0.2148, "step": 2827 }, { "epoch": 17.56697819314642, "grad_norm": 2.484375, "learning_rate": 4.9242057399813845e-05, "loss": 0.2376, "step": 2828 }, { "epoch": 17.573208722741434, "grad_norm": 2.5, "learning_rate": 4.924145521476525e-05, "loss": 0.1874, "step": 2829 }, { "epoch": 17.57943925233645, "grad_norm": 5.25, "learning_rate": 4.924085279427879e-05, "loss": 0.2422, "step": 2830 }, { "epoch": 17.585669781931465, "grad_norm": 5.125, "learning_rate": 4.924025013836031e-05, "loss": 0.3137, "step": 2831 }, { "epoch": 17.59190031152648, "grad_norm": 4.15625, "learning_rate": 4.923964724701568e-05, "loss": 0.3996, "step": 2832 }, { "epoch": 17.598130841121495, "grad_norm": 4.5, "learning_rate": 4.923904412025077e-05, "loss": 0.3911, "step": 2833 }, { "epoch": 17.60436137071651, "grad_norm": 4.875, "learning_rate": 4.92384407580714e-05, "loss": 0.2267, "step": 2834 }, { "epoch": 17.610591900311526, "grad_norm": 4.4375, "learning_rate": 4.923783716048345e-05, "loss": 0.2573, "step": 2835 }, { "epoch": 17.61682242990654, "grad_norm": 4.375, "learning_rate": 4.923723332749278e-05, "loss": 0.2836, "step": 2836 }, { "epoch": 17.623052959501557, "grad_norm": 2.375, "learning_rate": 4.923662925910526e-05, "loss": 0.1793, "step": 2837 }, { "epoch": 17.629283489096572, "grad_norm": 3.5625, "learning_rate": 4.923602495532675e-05, "loss": 0.2995, "step": 2838 }, { "epoch": 17.635514018691588, "grad_norm": 3.234375, "learning_rate": 4.923542041616312e-05, "loss": 0.3845, "step": 2839 }, { "epoch": 17.641744548286603, "grad_norm": 2.90625, "learning_rate": 4.923481564162025e-05, "loss": 0.2516, "step": 2840 }, { "epoch": 17.64797507788162, "grad_norm": 3.015625, "learning_rate": 4.923421063170399e-05, "loss": 0.296, "step": 2841 }, { "epoch": 17.654205607476637, "grad_norm": 2.96875, "learning_rate": 4.923360538642025e-05, "loss": 0.2057, "step": 2842 }, { "epoch": 17.660436137071652, "grad_norm": 2.5, "learning_rate": 4.923299990577488e-05, "loss": 0.2092, "step": 2843 }, { "epoch": 17.666666666666668, "grad_norm": 2.953125, "learning_rate": 4.9232394189773764e-05, "loss": 0.3477, "step": 2844 }, { "epoch": 17.672897196261683, "grad_norm": 2.765625, "learning_rate": 4.9231788238422806e-05, "loss": 0.262, "step": 2845 }, { "epoch": 17.6791277258567, "grad_norm": 2.34375, "learning_rate": 4.923118205172786e-05, "loss": 0.1704, "step": 2846 }, { "epoch": 17.685358255451714, "grad_norm": 3.484375, "learning_rate": 4.9230575629694844e-05, "loss": 0.4988, "step": 2847 }, { "epoch": 17.69158878504673, "grad_norm": 3.421875, "learning_rate": 4.922996897232963e-05, "loss": 0.2841, "step": 2848 }, { "epoch": 17.697819314641745, "grad_norm": 1.671875, "learning_rate": 4.922936207963812e-05, "loss": 0.1717, "step": 2849 }, { "epoch": 17.70404984423676, "grad_norm": 2.4375, "learning_rate": 4.9228754951626195e-05, "loss": 0.2229, "step": 2850 }, { "epoch": 17.710280373831775, "grad_norm": 3.015625, "learning_rate": 4.922814758829975e-05, "loss": 0.236, "step": 2851 }, { "epoch": 17.71651090342679, "grad_norm": 4.1875, "learning_rate": 4.9227539989664706e-05, "loss": 0.4678, "step": 2852 }, { "epoch": 17.722741433021806, "grad_norm": 3.296875, "learning_rate": 4.922693215572695e-05, "loss": 0.2051, "step": 2853 }, { "epoch": 17.72897196261682, "grad_norm": 2.859375, "learning_rate": 4.922632408649238e-05, "loss": 0.3035, "step": 2854 }, { "epoch": 17.735202492211837, "grad_norm": 2.796875, "learning_rate": 4.922571578196692e-05, "loss": 0.2543, "step": 2855 }, { "epoch": 17.741433021806852, "grad_norm": 1.7578125, "learning_rate": 4.922510724215646e-05, "loss": 0.1922, "step": 2856 }, { "epoch": 17.747663551401867, "grad_norm": 2.703125, "learning_rate": 4.922449846706691e-05, "loss": 0.3394, "step": 2857 }, { "epoch": 17.753894080996886, "grad_norm": 1.984375, "learning_rate": 4.9223889456704185e-05, "loss": 0.1883, "step": 2858 }, { "epoch": 17.7601246105919, "grad_norm": 2.984375, "learning_rate": 4.9223280211074206e-05, "loss": 0.2075, "step": 2859 }, { "epoch": 17.766355140186917, "grad_norm": 3.109375, "learning_rate": 4.92226707301829e-05, "loss": 0.2581, "step": 2860 }, { "epoch": 17.772585669781932, "grad_norm": 2.203125, "learning_rate": 4.922206101403616e-05, "loss": 0.1965, "step": 2861 }, { "epoch": 17.778816199376948, "grad_norm": 3.265625, "learning_rate": 4.9221451062639925e-05, "loss": 0.4263, "step": 2862 }, { "epoch": 17.785046728971963, "grad_norm": 2.9375, "learning_rate": 4.9220840876000116e-05, "loss": 0.3165, "step": 2863 }, { "epoch": 17.79127725856698, "grad_norm": 2.734375, "learning_rate": 4.922023045412266e-05, "loss": 0.2207, "step": 2864 }, { "epoch": 17.797507788161994, "grad_norm": 4.71875, "learning_rate": 4.921961979701348e-05, "loss": 0.2642, "step": 2865 }, { "epoch": 17.80373831775701, "grad_norm": 3.109375, "learning_rate": 4.921900890467852e-05, "loss": 0.201, "step": 2866 }, { "epoch": 17.809968847352025, "grad_norm": 2.5625, "learning_rate": 4.92183977771237e-05, "loss": 0.1998, "step": 2867 }, { "epoch": 17.81619937694704, "grad_norm": 3.046875, "learning_rate": 4.921778641435496e-05, "loss": 0.3943, "step": 2868 }, { "epoch": 17.822429906542055, "grad_norm": 5.40625, "learning_rate": 4.921717481637823e-05, "loss": 0.2095, "step": 2869 }, { "epoch": 17.82866043613707, "grad_norm": 4.46875, "learning_rate": 4.9216562983199465e-05, "loss": 0.2598, "step": 2870 }, { "epoch": 17.834890965732086, "grad_norm": 4.125, "learning_rate": 4.921595091482461e-05, "loss": 0.34, "step": 2871 }, { "epoch": 17.8411214953271, "grad_norm": 2.765625, "learning_rate": 4.921533861125958e-05, "loss": 0.2603, "step": 2872 }, { "epoch": 17.847352024922117, "grad_norm": 3.84375, "learning_rate": 4.921472607251034e-05, "loss": 0.2473, "step": 2873 }, { "epoch": 17.853582554517136, "grad_norm": 5.09375, "learning_rate": 4.921411329858285e-05, "loss": 0.3327, "step": 2874 }, { "epoch": 17.85981308411215, "grad_norm": 3.046875, "learning_rate": 4.921350028948305e-05, "loss": 0.2884, "step": 2875 }, { "epoch": 17.866043613707166, "grad_norm": 2.859375, "learning_rate": 4.921288704521689e-05, "loss": 0.3332, "step": 2876 }, { "epoch": 17.87227414330218, "grad_norm": 4.125, "learning_rate": 4.9212273565790344e-05, "loss": 0.3603, "step": 2877 }, { "epoch": 17.878504672897197, "grad_norm": 4.8125, "learning_rate": 4.9211659851209344e-05, "loss": 0.4616, "step": 2878 }, { "epoch": 17.884735202492212, "grad_norm": 2.75, "learning_rate": 4.921104590147987e-05, "loss": 0.2946, "step": 2879 }, { "epoch": 17.890965732087228, "grad_norm": 2.9375, "learning_rate": 4.9210431716607875e-05, "loss": 0.223, "step": 2880 }, { "epoch": 17.897196261682243, "grad_norm": 3.25, "learning_rate": 4.920981729659933e-05, "loss": 0.3195, "step": 2881 }, { "epoch": 17.90342679127726, "grad_norm": 3.5625, "learning_rate": 4.92092026414602e-05, "loss": 0.1549, "step": 2882 }, { "epoch": 17.909657320872274, "grad_norm": 3.125, "learning_rate": 4.9208587751196454e-05, "loss": 0.2079, "step": 2883 }, { "epoch": 17.91588785046729, "grad_norm": 2.859375, "learning_rate": 4.920797262581406e-05, "loss": 0.321, "step": 2884 }, { "epoch": 17.922118380062305, "grad_norm": 2.125, "learning_rate": 4.9207357265319005e-05, "loss": 0.1685, "step": 2885 }, { "epoch": 17.92834890965732, "grad_norm": 5.21875, "learning_rate": 4.920674166971725e-05, "loss": 0.1873, "step": 2886 }, { "epoch": 17.934579439252335, "grad_norm": 4.90625, "learning_rate": 4.920612583901479e-05, "loss": 0.2968, "step": 2887 }, { "epoch": 17.94080996884735, "grad_norm": 4.0, "learning_rate": 4.920550977321759e-05, "loss": 0.2625, "step": 2888 }, { "epoch": 17.947040498442366, "grad_norm": 2.140625, "learning_rate": 4.920489347233164e-05, "loss": 0.2059, "step": 2889 }, { "epoch": 17.953271028037385, "grad_norm": 2.421875, "learning_rate": 4.9204276936362926e-05, "loss": 0.2128, "step": 2890 }, { "epoch": 17.9595015576324, "grad_norm": 3.546875, "learning_rate": 4.920366016531744e-05, "loss": 0.1785, "step": 2891 }, { "epoch": 17.965732087227416, "grad_norm": 5.0625, "learning_rate": 4.9203043159201164e-05, "loss": 0.4622, "step": 2892 }, { "epoch": 17.97196261682243, "grad_norm": 3.0625, "learning_rate": 4.92024259180201e-05, "loss": 0.2589, "step": 2893 }, { "epoch": 17.978193146417446, "grad_norm": 3.640625, "learning_rate": 4.920180844178024e-05, "loss": 0.369, "step": 2894 }, { "epoch": 17.98442367601246, "grad_norm": 2.84375, "learning_rate": 4.920119073048757e-05, "loss": 0.1833, "step": 2895 }, { "epoch": 17.990654205607477, "grad_norm": 4.0625, "learning_rate": 4.9200572784148104e-05, "loss": 0.409, "step": 2896 }, { "epoch": 17.996884735202492, "grad_norm": 1.796875, "learning_rate": 4.9199954602767826e-05, "loss": 0.1594, "step": 2897 }, { "epoch": 18.0, "grad_norm": 2.125, "learning_rate": 4.9199336186352764e-05, "loss": 0.2071, "step": 2898 }, { "epoch": 18.006230529595015, "grad_norm": 3.65625, "learning_rate": 4.919871753490891e-05, "loss": 0.3127, "step": 2899 }, { "epoch": 18.01246105919003, "grad_norm": 3.453125, "learning_rate": 4.919809864844227e-05, "loss": 0.2632, "step": 2900 }, { "epoch": 18.018691588785046, "grad_norm": 2.9375, "learning_rate": 4.919747952695886e-05, "loss": 0.1978, "step": 2901 }, { "epoch": 18.02492211838006, "grad_norm": 2.421875, "learning_rate": 4.91968601704647e-05, "loss": 0.1649, "step": 2902 }, { "epoch": 18.031152647975077, "grad_norm": 4.6875, "learning_rate": 4.919624057896579e-05, "loss": 0.2198, "step": 2903 }, { "epoch": 18.037383177570092, "grad_norm": 5.15625, "learning_rate": 4.919562075246815e-05, "loss": 0.3934, "step": 2904 }, { "epoch": 18.043613707165107, "grad_norm": 3.34375, "learning_rate": 4.919500069097781e-05, "loss": 0.2859, "step": 2905 }, { "epoch": 18.049844236760123, "grad_norm": 2.875, "learning_rate": 4.919438039450078e-05, "loss": 0.3524, "step": 2906 }, { "epoch": 18.05607476635514, "grad_norm": 4.0, "learning_rate": 4.91937598630431e-05, "loss": 0.3872, "step": 2907 }, { "epoch": 18.062305295950157, "grad_norm": 3.078125, "learning_rate": 4.9193139096610784e-05, "loss": 0.2265, "step": 2908 }, { "epoch": 18.068535825545172, "grad_norm": 2.71875, "learning_rate": 4.9192518095209874e-05, "loss": 0.268, "step": 2909 }, { "epoch": 18.074766355140188, "grad_norm": 2.03125, "learning_rate": 4.9191896858846375e-05, "loss": 0.1643, "step": 2910 }, { "epoch": 18.080996884735203, "grad_norm": 2.375, "learning_rate": 4.9191275387526345e-05, "loss": 0.3074, "step": 2911 }, { "epoch": 18.08722741433022, "grad_norm": 2.6875, "learning_rate": 4.919065368125582e-05, "loss": 0.1921, "step": 2912 }, { "epoch": 18.093457943925234, "grad_norm": 2.890625, "learning_rate": 4.919003174004083e-05, "loss": 0.2554, "step": 2913 }, { "epoch": 18.09968847352025, "grad_norm": 2.421875, "learning_rate": 4.918940956388741e-05, "loss": 0.2085, "step": 2914 }, { "epoch": 18.105919003115265, "grad_norm": 2.734375, "learning_rate": 4.918878715280161e-05, "loss": 0.2107, "step": 2915 }, { "epoch": 18.11214953271028, "grad_norm": 2.390625, "learning_rate": 4.9188164506789475e-05, "loss": 0.2349, "step": 2916 }, { "epoch": 18.118380062305295, "grad_norm": 2.578125, "learning_rate": 4.918754162585705e-05, "loss": 0.2781, "step": 2917 }, { "epoch": 18.12461059190031, "grad_norm": 3.046875, "learning_rate": 4.918691851001039e-05, "loss": 0.2659, "step": 2918 }, { "epoch": 18.130841121495326, "grad_norm": 2.71875, "learning_rate": 4.9186295159255537e-05, "loss": 0.3332, "step": 2919 }, { "epoch": 18.13707165109034, "grad_norm": 3.8125, "learning_rate": 4.9185671573598555e-05, "loss": 0.3785, "step": 2920 }, { "epoch": 18.143302180685357, "grad_norm": 2.921875, "learning_rate": 4.9185047753045486e-05, "loss": 0.2174, "step": 2921 }, { "epoch": 18.149532710280372, "grad_norm": 4.5625, "learning_rate": 4.918442369760241e-05, "loss": 0.362, "step": 2922 }, { "epoch": 18.15576323987539, "grad_norm": 3.421875, "learning_rate": 4.9183799407275375e-05, "loss": 0.271, "step": 2923 }, { "epoch": 18.161993769470406, "grad_norm": 3.25, "learning_rate": 4.918317488207045e-05, "loss": 0.3347, "step": 2924 }, { "epoch": 18.16822429906542, "grad_norm": 4.25, "learning_rate": 4.9182550121993685e-05, "loss": 0.3114, "step": 2925 }, { "epoch": 18.174454828660437, "grad_norm": 4.40625, "learning_rate": 4.918192512705117e-05, "loss": 0.2701, "step": 2926 }, { "epoch": 18.180685358255452, "grad_norm": 2.609375, "learning_rate": 4.918129989724896e-05, "loss": 0.2887, "step": 2927 }, { "epoch": 18.186915887850468, "grad_norm": 3.234375, "learning_rate": 4.918067443259313e-05, "loss": 0.1823, "step": 2928 }, { "epoch": 18.193146417445483, "grad_norm": 2.484375, "learning_rate": 4.9180048733089755e-05, "loss": 0.2061, "step": 2929 }, { "epoch": 18.1993769470405, "grad_norm": 2.703125, "learning_rate": 4.917942279874491e-05, "loss": 0.1884, "step": 2930 }, { "epoch": 18.205607476635514, "grad_norm": 2.65625, "learning_rate": 4.917879662956469e-05, "loss": 0.2231, "step": 2931 }, { "epoch": 18.21183800623053, "grad_norm": 3.0, "learning_rate": 4.917817022555515e-05, "loss": 0.2475, "step": 2932 }, { "epoch": 18.218068535825545, "grad_norm": 1.9609375, "learning_rate": 4.9177543586722395e-05, "loss": 0.2122, "step": 2933 }, { "epoch": 18.22429906542056, "grad_norm": 1.8828125, "learning_rate": 4.917691671307251e-05, "loss": 0.1879, "step": 2934 }, { "epoch": 18.230529595015575, "grad_norm": 2.5625, "learning_rate": 4.917628960461157e-05, "loss": 0.4146, "step": 2935 }, { "epoch": 18.23676012461059, "grad_norm": 2.71875, "learning_rate": 4.917566226134567e-05, "loss": 0.2922, "step": 2936 }, { "epoch": 18.242990654205606, "grad_norm": 2.484375, "learning_rate": 4.917503468328091e-05, "loss": 0.1831, "step": 2937 }, { "epoch": 18.24922118380062, "grad_norm": 5.0, "learning_rate": 4.917440687042338e-05, "loss": 0.3492, "step": 2938 }, { "epoch": 18.25545171339564, "grad_norm": 4.0, "learning_rate": 4.917377882277917e-05, "loss": 0.1795, "step": 2939 }, { "epoch": 18.261682242990656, "grad_norm": 3.796875, "learning_rate": 4.91731505403544e-05, "loss": 0.377, "step": 2940 }, { "epoch": 18.26791277258567, "grad_norm": 2.015625, "learning_rate": 4.9172522023155154e-05, "loss": 0.2424, "step": 2941 }, { "epoch": 18.274143302180686, "grad_norm": 3.234375, "learning_rate": 4.917189327118754e-05, "loss": 0.2713, "step": 2942 }, { "epoch": 18.2803738317757, "grad_norm": 2.984375, "learning_rate": 4.9171264284457675e-05, "loss": 0.3027, "step": 2943 }, { "epoch": 18.286604361370717, "grad_norm": 3.03125, "learning_rate": 4.917063506297165e-05, "loss": 0.2797, "step": 2944 }, { "epoch": 18.292834890965732, "grad_norm": 2.3125, "learning_rate": 4.9170005606735594e-05, "loss": 0.1817, "step": 2945 }, { "epoch": 18.299065420560748, "grad_norm": 3.90625, "learning_rate": 4.9169375915755606e-05, "loss": 0.4777, "step": 2946 }, { "epoch": 18.305295950155763, "grad_norm": 2.5625, "learning_rate": 4.916874599003781e-05, "loss": 0.2283, "step": 2947 }, { "epoch": 18.31152647975078, "grad_norm": 3.1875, "learning_rate": 4.9168115829588326e-05, "loss": 0.3718, "step": 2948 }, { "epoch": 18.317757009345794, "grad_norm": 2.5625, "learning_rate": 4.916748543441326e-05, "loss": 0.191, "step": 2949 }, { "epoch": 18.32398753894081, "grad_norm": 3.078125, "learning_rate": 4.916685480451875e-05, "loss": 0.3864, "step": 2950 }, { "epoch": 18.330218068535824, "grad_norm": 2.609375, "learning_rate": 4.9166223939910916e-05, "loss": 0.2551, "step": 2951 }, { "epoch": 18.33644859813084, "grad_norm": 1.8046875, "learning_rate": 4.9165592840595884e-05, "loss": 0.1738, "step": 2952 }, { "epoch": 18.342679127725855, "grad_norm": 3.140625, "learning_rate": 4.916496150657978e-05, "loss": 0.1974, "step": 2953 }, { "epoch": 18.34890965732087, "grad_norm": 2.421875, "learning_rate": 4.916432993786874e-05, "loss": 0.2419, "step": 2954 }, { "epoch": 18.35514018691589, "grad_norm": 4.0625, "learning_rate": 4.9163698134468906e-05, "loss": 0.4168, "step": 2955 }, { "epoch": 18.361370716510905, "grad_norm": 1.9375, "learning_rate": 4.91630660963864e-05, "loss": 0.1581, "step": 2956 }, { "epoch": 18.36760124610592, "grad_norm": 3.640625, "learning_rate": 4.9162433823627366e-05, "loss": 0.4956, "step": 2957 }, { "epoch": 18.373831775700936, "grad_norm": 2.90625, "learning_rate": 4.9161801316197945e-05, "loss": 0.3581, "step": 2958 }, { "epoch": 18.38006230529595, "grad_norm": 3.1875, "learning_rate": 4.916116857410427e-05, "loss": 0.2187, "step": 2959 }, { "epoch": 18.386292834890966, "grad_norm": 4.1875, "learning_rate": 4.916053559735251e-05, "loss": 0.3659, "step": 2960 }, { "epoch": 18.39252336448598, "grad_norm": 3.53125, "learning_rate": 4.915990238594879e-05, "loss": 0.3584, "step": 2961 }, { "epoch": 18.398753894080997, "grad_norm": 2.796875, "learning_rate": 4.915926893989927e-05, "loss": 0.4133, "step": 2962 }, { "epoch": 18.404984423676012, "grad_norm": 4.59375, "learning_rate": 4.91586352592101e-05, "loss": 0.2957, "step": 2963 }, { "epoch": 18.411214953271028, "grad_norm": 2.65625, "learning_rate": 4.915800134388744e-05, "loss": 0.1869, "step": 2964 }, { "epoch": 18.417445482866043, "grad_norm": 3.75, "learning_rate": 4.9157367193937444e-05, "loss": 0.2517, "step": 2965 }, { "epoch": 18.42367601246106, "grad_norm": 3.1875, "learning_rate": 4.9156732809366264e-05, "loss": 0.1954, "step": 2966 }, { "epoch": 18.429906542056074, "grad_norm": 1.9375, "learning_rate": 4.915609819018007e-05, "loss": 0.1807, "step": 2967 }, { "epoch": 18.43613707165109, "grad_norm": 2.3125, "learning_rate": 4.915546333638501e-05, "loss": 0.2569, "step": 2968 }, { "epoch": 18.442367601246104, "grad_norm": 4.125, "learning_rate": 4.9154828247987275e-05, "loss": 0.3599, "step": 2969 }, { "epoch": 18.44859813084112, "grad_norm": 1.9453125, "learning_rate": 4.9154192924993015e-05, "loss": 0.1647, "step": 2970 }, { "epoch": 18.45482866043614, "grad_norm": 2.265625, "learning_rate": 4.915355736740841e-05, "loss": 0.3072, "step": 2971 }, { "epoch": 18.461059190031154, "grad_norm": 2.53125, "learning_rate": 4.9152921575239616e-05, "loss": 0.1734, "step": 2972 }, { "epoch": 18.46728971962617, "grad_norm": 3.078125, "learning_rate": 4.915228554849283e-05, "loss": 0.2458, "step": 2973 }, { "epoch": 18.473520249221185, "grad_norm": 4.3125, "learning_rate": 4.915164928717421e-05, "loss": 0.263, "step": 2974 }, { "epoch": 18.4797507788162, "grad_norm": 2.546875, "learning_rate": 4.915101279128995e-05, "loss": 0.2047, "step": 2975 }, { "epoch": 18.485981308411215, "grad_norm": 3.0625, "learning_rate": 4.915037606084622e-05, "loss": 0.2857, "step": 2976 }, { "epoch": 18.49221183800623, "grad_norm": 3.28125, "learning_rate": 4.914973909584922e-05, "loss": 0.2937, "step": 2977 }, { "epoch": 18.498442367601246, "grad_norm": 4.90625, "learning_rate": 4.9149101896305114e-05, "loss": 0.3266, "step": 2978 }, { "epoch": 18.50467289719626, "grad_norm": 2.546875, "learning_rate": 4.914846446222012e-05, "loss": 0.3072, "step": 2979 }, { "epoch": 18.510903426791277, "grad_norm": 3.1875, "learning_rate": 4.9147826793600396e-05, "loss": 0.2843, "step": 2980 }, { "epoch": 18.517133956386292, "grad_norm": 2.828125, "learning_rate": 4.9147188890452164e-05, "loss": 0.2027, "step": 2981 }, { "epoch": 18.523364485981308, "grad_norm": 3.78125, "learning_rate": 4.91465507527816e-05, "loss": 0.2264, "step": 2982 }, { "epoch": 18.529595015576323, "grad_norm": 1.9765625, "learning_rate": 4.914591238059491e-05, "loss": 0.1737, "step": 2983 }, { "epoch": 18.53582554517134, "grad_norm": 3.421875, "learning_rate": 4.9145273773898284e-05, "loss": 0.2718, "step": 2984 }, { "epoch": 18.542056074766354, "grad_norm": 1.9140625, "learning_rate": 4.914463493269794e-05, "loss": 0.1724, "step": 2985 }, { "epoch": 18.54828660436137, "grad_norm": 3.03125, "learning_rate": 4.9143995857000074e-05, "loss": 0.248, "step": 2986 }, { "epoch": 18.554517133956388, "grad_norm": 3.40625, "learning_rate": 4.9143356546810895e-05, "loss": 0.2368, "step": 2987 }, { "epoch": 18.560747663551403, "grad_norm": 3.046875, "learning_rate": 4.914271700213661e-05, "loss": 0.2802, "step": 2988 }, { "epoch": 18.56697819314642, "grad_norm": 2.890625, "learning_rate": 4.914207722298343e-05, "loss": 0.2362, "step": 2989 }, { "epoch": 18.573208722741434, "grad_norm": 2.359375, "learning_rate": 4.9141437209357574e-05, "loss": 0.2001, "step": 2990 }, { "epoch": 18.57943925233645, "grad_norm": 2.625, "learning_rate": 4.914079696126526e-05, "loss": 0.2751, "step": 2991 }, { "epoch": 18.585669781931465, "grad_norm": 3.046875, "learning_rate": 4.9140156478712685e-05, "loss": 0.173, "step": 2992 }, { "epoch": 18.59190031152648, "grad_norm": 2.96875, "learning_rate": 4.913951576170609e-05, "loss": 0.2186, "step": 2993 }, { "epoch": 18.598130841121495, "grad_norm": 5.1875, "learning_rate": 4.91388748102517e-05, "loss": 0.4864, "step": 2994 }, { "epoch": 18.60436137071651, "grad_norm": 2.359375, "learning_rate": 4.9138233624355724e-05, "loss": 0.227, "step": 2995 }, { "epoch": 18.610591900311526, "grad_norm": 3.234375, "learning_rate": 4.91375922040244e-05, "loss": 0.3451, "step": 2996 }, { "epoch": 18.61682242990654, "grad_norm": 3.875, "learning_rate": 4.913695054926396e-05, "loss": 0.3827, "step": 2997 }, { "epoch": 18.623052959501557, "grad_norm": 3.25, "learning_rate": 4.913630866008063e-05, "loss": 0.2321, "step": 2998 }, { "epoch": 18.629283489096572, "grad_norm": 2.953125, "learning_rate": 4.913566653648064e-05, "loss": 0.287, "step": 2999 }, { "epoch": 18.635514018691588, "grad_norm": 3.390625, "learning_rate": 4.913502417847024e-05, "loss": 0.403, "step": 3000 }, { "epoch": 18.641744548286603, "grad_norm": 2.609375, "learning_rate": 4.9134381586055655e-05, "loss": 0.165, "step": 3001 }, { "epoch": 18.64797507788162, "grad_norm": 3.96875, "learning_rate": 4.913373875924313e-05, "loss": 0.2816, "step": 3002 }, { "epoch": 18.654205607476637, "grad_norm": 2.625, "learning_rate": 4.913309569803891e-05, "loss": 0.2639, "step": 3003 }, { "epoch": 18.660436137071652, "grad_norm": 4.5, "learning_rate": 4.9132452402449246e-05, "loss": 0.5255, "step": 3004 }, { "epoch": 18.666666666666668, "grad_norm": 3.59375, "learning_rate": 4.9131808872480376e-05, "loss": 0.2493, "step": 3005 }, { "epoch": 18.672897196261683, "grad_norm": 3.28125, "learning_rate": 4.9131165108138554e-05, "loss": 0.2822, "step": 3006 }, { "epoch": 18.6791277258567, "grad_norm": 4.0625, "learning_rate": 4.913052110943004e-05, "loss": 0.296, "step": 3007 }, { "epoch": 18.685358255451714, "grad_norm": 3.796875, "learning_rate": 4.9129876876361076e-05, "loss": 0.5536, "step": 3008 }, { "epoch": 18.69158878504673, "grad_norm": 2.765625, "learning_rate": 4.9129232408937917e-05, "loss": 0.2331, "step": 3009 }, { "epoch": 18.697819314641745, "grad_norm": 3.671875, "learning_rate": 4.912858770716684e-05, "loss": 0.2556, "step": 3010 }, { "epoch": 18.70404984423676, "grad_norm": 2.953125, "learning_rate": 4.9127942771054095e-05, "loss": 0.2954, "step": 3011 }, { "epoch": 18.710280373831775, "grad_norm": 2.125, "learning_rate": 4.912729760060595e-05, "loss": 0.1803, "step": 3012 }, { "epoch": 18.71651090342679, "grad_norm": 3.1875, "learning_rate": 4.912665219582866e-05, "loss": 0.2831, "step": 3013 }, { "epoch": 18.722741433021806, "grad_norm": 4.3125, "learning_rate": 4.91260065567285e-05, "loss": 0.2552, "step": 3014 }, { "epoch": 18.72897196261682, "grad_norm": 2.953125, "learning_rate": 4.9125360683311744e-05, "loss": 0.1543, "step": 3015 }, { "epoch": 18.735202492211837, "grad_norm": 4.5, "learning_rate": 4.9124714575584664e-05, "loss": 0.495, "step": 3016 }, { "epoch": 18.741433021806852, "grad_norm": 3.59375, "learning_rate": 4.912406823355353e-05, "loss": 0.3161, "step": 3017 }, { "epoch": 18.747663551401867, "grad_norm": 3.546875, "learning_rate": 4.9123421657224636e-05, "loss": 0.1965, "step": 3018 }, { "epoch": 18.753894080996886, "grad_norm": 2.578125, "learning_rate": 4.912277484660424e-05, "loss": 0.2182, "step": 3019 }, { "epoch": 18.7601246105919, "grad_norm": 2.65625, "learning_rate": 4.912212780169863e-05, "loss": 0.2618, "step": 3020 }, { "epoch": 18.766355140186917, "grad_norm": 3.09375, "learning_rate": 4.9121480522514085e-05, "loss": 0.3474, "step": 3021 }, { "epoch": 18.772585669781932, "grad_norm": 2.734375, "learning_rate": 4.912083300905691e-05, "loss": 0.2822, "step": 3022 }, { "epoch": 18.778816199376948, "grad_norm": 2.46875, "learning_rate": 4.9120185261333385e-05, "loss": 0.2252, "step": 3023 }, { "epoch": 18.785046728971963, "grad_norm": 4.65625, "learning_rate": 4.911953727934979e-05, "loss": 0.2888, "step": 3024 }, { "epoch": 18.79127725856698, "grad_norm": 2.953125, "learning_rate": 4.9118889063112437e-05, "loss": 0.2881, "step": 3025 }, { "epoch": 18.797507788161994, "grad_norm": 3.0625, "learning_rate": 4.911824061262761e-05, "loss": 0.2903, "step": 3026 }, { "epoch": 18.80373831775701, "grad_norm": 2.46875, "learning_rate": 4.911759192790161e-05, "loss": 0.1872, "step": 3027 }, { "epoch": 18.809968847352025, "grad_norm": 3.734375, "learning_rate": 4.911694300894073e-05, "loss": 0.3924, "step": 3028 }, { "epoch": 18.81619937694704, "grad_norm": 2.828125, "learning_rate": 4.9116293855751285e-05, "loss": 0.2005, "step": 3029 }, { "epoch": 18.822429906542055, "grad_norm": 4.21875, "learning_rate": 4.9115644468339575e-05, "loss": 0.3651, "step": 3030 }, { "epoch": 18.82866043613707, "grad_norm": 2.90625, "learning_rate": 4.91149948467119e-05, "loss": 0.3127, "step": 3031 }, { "epoch": 18.834890965732086, "grad_norm": 2.359375, "learning_rate": 4.911434499087457e-05, "loss": 0.1762, "step": 3032 }, { "epoch": 18.8411214953271, "grad_norm": 5.28125, "learning_rate": 4.9113694900833906e-05, "loss": 0.3746, "step": 3033 }, { "epoch": 18.847352024922117, "grad_norm": 5.96875, "learning_rate": 4.911304457659622e-05, "loss": 0.3736, "step": 3034 }, { "epoch": 18.853582554517136, "grad_norm": 3.21875, "learning_rate": 4.9112394018167825e-05, "loss": 0.2458, "step": 3035 }, { "epoch": 18.85981308411215, "grad_norm": 1.7421875, "learning_rate": 4.911174322555504e-05, "loss": 0.1357, "step": 3036 }, { "epoch": 18.866043613707166, "grad_norm": 3.625, "learning_rate": 4.911109219876417e-05, "loss": 0.3464, "step": 3037 }, { "epoch": 18.87227414330218, "grad_norm": 3.328125, "learning_rate": 4.911044093780157e-05, "loss": 0.1841, "step": 3038 }, { "epoch": 18.878504672897197, "grad_norm": 2.734375, "learning_rate": 4.9109789442673545e-05, "loss": 0.217, "step": 3039 }, { "epoch": 18.884735202492212, "grad_norm": 3.390625, "learning_rate": 4.9109137713386424e-05, "loss": 0.3025, "step": 3040 }, { "epoch": 18.890965732087228, "grad_norm": 2.734375, "learning_rate": 4.9108485749946544e-05, "loss": 0.2448, "step": 3041 }, { "epoch": 18.897196261682243, "grad_norm": 2.390625, "learning_rate": 4.9107833552360215e-05, "loss": 0.2209, "step": 3042 }, { "epoch": 18.90342679127726, "grad_norm": 2.46875, "learning_rate": 4.910718112063381e-05, "loss": 0.251, "step": 3043 }, { "epoch": 18.909657320872274, "grad_norm": 2.71875, "learning_rate": 4.910652845477363e-05, "loss": 0.3247, "step": 3044 }, { "epoch": 18.91588785046729, "grad_norm": 1.6953125, "learning_rate": 4.910587555478602e-05, "loss": 0.1416, "step": 3045 }, { "epoch": 18.922118380062305, "grad_norm": 3.359375, "learning_rate": 4.910522242067733e-05, "loss": 0.3163, "step": 3046 }, { "epoch": 18.92834890965732, "grad_norm": 3.84375, "learning_rate": 4.910456905245391e-05, "loss": 0.4872, "step": 3047 }, { "epoch": 18.934579439252335, "grad_norm": 3.5, "learning_rate": 4.9103915450122094e-05, "loss": 0.2888, "step": 3048 }, { "epoch": 18.94080996884735, "grad_norm": 3.84375, "learning_rate": 4.910326161368823e-05, "loss": 0.3063, "step": 3049 }, { "epoch": 18.947040498442366, "grad_norm": 5.0, "learning_rate": 4.910260754315868e-05, "loss": 0.2125, "step": 3050 }, { "epoch": 18.953271028037385, "grad_norm": 5.1875, "learning_rate": 4.9101953238539775e-05, "loss": 0.5048, "step": 3051 }, { "epoch": 18.9595015576324, "grad_norm": 3.75, "learning_rate": 4.910129869983789e-05, "loss": 0.2099, "step": 3052 }, { "epoch": 18.965732087227416, "grad_norm": 3.203125, "learning_rate": 4.910064392705937e-05, "loss": 0.2839, "step": 3053 }, { "epoch": 18.97196261682243, "grad_norm": 3.765625, "learning_rate": 4.9099988920210585e-05, "loss": 0.2307, "step": 3054 }, { "epoch": 18.978193146417446, "grad_norm": 4.25, "learning_rate": 4.909933367929789e-05, "loss": 0.362, "step": 3055 }, { "epoch": 18.98442367601246, "grad_norm": 4.40625, "learning_rate": 4.909867820432764e-05, "loss": 0.4904, "step": 3056 }, { "epoch": 18.990654205607477, "grad_norm": 3.59375, "learning_rate": 4.909802249530622e-05, "loss": 0.2392, "step": 3057 }, { "epoch": 18.996884735202492, "grad_norm": 4.21875, "learning_rate": 4.909736655223998e-05, "loss": 0.2383, "step": 3058 }, { "epoch": 19.0, "grad_norm": 3.125, "learning_rate": 4.90967103751353e-05, "loss": 0.2975, "step": 3059 }, { "epoch": 19.006230529595015, "grad_norm": 5.125, "learning_rate": 4.909605396399856e-05, "loss": 0.2667, "step": 3060 }, { "epoch": 19.01246105919003, "grad_norm": 2.9375, "learning_rate": 4.909539731883611e-05, "loss": 0.2142, "step": 3061 }, { "epoch": 19.018691588785046, "grad_norm": 2.71875, "learning_rate": 4.909474043965436e-05, "loss": 0.173, "step": 3062 }, { "epoch": 19.02492211838006, "grad_norm": 4.0625, "learning_rate": 4.909408332645967e-05, "loss": 0.3063, "step": 3063 }, { "epoch": 19.031152647975077, "grad_norm": 2.96875, "learning_rate": 4.909342597925843e-05, "loss": 0.1918, "step": 3064 }, { "epoch": 19.037383177570092, "grad_norm": 2.4375, "learning_rate": 4.909276839805702e-05, "loss": 0.2397, "step": 3065 }, { "epoch": 19.043613707165107, "grad_norm": 1.8828125, "learning_rate": 4.909211058286182e-05, "loss": 0.1636, "step": 3066 }, { "epoch": 19.049844236760123, "grad_norm": 1.25, "learning_rate": 4.909145253367924e-05, "loss": 0.1518, "step": 3067 }, { "epoch": 19.05607476635514, "grad_norm": 2.390625, "learning_rate": 4.9090794250515643e-05, "loss": 0.3124, "step": 3068 }, { "epoch": 19.062305295950157, "grad_norm": 3.328125, "learning_rate": 4.909013573337744e-05, "loss": 0.3117, "step": 3069 }, { "epoch": 19.068535825545172, "grad_norm": 3.265625, "learning_rate": 4.9089476982271026e-05, "loss": 0.2121, "step": 3070 }, { "epoch": 19.074766355140188, "grad_norm": 2.171875, "learning_rate": 4.90888179972028e-05, "loss": 0.1594, "step": 3071 }, { "epoch": 19.080996884735203, "grad_norm": 3.84375, "learning_rate": 4.908815877817915e-05, "loss": 0.2906, "step": 3072 }, { "epoch": 19.08722741433022, "grad_norm": 1.9140625, "learning_rate": 4.9087499325206494e-05, "loss": 0.2148, "step": 3073 }, { "epoch": 19.093457943925234, "grad_norm": 2.078125, "learning_rate": 4.9086839638291224e-05, "loss": 0.1585, "step": 3074 }, { "epoch": 19.09968847352025, "grad_norm": 2.703125, "learning_rate": 4.908617971743976e-05, "loss": 0.2525, "step": 3075 }, { "epoch": 19.105919003115265, "grad_norm": 2.390625, "learning_rate": 4.9085519562658496e-05, "loss": 0.1505, "step": 3076 }, { "epoch": 19.11214953271028, "grad_norm": 3.078125, "learning_rate": 4.908485917395385e-05, "loss": 0.2155, "step": 3077 }, { "epoch": 19.118380062305295, "grad_norm": 3.578125, "learning_rate": 4.9084198551332236e-05, "loss": 0.3814, "step": 3078 }, { "epoch": 19.12461059190031, "grad_norm": 3.875, "learning_rate": 4.9083537694800086e-05, "loss": 0.3142, "step": 3079 }, { "epoch": 19.130841121495326, "grad_norm": 2.796875, "learning_rate": 4.908287660436379e-05, "loss": 0.2386, "step": 3080 }, { "epoch": 19.13707165109034, "grad_norm": 3.046875, "learning_rate": 4.908221528002979e-05, "loss": 0.3296, "step": 3081 }, { "epoch": 19.143302180685357, "grad_norm": 3.03125, "learning_rate": 4.90815537218045e-05, "loss": 0.3576, "step": 3082 }, { "epoch": 19.149532710280372, "grad_norm": 2.140625, "learning_rate": 4.908089192969434e-05, "loss": 0.2233, "step": 3083 }, { "epoch": 19.15576323987539, "grad_norm": 3.703125, "learning_rate": 4.908022990370575e-05, "loss": 0.4476, "step": 3084 }, { "epoch": 19.161993769470406, "grad_norm": 3.453125, "learning_rate": 4.9079567643845146e-05, "loss": 0.2888, "step": 3085 }, { "epoch": 19.16822429906542, "grad_norm": 3.359375, "learning_rate": 4.907890515011898e-05, "loss": 0.2332, "step": 3086 }, { "epoch": 19.174454828660437, "grad_norm": 3.765625, "learning_rate": 4.907824242253367e-05, "loss": 0.3619, "step": 3087 }, { "epoch": 19.180685358255452, "grad_norm": 2.953125, "learning_rate": 4.907757946109565e-05, "loss": 0.3148, "step": 3088 }, { "epoch": 19.186915887850468, "grad_norm": 2.953125, "learning_rate": 4.907691626581137e-05, "loss": 0.3961, "step": 3089 }, { "epoch": 19.193146417445483, "grad_norm": 3.875, "learning_rate": 4.9076252836687265e-05, "loss": 0.4918, "step": 3090 }, { "epoch": 19.1993769470405, "grad_norm": 2.859375, "learning_rate": 4.907558917372978e-05, "loss": 0.276, "step": 3091 }, { "epoch": 19.205607476635514, "grad_norm": 3.140625, "learning_rate": 4.907492527694536e-05, "loss": 0.2792, "step": 3092 }, { "epoch": 19.21183800623053, "grad_norm": 2.96875, "learning_rate": 4.907426114634046e-05, "loss": 0.2226, "step": 3093 }, { "epoch": 19.218068535825545, "grad_norm": 2.671875, "learning_rate": 4.907359678192152e-05, "loss": 0.3387, "step": 3094 }, { "epoch": 19.22429906542056, "grad_norm": 3.84375, "learning_rate": 4.907293218369499e-05, "loss": 0.3505, "step": 3095 }, { "epoch": 19.230529595015575, "grad_norm": 3.1875, "learning_rate": 4.907226735166733e-05, "loss": 0.2482, "step": 3096 }, { "epoch": 19.23676012461059, "grad_norm": 1.8671875, "learning_rate": 4.9071602285844995e-05, "loss": 0.1964, "step": 3097 }, { "epoch": 19.242990654205606, "grad_norm": 3.390625, "learning_rate": 4.907093698623446e-05, "loss": 0.1602, "step": 3098 }, { "epoch": 19.24922118380062, "grad_norm": 3.0, "learning_rate": 4.907027145284217e-05, "loss": 0.3094, "step": 3099 }, { "epoch": 19.25545171339564, "grad_norm": 2.78125, "learning_rate": 4.906960568567458e-05, "loss": 0.1831, "step": 3100 }, { "epoch": 19.261682242990656, "grad_norm": 3.28125, "learning_rate": 4.906893968473818e-05, "loss": 0.3772, "step": 3101 }, { "epoch": 19.26791277258567, "grad_norm": 2.734375, "learning_rate": 4.906827345003942e-05, "loss": 0.2361, "step": 3102 }, { "epoch": 19.274143302180686, "grad_norm": 3.390625, "learning_rate": 4.906760698158478e-05, "loss": 0.3612, "step": 3103 }, { "epoch": 19.2803738317757, "grad_norm": 3.84375, "learning_rate": 4.906694027938074e-05, "loss": 0.3152, "step": 3104 }, { "epoch": 19.286604361370717, "grad_norm": 2.34375, "learning_rate": 4.9066273343433745e-05, "loss": 0.2179, "step": 3105 }, { "epoch": 19.292834890965732, "grad_norm": 2.875, "learning_rate": 4.90656061737503e-05, "loss": 0.2598, "step": 3106 }, { "epoch": 19.299065420560748, "grad_norm": 2.328125, "learning_rate": 4.906493877033689e-05, "loss": 0.2962, "step": 3107 }, { "epoch": 19.305295950155763, "grad_norm": 2.140625, "learning_rate": 4.906427113319997e-05, "loss": 0.1802, "step": 3108 }, { "epoch": 19.31152647975078, "grad_norm": 2.78125, "learning_rate": 4.9063603262346043e-05, "loss": 0.1506, "step": 3109 }, { "epoch": 19.317757009345794, "grad_norm": 2.78125, "learning_rate": 4.906293515778159e-05, "loss": 0.1762, "step": 3110 }, { "epoch": 19.32398753894081, "grad_norm": 4.25, "learning_rate": 4.9062266819513104e-05, "loss": 0.3501, "step": 3111 }, { "epoch": 19.330218068535824, "grad_norm": 2.859375, "learning_rate": 4.9061598247547065e-05, "loss": 0.1877, "step": 3112 }, { "epoch": 19.33644859813084, "grad_norm": 3.359375, "learning_rate": 4.906092944188998e-05, "loss": 0.3526, "step": 3113 }, { "epoch": 19.342679127725855, "grad_norm": 3.28125, "learning_rate": 4.9060260402548335e-05, "loss": 0.2614, "step": 3114 }, { "epoch": 19.34890965732087, "grad_norm": 4.15625, "learning_rate": 4.905959112952864e-05, "loss": 0.3874, "step": 3115 }, { "epoch": 19.35514018691589, "grad_norm": 2.25, "learning_rate": 4.905892162283737e-05, "loss": 0.1889, "step": 3116 }, { "epoch": 19.361370716510905, "grad_norm": 3.203125, "learning_rate": 4.905825188248106e-05, "loss": 0.2606, "step": 3117 }, { "epoch": 19.36760124610592, "grad_norm": 2.09375, "learning_rate": 4.905758190846619e-05, "loss": 0.1775, "step": 3118 }, { "epoch": 19.373831775700936, "grad_norm": 3.046875, "learning_rate": 4.905691170079928e-05, "loss": 0.1741, "step": 3119 }, { "epoch": 19.38006230529595, "grad_norm": 3.609375, "learning_rate": 4.9056241259486834e-05, "loss": 0.2542, "step": 3120 }, { "epoch": 19.386292834890966, "grad_norm": 3.9375, "learning_rate": 4.905557058453536e-05, "loss": 0.4012, "step": 3121 }, { "epoch": 19.39252336448598, "grad_norm": 3.8125, "learning_rate": 4.905489967595138e-05, "loss": 0.36, "step": 3122 }, { "epoch": 19.398753894080997, "grad_norm": 2.515625, "learning_rate": 4.905422853374141e-05, "loss": 0.2138, "step": 3123 }, { "epoch": 19.404984423676012, "grad_norm": 4.25, "learning_rate": 4.9053557157911956e-05, "loss": 0.2211, "step": 3124 }, { "epoch": 19.411214953271028, "grad_norm": 2.5, "learning_rate": 4.905288554846955e-05, "loss": 0.1589, "step": 3125 }, { "epoch": 19.417445482866043, "grad_norm": 3.3125, "learning_rate": 4.9052213705420714e-05, "loss": 0.2433, "step": 3126 }, { "epoch": 19.42367601246106, "grad_norm": 3.359375, "learning_rate": 4.9051541628771966e-05, "loss": 0.3605, "step": 3127 }, { "epoch": 19.429906542056074, "grad_norm": 4.21875, "learning_rate": 4.905086931852983e-05, "loss": 0.1968, "step": 3128 }, { "epoch": 19.43613707165109, "grad_norm": 5.40625, "learning_rate": 4.905019677470086e-05, "loss": 0.2688, "step": 3129 }, { "epoch": 19.442367601246104, "grad_norm": 3.546875, "learning_rate": 4.9049523997291566e-05, "loss": 0.4281, "step": 3130 }, { "epoch": 19.44859813084112, "grad_norm": 2.375, "learning_rate": 4.904885098630848e-05, "loss": 0.2377, "step": 3131 }, { "epoch": 19.45482866043614, "grad_norm": 3.328125, "learning_rate": 4.9048177741758156e-05, "loss": 0.2117, "step": 3132 }, { "epoch": 19.461059190031154, "grad_norm": 4.75, "learning_rate": 4.904750426364712e-05, "loss": 0.3542, "step": 3133 }, { "epoch": 19.46728971962617, "grad_norm": 4.21875, "learning_rate": 4.904683055198192e-05, "loss": 0.2113, "step": 3134 }, { "epoch": 19.473520249221185, "grad_norm": 2.34375, "learning_rate": 4.904615660676908e-05, "loss": 0.1973, "step": 3135 }, { "epoch": 19.4797507788162, "grad_norm": 4.0625, "learning_rate": 4.9045482428015175e-05, "loss": 0.203, "step": 3136 }, { "epoch": 19.485981308411215, "grad_norm": 2.15625, "learning_rate": 4.904480801572673e-05, "loss": 0.1692, "step": 3137 }, { "epoch": 19.49221183800623, "grad_norm": 2.6875, "learning_rate": 4.9044133369910303e-05, "loss": 0.238, "step": 3138 }, { "epoch": 19.498442367601246, "grad_norm": 2.625, "learning_rate": 4.9043458490572444e-05, "loss": 0.2145, "step": 3139 }, { "epoch": 19.50467289719626, "grad_norm": 2.484375, "learning_rate": 4.904278337771972e-05, "loss": 0.1962, "step": 3140 }, { "epoch": 19.510903426791277, "grad_norm": 2.8125, "learning_rate": 4.9042108031358675e-05, "loss": 0.2008, "step": 3141 }, { "epoch": 19.517133956386292, "grad_norm": 3.515625, "learning_rate": 4.9041432451495864e-05, "loss": 0.2027, "step": 3142 }, { "epoch": 19.523364485981308, "grad_norm": 3.515625, "learning_rate": 4.9040756638137864e-05, "loss": 0.3571, "step": 3143 }, { "epoch": 19.529595015576323, "grad_norm": 1.8515625, "learning_rate": 4.904008059129123e-05, "loss": 0.1644, "step": 3144 }, { "epoch": 19.53582554517134, "grad_norm": 2.109375, "learning_rate": 4.903940431096252e-05, "loss": 0.1999, "step": 3145 }, { "epoch": 19.542056074766354, "grad_norm": 2.65625, "learning_rate": 4.9038727797158315e-05, "loss": 0.2853, "step": 3146 }, { "epoch": 19.54828660436137, "grad_norm": 2.75, "learning_rate": 4.903805104988518e-05, "loss": 0.2212, "step": 3147 }, { "epoch": 19.554517133956388, "grad_norm": 3.015625, "learning_rate": 4.903737406914968e-05, "loss": 0.3264, "step": 3148 }, { "epoch": 19.560747663551403, "grad_norm": 2.65625, "learning_rate": 4.903669685495841e-05, "loss": 0.2419, "step": 3149 }, { "epoch": 19.56697819314642, "grad_norm": 2.1875, "learning_rate": 4.9036019407317925e-05, "loss": 0.2386, "step": 3150 }, { "epoch": 19.573208722741434, "grad_norm": 2.296875, "learning_rate": 4.903534172623482e-05, "loss": 0.2133, "step": 3151 }, { "epoch": 19.57943925233645, "grad_norm": 3.1875, "learning_rate": 4.903466381171568e-05, "loss": 0.2937, "step": 3152 }, { "epoch": 19.585669781931465, "grad_norm": 2.609375, "learning_rate": 4.9033985663767067e-05, "loss": 0.347, "step": 3153 }, { "epoch": 19.59190031152648, "grad_norm": 2.234375, "learning_rate": 4.903330728239558e-05, "loss": 0.2167, "step": 3154 }, { "epoch": 19.598130841121495, "grad_norm": 2.0, "learning_rate": 4.9032628667607813e-05, "loss": 0.1958, "step": 3155 }, { "epoch": 19.60436137071651, "grad_norm": 1.9765625, "learning_rate": 4.9031949819410364e-05, "loss": 0.1614, "step": 3156 }, { "epoch": 19.610591900311526, "grad_norm": 1.6796875, "learning_rate": 4.90312707378098e-05, "loss": 0.2898, "step": 3157 }, { "epoch": 19.61682242990654, "grad_norm": 2.75, "learning_rate": 4.903059142281273e-05, "loss": 0.2484, "step": 3158 }, { "epoch": 19.623052959501557, "grad_norm": 2.125, "learning_rate": 4.902991187442576e-05, "loss": 0.1861, "step": 3159 }, { "epoch": 19.629283489096572, "grad_norm": 2.078125, "learning_rate": 4.9029232092655486e-05, "loss": 0.1869, "step": 3160 }, { "epoch": 19.635514018691588, "grad_norm": 2.234375, "learning_rate": 4.90285520775085e-05, "loss": 0.2283, "step": 3161 }, { "epoch": 19.641744548286603, "grad_norm": 2.453125, "learning_rate": 4.902787182899141e-05, "loss": 0.2545, "step": 3162 }, { "epoch": 19.64797507788162, "grad_norm": 2.25, "learning_rate": 4.9027191347110835e-05, "loss": 0.2364, "step": 3163 }, { "epoch": 19.654205607476637, "grad_norm": 2.453125, "learning_rate": 4.902651063187337e-05, "loss": 0.188, "step": 3164 }, { "epoch": 19.660436137071652, "grad_norm": 3.5625, "learning_rate": 4.902582968328563e-05, "loss": 0.2918, "step": 3165 }, { "epoch": 19.666666666666668, "grad_norm": 3.375, "learning_rate": 4.9025148501354225e-05, "loss": 0.2463, "step": 3166 }, { "epoch": 19.672897196261683, "grad_norm": 4.125, "learning_rate": 4.902446708608578e-05, "loss": 0.371, "step": 3167 }, { "epoch": 19.6791277258567, "grad_norm": 2.40625, "learning_rate": 4.902378543748691e-05, "loss": 0.2177, "step": 3168 }, { "epoch": 19.685358255451714, "grad_norm": 2.0625, "learning_rate": 4.902310355556423e-05, "loss": 0.2102, "step": 3169 }, { "epoch": 19.69158878504673, "grad_norm": 4.9375, "learning_rate": 4.902242144032437e-05, "loss": 0.4494, "step": 3170 }, { "epoch": 19.697819314641745, "grad_norm": 4.40625, "learning_rate": 4.9021739091773945e-05, "loss": 0.2002, "step": 3171 }, { "epoch": 19.70404984423676, "grad_norm": 2.15625, "learning_rate": 4.902105650991959e-05, "loss": 0.2199, "step": 3172 }, { "epoch": 19.710280373831775, "grad_norm": 2.578125, "learning_rate": 4.902037369476794e-05, "loss": 0.1912, "step": 3173 }, { "epoch": 19.71651090342679, "grad_norm": 2.421875, "learning_rate": 4.901969064632561e-05, "loss": 0.1454, "step": 3174 }, { "epoch": 19.722741433021806, "grad_norm": 3.15625, "learning_rate": 4.9019007364599246e-05, "loss": 0.2423, "step": 3175 }, { "epoch": 19.72897196261682, "grad_norm": 3.46875, "learning_rate": 4.901832384959548e-05, "loss": 0.4535, "step": 3176 }, { "epoch": 19.735202492211837, "grad_norm": 3.578125, "learning_rate": 4.9017640101320955e-05, "loss": 0.187, "step": 3177 }, { "epoch": 19.741433021806852, "grad_norm": 3.203125, "learning_rate": 4.901695611978231e-05, "loss": 0.1921, "step": 3178 }, { "epoch": 19.747663551401867, "grad_norm": 1.515625, "learning_rate": 4.901627190498618e-05, "loss": 0.1572, "step": 3179 }, { "epoch": 19.753894080996886, "grad_norm": 2.359375, "learning_rate": 4.901558745693922e-05, "loss": 0.2943, "step": 3180 }, { "epoch": 19.7601246105919, "grad_norm": 3.875, "learning_rate": 4.9014902775648076e-05, "loss": 0.3035, "step": 3181 }, { "epoch": 19.766355140186917, "grad_norm": 2.671875, "learning_rate": 4.9014217861119385e-05, "loss": 0.2826, "step": 3182 }, { "epoch": 19.772585669781932, "grad_norm": 2.625, "learning_rate": 4.901353271335982e-05, "loss": 0.2041, "step": 3183 }, { "epoch": 19.778816199376948, "grad_norm": 2.5, "learning_rate": 4.901284733237602e-05, "loss": 0.2349, "step": 3184 }, { "epoch": 19.785046728971963, "grad_norm": 3.390625, "learning_rate": 4.9012161718174657e-05, "loss": 0.1876, "step": 3185 }, { "epoch": 19.79127725856698, "grad_norm": 3.5, "learning_rate": 4.901147587076237e-05, "loss": 0.4627, "step": 3186 }, { "epoch": 19.797507788161994, "grad_norm": 3.8125, "learning_rate": 4.9010789790145827e-05, "loss": 0.2834, "step": 3187 }, { "epoch": 19.80373831775701, "grad_norm": 1.59375, "learning_rate": 4.9010103476331704e-05, "loss": 0.1371, "step": 3188 }, { "epoch": 19.809968847352025, "grad_norm": 2.34375, "learning_rate": 4.9009416929326646e-05, "loss": 0.2568, "step": 3189 }, { "epoch": 19.81619937694704, "grad_norm": 3.0625, "learning_rate": 4.900873014913734e-05, "loss": 0.232, "step": 3190 }, { "epoch": 19.822429906542055, "grad_norm": 2.171875, "learning_rate": 4.9008043135770444e-05, "loss": 0.1609, "step": 3191 }, { "epoch": 19.82866043613707, "grad_norm": 2.015625, "learning_rate": 4.900735588923263e-05, "loss": 0.1803, "step": 3192 }, { "epoch": 19.834890965732086, "grad_norm": 1.859375, "learning_rate": 4.900666840953059e-05, "loss": 0.1417, "step": 3193 }, { "epoch": 19.8411214953271, "grad_norm": 2.890625, "learning_rate": 4.9005980696670975e-05, "loss": 0.3864, "step": 3194 }, { "epoch": 19.847352024922117, "grad_norm": 3.21875, "learning_rate": 4.9005292750660474e-05, "loss": 0.3011, "step": 3195 }, { "epoch": 19.853582554517136, "grad_norm": 1.84375, "learning_rate": 4.900460457150578e-05, "loss": 0.1852, "step": 3196 }, { "epoch": 19.85981308411215, "grad_norm": 2.5, "learning_rate": 4.900391615921356e-05, "loss": 0.2146, "step": 3197 }, { "epoch": 19.866043613707166, "grad_norm": 2.640625, "learning_rate": 4.900322751379052e-05, "loss": 0.2921, "step": 3198 }, { "epoch": 19.87227414330218, "grad_norm": 2.28125, "learning_rate": 4.9002538635243324e-05, "loss": 0.2239, "step": 3199 }, { "epoch": 19.878504672897197, "grad_norm": 3.28125, "learning_rate": 4.900184952357868e-05, "loss": 0.388, "step": 3200 }, { "epoch": 19.884735202492212, "grad_norm": 3.125, "learning_rate": 4.900116017880328e-05, "loss": 0.3077, "step": 3201 }, { "epoch": 19.890965732087228, "grad_norm": 3.78125, "learning_rate": 4.900047060092381e-05, "loss": 0.2593, "step": 3202 }, { "epoch": 19.897196261682243, "grad_norm": 3.875, "learning_rate": 4.899978078994697e-05, "loss": 0.3309, "step": 3203 }, { "epoch": 19.90342679127726, "grad_norm": 2.953125, "learning_rate": 4.899909074587946e-05, "loss": 0.1719, "step": 3204 }, { "epoch": 19.909657320872274, "grad_norm": 3.046875, "learning_rate": 4.899840046872799e-05, "loss": 0.343, "step": 3205 }, { "epoch": 19.91588785046729, "grad_norm": 3.75, "learning_rate": 4.8997709958499256e-05, "loss": 0.249, "step": 3206 }, { "epoch": 19.922118380062305, "grad_norm": 3.453125, "learning_rate": 4.8997019215199966e-05, "loss": 0.1886, "step": 3207 }, { "epoch": 19.92834890965732, "grad_norm": 3.3125, "learning_rate": 4.8996328238836823e-05, "loss": 0.3185, "step": 3208 }, { "epoch": 19.934579439252335, "grad_norm": 4.15625, "learning_rate": 4.899563702941655e-05, "loss": 0.4896, "step": 3209 }, { "epoch": 19.94080996884735, "grad_norm": 2.625, "learning_rate": 4.8994945586945854e-05, "loss": 0.2065, "step": 3210 }, { "epoch": 19.947040498442366, "grad_norm": 3.046875, "learning_rate": 4.8994253911431444e-05, "loss": 0.1521, "step": 3211 }, { "epoch": 19.953271028037385, "grad_norm": 5.0625, "learning_rate": 4.899356200288004e-05, "loss": 0.1821, "step": 3212 }, { "epoch": 19.9595015576324, "grad_norm": 3.515625, "learning_rate": 4.8992869861298376e-05, "loss": 0.1795, "step": 3213 }, { "epoch": 19.965732087227416, "grad_norm": 3.046875, "learning_rate": 4.8992177486693157e-05, "loss": 0.2941, "step": 3214 }, { "epoch": 19.97196261682243, "grad_norm": 2.546875, "learning_rate": 4.899148487907112e-05, "loss": 0.2657, "step": 3215 }, { "epoch": 19.978193146417446, "grad_norm": 4.40625, "learning_rate": 4.899079203843898e-05, "loss": 0.386, "step": 3216 }, { "epoch": 19.98442367601246, "grad_norm": 5.21875, "learning_rate": 4.899009896480348e-05, "loss": 0.251, "step": 3217 }, { "epoch": 19.990654205607477, "grad_norm": 2.546875, "learning_rate": 4.898940565817134e-05, "loss": 0.1601, "step": 3218 }, { "epoch": 19.996884735202492, "grad_norm": 2.296875, "learning_rate": 4.898871211854929e-05, "loss": 0.2104, "step": 3219 }, { "epoch": 20.0, "grad_norm": 1.703125, "learning_rate": 4.898801834594407e-05, "loss": 0.1144, "step": 3220 }, { "epoch": 20.006230529595015, "grad_norm": 5.0625, "learning_rate": 4.898732434036244e-05, "loss": 0.2472, "step": 3221 }, { "epoch": 20.01246105919003, "grad_norm": 5.40625, "learning_rate": 4.89866301018111e-05, "loss": 0.2146, "step": 3222 }, { "epoch": 20.018691588785046, "grad_norm": 5.1875, "learning_rate": 4.898593563029683e-05, "loss": 0.3083, "step": 3223 }, { "epoch": 20.02492211838006, "grad_norm": 2.890625, "learning_rate": 4.898524092582635e-05, "loss": 0.2753, "step": 3224 }, { "epoch": 20.031152647975077, "grad_norm": 4.84375, "learning_rate": 4.8984545988406414e-05, "loss": 0.1724, "step": 3225 }, { "epoch": 20.037383177570092, "grad_norm": 6.65625, "learning_rate": 4.898385081804378e-05, "loss": 0.2902, "step": 3226 }, { "epoch": 20.043613707165107, "grad_norm": 6.03125, "learning_rate": 4.898315541474519e-05, "loss": 0.1972, "step": 3227 }, { "epoch": 20.049844236760123, "grad_norm": 4.28125, "learning_rate": 4.8982459778517396e-05, "loss": 0.3022, "step": 3228 }, { "epoch": 20.05607476635514, "grad_norm": 3.875, "learning_rate": 4.8981763909367164e-05, "loss": 0.265, "step": 3229 }, { "epoch": 20.062305295950157, "grad_norm": 4.78125, "learning_rate": 4.898106780730125e-05, "loss": 0.2067, "step": 3230 }, { "epoch": 20.068535825545172, "grad_norm": 4.40625, "learning_rate": 4.8980371472326405e-05, "loss": 0.163, "step": 3231 }, { "epoch": 20.074766355140188, "grad_norm": 5.75, "learning_rate": 4.8979674904449404e-05, "loss": 0.2827, "step": 3232 }, { "epoch": 20.080996884735203, "grad_norm": 4.96875, "learning_rate": 4.8978978103676995e-05, "loss": 0.2162, "step": 3233 }, { "epoch": 20.08722741433022, "grad_norm": 3.21875, "learning_rate": 4.897828107001597e-05, "loss": 0.2788, "step": 3234 }, { "epoch": 20.093457943925234, "grad_norm": 4.5625, "learning_rate": 4.897758380347308e-05, "loss": 0.2857, "step": 3235 }, { "epoch": 20.09968847352025, "grad_norm": 5.15625, "learning_rate": 4.8976886304055104e-05, "loss": 0.3114, "step": 3236 }, { "epoch": 20.105919003115265, "grad_norm": 6.84375, "learning_rate": 4.897618857176882e-05, "loss": 0.2392, "step": 3237 }, { "epoch": 20.11214953271028, "grad_norm": 3.484375, "learning_rate": 4.8975490606620994e-05, "loss": 0.204, "step": 3238 }, { "epoch": 20.118380062305295, "grad_norm": 2.828125, "learning_rate": 4.897479240861841e-05, "loss": 0.2827, "step": 3239 }, { "epoch": 20.12461059190031, "grad_norm": 1.3125, "learning_rate": 4.897409397776785e-05, "loss": 0.1486, "step": 3240 }, { "epoch": 20.130841121495326, "grad_norm": 3.59375, "learning_rate": 4.8973395314076104e-05, "loss": 0.2473, "step": 3241 }, { "epoch": 20.13707165109034, "grad_norm": 3.609375, "learning_rate": 4.897269641754995e-05, "loss": 0.2595, "step": 3242 }, { "epoch": 20.143302180685357, "grad_norm": 2.6875, "learning_rate": 4.897199728819617e-05, "loss": 0.349, "step": 3243 }, { "epoch": 20.149532710280372, "grad_norm": 2.40625, "learning_rate": 4.897129792602156e-05, "loss": 0.2063, "step": 3244 }, { "epoch": 20.15576323987539, "grad_norm": 3.75, "learning_rate": 4.897059833103291e-05, "loss": 0.2153, "step": 3245 }, { "epoch": 20.161993769470406, "grad_norm": 2.078125, "learning_rate": 4.8969898503237034e-05, "loss": 0.1996, "step": 3246 }, { "epoch": 20.16822429906542, "grad_norm": 3.546875, "learning_rate": 4.8969198442640695e-05, "loss": 0.2079, "step": 3247 }, { "epoch": 20.174454828660437, "grad_norm": 2.828125, "learning_rate": 4.896849814925072e-05, "loss": 0.2401, "step": 3248 }, { "epoch": 20.180685358255452, "grad_norm": 2.796875, "learning_rate": 4.896779762307389e-05, "loss": 0.2154, "step": 3249 }, { "epoch": 20.186915887850468, "grad_norm": 3.453125, "learning_rate": 4.896709686411702e-05, "loss": 0.1862, "step": 3250 }, { "epoch": 20.193146417445483, "grad_norm": 1.984375, "learning_rate": 4.896639587238692e-05, "loss": 0.1939, "step": 3251 }, { "epoch": 20.1993769470405, "grad_norm": 2.484375, "learning_rate": 4.8965694647890393e-05, "loss": 0.2555, "step": 3252 }, { "epoch": 20.205607476635514, "grad_norm": 3.765625, "learning_rate": 4.896499319063424e-05, "loss": 0.5106, "step": 3253 }, { "epoch": 20.21183800623053, "grad_norm": 4.0, "learning_rate": 4.8964291500625295e-05, "loss": 0.3443, "step": 3254 }, { "epoch": 20.218068535825545, "grad_norm": 3.390625, "learning_rate": 4.8963589577870356e-05, "loss": 0.2906, "step": 3255 }, { "epoch": 20.22429906542056, "grad_norm": 2.46875, "learning_rate": 4.896288742237624e-05, "loss": 0.1946, "step": 3256 }, { "epoch": 20.230529595015575, "grad_norm": 3.234375, "learning_rate": 4.8962185034149776e-05, "loss": 0.2539, "step": 3257 }, { "epoch": 20.23676012461059, "grad_norm": 2.53125, "learning_rate": 4.896148241319778e-05, "loss": 0.1938, "step": 3258 }, { "epoch": 20.242990654205606, "grad_norm": 4.4375, "learning_rate": 4.896077955952708e-05, "loss": 0.2428, "step": 3259 }, { "epoch": 20.24922118380062, "grad_norm": 2.109375, "learning_rate": 4.8960076473144486e-05, "loss": 0.1606, "step": 3260 }, { "epoch": 20.25545171339564, "grad_norm": 2.5, "learning_rate": 4.895937315405685e-05, "loss": 0.2111, "step": 3261 }, { "epoch": 20.261682242990656, "grad_norm": 1.5, "learning_rate": 4.8958669602271e-05, "loss": 0.1508, "step": 3262 }, { "epoch": 20.26791277258567, "grad_norm": 3.359375, "learning_rate": 4.895796581779375e-05, "loss": 0.2634, "step": 3263 }, { "epoch": 20.274143302180686, "grad_norm": 3.203125, "learning_rate": 4.8957261800631953e-05, "loss": 0.1663, "step": 3264 }, { "epoch": 20.2803738317757, "grad_norm": 3.125, "learning_rate": 4.895655755079244e-05, "loss": 0.1659, "step": 3265 }, { "epoch": 20.286604361370717, "grad_norm": 4.15625, "learning_rate": 4.895585306828205e-05, "loss": 0.3859, "step": 3266 }, { "epoch": 20.292834890965732, "grad_norm": 2.296875, "learning_rate": 4.8955148353107625e-05, "loss": 0.1836, "step": 3267 }, { "epoch": 20.299065420560748, "grad_norm": 2.609375, "learning_rate": 4.895444340527601e-05, "loss": 0.2099, "step": 3268 }, { "epoch": 20.305295950155763, "grad_norm": 2.46875, "learning_rate": 4.8953738224794056e-05, "loss": 0.1798, "step": 3269 }, { "epoch": 20.31152647975078, "grad_norm": 2.515625, "learning_rate": 4.89530328116686e-05, "loss": 0.2437, "step": 3270 }, { "epoch": 20.317757009345794, "grad_norm": 3.90625, "learning_rate": 4.895232716590651e-05, "loss": 0.2852, "step": 3271 }, { "epoch": 20.32398753894081, "grad_norm": 4.5625, "learning_rate": 4.8951621287514626e-05, "loss": 0.229, "step": 3272 }, { "epoch": 20.330218068535824, "grad_norm": 2.859375, "learning_rate": 4.895091517649981e-05, "loss": 0.2869, "step": 3273 }, { "epoch": 20.33644859813084, "grad_norm": 4.0, "learning_rate": 4.8950208832868916e-05, "loss": 0.3576, "step": 3274 }, { "epoch": 20.342679127725855, "grad_norm": 3.125, "learning_rate": 4.894950225662881e-05, "loss": 0.1831, "step": 3275 }, { "epoch": 20.34890965732087, "grad_norm": 2.40625, "learning_rate": 4.894879544778635e-05, "loss": 0.3379, "step": 3276 }, { "epoch": 20.35514018691589, "grad_norm": 2.546875, "learning_rate": 4.8948088406348404e-05, "loss": 0.2342, "step": 3277 }, { "epoch": 20.361370716510905, "grad_norm": 2.015625, "learning_rate": 4.8947381132321826e-05, "loss": 0.2011, "step": 3278 }, { "epoch": 20.36760124610592, "grad_norm": 3.8125, "learning_rate": 4.89466736257135e-05, "loss": 0.3793, "step": 3279 }, { "epoch": 20.373831775700936, "grad_norm": 2.34375, "learning_rate": 4.8945965886530296e-05, "loss": 0.2084, "step": 3280 }, { "epoch": 20.38006230529595, "grad_norm": 3.234375, "learning_rate": 4.8945257914779086e-05, "loss": 0.2007, "step": 3281 }, { "epoch": 20.386292834890966, "grad_norm": 2.421875, "learning_rate": 4.894454971046674e-05, "loss": 0.1939, "step": 3282 }, { "epoch": 20.39252336448598, "grad_norm": 1.9140625, "learning_rate": 4.894384127360014e-05, "loss": 0.1757, "step": 3283 }, { "epoch": 20.398753894080997, "grad_norm": 1.96875, "learning_rate": 4.894313260418617e-05, "loss": 0.1628, "step": 3284 }, { "epoch": 20.404984423676012, "grad_norm": 2.46875, "learning_rate": 4.894242370223171e-05, "loss": 0.3531, "step": 3285 }, { "epoch": 20.411214953271028, "grad_norm": 2.734375, "learning_rate": 4.8941714567743644e-05, "loss": 0.2213, "step": 3286 }, { "epoch": 20.417445482866043, "grad_norm": 2.53125, "learning_rate": 4.894100520072886e-05, "loss": 0.2321, "step": 3287 }, { "epoch": 20.42367601246106, "grad_norm": 3.390625, "learning_rate": 4.8940295601194254e-05, "loss": 0.3302, "step": 3288 }, { "epoch": 20.429906542056074, "grad_norm": 2.6875, "learning_rate": 4.893958576914671e-05, "loss": 0.3206, "step": 3289 }, { "epoch": 20.43613707165109, "grad_norm": 3.265625, "learning_rate": 4.893887570459312e-05, "loss": 0.3167, "step": 3290 }, { "epoch": 20.442367601246104, "grad_norm": 2.671875, "learning_rate": 4.8938165407540385e-05, "loss": 0.2373, "step": 3291 }, { "epoch": 20.44859813084112, "grad_norm": 2.140625, "learning_rate": 4.89374548779954e-05, "loss": 0.1927, "step": 3292 }, { "epoch": 20.45482866043614, "grad_norm": 3.546875, "learning_rate": 4.893674411596507e-05, "loss": 0.2746, "step": 3293 }, { "epoch": 20.461059190031154, "grad_norm": 3.140625, "learning_rate": 4.8936033121456296e-05, "loss": 0.2338, "step": 3294 }, { "epoch": 20.46728971962617, "grad_norm": 4.34375, "learning_rate": 4.8935321894475984e-05, "loss": 0.4454, "step": 3295 }, { "epoch": 20.473520249221185, "grad_norm": 3.25, "learning_rate": 4.893461043503104e-05, "loss": 0.172, "step": 3296 }, { "epoch": 20.4797507788162, "grad_norm": 3.09375, "learning_rate": 4.893389874312837e-05, "loss": 0.2372, "step": 3297 }, { "epoch": 20.485981308411215, "grad_norm": 3.453125, "learning_rate": 4.89331868187749e-05, "loss": 0.1694, "step": 3298 }, { "epoch": 20.49221183800623, "grad_norm": 2.859375, "learning_rate": 4.8932474661977534e-05, "loss": 0.2115, "step": 3299 }, { "epoch": 20.498442367601246, "grad_norm": 2.765625, "learning_rate": 4.893176227274319e-05, "loss": 0.2714, "step": 3300 }, { "epoch": 20.50467289719626, "grad_norm": 2.578125, "learning_rate": 4.8931049651078784e-05, "loss": 0.2863, "step": 3301 }, { "epoch": 20.510903426791277, "grad_norm": 2.828125, "learning_rate": 4.893033679699124e-05, "loss": 0.2755, "step": 3302 }, { "epoch": 20.517133956386292, "grad_norm": 3.390625, "learning_rate": 4.8929623710487485e-05, "loss": 0.3349, "step": 3303 }, { "epoch": 20.523364485981308, "grad_norm": 4.34375, "learning_rate": 4.892891039157443e-05, "loss": 0.369, "step": 3304 }, { "epoch": 20.529595015576323, "grad_norm": 4.15625, "learning_rate": 4.8928196840259034e-05, "loss": 0.3344, "step": 3305 }, { "epoch": 20.53582554517134, "grad_norm": 2.8125, "learning_rate": 4.892748305654819e-05, "loss": 0.2282, "step": 3306 }, { "epoch": 20.542056074766354, "grad_norm": 3.484375, "learning_rate": 4.8926769040448855e-05, "loss": 0.3164, "step": 3307 }, { "epoch": 20.54828660436137, "grad_norm": 3.640625, "learning_rate": 4.892605479196796e-05, "loss": 0.4667, "step": 3308 }, { "epoch": 20.554517133956388, "grad_norm": 5.15625, "learning_rate": 4.892534031111243e-05, "loss": 0.3724, "step": 3309 }, { "epoch": 20.560747663551403, "grad_norm": 1.546875, "learning_rate": 4.8924625597889215e-05, "loss": 0.1778, "step": 3310 }, { "epoch": 20.56697819314642, "grad_norm": 3.4375, "learning_rate": 4.892391065230525e-05, "loss": 0.2197, "step": 3311 }, { "epoch": 20.573208722741434, "grad_norm": 3.328125, "learning_rate": 4.8923195474367486e-05, "loss": 0.1972, "step": 3312 }, { "epoch": 20.57943925233645, "grad_norm": 2.078125, "learning_rate": 4.8922480064082864e-05, "loss": 0.1505, "step": 3313 }, { "epoch": 20.585669781931465, "grad_norm": 2.75, "learning_rate": 4.892176442145834e-05, "loss": 0.2019, "step": 3314 }, { "epoch": 20.59190031152648, "grad_norm": 2.96875, "learning_rate": 4.892104854650085e-05, "loss": 0.2899, "step": 3315 }, { "epoch": 20.598130841121495, "grad_norm": 3.953125, "learning_rate": 4.892033243921736e-05, "loss": 0.3131, "step": 3316 }, { "epoch": 20.60436137071651, "grad_norm": 3.484375, "learning_rate": 4.891961609961482e-05, "loss": 0.2424, "step": 3317 }, { "epoch": 20.610591900311526, "grad_norm": 2.5, "learning_rate": 4.8918899527700185e-05, "loss": 0.153, "step": 3318 }, { "epoch": 20.61682242990654, "grad_norm": 3.1875, "learning_rate": 4.891818272348042e-05, "loss": 0.2149, "step": 3319 }, { "epoch": 20.623052959501557, "grad_norm": 2.234375, "learning_rate": 4.8917465686962475e-05, "loss": 0.1649, "step": 3320 }, { "epoch": 20.629283489096572, "grad_norm": 3.96875, "learning_rate": 4.891674841815333e-05, "loss": 0.2948, "step": 3321 }, { "epoch": 20.635514018691588, "grad_norm": 3.296875, "learning_rate": 4.891603091705994e-05, "loss": 0.2146, "step": 3322 }, { "epoch": 20.641744548286603, "grad_norm": 3.015625, "learning_rate": 4.891531318368928e-05, "loss": 0.1852, "step": 3323 }, { "epoch": 20.64797507788162, "grad_norm": 4.96875, "learning_rate": 4.891459521804831e-05, "loss": 0.2451, "step": 3324 }, { "epoch": 20.654205607476637, "grad_norm": 3.484375, "learning_rate": 4.891387702014402e-05, "loss": 0.2591, "step": 3325 }, { "epoch": 20.660436137071652, "grad_norm": 2.9375, "learning_rate": 4.8913158589983374e-05, "loss": 0.2098, "step": 3326 }, { "epoch": 20.666666666666668, "grad_norm": 3.078125, "learning_rate": 4.891243992757335e-05, "loss": 0.1971, "step": 3327 }, { "epoch": 20.672897196261683, "grad_norm": 4.84375, "learning_rate": 4.8911721032920924e-05, "loss": 0.2352, "step": 3328 }, { "epoch": 20.6791277258567, "grad_norm": 1.6953125, "learning_rate": 4.891100190603309e-05, "loss": 0.1536, "step": 3329 }, { "epoch": 20.685358255451714, "grad_norm": 2.140625, "learning_rate": 4.891028254691682e-05, "loss": 0.2018, "step": 3330 }, { "epoch": 20.69158878504673, "grad_norm": 3.0625, "learning_rate": 4.8909562955579114e-05, "loss": 0.216, "step": 3331 }, { "epoch": 20.697819314641745, "grad_norm": 3.609375, "learning_rate": 4.890884313202695e-05, "loss": 0.3257, "step": 3332 }, { "epoch": 20.70404984423676, "grad_norm": 3.15625, "learning_rate": 4.8908123076267324e-05, "loss": 0.2069, "step": 3333 }, { "epoch": 20.710280373831775, "grad_norm": 3.703125, "learning_rate": 4.890740278830723e-05, "loss": 0.5516, "step": 3334 }, { "epoch": 20.71651090342679, "grad_norm": 2.453125, "learning_rate": 4.890668226815365e-05, "loss": 0.2724, "step": 3335 }, { "epoch": 20.722741433021806, "grad_norm": 3.375, "learning_rate": 4.8905961515813604e-05, "loss": 0.3129, "step": 3336 }, { "epoch": 20.72897196261682, "grad_norm": 2.046875, "learning_rate": 4.890524053129408e-05, "loss": 0.1899, "step": 3337 }, { "epoch": 20.735202492211837, "grad_norm": 1.546875, "learning_rate": 4.8904519314602075e-05, "loss": 0.1667, "step": 3338 }, { "epoch": 20.741433021806852, "grad_norm": 1.5546875, "learning_rate": 4.8903797865744606e-05, "loss": 0.1681, "step": 3339 }, { "epoch": 20.747663551401867, "grad_norm": 3.359375, "learning_rate": 4.890307618472867e-05, "loss": 0.21, "step": 3340 }, { "epoch": 20.753894080996886, "grad_norm": 2.5625, "learning_rate": 4.890235427156128e-05, "loss": 0.288, "step": 3341 }, { "epoch": 20.7601246105919, "grad_norm": 2.65625, "learning_rate": 4.8901632126249455e-05, "loss": 0.1873, "step": 3342 }, { "epoch": 20.766355140186917, "grad_norm": 2.125, "learning_rate": 4.890090974880019e-05, "loss": 0.2048, "step": 3343 }, { "epoch": 20.772585669781932, "grad_norm": 2.140625, "learning_rate": 4.890018713922052e-05, "loss": 0.1677, "step": 3344 }, { "epoch": 20.778816199376948, "grad_norm": 2.046875, "learning_rate": 4.8899464297517446e-05, "loss": 0.1247, "step": 3345 }, { "epoch": 20.785046728971963, "grad_norm": 3.375, "learning_rate": 4.8898741223698005e-05, "loss": 0.2857, "step": 3346 }, { "epoch": 20.79127725856698, "grad_norm": 3.671875, "learning_rate": 4.889801791776921e-05, "loss": 0.4177, "step": 3347 }, { "epoch": 20.797507788161994, "grad_norm": 3.0, "learning_rate": 4.8897294379738084e-05, "loss": 0.2315, "step": 3348 }, { "epoch": 20.80373831775701, "grad_norm": 3.46875, "learning_rate": 4.889657060961166e-05, "loss": 0.338, "step": 3349 }, { "epoch": 20.809968847352025, "grad_norm": 2.8125, "learning_rate": 4.8895846607396965e-05, "loss": 0.1961, "step": 3350 }, { "epoch": 20.81619937694704, "grad_norm": 4.09375, "learning_rate": 4.889512237310104e-05, "loss": 0.3245, "step": 3351 }, { "epoch": 20.822429906542055, "grad_norm": 2.546875, "learning_rate": 4.88943979067309e-05, "loss": 0.2706, "step": 3352 }, { "epoch": 20.82866043613707, "grad_norm": 2.234375, "learning_rate": 4.8893673208293596e-05, "loss": 0.1789, "step": 3353 }, { "epoch": 20.834890965732086, "grad_norm": 2.625, "learning_rate": 4.8892948277796156e-05, "loss": 0.196, "step": 3354 }, { "epoch": 20.8411214953271, "grad_norm": 3.6875, "learning_rate": 4.8892223115245624e-05, "loss": 0.3247, "step": 3355 }, { "epoch": 20.847352024922117, "grad_norm": 1.953125, "learning_rate": 4.889149772064906e-05, "loss": 0.1582, "step": 3356 }, { "epoch": 20.853582554517136, "grad_norm": 3.375, "learning_rate": 4.889077209401348e-05, "loss": 0.3276, "step": 3357 }, { "epoch": 20.85981308411215, "grad_norm": 3.375, "learning_rate": 4.889004623534596e-05, "loss": 0.3103, "step": 3358 }, { "epoch": 20.866043613707166, "grad_norm": 2.515625, "learning_rate": 4.888932014465352e-05, "loss": 0.2409, "step": 3359 }, { "epoch": 20.87227414330218, "grad_norm": 2.484375, "learning_rate": 4.8888593821943234e-05, "loss": 0.2077, "step": 3360 }, { "epoch": 20.878504672897197, "grad_norm": 2.015625, "learning_rate": 4.888786726722216e-05, "loss": 0.1958, "step": 3361 }, { "epoch": 20.884735202492212, "grad_norm": 2.640625, "learning_rate": 4.8887140480497326e-05, "loss": 0.2112, "step": 3362 }, { "epoch": 20.890965732087228, "grad_norm": 2.875, "learning_rate": 4.8886413461775816e-05, "loss": 0.433, "step": 3363 }, { "epoch": 20.897196261682243, "grad_norm": 3.328125, "learning_rate": 4.8885686211064686e-05, "loss": 0.2487, "step": 3364 }, { "epoch": 20.90342679127726, "grad_norm": 2.75, "learning_rate": 4.8884958728370986e-05, "loss": 0.2748, "step": 3365 }, { "epoch": 20.909657320872274, "grad_norm": 3.03125, "learning_rate": 4.888423101370181e-05, "loss": 0.3471, "step": 3366 }, { "epoch": 20.91588785046729, "grad_norm": 3.046875, "learning_rate": 4.8883503067064195e-05, "loss": 0.3551, "step": 3367 }, { "epoch": 20.922118380062305, "grad_norm": 3.0625, "learning_rate": 4.8882774888465226e-05, "loss": 0.2384, "step": 3368 }, { "epoch": 20.92834890965732, "grad_norm": 3.625, "learning_rate": 4.888204647791198e-05, "loss": 0.3785, "step": 3369 }, { "epoch": 20.934579439252335, "grad_norm": 1.9453125, "learning_rate": 4.8881317835411515e-05, "loss": 0.1391, "step": 3370 }, { "epoch": 20.94080996884735, "grad_norm": 3.75, "learning_rate": 4.888058896097092e-05, "loss": 0.2241, "step": 3371 }, { "epoch": 20.947040498442366, "grad_norm": 3.484375, "learning_rate": 4.887985985459728e-05, "loss": 0.2494, "step": 3372 }, { "epoch": 20.953271028037385, "grad_norm": 4.4375, "learning_rate": 4.887913051629766e-05, "loss": 0.3109, "step": 3373 }, { "epoch": 20.9595015576324, "grad_norm": 2.25, "learning_rate": 4.887840094607915e-05, "loss": 0.3076, "step": 3374 }, { "epoch": 20.965732087227416, "grad_norm": 3.765625, "learning_rate": 4.887767114394884e-05, "loss": 0.1931, "step": 3375 }, { "epoch": 20.97196261682243, "grad_norm": 2.078125, "learning_rate": 4.887694110991381e-05, "loss": 0.1919, "step": 3376 }, { "epoch": 20.978193146417446, "grad_norm": 2.609375, "learning_rate": 4.887621084398116e-05, "loss": 0.2654, "step": 3377 }, { "epoch": 20.98442367601246, "grad_norm": 3.546875, "learning_rate": 4.887548034615797e-05, "loss": 0.5526, "step": 3378 }, { "epoch": 20.990654205607477, "grad_norm": 3.390625, "learning_rate": 4.887474961645136e-05, "loss": 0.1927, "step": 3379 }, { "epoch": 20.996884735202492, "grad_norm": 2.953125, "learning_rate": 4.8874018654868394e-05, "loss": 0.3601, "step": 3380 }, { "epoch": 21.0, "grad_norm": 2.484375, "learning_rate": 4.887328746141619e-05, "loss": 0.1453, "step": 3381 }, { "epoch": 21.006230529595015, "grad_norm": 2.90625, "learning_rate": 4.887255603610185e-05, "loss": 0.2682, "step": 3382 }, { "epoch": 21.01246105919003, "grad_norm": 2.109375, "learning_rate": 4.8871824378932475e-05, "loss": 0.2105, "step": 3383 }, { "epoch": 21.018691588785046, "grad_norm": 1.8671875, "learning_rate": 4.887109248991516e-05, "loss": 0.1946, "step": 3384 }, { "epoch": 21.02492211838006, "grad_norm": 2.671875, "learning_rate": 4.887036036905703e-05, "loss": 0.2618, "step": 3385 }, { "epoch": 21.031152647975077, "grad_norm": 3.484375, "learning_rate": 4.8869628016365185e-05, "loss": 0.2958, "step": 3386 }, { "epoch": 21.037383177570092, "grad_norm": 2.734375, "learning_rate": 4.886889543184674e-05, "loss": 0.1742, "step": 3387 }, { "epoch": 21.043613707165107, "grad_norm": 2.546875, "learning_rate": 4.8868162615508814e-05, "loss": 0.2203, "step": 3388 }, { "epoch": 21.049844236760123, "grad_norm": 2.40625, "learning_rate": 4.886742956735852e-05, "loss": 0.1996, "step": 3389 }, { "epoch": 21.05607476635514, "grad_norm": 2.71875, "learning_rate": 4.886669628740299e-05, "loss": 0.2903, "step": 3390 }, { "epoch": 21.062305295950157, "grad_norm": 1.8046875, "learning_rate": 4.886596277564932e-05, "loss": 0.1989, "step": 3391 }, { "epoch": 21.068535825545172, "grad_norm": 3.15625, "learning_rate": 4.8865229032104656e-05, "loss": 0.2042, "step": 3392 }, { "epoch": 21.074766355140188, "grad_norm": 2.6875, "learning_rate": 4.8864495056776116e-05, "loss": 0.1988, "step": 3393 }, { "epoch": 21.080996884735203, "grad_norm": 2.375, "learning_rate": 4.8863760849670827e-05, "loss": 0.1612, "step": 3394 }, { "epoch": 21.08722741433022, "grad_norm": 3.234375, "learning_rate": 4.886302641079592e-05, "loss": 0.2878, "step": 3395 }, { "epoch": 21.093457943925234, "grad_norm": 2.515625, "learning_rate": 4.886229174015854e-05, "loss": 0.2407, "step": 3396 }, { "epoch": 21.09968847352025, "grad_norm": 3.0625, "learning_rate": 4.886155683776581e-05, "loss": 0.259, "step": 3397 }, { "epoch": 21.105919003115265, "grad_norm": 2.859375, "learning_rate": 4.8860821703624864e-05, "loss": 0.2667, "step": 3398 }, { "epoch": 21.11214953271028, "grad_norm": 1.890625, "learning_rate": 4.886008633774285e-05, "loss": 0.1763, "step": 3399 }, { "epoch": 21.118380062305295, "grad_norm": 4.0625, "learning_rate": 4.8859350740126905e-05, "loss": 0.4643, "step": 3400 }, { "epoch": 21.12461059190031, "grad_norm": 2.21875, "learning_rate": 4.885861491078418e-05, "loss": 0.2414, "step": 3401 }, { "epoch": 21.130841121495326, "grad_norm": 2.828125, "learning_rate": 4.885787884972183e-05, "loss": 0.1778, "step": 3402 }, { "epoch": 21.13707165109034, "grad_norm": 3.25, "learning_rate": 4.885714255694698e-05, "loss": 0.2682, "step": 3403 }, { "epoch": 21.143302180685357, "grad_norm": 2.34375, "learning_rate": 4.8856406032466785e-05, "loss": 0.2395, "step": 3404 }, { "epoch": 21.149532710280372, "grad_norm": 2.953125, "learning_rate": 4.885566927628842e-05, "loss": 0.1749, "step": 3405 }, { "epoch": 21.15576323987539, "grad_norm": 3.078125, "learning_rate": 4.8854932288419014e-05, "loss": 0.2494, "step": 3406 }, { "epoch": 21.161993769470406, "grad_norm": 2.25, "learning_rate": 4.885419506886574e-05, "loss": 0.1303, "step": 3407 }, { "epoch": 21.16822429906542, "grad_norm": 1.6953125, "learning_rate": 4.885345761763576e-05, "loss": 0.1647, "step": 3408 }, { "epoch": 21.174454828660437, "grad_norm": 4.1875, "learning_rate": 4.885271993473623e-05, "loss": 0.4889, "step": 3409 }, { "epoch": 21.180685358255452, "grad_norm": 2.734375, "learning_rate": 4.8851982020174316e-05, "loss": 0.3084, "step": 3410 }, { "epoch": 21.186915887850468, "grad_norm": 2.140625, "learning_rate": 4.8851243873957186e-05, "loss": 0.2094, "step": 3411 }, { "epoch": 21.193146417445483, "grad_norm": 2.71875, "learning_rate": 4.8850505496092e-05, "loss": 0.2649, "step": 3412 }, { "epoch": 21.1993769470405, "grad_norm": 1.8203125, "learning_rate": 4.8849766886585945e-05, "loss": 0.1392, "step": 3413 }, { "epoch": 21.205607476635514, "grad_norm": 2.515625, "learning_rate": 4.884902804544619e-05, "loss": 0.1694, "step": 3414 }, { "epoch": 21.21183800623053, "grad_norm": 2.5625, "learning_rate": 4.8848288972679895e-05, "loss": 0.1583, "step": 3415 }, { "epoch": 21.218068535825545, "grad_norm": 2.703125, "learning_rate": 4.8847549668294254e-05, "loss": 0.2253, "step": 3416 }, { "epoch": 21.22429906542056, "grad_norm": 2.140625, "learning_rate": 4.884681013229645e-05, "loss": 0.2105, "step": 3417 }, { "epoch": 21.230529595015575, "grad_norm": 5.65625, "learning_rate": 4.884607036469365e-05, "loss": 0.2703, "step": 3418 }, { "epoch": 21.23676012461059, "grad_norm": 5.34375, "learning_rate": 4.884533036549306e-05, "loss": 0.3291, "step": 3419 }, { "epoch": 21.242990654205606, "grad_norm": 5.71875, "learning_rate": 4.884459013470184e-05, "loss": 0.2593, "step": 3420 }, { "epoch": 21.24922118380062, "grad_norm": 2.359375, "learning_rate": 4.8843849672327213e-05, "loss": 0.2439, "step": 3421 }, { "epoch": 21.25545171339564, "grad_norm": 3.5625, "learning_rate": 4.884310897837634e-05, "loss": 0.3727, "step": 3422 }, { "epoch": 21.261682242990656, "grad_norm": 4.09375, "learning_rate": 4.8842368052856426e-05, "loss": 0.1833, "step": 3423 }, { "epoch": 21.26791277258567, "grad_norm": 5.625, "learning_rate": 4.8841626895774667e-05, "loss": 0.2189, "step": 3424 }, { "epoch": 21.274143302180686, "grad_norm": 4.59375, "learning_rate": 4.884088550713827e-05, "loss": 0.3129, "step": 3425 }, { "epoch": 21.2803738317757, "grad_norm": 5.6875, "learning_rate": 4.8840143886954414e-05, "loss": 0.4351, "step": 3426 }, { "epoch": 21.286604361370717, "grad_norm": 2.828125, "learning_rate": 4.883940203523032e-05, "loss": 0.2953, "step": 3427 }, { "epoch": 21.292834890965732, "grad_norm": 3.109375, "learning_rate": 4.883865995197319e-05, "loss": 0.3219, "step": 3428 }, { "epoch": 21.299065420560748, "grad_norm": 4.0625, "learning_rate": 4.883791763719023e-05, "loss": 0.2208, "step": 3429 }, { "epoch": 21.305295950155763, "grad_norm": 3.4375, "learning_rate": 4.8837175090888653e-05, "loss": 0.2127, "step": 3430 }, { "epoch": 21.31152647975078, "grad_norm": 2.625, "learning_rate": 4.8836432313075656e-05, "loss": 0.2484, "step": 3431 }, { "epoch": 21.317757009345794, "grad_norm": 1.859375, "learning_rate": 4.883568930375847e-05, "loss": 0.1723, "step": 3432 }, { "epoch": 21.32398753894081, "grad_norm": 4.21875, "learning_rate": 4.883494606294431e-05, "loss": 0.362, "step": 3433 }, { "epoch": 21.330218068535824, "grad_norm": 4.03125, "learning_rate": 4.883420259064038e-05, "loss": 0.1742, "step": 3434 }, { "epoch": 21.33644859813084, "grad_norm": 4.25, "learning_rate": 4.883345888685391e-05, "loss": 0.3653, "step": 3435 }, { "epoch": 21.342679127725855, "grad_norm": 2.1875, "learning_rate": 4.8832714951592127e-05, "loss": 0.1835, "step": 3436 }, { "epoch": 21.34890965732087, "grad_norm": 2.921875, "learning_rate": 4.883197078486226e-05, "loss": 0.207, "step": 3437 }, { "epoch": 21.35514018691589, "grad_norm": 3.125, "learning_rate": 4.8831226386671515e-05, "loss": 0.3358, "step": 3438 }, { "epoch": 21.361370716510905, "grad_norm": 2.359375, "learning_rate": 4.883048175702714e-05, "loss": 0.1781, "step": 3439 }, { "epoch": 21.36760124610592, "grad_norm": 2.515625, "learning_rate": 4.8829736895936364e-05, "loss": 0.1761, "step": 3440 }, { "epoch": 21.373831775700936, "grad_norm": 2.828125, "learning_rate": 4.882899180340642e-05, "loss": 0.3051, "step": 3441 }, { "epoch": 21.38006230529595, "grad_norm": 3.625, "learning_rate": 4.882824647944454e-05, "loss": 0.3534, "step": 3442 }, { "epoch": 21.386292834890966, "grad_norm": 4.625, "learning_rate": 4.8827500924057976e-05, "loss": 0.3004, "step": 3443 }, { "epoch": 21.39252336448598, "grad_norm": 4.65625, "learning_rate": 4.882675513725395e-05, "loss": 0.2649, "step": 3444 }, { "epoch": 21.398753894080997, "grad_norm": 4.78125, "learning_rate": 4.882600911903973e-05, "loss": 0.2398, "step": 3445 }, { "epoch": 21.404984423676012, "grad_norm": 4.53125, "learning_rate": 4.882526286942253e-05, "loss": 0.3462, "step": 3446 }, { "epoch": 21.411214953271028, "grad_norm": 4.40625, "learning_rate": 4.8824516388409625e-05, "loss": 0.2988, "step": 3447 }, { "epoch": 21.417445482866043, "grad_norm": 2.546875, "learning_rate": 4.882376967600825e-05, "loss": 0.2204, "step": 3448 }, { "epoch": 21.42367601246106, "grad_norm": 3.6875, "learning_rate": 4.8823022732225665e-05, "loss": 0.4311, "step": 3449 }, { "epoch": 21.429906542056074, "grad_norm": 3.15625, "learning_rate": 4.882227555706912e-05, "loss": 0.2644, "step": 3450 }, { "epoch": 21.43613707165109, "grad_norm": 3.4375, "learning_rate": 4.882152815054587e-05, "loss": 0.2084, "step": 3451 }, { "epoch": 21.442367601246104, "grad_norm": 3.0, "learning_rate": 4.882078051266318e-05, "loss": 0.2291, "step": 3452 }, { "epoch": 21.44859813084112, "grad_norm": 3.046875, "learning_rate": 4.88200326434283e-05, "loss": 0.3013, "step": 3453 }, { "epoch": 21.45482866043614, "grad_norm": 2.984375, "learning_rate": 4.8819284542848506e-05, "loss": 0.2878, "step": 3454 }, { "epoch": 21.461059190031154, "grad_norm": 3.6875, "learning_rate": 4.8818536210931064e-05, "loss": 0.3104, "step": 3455 }, { "epoch": 21.46728971962617, "grad_norm": 3.1875, "learning_rate": 4.881778764768323e-05, "loss": 0.2899, "step": 3456 }, { "epoch": 21.473520249221185, "grad_norm": 2.625, "learning_rate": 4.881703885311229e-05, "loss": 0.223, "step": 3457 }, { "epoch": 21.4797507788162, "grad_norm": 3.71875, "learning_rate": 4.88162898272255e-05, "loss": 0.2086, "step": 3458 }, { "epoch": 21.485981308411215, "grad_norm": 4.375, "learning_rate": 4.881554057003015e-05, "loss": 0.3687, "step": 3459 }, { "epoch": 21.49221183800623, "grad_norm": 2.828125, "learning_rate": 4.88147910815335e-05, "loss": 0.2305, "step": 3460 }, { "epoch": 21.498442367601246, "grad_norm": 3.171875, "learning_rate": 4.8814041361742845e-05, "loss": 0.2501, "step": 3461 }, { "epoch": 21.50467289719626, "grad_norm": 3.09375, "learning_rate": 4.881329141066546e-05, "loss": 0.3099, "step": 3462 }, { "epoch": 21.510903426791277, "grad_norm": 2.875, "learning_rate": 4.881254122830862e-05, "loss": 0.142, "step": 3463 }, { "epoch": 21.517133956386292, "grad_norm": 3.625, "learning_rate": 4.881179081467963e-05, "loss": 0.2883, "step": 3464 }, { "epoch": 21.523364485981308, "grad_norm": 2.8125, "learning_rate": 4.881104016978576e-05, "loss": 0.2491, "step": 3465 }, { "epoch": 21.529595015576323, "grad_norm": 2.96875, "learning_rate": 4.881028929363432e-05, "loss": 0.1992, "step": 3466 }, { "epoch": 21.53582554517134, "grad_norm": 3.796875, "learning_rate": 4.880953818623258e-05, "loss": 0.2001, "step": 3467 }, { "epoch": 21.542056074766354, "grad_norm": 5.28125, "learning_rate": 4.880878684758785e-05, "loss": 0.3121, "step": 3468 }, { "epoch": 21.54828660436137, "grad_norm": 5.71875, "learning_rate": 4.880803527770742e-05, "loss": 0.2717, "step": 3469 }, { "epoch": 21.554517133956388, "grad_norm": 4.78125, "learning_rate": 4.8807283476598595e-05, "loss": 0.4163, "step": 3470 }, { "epoch": 21.560747663551403, "grad_norm": 2.53125, "learning_rate": 4.8806531444268665e-05, "loss": 0.2333, "step": 3471 }, { "epoch": 21.56697819314642, "grad_norm": 2.0625, "learning_rate": 4.880577918072495e-05, "loss": 0.223, "step": 3472 }, { "epoch": 21.573208722741434, "grad_norm": 4.28125, "learning_rate": 4.880502668597475e-05, "loss": 0.3529, "step": 3473 }, { "epoch": 21.57943925233645, "grad_norm": 4.46875, "learning_rate": 4.8804273960025376e-05, "loss": 0.2861, "step": 3474 }, { "epoch": 21.585669781931465, "grad_norm": 3.515625, "learning_rate": 4.880352100288413e-05, "loss": 0.3058, "step": 3475 }, { "epoch": 21.59190031152648, "grad_norm": 3.0, "learning_rate": 4.880276781455833e-05, "loss": 0.2321, "step": 3476 }, { "epoch": 21.598130841121495, "grad_norm": 2.765625, "learning_rate": 4.880201439505529e-05, "loss": 0.2138, "step": 3477 }, { "epoch": 21.60436137071651, "grad_norm": 4.6875, "learning_rate": 4.8801260744382326e-05, "loss": 0.2472, "step": 3478 }, { "epoch": 21.610591900311526, "grad_norm": 3.8125, "learning_rate": 4.880050686254677e-05, "loss": 0.1485, "step": 3479 }, { "epoch": 21.61682242990654, "grad_norm": 2.296875, "learning_rate": 4.879975274955593e-05, "loss": 0.1532, "step": 3480 }, { "epoch": 21.623052959501557, "grad_norm": 3.8125, "learning_rate": 4.879899840541713e-05, "loss": 0.3092, "step": 3481 }, { "epoch": 21.629283489096572, "grad_norm": 3.5625, "learning_rate": 4.87982438301377e-05, "loss": 0.211, "step": 3482 }, { "epoch": 21.635514018691588, "grad_norm": 3.625, "learning_rate": 4.8797489023724964e-05, "loss": 0.2103, "step": 3483 }, { "epoch": 21.641744548286603, "grad_norm": 3.421875, "learning_rate": 4.879673398618627e-05, "loss": 0.2241, "step": 3484 }, { "epoch": 21.64797507788162, "grad_norm": 2.890625, "learning_rate": 4.8795978717528926e-05, "loss": 0.2175, "step": 3485 }, { "epoch": 21.654205607476637, "grad_norm": 3.703125, "learning_rate": 4.879522321776029e-05, "loss": 0.2334, "step": 3486 }, { "epoch": 21.660436137071652, "grad_norm": 2.0625, "learning_rate": 4.879446748688768e-05, "loss": 0.2306, "step": 3487 }, { "epoch": 21.666666666666668, "grad_norm": 2.484375, "learning_rate": 4.879371152491845e-05, "loss": 0.249, "step": 3488 }, { "epoch": 21.672897196261683, "grad_norm": 3.359375, "learning_rate": 4.879295533185995e-05, "loss": 0.2909, "step": 3489 }, { "epoch": 21.6791277258567, "grad_norm": 1.703125, "learning_rate": 4.879219890771949e-05, "loss": 0.1505, "step": 3490 }, { "epoch": 21.685358255451714, "grad_norm": 2.875, "learning_rate": 4.879144225250445e-05, "loss": 0.2685, "step": 3491 }, { "epoch": 21.69158878504673, "grad_norm": 3.453125, "learning_rate": 4.879068536622217e-05, "loss": 0.3771, "step": 3492 }, { "epoch": 21.697819314641745, "grad_norm": 3.28125, "learning_rate": 4.878992824887999e-05, "loss": 0.2577, "step": 3493 }, { "epoch": 21.70404984423676, "grad_norm": 2.90625, "learning_rate": 4.878917090048527e-05, "loss": 0.3102, "step": 3494 }, { "epoch": 21.710280373831775, "grad_norm": 3.34375, "learning_rate": 4.878841332104538e-05, "loss": 0.3254, "step": 3495 }, { "epoch": 21.71651090342679, "grad_norm": 1.5859375, "learning_rate": 4.878765551056765e-05, "loss": 0.1326, "step": 3496 }, { "epoch": 21.722741433021806, "grad_norm": 2.65625, "learning_rate": 4.878689746905946e-05, "loss": 0.1742, "step": 3497 }, { "epoch": 21.72897196261682, "grad_norm": 2.609375, "learning_rate": 4.878613919652817e-05, "loss": 0.256, "step": 3498 }, { "epoch": 21.735202492211837, "grad_norm": 2.609375, "learning_rate": 4.878538069298114e-05, "loss": 0.2801, "step": 3499 }, { "epoch": 21.741433021806852, "grad_norm": 3.09375, "learning_rate": 4.878462195842573e-05, "loss": 0.3501, "step": 3500 }, { "epoch": 21.747663551401867, "grad_norm": 2.640625, "learning_rate": 4.8783862992869336e-05, "loss": 0.219, "step": 3501 }, { "epoch": 21.753894080996886, "grad_norm": 2.140625, "learning_rate": 4.8783103796319294e-05, "loss": 0.1994, "step": 3502 }, { "epoch": 21.7601246105919, "grad_norm": 2.078125, "learning_rate": 4.8782344368782996e-05, "loss": 0.1456, "step": 3503 }, { "epoch": 21.766355140186917, "grad_norm": 2.734375, "learning_rate": 4.878158471026782e-05, "loss": 0.2526, "step": 3504 }, { "epoch": 21.772585669781932, "grad_norm": 2.3125, "learning_rate": 4.878082482078114e-05, "loss": 0.1971, "step": 3505 }, { "epoch": 21.778816199376948, "grad_norm": 2.390625, "learning_rate": 4.878006470033033e-05, "loss": 0.2872, "step": 3506 }, { "epoch": 21.785046728971963, "grad_norm": 2.3125, "learning_rate": 4.877930434892278e-05, "loss": 0.3292, "step": 3507 }, { "epoch": 21.79127725856698, "grad_norm": 2.40625, "learning_rate": 4.877854376656588e-05, "loss": 0.2431, "step": 3508 }, { "epoch": 21.797507788161994, "grad_norm": 2.59375, "learning_rate": 4.8777782953266996e-05, "loss": 0.2154, "step": 3509 }, { "epoch": 21.80373831775701, "grad_norm": 2.40625, "learning_rate": 4.877702190903354e-05, "loss": 0.2681, "step": 3510 }, { "epoch": 21.809968847352025, "grad_norm": 2.90625, "learning_rate": 4.87762606338729e-05, "loss": 0.2655, "step": 3511 }, { "epoch": 21.81619937694704, "grad_norm": 3.75, "learning_rate": 4.877549912779245e-05, "loss": 0.3904, "step": 3512 }, { "epoch": 21.822429906542055, "grad_norm": 2.015625, "learning_rate": 4.877473739079961e-05, "loss": 0.186, "step": 3513 }, { "epoch": 21.82866043613707, "grad_norm": 2.21875, "learning_rate": 4.877397542290176e-05, "loss": 0.3096, "step": 3514 }, { "epoch": 21.834890965732086, "grad_norm": 2.3125, "learning_rate": 4.8773213224106315e-05, "loss": 0.1414, "step": 3515 }, { "epoch": 21.8411214953271, "grad_norm": 1.890625, "learning_rate": 4.8772450794420666e-05, "loss": 0.1811, "step": 3516 }, { "epoch": 21.847352024922117, "grad_norm": 3.375, "learning_rate": 4.8771688133852225e-05, "loss": 0.2144, "step": 3517 }, { "epoch": 21.853582554517136, "grad_norm": 2.921875, "learning_rate": 4.8770925242408394e-05, "loss": 0.302, "step": 3518 }, { "epoch": 21.85981308411215, "grad_norm": 2.296875, "learning_rate": 4.87701621200966e-05, "loss": 0.2359, "step": 3519 }, { "epoch": 21.866043613707166, "grad_norm": 3.203125, "learning_rate": 4.8769398766924226e-05, "loss": 0.2048, "step": 3520 }, { "epoch": 21.87227414330218, "grad_norm": 2.78125, "learning_rate": 4.8768635182898694e-05, "loss": 0.2858, "step": 3521 }, { "epoch": 21.878504672897197, "grad_norm": 2.46875, "learning_rate": 4.876787136802744e-05, "loss": 0.218, "step": 3522 }, { "epoch": 21.884735202492212, "grad_norm": 3.796875, "learning_rate": 4.8767107322317865e-05, "loss": 0.498, "step": 3523 }, { "epoch": 21.890965732087228, "grad_norm": 1.71875, "learning_rate": 4.8766343045777386e-05, "loss": 0.1787, "step": 3524 }, { "epoch": 21.897196261682243, "grad_norm": 3.15625, "learning_rate": 4.8765578538413436e-05, "loss": 0.3539, "step": 3525 }, { "epoch": 21.90342679127726, "grad_norm": 3.171875, "learning_rate": 4.876481380023344e-05, "loss": 0.3104, "step": 3526 }, { "epoch": 21.909657320872274, "grad_norm": 1.9921875, "learning_rate": 4.8764048831244816e-05, "loss": 0.1992, "step": 3527 }, { "epoch": 21.91588785046729, "grad_norm": 2.5625, "learning_rate": 4.8763283631455007e-05, "loss": 0.2171, "step": 3528 }, { "epoch": 21.922118380062305, "grad_norm": 2.5, "learning_rate": 4.876251820087143e-05, "loss": 0.216, "step": 3529 }, { "epoch": 21.92834890965732, "grad_norm": 2.71875, "learning_rate": 4.876175253950153e-05, "loss": 0.3522, "step": 3530 }, { "epoch": 21.934579439252335, "grad_norm": 2.03125, "learning_rate": 4.8760986647352734e-05, "loss": 0.1933, "step": 3531 }, { "epoch": 21.94080996884735, "grad_norm": 4.03125, "learning_rate": 4.8760220524432497e-05, "loss": 0.2116, "step": 3532 }, { "epoch": 21.947040498442366, "grad_norm": 1.9765625, "learning_rate": 4.875945417074824e-05, "loss": 0.1903, "step": 3533 }, { "epoch": 21.953271028037385, "grad_norm": 2.921875, "learning_rate": 4.875868758630742e-05, "loss": 0.1981, "step": 3534 }, { "epoch": 21.9595015576324, "grad_norm": 4.8125, "learning_rate": 4.8757920771117475e-05, "loss": 0.2132, "step": 3535 }, { "epoch": 21.965732087227416, "grad_norm": 3.171875, "learning_rate": 4.875715372518585e-05, "loss": 0.2941, "step": 3536 }, { "epoch": 21.97196261682243, "grad_norm": 2.40625, "learning_rate": 4.875638644852e-05, "loss": 0.1911, "step": 3537 }, { "epoch": 21.978193146417446, "grad_norm": 4.34375, "learning_rate": 4.875561894112738e-05, "loss": 0.2409, "step": 3538 }, { "epoch": 21.98442367601246, "grad_norm": 4.1875, "learning_rate": 4.8754851203015436e-05, "loss": 0.2879, "step": 3539 }, { "epoch": 21.990654205607477, "grad_norm": 3.734375, "learning_rate": 4.8754083234191635e-05, "loss": 0.2563, "step": 3540 }, { "epoch": 21.996884735202492, "grad_norm": 2.28125, "learning_rate": 4.875331503466343e-05, "loss": 0.1747, "step": 3541 }, { "epoch": 22.0, "grad_norm": 2.125, "learning_rate": 4.875254660443827e-05, "loss": 0.2122, "step": 3542 }, { "epoch": 22.006230529595015, "grad_norm": 3.484375, "learning_rate": 4.8751777943523634e-05, "loss": 0.3462, "step": 3543 }, { "epoch": 22.01246105919003, "grad_norm": 2.34375, "learning_rate": 4.875100905192699e-05, "loss": 0.1499, "step": 3544 }, { "epoch": 22.018691588785046, "grad_norm": 3.375, "learning_rate": 4.875023992965579e-05, "loss": 0.3474, "step": 3545 }, { "epoch": 22.02492211838006, "grad_norm": 2.375, "learning_rate": 4.8749470576717516e-05, "loss": 0.2257, "step": 3546 }, { "epoch": 22.031152647975077, "grad_norm": 3.296875, "learning_rate": 4.874870099311963e-05, "loss": 0.3402, "step": 3547 }, { "epoch": 22.037383177570092, "grad_norm": 2.609375, "learning_rate": 4.874793117886962e-05, "loss": 0.2454, "step": 3548 }, { "epoch": 22.043613707165107, "grad_norm": 3.671875, "learning_rate": 4.874716113397496e-05, "loss": 0.2666, "step": 3549 }, { "epoch": 22.049844236760123, "grad_norm": 3.5625, "learning_rate": 4.8746390858443115e-05, "loss": 0.2864, "step": 3550 }, { "epoch": 22.05607476635514, "grad_norm": 2.65625, "learning_rate": 4.8745620352281576e-05, "loss": 0.3414, "step": 3551 }, { "epoch": 22.062305295950157, "grad_norm": 1.4765625, "learning_rate": 4.874484961549783e-05, "loss": 0.1446, "step": 3552 }, { "epoch": 22.068535825545172, "grad_norm": 2.78125, "learning_rate": 4.874407864809936e-05, "loss": 0.2686, "step": 3553 }, { "epoch": 22.074766355140188, "grad_norm": 2.8125, "learning_rate": 4.874330745009365e-05, "loss": 0.286, "step": 3554 }, { "epoch": 22.080996884735203, "grad_norm": 3.171875, "learning_rate": 4.8742536021488184e-05, "loss": 0.2962, "step": 3555 }, { "epoch": 22.08722741433022, "grad_norm": 3.0625, "learning_rate": 4.874176436229046e-05, "loss": 0.3718, "step": 3556 }, { "epoch": 22.093457943925234, "grad_norm": 1.46875, "learning_rate": 4.874099247250798e-05, "loss": 0.1427, "step": 3557 }, { "epoch": 22.09968847352025, "grad_norm": 1.359375, "learning_rate": 4.874022035214824e-05, "loss": 0.1463, "step": 3558 }, { "epoch": 22.105919003115265, "grad_norm": 2.859375, "learning_rate": 4.873944800121873e-05, "loss": 0.23, "step": 3559 }, { "epoch": 22.11214953271028, "grad_norm": 2.546875, "learning_rate": 4.8738675419726956e-05, "loss": 0.1764, "step": 3560 }, { "epoch": 22.118380062305295, "grad_norm": 3.53125, "learning_rate": 4.873790260768043e-05, "loss": 0.3899, "step": 3561 }, { "epoch": 22.12461059190031, "grad_norm": 2.59375, "learning_rate": 4.873712956508664e-05, "loss": 0.3126, "step": 3562 }, { "epoch": 22.130841121495326, "grad_norm": 3.640625, "learning_rate": 4.87363562919531e-05, "loss": 0.384, "step": 3563 }, { "epoch": 22.13707165109034, "grad_norm": 3.75, "learning_rate": 4.873558278828733e-05, "loss": 0.3091, "step": 3564 }, { "epoch": 22.143302180685357, "grad_norm": 3.84375, "learning_rate": 4.873480905409683e-05, "loss": 0.3469, "step": 3565 }, { "epoch": 22.149532710280372, "grad_norm": 2.0625, "learning_rate": 4.8734035089389115e-05, "loss": 0.1427, "step": 3566 }, { "epoch": 22.15576323987539, "grad_norm": 2.046875, "learning_rate": 4.873326089417172e-05, "loss": 0.2012, "step": 3567 }, { "epoch": 22.161993769470406, "grad_norm": 3.375, "learning_rate": 4.8732486468452144e-05, "loss": 0.2379, "step": 3568 }, { "epoch": 22.16822429906542, "grad_norm": 2.09375, "learning_rate": 4.8731711812237914e-05, "loss": 0.2165, "step": 3569 }, { "epoch": 22.174454828660437, "grad_norm": 2.375, "learning_rate": 4.873093692553656e-05, "loss": 0.239, "step": 3570 }, { "epoch": 22.180685358255452, "grad_norm": 2.6875, "learning_rate": 4.87301618083556e-05, "loss": 0.1541, "step": 3571 }, { "epoch": 22.186915887850468, "grad_norm": 2.859375, "learning_rate": 4.8729386460702565e-05, "loss": 0.2441, "step": 3572 }, { "epoch": 22.193146417445483, "grad_norm": 2.9375, "learning_rate": 4.872861088258499e-05, "loss": 0.1504, "step": 3573 }, { "epoch": 22.1993769470405, "grad_norm": 2.515625, "learning_rate": 4.872783507401039e-05, "loss": 0.1943, "step": 3574 }, { "epoch": 22.205607476635514, "grad_norm": 3.25, "learning_rate": 4.872705903498632e-05, "loss": 0.3805, "step": 3575 }, { "epoch": 22.21183800623053, "grad_norm": 1.7421875, "learning_rate": 4.8726282765520316e-05, "loss": 0.1553, "step": 3576 }, { "epoch": 22.218068535825545, "grad_norm": 2.328125, "learning_rate": 4.87255062656199e-05, "loss": 0.2942, "step": 3577 }, { "epoch": 22.22429906542056, "grad_norm": 2.359375, "learning_rate": 4.872472953529263e-05, "loss": 0.2356, "step": 3578 }, { "epoch": 22.230529595015575, "grad_norm": 3.234375, "learning_rate": 4.872395257454605e-05, "loss": 0.4242, "step": 3579 }, { "epoch": 22.23676012461059, "grad_norm": 1.4921875, "learning_rate": 4.8723175383387684e-05, "loss": 0.1518, "step": 3580 }, { "epoch": 22.242990654205606, "grad_norm": 3.46875, "learning_rate": 4.872239796182512e-05, "loss": 0.1994, "step": 3581 }, { "epoch": 22.24922118380062, "grad_norm": 4.5625, "learning_rate": 4.872162030986587e-05, "loss": 0.3901, "step": 3582 }, { "epoch": 22.25545171339564, "grad_norm": 3.96875, "learning_rate": 4.8720842427517496e-05, "loss": 0.3584, "step": 3583 }, { "epoch": 22.261682242990656, "grad_norm": 1.9765625, "learning_rate": 4.872006431478757e-05, "loss": 0.2951, "step": 3584 }, { "epoch": 22.26791277258567, "grad_norm": 3.0625, "learning_rate": 4.8719285971683636e-05, "loss": 0.2558, "step": 3585 }, { "epoch": 22.274143302180686, "grad_norm": 4.25, "learning_rate": 4.871850739821325e-05, "loss": 0.1748, "step": 3586 }, { "epoch": 22.2803738317757, "grad_norm": 3.046875, "learning_rate": 4.8717728594383984e-05, "loss": 0.2365, "step": 3587 }, { "epoch": 22.286604361370717, "grad_norm": 2.03125, "learning_rate": 4.8716949560203395e-05, "loss": 0.2217, "step": 3588 }, { "epoch": 22.292834890965732, "grad_norm": 2.625, "learning_rate": 4.8716170295679053e-05, "loss": 0.3566, "step": 3589 }, { "epoch": 22.299065420560748, "grad_norm": 3.0, "learning_rate": 4.8715390800818515e-05, "loss": 0.1684, "step": 3590 }, { "epoch": 22.305295950155763, "grad_norm": 3.484375, "learning_rate": 4.8714611075629376e-05, "loss": 0.3894, "step": 3591 }, { "epoch": 22.31152647975078, "grad_norm": 1.6328125, "learning_rate": 4.871383112011918e-05, "loss": 0.1529, "step": 3592 }, { "epoch": 22.317757009345794, "grad_norm": 2.265625, "learning_rate": 4.8713050934295524e-05, "loss": 0.2974, "step": 3593 }, { "epoch": 22.32398753894081, "grad_norm": 1.9921875, "learning_rate": 4.871227051816598e-05, "loss": 0.2858, "step": 3594 }, { "epoch": 22.330218068535824, "grad_norm": 3.59375, "learning_rate": 4.871148987173811e-05, "loss": 0.2649, "step": 3595 }, { "epoch": 22.33644859813084, "grad_norm": 2.96875, "learning_rate": 4.8710708995019527e-05, "loss": 0.2126, "step": 3596 }, { "epoch": 22.342679127725855, "grad_norm": 2.171875, "learning_rate": 4.8709927888017795e-05, "loss": 0.1857, "step": 3597 }, { "epoch": 22.34890965732087, "grad_norm": 2.671875, "learning_rate": 4.87091465507405e-05, "loss": 0.2149, "step": 3598 }, { "epoch": 22.35514018691589, "grad_norm": 4.09375, "learning_rate": 4.870836498319523e-05, "loss": 0.2945, "step": 3599 }, { "epoch": 22.361370716510905, "grad_norm": 2.53125, "learning_rate": 4.870758318538959e-05, "loss": 0.1729, "step": 3600 }, { "epoch": 22.36760124610592, "grad_norm": 3.0, "learning_rate": 4.8706801157331154e-05, "loss": 0.3302, "step": 3601 }, { "epoch": 22.373831775700936, "grad_norm": 4.34375, "learning_rate": 4.870601889902753e-05, "loss": 0.441, "step": 3602 }, { "epoch": 22.38006230529595, "grad_norm": 2.734375, "learning_rate": 4.870523641048631e-05, "loss": 0.2198, "step": 3603 }, { "epoch": 22.386292834890966, "grad_norm": 2.84375, "learning_rate": 4.87044536917151e-05, "loss": 0.3091, "step": 3604 }, { "epoch": 22.39252336448598, "grad_norm": 2.875, "learning_rate": 4.8703670742721494e-05, "loss": 0.2488, "step": 3605 }, { "epoch": 22.398753894080997, "grad_norm": 3.0625, "learning_rate": 4.87028875635131e-05, "loss": 0.1841, "step": 3606 }, { "epoch": 22.404984423676012, "grad_norm": 5.09375, "learning_rate": 4.8702104154097526e-05, "loss": 0.3635, "step": 3607 }, { "epoch": 22.411214953271028, "grad_norm": 2.515625, "learning_rate": 4.870132051448237e-05, "loss": 0.1285, "step": 3608 }, { "epoch": 22.417445482866043, "grad_norm": 3.296875, "learning_rate": 4.870053664467526e-05, "loss": 0.2058, "step": 3609 }, { "epoch": 22.42367601246106, "grad_norm": 3.625, "learning_rate": 4.8699752544683804e-05, "loss": 0.1513, "step": 3610 }, { "epoch": 22.429906542056074, "grad_norm": 2.265625, "learning_rate": 4.869896821451561e-05, "loss": 0.2658, "step": 3611 }, { "epoch": 22.43613707165109, "grad_norm": 1.828125, "learning_rate": 4.86981836541783e-05, "loss": 0.1673, "step": 3612 }, { "epoch": 22.442367601246104, "grad_norm": 3.859375, "learning_rate": 4.869739886367949e-05, "loss": 0.1427, "step": 3613 }, { "epoch": 22.44859813084112, "grad_norm": 4.125, "learning_rate": 4.8696613843026806e-05, "loss": 0.1896, "step": 3614 }, { "epoch": 22.45482866043614, "grad_norm": 3.5, "learning_rate": 4.8695828592227865e-05, "loss": 0.4158, "step": 3615 }, { "epoch": 22.461059190031154, "grad_norm": 2.203125, "learning_rate": 4.869504311129031e-05, "loss": 0.2116, "step": 3616 }, { "epoch": 22.46728971962617, "grad_norm": 4.40625, "learning_rate": 4.8694257400221754e-05, "loss": 0.215, "step": 3617 }, { "epoch": 22.473520249221185, "grad_norm": 5.1875, "learning_rate": 4.869347145902984e-05, "loss": 0.2393, "step": 3618 }, { "epoch": 22.4797507788162, "grad_norm": 4.0625, "learning_rate": 4.869268528772219e-05, "loss": 0.3052, "step": 3619 }, { "epoch": 22.485981308411215, "grad_norm": 2.546875, "learning_rate": 4.8691898886306445e-05, "loss": 0.1894, "step": 3620 }, { "epoch": 22.49221183800623, "grad_norm": 5.9375, "learning_rate": 4.8691112254790246e-05, "loss": 0.4056, "step": 3621 }, { "epoch": 22.498442367601246, "grad_norm": 4.8125, "learning_rate": 4.869032539318123e-05, "loss": 0.2178, "step": 3622 }, { "epoch": 22.50467289719626, "grad_norm": 4.84375, "learning_rate": 4.8689538301487025e-05, "loss": 0.16, "step": 3623 }, { "epoch": 22.510903426791277, "grad_norm": 3.296875, "learning_rate": 4.86887509797153e-05, "loss": 0.2684, "step": 3624 }, { "epoch": 22.517133956386292, "grad_norm": 2.53125, "learning_rate": 4.868796342787368e-05, "loss": 0.26, "step": 3625 }, { "epoch": 22.523364485981308, "grad_norm": 2.734375, "learning_rate": 4.868717564596984e-05, "loss": 0.2388, "step": 3626 }, { "epoch": 22.529595015576323, "grad_norm": 4.125, "learning_rate": 4.8686387634011405e-05, "loss": 0.285, "step": 3627 }, { "epoch": 22.53582554517134, "grad_norm": 5.125, "learning_rate": 4.8685599392006046e-05, "loss": 0.232, "step": 3628 }, { "epoch": 22.542056074766354, "grad_norm": 3.515625, "learning_rate": 4.86848109199614e-05, "loss": 0.265, "step": 3629 }, { "epoch": 22.54828660436137, "grad_norm": 3.328125, "learning_rate": 4.868402221788514e-05, "loss": 0.2908, "step": 3630 }, { "epoch": 22.554517133956388, "grad_norm": 3.40625, "learning_rate": 4.8683233285784924e-05, "loss": 0.2741, "step": 3631 }, { "epoch": 22.560747663551403, "grad_norm": 4.96875, "learning_rate": 4.8682444123668417e-05, "loss": 0.2705, "step": 3632 }, { "epoch": 22.56697819314642, "grad_norm": 3.75, "learning_rate": 4.868165473154327e-05, "loss": 0.3651, "step": 3633 }, { "epoch": 22.573208722741434, "grad_norm": 3.515625, "learning_rate": 4.868086510941716e-05, "loss": 0.3328, "step": 3634 }, { "epoch": 22.57943925233645, "grad_norm": 2.265625, "learning_rate": 4.868007525729775e-05, "loss": 0.1943, "step": 3635 }, { "epoch": 22.585669781931465, "grad_norm": 2.453125, "learning_rate": 4.8679285175192725e-05, "loss": 0.1857, "step": 3636 }, { "epoch": 22.59190031152648, "grad_norm": 2.8125, "learning_rate": 4.867849486310975e-05, "loss": 0.2408, "step": 3637 }, { "epoch": 22.598130841121495, "grad_norm": 2.34375, "learning_rate": 4.8677704321056496e-05, "loss": 0.1564, "step": 3638 }, { "epoch": 22.60436137071651, "grad_norm": 3.234375, "learning_rate": 4.8676913549040634e-05, "loss": 0.2964, "step": 3639 }, { "epoch": 22.610591900311526, "grad_norm": 2.296875, "learning_rate": 4.867612254706987e-05, "loss": 0.1851, "step": 3640 }, { "epoch": 22.61682242990654, "grad_norm": 2.390625, "learning_rate": 4.8675331315151865e-05, "loss": 0.1748, "step": 3641 }, { "epoch": 22.623052959501557, "grad_norm": 3.203125, "learning_rate": 4.867453985329431e-05, "loss": 0.2627, "step": 3642 }, { "epoch": 22.629283489096572, "grad_norm": 2.140625, "learning_rate": 4.867374816150489e-05, "loss": 0.1774, "step": 3643 }, { "epoch": 22.635514018691588, "grad_norm": 2.453125, "learning_rate": 4.867295623979129e-05, "loss": 0.1535, "step": 3644 }, { "epoch": 22.641744548286603, "grad_norm": 2.40625, "learning_rate": 4.867216408816122e-05, "loss": 0.1751, "step": 3645 }, { "epoch": 22.64797507788162, "grad_norm": 3.15625, "learning_rate": 4.867137170662235e-05, "loss": 0.1859, "step": 3646 }, { "epoch": 22.654205607476637, "grad_norm": 4.0625, "learning_rate": 4.86705790951824e-05, "loss": 0.2256, "step": 3647 }, { "epoch": 22.660436137071652, "grad_norm": 3.25, "learning_rate": 4.866978625384904e-05, "loss": 0.2348, "step": 3648 }, { "epoch": 22.666666666666668, "grad_norm": 2.3125, "learning_rate": 4.8668993182629985e-05, "loss": 0.1554, "step": 3649 }, { "epoch": 22.672897196261683, "grad_norm": 3.15625, "learning_rate": 4.866819988153294e-05, "loss": 0.1808, "step": 3650 }, { "epoch": 22.6791277258567, "grad_norm": 3.0625, "learning_rate": 4.8667406350565616e-05, "loss": 0.3271, "step": 3651 }, { "epoch": 22.685358255451714, "grad_norm": 2.9375, "learning_rate": 4.86666125897357e-05, "loss": 0.3034, "step": 3652 }, { "epoch": 22.69158878504673, "grad_norm": 2.8125, "learning_rate": 4.8665818599050906e-05, "loss": 0.2493, "step": 3653 }, { "epoch": 22.697819314641745, "grad_norm": 4.09375, "learning_rate": 4.866502437851896e-05, "loss": 0.3043, "step": 3654 }, { "epoch": 22.70404984423676, "grad_norm": 2.171875, "learning_rate": 4.866422992814757e-05, "loss": 0.2363, "step": 3655 }, { "epoch": 22.710280373831775, "grad_norm": 2.71875, "learning_rate": 4.866343524794444e-05, "loss": 0.2641, "step": 3656 }, { "epoch": 22.71651090342679, "grad_norm": 2.4375, "learning_rate": 4.8662640337917296e-05, "loss": 0.2174, "step": 3657 }, { "epoch": 22.722741433021806, "grad_norm": 5.5, "learning_rate": 4.866184519807387e-05, "loss": 0.2634, "step": 3658 }, { "epoch": 22.72897196261682, "grad_norm": 4.15625, "learning_rate": 4.8661049828421866e-05, "loss": 0.3661, "step": 3659 }, { "epoch": 22.735202492211837, "grad_norm": 3.1875, "learning_rate": 4.866025422896902e-05, "loss": 0.2599, "step": 3660 }, { "epoch": 22.741433021806852, "grad_norm": 2.4375, "learning_rate": 4.865945839972304e-05, "loss": 0.2155, "step": 3661 }, { "epoch": 22.747663551401867, "grad_norm": 3.15625, "learning_rate": 4.865866234069169e-05, "loss": 0.2839, "step": 3662 }, { "epoch": 22.753894080996886, "grad_norm": 3.578125, "learning_rate": 4.865786605188268e-05, "loss": 0.3206, "step": 3663 }, { "epoch": 22.7601246105919, "grad_norm": 5.4375, "learning_rate": 4.865706953330374e-05, "loss": 0.3327, "step": 3664 }, { "epoch": 22.766355140186917, "grad_norm": 2.609375, "learning_rate": 4.865627278496261e-05, "loss": 0.1987, "step": 3665 }, { "epoch": 22.772585669781932, "grad_norm": 2.09375, "learning_rate": 4.865547580686703e-05, "loss": 0.2249, "step": 3666 }, { "epoch": 22.778816199376948, "grad_norm": 2.765625, "learning_rate": 4.865467859902475e-05, "loss": 0.2251, "step": 3667 }, { "epoch": 22.785046728971963, "grad_norm": 2.65625, "learning_rate": 4.865388116144349e-05, "loss": 0.1761, "step": 3668 }, { "epoch": 22.79127725856698, "grad_norm": 1.921875, "learning_rate": 4.865308349413101e-05, "loss": 0.1524, "step": 3669 }, { "epoch": 22.797507788161994, "grad_norm": 2.90625, "learning_rate": 4.865228559709507e-05, "loss": 0.2382, "step": 3670 }, { "epoch": 22.80373831775701, "grad_norm": 2.609375, "learning_rate": 4.865148747034339e-05, "loss": 0.2263, "step": 3671 }, { "epoch": 22.809968847352025, "grad_norm": 2.984375, "learning_rate": 4.8650689113883744e-05, "loss": 0.3107, "step": 3672 }, { "epoch": 22.81619937694704, "grad_norm": 2.015625, "learning_rate": 4.864989052772386e-05, "loss": 0.2019, "step": 3673 }, { "epoch": 22.822429906542055, "grad_norm": 2.625, "learning_rate": 4.864909171187153e-05, "loss": 0.241, "step": 3674 }, { "epoch": 22.82866043613707, "grad_norm": 3.46875, "learning_rate": 4.864829266633449e-05, "loss": 0.2874, "step": 3675 }, { "epoch": 22.834890965732086, "grad_norm": 1.953125, "learning_rate": 4.8647493391120505e-05, "loss": 0.191, "step": 3676 }, { "epoch": 22.8411214953271, "grad_norm": 2.71875, "learning_rate": 4.864669388623733e-05, "loss": 0.2333, "step": 3677 }, { "epoch": 22.847352024922117, "grad_norm": 4.34375, "learning_rate": 4.864589415169274e-05, "loss": 0.2381, "step": 3678 }, { "epoch": 22.853582554517136, "grad_norm": 3.6875, "learning_rate": 4.86450941874945e-05, "loss": 0.2298, "step": 3679 }, { "epoch": 22.85981308411215, "grad_norm": 1.875, "learning_rate": 4.864429399365038e-05, "loss": 0.167, "step": 3680 }, { "epoch": 22.866043613707166, "grad_norm": 3.640625, "learning_rate": 4.864349357016815e-05, "loss": 0.3189, "step": 3681 }, { "epoch": 22.87227414330218, "grad_norm": 2.84375, "learning_rate": 4.864269291705559e-05, "loss": 0.212, "step": 3682 }, { "epoch": 22.878504672897197, "grad_norm": 2.71875, "learning_rate": 4.8641892034320465e-05, "loss": 0.2046, "step": 3683 }, { "epoch": 22.884735202492212, "grad_norm": 2.3125, "learning_rate": 4.864109092197056e-05, "loss": 0.1351, "step": 3684 }, { "epoch": 22.890965732087228, "grad_norm": 2.953125, "learning_rate": 4.864028958001365e-05, "loss": 0.21, "step": 3685 }, { "epoch": 22.897196261682243, "grad_norm": 4.03125, "learning_rate": 4.8639488008457524e-05, "loss": 0.2771, "step": 3686 }, { "epoch": 22.90342679127726, "grad_norm": 3.828125, "learning_rate": 4.8638686207309966e-05, "loss": 0.2772, "step": 3687 }, { "epoch": 22.909657320872274, "grad_norm": 5.40625, "learning_rate": 4.863788417657876e-05, "loss": 0.2687, "step": 3688 }, { "epoch": 22.91588785046729, "grad_norm": 2.71875, "learning_rate": 4.86370819162717e-05, "loss": 0.1836, "step": 3689 }, { "epoch": 22.922118380062305, "grad_norm": 3.265625, "learning_rate": 4.863627942639657e-05, "loss": 0.2225, "step": 3690 }, { "epoch": 22.92834890965732, "grad_norm": 3.90625, "learning_rate": 4.863547670696118e-05, "loss": 0.2703, "step": 3691 }, { "epoch": 22.934579439252335, "grad_norm": 3.890625, "learning_rate": 4.863467375797331e-05, "loss": 0.223, "step": 3692 }, { "epoch": 22.94080996884735, "grad_norm": 7.78125, "learning_rate": 4.8633870579440765e-05, "loss": 0.3165, "step": 3693 }, { "epoch": 22.947040498442366, "grad_norm": 4.125, "learning_rate": 4.8633067171371337e-05, "loss": 0.2271, "step": 3694 }, { "epoch": 22.953271028037385, "grad_norm": 3.0625, "learning_rate": 4.8632263533772846e-05, "loss": 0.3557, "step": 3695 }, { "epoch": 22.9595015576324, "grad_norm": 3.953125, "learning_rate": 4.863145966665309e-05, "loss": 0.2372, "step": 3696 }, { "epoch": 22.965732087227416, "grad_norm": 4.875, "learning_rate": 4.863065557001987e-05, "loss": 0.2318, "step": 3697 }, { "epoch": 22.97196261682243, "grad_norm": 5.875, "learning_rate": 4.8629851243881e-05, "loss": 0.2397, "step": 3698 }, { "epoch": 22.978193146417446, "grad_norm": 3.734375, "learning_rate": 4.8629046688244286e-05, "loss": 0.1747, "step": 3699 }, { "epoch": 22.98442367601246, "grad_norm": 3.84375, "learning_rate": 4.862824190311754e-05, "loss": 0.256, "step": 3700 }, { "epoch": 22.990654205607477, "grad_norm": 4.28125, "learning_rate": 4.8627436888508606e-05, "loss": 0.2206, "step": 3701 }, { "epoch": 22.996884735202492, "grad_norm": 5.25, "learning_rate": 4.8626631644425266e-05, "loss": 0.2593, "step": 3702 }, { "epoch": 23.0, "grad_norm": 2.34375, "learning_rate": 4.862582617087537e-05, "loss": 0.2306, "step": 3703 }, { "epoch": 23.006230529595015, "grad_norm": 1.8515625, "learning_rate": 4.862502046786671e-05, "loss": 0.1898, "step": 3704 }, { "epoch": 23.01246105919003, "grad_norm": 2.234375, "learning_rate": 4.862421453540714e-05, "loss": 0.2238, "step": 3705 }, { "epoch": 23.018691588785046, "grad_norm": 2.34375, "learning_rate": 4.862340837350448e-05, "loss": 0.225, "step": 3706 }, { "epoch": 23.02492211838006, "grad_norm": 4.53125, "learning_rate": 4.862260198216655e-05, "loss": 0.2938, "step": 3707 }, { "epoch": 23.031152647975077, "grad_norm": 2.609375, "learning_rate": 4.8621795361401185e-05, "loss": 0.1911, "step": 3708 }, { "epoch": 23.037383177570092, "grad_norm": 2.25, "learning_rate": 4.862098851121623e-05, "loss": 0.2496, "step": 3709 }, { "epoch": 23.043613707165107, "grad_norm": 2.40625, "learning_rate": 4.86201814316195e-05, "loss": 0.1773, "step": 3710 }, { "epoch": 23.049844236760123, "grad_norm": 3.328125, "learning_rate": 4.8619374122618854e-05, "loss": 0.3133, "step": 3711 }, { "epoch": 23.05607476635514, "grad_norm": 2.5625, "learning_rate": 4.8618566584222127e-05, "loss": 0.3374, "step": 3712 }, { "epoch": 23.062305295950157, "grad_norm": 2.171875, "learning_rate": 4.8617758816437154e-05, "loss": 0.189, "step": 3713 }, { "epoch": 23.068535825545172, "grad_norm": 2.578125, "learning_rate": 4.861695081927179e-05, "loss": 0.1568, "step": 3714 }, { "epoch": 23.074766355140188, "grad_norm": 3.328125, "learning_rate": 4.861614259273388e-05, "loss": 0.2423, "step": 3715 }, { "epoch": 23.080996884735203, "grad_norm": 3.125, "learning_rate": 4.861533413683127e-05, "loss": 0.2039, "step": 3716 }, { "epoch": 23.08722741433022, "grad_norm": 3.28125, "learning_rate": 4.861452545157182e-05, "loss": 0.3428, "step": 3717 }, { "epoch": 23.093457943925234, "grad_norm": 3.0, "learning_rate": 4.861371653696337e-05, "loss": 0.2328, "step": 3718 }, { "epoch": 23.09968847352025, "grad_norm": 2.9375, "learning_rate": 4.861290739301379e-05, "loss": 0.2251, "step": 3719 }, { "epoch": 23.105919003115265, "grad_norm": 4.0625, "learning_rate": 4.861209801973093e-05, "loss": 0.4592, "step": 3720 }, { "epoch": 23.11214953271028, "grad_norm": 2.921875, "learning_rate": 4.8611288417122656e-05, "loss": 0.2358, "step": 3721 }, { "epoch": 23.118380062305295, "grad_norm": 3.265625, "learning_rate": 4.8610478585196826e-05, "loss": 0.4039, "step": 3722 }, { "epoch": 23.12461059190031, "grad_norm": 4.5625, "learning_rate": 4.8609668523961316e-05, "loss": 0.3638, "step": 3723 }, { "epoch": 23.130841121495326, "grad_norm": 3.84375, "learning_rate": 4.8608858233423984e-05, "loss": 0.3132, "step": 3724 }, { "epoch": 23.13707165109034, "grad_norm": 3.78125, "learning_rate": 4.86080477135927e-05, "loss": 0.2967, "step": 3725 }, { "epoch": 23.143302180685357, "grad_norm": 3.15625, "learning_rate": 4.8607236964475335e-05, "loss": 0.2699, "step": 3726 }, { "epoch": 23.149532710280372, "grad_norm": 2.453125, "learning_rate": 4.860642598607976e-05, "loss": 0.2804, "step": 3727 }, { "epoch": 23.15576323987539, "grad_norm": 2.859375, "learning_rate": 4.8605614778413866e-05, "loss": 0.2181, "step": 3728 }, { "epoch": 23.161993769470406, "grad_norm": 3.5, "learning_rate": 4.8604803341485514e-05, "loss": 0.2902, "step": 3729 }, { "epoch": 23.16822429906542, "grad_norm": 2.921875, "learning_rate": 4.860399167530261e-05, "loss": 0.222, "step": 3730 }, { "epoch": 23.174454828660437, "grad_norm": 2.703125, "learning_rate": 4.8603179779873004e-05, "loss": 0.2197, "step": 3731 }, { "epoch": 23.180685358255452, "grad_norm": 4.25, "learning_rate": 4.860236765520461e-05, "loss": 0.3739, "step": 3732 }, { "epoch": 23.186915887850468, "grad_norm": 2.828125, "learning_rate": 4.860155530130529e-05, "loss": 0.2083, "step": 3733 }, { "epoch": 23.193146417445483, "grad_norm": 2.890625, "learning_rate": 4.860074271818296e-05, "loss": 0.2522, "step": 3734 }, { "epoch": 23.1993769470405, "grad_norm": 2.390625, "learning_rate": 4.859992990584549e-05, "loss": 0.1941, "step": 3735 }, { "epoch": 23.205607476635514, "grad_norm": 2.4375, "learning_rate": 4.859911686430079e-05, "loss": 0.1888, "step": 3736 }, { "epoch": 23.21183800623053, "grad_norm": 3.53125, "learning_rate": 4.859830359355674e-05, "loss": 0.3428, "step": 3737 }, { "epoch": 23.218068535825545, "grad_norm": 2.078125, "learning_rate": 4.8597490093621253e-05, "loss": 0.1771, "step": 3738 }, { "epoch": 23.22429906542056, "grad_norm": 3.234375, "learning_rate": 4.859667636450223e-05, "loss": 0.4278, "step": 3739 }, { "epoch": 23.230529595015575, "grad_norm": 3.96875, "learning_rate": 4.859586240620757e-05, "loss": 0.2676, "step": 3740 }, { "epoch": 23.23676012461059, "grad_norm": 2.203125, "learning_rate": 4.8595048218745164e-05, "loss": 0.1552, "step": 3741 }, { "epoch": 23.242990654205606, "grad_norm": 2.953125, "learning_rate": 4.859423380212295e-05, "loss": 0.3059, "step": 3742 }, { "epoch": 23.24922118380062, "grad_norm": 1.984375, "learning_rate": 4.8593419156348804e-05, "loss": 0.1425, "step": 3743 }, { "epoch": 23.25545171339564, "grad_norm": 2.390625, "learning_rate": 4.859260428143067e-05, "loss": 0.1683, "step": 3744 }, { "epoch": 23.261682242990656, "grad_norm": 3.640625, "learning_rate": 4.8591789177376446e-05, "loss": 0.2518, "step": 3745 }, { "epoch": 23.26791277258567, "grad_norm": 4.3125, "learning_rate": 4.859097384419404e-05, "loss": 0.2936, "step": 3746 }, { "epoch": 23.274143302180686, "grad_norm": 3.90625, "learning_rate": 4.8590158281891385e-05, "loss": 0.3042, "step": 3747 }, { "epoch": 23.2803738317757, "grad_norm": 2.875, "learning_rate": 4.8589342490476394e-05, "loss": 0.2371, "step": 3748 }, { "epoch": 23.286604361370717, "grad_norm": 4.75, "learning_rate": 4.8588526469957e-05, "loss": 0.3499, "step": 3749 }, { "epoch": 23.292834890965732, "grad_norm": 2.21875, "learning_rate": 4.858771022034112e-05, "loss": 0.1623, "step": 3750 }, { "epoch": 23.299065420560748, "grad_norm": 3.3125, "learning_rate": 4.8586893741636687e-05, "loss": 0.2338, "step": 3751 }, { "epoch": 23.305295950155763, "grad_norm": 2.390625, "learning_rate": 4.858607703385162e-05, "loss": 0.2798, "step": 3752 }, { "epoch": 23.31152647975078, "grad_norm": 3.25, "learning_rate": 4.858526009699386e-05, "loss": 0.4853, "step": 3753 }, { "epoch": 23.317757009345794, "grad_norm": 3.796875, "learning_rate": 4.858444293107134e-05, "loss": 0.2095, "step": 3754 }, { "epoch": 23.32398753894081, "grad_norm": 4.0625, "learning_rate": 4.858362553609199e-05, "loss": 0.2923, "step": 3755 }, { "epoch": 23.330218068535824, "grad_norm": 3.75, "learning_rate": 4.8582807912063764e-05, "loss": 0.1661, "step": 3756 }, { "epoch": 23.33644859813084, "grad_norm": 3.453125, "learning_rate": 4.858199005899459e-05, "loss": 0.2667, "step": 3757 }, { "epoch": 23.342679127725855, "grad_norm": 2.421875, "learning_rate": 4.858117197689242e-05, "loss": 0.1511, "step": 3758 }, { "epoch": 23.34890965732087, "grad_norm": 5.4375, "learning_rate": 4.858035366576518e-05, "loss": 0.1955, "step": 3759 }, { "epoch": 23.35514018691589, "grad_norm": 5.5625, "learning_rate": 4.857953512562084e-05, "loss": 0.3204, "step": 3760 }, { "epoch": 23.361370716510905, "grad_norm": 4.21875, "learning_rate": 4.857871635646734e-05, "loss": 0.2818, "step": 3761 }, { "epoch": 23.36760124610592, "grad_norm": 3.53125, "learning_rate": 4.857789735831263e-05, "loss": 0.1795, "step": 3762 }, { "epoch": 23.373831775700936, "grad_norm": 3.015625, "learning_rate": 4.857707813116468e-05, "loss": 0.182, "step": 3763 }, { "epoch": 23.38006230529595, "grad_norm": 4.0625, "learning_rate": 4.8576258675031424e-05, "loss": 0.2389, "step": 3764 }, { "epoch": 23.386292834890966, "grad_norm": 4.34375, "learning_rate": 4.857543898992083e-05, "loss": 0.2335, "step": 3765 }, { "epoch": 23.39252336448598, "grad_norm": 3.375, "learning_rate": 4.857461907584086e-05, "loss": 0.2177, "step": 3766 }, { "epoch": 23.398753894080997, "grad_norm": 2.5, "learning_rate": 4.8573798932799485e-05, "loss": 0.1798, "step": 3767 }, { "epoch": 23.404984423676012, "grad_norm": 3.15625, "learning_rate": 4.8572978560804655e-05, "loss": 0.2615, "step": 3768 }, { "epoch": 23.411214953271028, "grad_norm": 3.8125, "learning_rate": 4.857215795986435e-05, "loss": 0.2684, "step": 3769 }, { "epoch": 23.417445482866043, "grad_norm": 2.625, "learning_rate": 4.8571337129986525e-05, "loss": 0.1769, "step": 3770 }, { "epoch": 23.42367601246106, "grad_norm": 3.1875, "learning_rate": 4.8570516071179176e-05, "loss": 0.2048, "step": 3771 }, { "epoch": 23.429906542056074, "grad_norm": 2.1875, "learning_rate": 4.856969478345026e-05, "loss": 0.179, "step": 3772 }, { "epoch": 23.43613707165109, "grad_norm": 1.3984375, "learning_rate": 4.856887326680774e-05, "loss": 0.1612, "step": 3773 }, { "epoch": 23.442367601246104, "grad_norm": 3.515625, "learning_rate": 4.856805152125963e-05, "loss": 0.2223, "step": 3774 }, { "epoch": 23.44859813084112, "grad_norm": 1.546875, "learning_rate": 4.8567229546813894e-05, "loss": 0.1551, "step": 3775 }, { "epoch": 23.45482866043614, "grad_norm": 3.53125, "learning_rate": 4.85664073434785e-05, "loss": 0.2864, "step": 3776 }, { "epoch": 23.461059190031154, "grad_norm": 4.96875, "learning_rate": 4.856558491126146e-05, "loss": 0.1646, "step": 3777 }, { "epoch": 23.46728971962617, "grad_norm": 2.453125, "learning_rate": 4.856476225017074e-05, "loss": 0.1534, "step": 3778 }, { "epoch": 23.473520249221185, "grad_norm": 3.015625, "learning_rate": 4.8563939360214345e-05, "loss": 0.2323, "step": 3779 }, { "epoch": 23.4797507788162, "grad_norm": 2.359375, "learning_rate": 4.856311624140025e-05, "loss": 0.2755, "step": 3780 }, { "epoch": 23.485981308411215, "grad_norm": 4.15625, "learning_rate": 4.856229289373647e-05, "loss": 0.2613, "step": 3781 }, { "epoch": 23.49221183800623, "grad_norm": 2.625, "learning_rate": 4.856146931723099e-05, "loss": 0.1457, "step": 3782 }, { "epoch": 23.498442367601246, "grad_norm": 1.9296875, "learning_rate": 4.856064551189181e-05, "loss": 0.1526, "step": 3783 }, { "epoch": 23.50467289719626, "grad_norm": 2.453125, "learning_rate": 4.855982147772693e-05, "loss": 0.258, "step": 3784 }, { "epoch": 23.510903426791277, "grad_norm": 3.375, "learning_rate": 4.855899721474435e-05, "loss": 0.3029, "step": 3785 }, { "epoch": 23.517133956386292, "grad_norm": 1.7890625, "learning_rate": 4.855817272295209e-05, "loss": 0.1822, "step": 3786 }, { "epoch": 23.523364485981308, "grad_norm": 4.1875, "learning_rate": 4.855734800235814e-05, "loss": 0.2484, "step": 3787 }, { "epoch": 23.529595015576323, "grad_norm": 2.5, "learning_rate": 4.855652305297052e-05, "loss": 0.2392, "step": 3788 }, { "epoch": 23.53582554517134, "grad_norm": 2.09375, "learning_rate": 4.8555697874797236e-05, "loss": 0.1552, "step": 3789 }, { "epoch": 23.542056074766354, "grad_norm": 2.671875, "learning_rate": 4.8554872467846304e-05, "loss": 0.2612, "step": 3790 }, { "epoch": 23.54828660436137, "grad_norm": 2.015625, "learning_rate": 4.8554046832125746e-05, "loss": 0.1257, "step": 3791 }, { "epoch": 23.554517133956388, "grad_norm": 2.28125, "learning_rate": 4.855322096764358e-05, "loss": 0.172, "step": 3792 }, { "epoch": 23.560747663551403, "grad_norm": 2.46875, "learning_rate": 4.8552394874407815e-05, "loss": 0.3605, "step": 3793 }, { "epoch": 23.56697819314642, "grad_norm": 3.1875, "learning_rate": 4.855156855242648e-05, "loss": 0.246, "step": 3794 }, { "epoch": 23.573208722741434, "grad_norm": 3.09375, "learning_rate": 4.8550742001707616e-05, "loss": 0.2805, "step": 3795 }, { "epoch": 23.57943925233645, "grad_norm": 2.390625, "learning_rate": 4.854991522225923e-05, "loss": 0.1631, "step": 3796 }, { "epoch": 23.585669781931465, "grad_norm": 3.109375, "learning_rate": 4.8549088214089366e-05, "loss": 0.1756, "step": 3797 }, { "epoch": 23.59190031152648, "grad_norm": 3.625, "learning_rate": 4.854826097720604e-05, "loss": 0.2168, "step": 3798 }, { "epoch": 23.598130841121495, "grad_norm": 3.40625, "learning_rate": 4.8547433511617305e-05, "loss": 0.1934, "step": 3799 }, { "epoch": 23.60436137071651, "grad_norm": 4.09375, "learning_rate": 4.8546605817331196e-05, "loss": 0.4304, "step": 3800 }, { "epoch": 23.610591900311526, "grad_norm": 2.9375, "learning_rate": 4.8545777894355725e-05, "loss": 0.33, "step": 3801 }, { "epoch": 23.61682242990654, "grad_norm": 2.796875, "learning_rate": 4.854494974269896e-05, "loss": 0.2995, "step": 3802 }, { "epoch": 23.623052959501557, "grad_norm": 2.5, "learning_rate": 4.854412136236894e-05, "loss": 0.2689, "step": 3803 }, { "epoch": 23.629283489096572, "grad_norm": 3.859375, "learning_rate": 4.854329275337371e-05, "loss": 0.3302, "step": 3804 }, { "epoch": 23.635514018691588, "grad_norm": 3.65625, "learning_rate": 4.854246391572131e-05, "loss": 0.2732, "step": 3805 }, { "epoch": 23.641744548286603, "grad_norm": 2.15625, "learning_rate": 4.8541634849419794e-05, "loss": 0.1856, "step": 3806 }, { "epoch": 23.64797507788162, "grad_norm": 4.1875, "learning_rate": 4.854080555447722e-05, "loss": 0.3511, "step": 3807 }, { "epoch": 23.654205607476637, "grad_norm": 5.71875, "learning_rate": 4.853997603090163e-05, "loss": 0.2646, "step": 3808 }, { "epoch": 23.660436137071652, "grad_norm": 2.65625, "learning_rate": 4.8539146278701084e-05, "loss": 0.3009, "step": 3809 }, { "epoch": 23.666666666666668, "grad_norm": 3.34375, "learning_rate": 4.853831629788365e-05, "loss": 0.2525, "step": 3810 }, { "epoch": 23.672897196261683, "grad_norm": 2.421875, "learning_rate": 4.853748608845739e-05, "loss": 0.2716, "step": 3811 }, { "epoch": 23.6791277258567, "grad_norm": 4.46875, "learning_rate": 4.853665565043035e-05, "loss": 0.25, "step": 3812 }, { "epoch": 23.685358255451714, "grad_norm": 5.34375, "learning_rate": 4.8535824983810604e-05, "loss": 0.2683, "step": 3813 }, { "epoch": 23.69158878504673, "grad_norm": 3.703125, "learning_rate": 4.8534994088606225e-05, "loss": 0.4103, "step": 3814 }, { "epoch": 23.697819314641745, "grad_norm": 2.8125, "learning_rate": 4.853416296482528e-05, "loss": 0.2152, "step": 3815 }, { "epoch": 23.70404984423676, "grad_norm": 3.171875, "learning_rate": 4.853333161247583e-05, "loss": 0.2008, "step": 3816 }, { "epoch": 23.710280373831775, "grad_norm": 2.5625, "learning_rate": 4.8532500031565974e-05, "loss": 0.2102, "step": 3817 }, { "epoch": 23.71651090342679, "grad_norm": 3.625, "learning_rate": 4.853166822210376e-05, "loss": 0.2111, "step": 3818 }, { "epoch": 23.722741433021806, "grad_norm": 2.703125, "learning_rate": 4.8530836184097297e-05, "loss": 0.3231, "step": 3819 }, { "epoch": 23.72897196261682, "grad_norm": 2.09375, "learning_rate": 4.853000391755463e-05, "loss": 0.1897, "step": 3820 }, { "epoch": 23.735202492211837, "grad_norm": 1.40625, "learning_rate": 4.8529171422483876e-05, "loss": 0.1588, "step": 3821 }, { "epoch": 23.741433021806852, "grad_norm": 4.1875, "learning_rate": 4.8528338698893104e-05, "loss": 0.2871, "step": 3822 }, { "epoch": 23.747663551401867, "grad_norm": 3.875, "learning_rate": 4.85275057467904e-05, "loss": 0.4092, "step": 3823 }, { "epoch": 23.753894080996886, "grad_norm": 3.96875, "learning_rate": 4.852667256618385e-05, "loss": 0.2064, "step": 3824 }, { "epoch": 23.7601246105919, "grad_norm": 4.03125, "learning_rate": 4.8525839157081564e-05, "loss": 0.2107, "step": 3825 }, { "epoch": 23.766355140186917, "grad_norm": 4.40625, "learning_rate": 4.852500551949162e-05, "loss": 0.2808, "step": 3826 }, { "epoch": 23.772585669781932, "grad_norm": 2.625, "learning_rate": 4.8524171653422126e-05, "loss": 0.1808, "step": 3827 }, { "epoch": 23.778816199376948, "grad_norm": 3.453125, "learning_rate": 4.8523337558881166e-05, "loss": 0.2127, "step": 3828 }, { "epoch": 23.785046728971963, "grad_norm": 1.9296875, "learning_rate": 4.852250323587686e-05, "loss": 0.1839, "step": 3829 }, { "epoch": 23.79127725856698, "grad_norm": 3.078125, "learning_rate": 4.8521668684417284e-05, "loss": 0.2187, "step": 3830 }, { "epoch": 23.797507788161994, "grad_norm": 3.8125, "learning_rate": 4.852083390451058e-05, "loss": 0.3456, "step": 3831 }, { "epoch": 23.80373831775701, "grad_norm": 3.171875, "learning_rate": 4.851999889616482e-05, "loss": 0.284, "step": 3832 }, { "epoch": 23.809968847352025, "grad_norm": 2.9375, "learning_rate": 4.851916365938813e-05, "loss": 0.2092, "step": 3833 }, { "epoch": 23.81619937694704, "grad_norm": 2.734375, "learning_rate": 4.8518328194188635e-05, "loss": 0.2003, "step": 3834 }, { "epoch": 23.822429906542055, "grad_norm": 3.265625, "learning_rate": 4.8517492500574424e-05, "loss": 0.3109, "step": 3835 }, { "epoch": 23.82866043613707, "grad_norm": 1.9296875, "learning_rate": 4.8516656578553626e-05, "loss": 0.1779, "step": 3836 }, { "epoch": 23.834890965732086, "grad_norm": 3.4375, "learning_rate": 4.8515820428134364e-05, "loss": 0.1558, "step": 3837 }, { "epoch": 23.8411214953271, "grad_norm": 4.96875, "learning_rate": 4.851498404932474e-05, "loss": 0.2321, "step": 3838 }, { "epoch": 23.847352024922117, "grad_norm": 3.265625, "learning_rate": 4.851414744213291e-05, "loss": 0.2968, "step": 3839 }, { "epoch": 23.853582554517136, "grad_norm": 3.734375, "learning_rate": 4.8513310606566964e-05, "loss": 0.3211, "step": 3840 }, { "epoch": 23.85981308411215, "grad_norm": 1.6328125, "learning_rate": 4.851247354263505e-05, "loss": 0.1527, "step": 3841 }, { "epoch": 23.866043613707166, "grad_norm": 2.921875, "learning_rate": 4.8511636250345294e-05, "loss": 0.3131, "step": 3842 }, { "epoch": 23.87227414330218, "grad_norm": 2.84375, "learning_rate": 4.851079872970582e-05, "loss": 0.2452, "step": 3843 }, { "epoch": 23.878504672897197, "grad_norm": 2.859375, "learning_rate": 4.850996098072478e-05, "loss": 0.2507, "step": 3844 }, { "epoch": 23.884735202492212, "grad_norm": 2.296875, "learning_rate": 4.850912300341029e-05, "loss": 0.1646, "step": 3845 }, { "epoch": 23.890965732087228, "grad_norm": 1.890625, "learning_rate": 4.85082847977705e-05, "loss": 0.2812, "step": 3846 }, { "epoch": 23.897196261682243, "grad_norm": 2.8125, "learning_rate": 4.8507446363813554e-05, "loss": 0.2463, "step": 3847 }, { "epoch": 23.90342679127726, "grad_norm": 3.046875, "learning_rate": 4.8506607701547585e-05, "loss": 0.3616, "step": 3848 }, { "epoch": 23.909657320872274, "grad_norm": 2.6875, "learning_rate": 4.850576881098074e-05, "loss": 0.1669, "step": 3849 }, { "epoch": 23.91588785046729, "grad_norm": 3.375, "learning_rate": 4.850492969212118e-05, "loss": 0.4007, "step": 3850 }, { "epoch": 23.922118380062305, "grad_norm": 2.71875, "learning_rate": 4.850409034497704e-05, "loss": 0.1948, "step": 3851 }, { "epoch": 23.92834890965732, "grad_norm": 2.671875, "learning_rate": 4.8503250769556473e-05, "loss": 0.2905, "step": 3852 }, { "epoch": 23.934579439252335, "grad_norm": 3.765625, "learning_rate": 4.850241096586764e-05, "loss": 0.3883, "step": 3853 }, { "epoch": 23.94080996884735, "grad_norm": 2.71875, "learning_rate": 4.850157093391869e-05, "loss": 0.2125, "step": 3854 }, { "epoch": 23.947040498442366, "grad_norm": 2.328125, "learning_rate": 4.8500730673717785e-05, "loss": 0.2045, "step": 3855 }, { "epoch": 23.953271028037385, "grad_norm": 2.28125, "learning_rate": 4.849989018527309e-05, "loss": 0.191, "step": 3856 }, { "epoch": 23.9595015576324, "grad_norm": 2.859375, "learning_rate": 4.849904946859276e-05, "loss": 0.3389, "step": 3857 }, { "epoch": 23.965732087227416, "grad_norm": 2.28125, "learning_rate": 4.849820852368497e-05, "loss": 0.1709, "step": 3858 }, { "epoch": 23.97196261682243, "grad_norm": 2.453125, "learning_rate": 4.849736735055788e-05, "loss": 0.1963, "step": 3859 }, { "epoch": 23.978193146417446, "grad_norm": 2.3125, "learning_rate": 4.849652594921965e-05, "loss": 0.2104, "step": 3860 }, { "epoch": 23.98442367601246, "grad_norm": 3.140625, "learning_rate": 4.849568431967848e-05, "loss": 0.1514, "step": 3861 }, { "epoch": 23.990654205607477, "grad_norm": 3.34375, "learning_rate": 4.849484246194251e-05, "loss": 0.2347, "step": 3862 }, { "epoch": 23.996884735202492, "grad_norm": 2.515625, "learning_rate": 4.849400037601995e-05, "loss": 0.194, "step": 3863 }, { "epoch": 24.0, "grad_norm": 1.640625, "learning_rate": 4.849315806191895e-05, "loss": 0.0925, "step": 3864 }, { "epoch": 24.006230529595015, "grad_norm": 2.234375, "learning_rate": 4.849231551964771e-05, "loss": 0.1896, "step": 3865 }, { "epoch": 24.01246105919003, "grad_norm": 2.484375, "learning_rate": 4.849147274921441e-05, "loss": 0.2694, "step": 3866 }, { "epoch": 24.018691588785046, "grad_norm": 3.109375, "learning_rate": 4.849062975062722e-05, "loss": 0.3584, "step": 3867 }, { "epoch": 24.02492211838006, "grad_norm": 2.65625, "learning_rate": 4.848978652389434e-05, "loss": 0.3251, "step": 3868 }, { "epoch": 24.031152647975077, "grad_norm": 2.59375, "learning_rate": 4.848894306902396e-05, "loss": 0.2847, "step": 3869 }, { "epoch": 24.037383177570092, "grad_norm": 2.03125, "learning_rate": 4.848809938602427e-05, "loss": 0.1496, "step": 3870 }, { "epoch": 24.043613707165107, "grad_norm": 2.625, "learning_rate": 4.8487255474903465e-05, "loss": 0.3154, "step": 3871 }, { "epoch": 24.049844236760123, "grad_norm": 3.03125, "learning_rate": 4.848641133566973e-05, "loss": 0.246, "step": 3872 }, { "epoch": 24.05607476635514, "grad_norm": 3.1875, "learning_rate": 4.8485566968331286e-05, "loss": 0.2166, "step": 3873 }, { "epoch": 24.062305295950157, "grad_norm": 3.40625, "learning_rate": 4.848472237289632e-05, "loss": 0.3476, "step": 3874 }, { "epoch": 24.068535825545172, "grad_norm": 2.859375, "learning_rate": 4.848387754937303e-05, "loss": 0.2274, "step": 3875 }, { "epoch": 24.074766355140188, "grad_norm": 3.3125, "learning_rate": 4.848303249776963e-05, "loss": 0.1921, "step": 3876 }, { "epoch": 24.080996884735203, "grad_norm": 2.796875, "learning_rate": 4.848218721809432e-05, "loss": 0.3016, "step": 3877 }, { "epoch": 24.08722741433022, "grad_norm": 2.921875, "learning_rate": 4.848134171035532e-05, "loss": 0.1777, "step": 3878 }, { "epoch": 24.093457943925234, "grad_norm": 2.84375, "learning_rate": 4.8480495974560836e-05, "loss": 0.1864, "step": 3879 }, { "epoch": 24.09968847352025, "grad_norm": 2.640625, "learning_rate": 4.847965001071907e-05, "loss": 0.2472, "step": 3880 }, { "epoch": 24.105919003115265, "grad_norm": 3.703125, "learning_rate": 4.847880381883826e-05, "loss": 0.499, "step": 3881 }, { "epoch": 24.11214953271028, "grad_norm": 2.5625, "learning_rate": 4.847795739892661e-05, "loss": 0.2393, "step": 3882 }, { "epoch": 24.118380062305295, "grad_norm": 2.140625, "learning_rate": 4.847711075099235e-05, "loss": 0.1484, "step": 3883 }, { "epoch": 24.12461059190031, "grad_norm": 2.53125, "learning_rate": 4.847626387504369e-05, "loss": 0.1669, "step": 3884 }, { "epoch": 24.130841121495326, "grad_norm": 3.0625, "learning_rate": 4.8475416771088865e-05, "loss": 0.3511, "step": 3885 }, { "epoch": 24.13707165109034, "grad_norm": 2.296875, "learning_rate": 4.84745694391361e-05, "loss": 0.2196, "step": 3886 }, { "epoch": 24.143302180685357, "grad_norm": 2.078125, "learning_rate": 4.847372187919362e-05, "loss": 0.167, "step": 3887 }, { "epoch": 24.149532710280372, "grad_norm": 2.34375, "learning_rate": 4.8472874091269674e-05, "loss": 0.1962, "step": 3888 }, { "epoch": 24.15576323987539, "grad_norm": 3.546875, "learning_rate": 4.847202607537247e-05, "loss": 0.3097, "step": 3889 }, { "epoch": 24.161993769470406, "grad_norm": 4.28125, "learning_rate": 4.8471177831510264e-05, "loss": 0.3306, "step": 3890 }, { "epoch": 24.16822429906542, "grad_norm": 5.0, "learning_rate": 4.847032935969128e-05, "loss": 0.3557, "step": 3891 }, { "epoch": 24.174454828660437, "grad_norm": 2.375, "learning_rate": 4.846948065992377e-05, "loss": 0.1706, "step": 3892 }, { "epoch": 24.180685358255452, "grad_norm": 3.75, "learning_rate": 4.8468631732215975e-05, "loss": 0.198, "step": 3893 }, { "epoch": 24.186915887850468, "grad_norm": 2.265625, "learning_rate": 4.8467782576576135e-05, "loss": 0.2766, "step": 3894 }, { "epoch": 24.193146417445483, "grad_norm": 1.9375, "learning_rate": 4.8466933193012495e-05, "loss": 0.1591, "step": 3895 }, { "epoch": 24.1993769470405, "grad_norm": 2.03125, "learning_rate": 4.8466083581533316e-05, "loss": 0.1551, "step": 3896 }, { "epoch": 24.205607476635514, "grad_norm": 4.15625, "learning_rate": 4.8465233742146845e-05, "loss": 0.3925, "step": 3897 }, { "epoch": 24.21183800623053, "grad_norm": 4.34375, "learning_rate": 4.846438367486133e-05, "loss": 0.3048, "step": 3898 }, { "epoch": 24.218068535825545, "grad_norm": 2.890625, "learning_rate": 4.8463533379685024e-05, "loss": 0.1903, "step": 3899 }, { "epoch": 24.22429906542056, "grad_norm": 4.09375, "learning_rate": 4.846268285662619e-05, "loss": 0.3602, "step": 3900 }, { "epoch": 24.230529595015575, "grad_norm": 3.5625, "learning_rate": 4.8461832105693095e-05, "loss": 0.3212, "step": 3901 }, { "epoch": 24.23676012461059, "grad_norm": 2.953125, "learning_rate": 4.8460981126894e-05, "loss": 0.2745, "step": 3902 }, { "epoch": 24.242990654205606, "grad_norm": 3.96875, "learning_rate": 4.846012992023716e-05, "loss": 0.3367, "step": 3903 }, { "epoch": 24.24922118380062, "grad_norm": 2.296875, "learning_rate": 4.845927848573085e-05, "loss": 0.1477, "step": 3904 }, { "epoch": 24.25545171339564, "grad_norm": 2.625, "learning_rate": 4.845842682338334e-05, "loss": 0.2273, "step": 3905 }, { "epoch": 24.261682242990656, "grad_norm": 2.109375, "learning_rate": 4.845757493320291e-05, "loss": 0.2107, "step": 3906 }, { "epoch": 24.26791277258567, "grad_norm": 3.84375, "learning_rate": 4.845672281519781e-05, "loss": 0.3259, "step": 3907 }, { "epoch": 24.274143302180686, "grad_norm": 1.859375, "learning_rate": 4.845587046937633e-05, "loss": 0.2482, "step": 3908 }, { "epoch": 24.2803738317757, "grad_norm": 2.015625, "learning_rate": 4.845501789574674e-05, "loss": 0.1329, "step": 3909 }, { "epoch": 24.286604361370717, "grad_norm": 3.109375, "learning_rate": 4.845416509431734e-05, "loss": 0.2781, "step": 3910 }, { "epoch": 24.292834890965732, "grad_norm": 3.578125, "learning_rate": 4.84533120650964e-05, "loss": 0.2474, "step": 3911 }, { "epoch": 24.299065420560748, "grad_norm": 3.234375, "learning_rate": 4.84524588080922e-05, "loss": 0.2363, "step": 3912 }, { "epoch": 24.305295950155763, "grad_norm": 3.09375, "learning_rate": 4.8451605323313034e-05, "loss": 0.219, "step": 3913 }, { "epoch": 24.31152647975078, "grad_norm": 2.890625, "learning_rate": 4.8450751610767194e-05, "loss": 0.2897, "step": 3914 }, { "epoch": 24.317757009345794, "grad_norm": 2.28125, "learning_rate": 4.844989767046296e-05, "loss": 0.3018, "step": 3915 }, { "epoch": 24.32398753894081, "grad_norm": 2.171875, "learning_rate": 4.844904350240863e-05, "loss": 0.2067, "step": 3916 }, { "epoch": 24.330218068535824, "grad_norm": 3.125, "learning_rate": 4.8448189106612505e-05, "loss": 0.2313, "step": 3917 }, { "epoch": 24.33644859813084, "grad_norm": 1.7890625, "learning_rate": 4.8447334483082886e-05, "loss": 0.1433, "step": 3918 }, { "epoch": 24.342679127725855, "grad_norm": 2.5, "learning_rate": 4.8446479631828066e-05, "loss": 0.2056, "step": 3919 }, { "epoch": 24.34890965732087, "grad_norm": 2.9375, "learning_rate": 4.8445624552856346e-05, "loss": 0.1983, "step": 3920 }, { "epoch": 24.35514018691589, "grad_norm": 1.609375, "learning_rate": 4.844476924617603e-05, "loss": 0.1328, "step": 3921 }, { "epoch": 24.361370716510905, "grad_norm": 2.09375, "learning_rate": 4.844391371179543e-05, "loss": 0.1728, "step": 3922 }, { "epoch": 24.36760124610592, "grad_norm": 1.2890625, "learning_rate": 4.844305794972286e-05, "loss": 0.146, "step": 3923 }, { "epoch": 24.373831775700936, "grad_norm": 3.71875, "learning_rate": 4.844220195996662e-05, "loss": 0.3091, "step": 3924 }, { "epoch": 24.38006230529595, "grad_norm": 3.390625, "learning_rate": 4.8441345742535024e-05, "loss": 0.2899, "step": 3925 }, { "epoch": 24.386292834890966, "grad_norm": 2.53125, "learning_rate": 4.84404892974364e-05, "loss": 0.1446, "step": 3926 }, { "epoch": 24.39252336448598, "grad_norm": 3.484375, "learning_rate": 4.843963262467906e-05, "loss": 0.3171, "step": 3927 }, { "epoch": 24.398753894080997, "grad_norm": 1.4296875, "learning_rate": 4.8438775724271316e-05, "loss": 0.147, "step": 3928 }, { "epoch": 24.404984423676012, "grad_norm": 3.9375, "learning_rate": 4.84379185962215e-05, "loss": 0.3188, "step": 3929 }, { "epoch": 24.411214953271028, "grad_norm": 2.53125, "learning_rate": 4.8437061240537934e-05, "loss": 0.1362, "step": 3930 }, { "epoch": 24.417445482866043, "grad_norm": 1.5234375, "learning_rate": 4.843620365722894e-05, "loss": 0.1576, "step": 3931 }, { "epoch": 24.42367601246106, "grad_norm": 2.734375, "learning_rate": 4.843534584630286e-05, "loss": 0.2844, "step": 3932 }, { "epoch": 24.429906542056074, "grad_norm": 3.296875, "learning_rate": 4.8434487807768006e-05, "loss": 0.2565, "step": 3933 }, { "epoch": 24.43613707165109, "grad_norm": 4.40625, "learning_rate": 4.843362954163273e-05, "loss": 0.3394, "step": 3934 }, { "epoch": 24.442367601246104, "grad_norm": 3.1875, "learning_rate": 4.843277104790535e-05, "loss": 0.2114, "step": 3935 }, { "epoch": 24.44859813084112, "grad_norm": 2.5625, "learning_rate": 4.8431912326594226e-05, "loss": 0.2112, "step": 3936 }, { "epoch": 24.45482866043614, "grad_norm": 1.828125, "learning_rate": 4.843105337770768e-05, "loss": 0.196, "step": 3937 }, { "epoch": 24.461059190031154, "grad_norm": 5.59375, "learning_rate": 4.843019420125405e-05, "loss": 0.2452, "step": 3938 }, { "epoch": 24.46728971962617, "grad_norm": 5.40625, "learning_rate": 4.842933479724171e-05, "loss": 0.3382, "step": 3939 }, { "epoch": 24.473520249221185, "grad_norm": 4.6875, "learning_rate": 4.842847516567898e-05, "loss": 0.375, "step": 3940 }, { "epoch": 24.4797507788162, "grad_norm": 4.28125, "learning_rate": 4.84276153065742e-05, "loss": 0.3069, "step": 3941 }, { "epoch": 24.485981308411215, "grad_norm": 4.375, "learning_rate": 4.8426755219935756e-05, "loss": 0.2721, "step": 3942 }, { "epoch": 24.49221183800623, "grad_norm": 4.75, "learning_rate": 4.8425894905771975e-05, "loss": 0.3004, "step": 3943 }, { "epoch": 24.498442367601246, "grad_norm": 5.875, "learning_rate": 4.842503436409122e-05, "loss": 0.2394, "step": 3944 }, { "epoch": 24.50467289719626, "grad_norm": 3.203125, "learning_rate": 4.842417359490185e-05, "loss": 0.4061, "step": 3945 }, { "epoch": 24.510903426791277, "grad_norm": 2.234375, "learning_rate": 4.842331259821222e-05, "loss": 0.2039, "step": 3946 }, { "epoch": 24.517133956386292, "grad_norm": 2.609375, "learning_rate": 4.84224513740307e-05, "loss": 0.3308, "step": 3947 }, { "epoch": 24.523364485981308, "grad_norm": 3.640625, "learning_rate": 4.842158992236565e-05, "loss": 0.3125, "step": 3948 }, { "epoch": 24.529595015576323, "grad_norm": 3.828125, "learning_rate": 4.8420728243225435e-05, "loss": 0.4212, "step": 3949 }, { "epoch": 24.53582554517134, "grad_norm": 2.015625, "learning_rate": 4.841986633661842e-05, "loss": 0.1752, "step": 3950 }, { "epoch": 24.542056074766354, "grad_norm": 2.34375, "learning_rate": 4.8419004202553e-05, "loss": 0.1787, "step": 3951 }, { "epoch": 24.54828660436137, "grad_norm": 2.625, "learning_rate": 4.8418141841037515e-05, "loss": 0.22, "step": 3952 }, { "epoch": 24.554517133956388, "grad_norm": 2.890625, "learning_rate": 4.8417279252080363e-05, "loss": 0.2873, "step": 3953 }, { "epoch": 24.560747663551403, "grad_norm": 1.296875, "learning_rate": 4.8416416435689906e-05, "loss": 0.1223, "step": 3954 }, { "epoch": 24.56697819314642, "grad_norm": 3.890625, "learning_rate": 4.841555339187454e-05, "loss": 0.2535, "step": 3955 }, { "epoch": 24.573208722741434, "grad_norm": 2.515625, "learning_rate": 4.841469012064263e-05, "loss": 0.2551, "step": 3956 }, { "epoch": 24.57943925233645, "grad_norm": 2.234375, "learning_rate": 4.841382662200257e-05, "loss": 0.1908, "step": 3957 }, { "epoch": 24.585669781931465, "grad_norm": 2.375, "learning_rate": 4.841296289596274e-05, "loss": 0.1757, "step": 3958 }, { "epoch": 24.59190031152648, "grad_norm": 3.125, "learning_rate": 4.8412098942531546e-05, "loss": 0.3824, "step": 3959 }, { "epoch": 24.598130841121495, "grad_norm": 1.625, "learning_rate": 4.8411234761717364e-05, "loss": 0.1576, "step": 3960 }, { "epoch": 24.60436137071651, "grad_norm": 3.046875, "learning_rate": 4.841037035352859e-05, "loss": 0.3452, "step": 3961 }, { "epoch": 24.610591900311526, "grad_norm": 2.96875, "learning_rate": 4.8409505717973625e-05, "loss": 0.2098, "step": 3962 }, { "epoch": 24.61682242990654, "grad_norm": 3.34375, "learning_rate": 4.8408640855060846e-05, "loss": 0.3904, "step": 3963 }, { "epoch": 24.623052959501557, "grad_norm": 2.375, "learning_rate": 4.840777576479868e-05, "loss": 0.2154, "step": 3964 }, { "epoch": 24.629283489096572, "grad_norm": 2.109375, "learning_rate": 4.8406910447195516e-05, "loss": 0.1874, "step": 3965 }, { "epoch": 24.635514018691588, "grad_norm": 3.34375, "learning_rate": 4.840604490225976e-05, "loss": 0.3661, "step": 3966 }, { "epoch": 24.641744548286603, "grad_norm": 2.234375, "learning_rate": 4.840517912999981e-05, "loss": 0.1584, "step": 3967 }, { "epoch": 24.64797507788162, "grad_norm": 3.515625, "learning_rate": 4.840431313042408e-05, "loss": 0.4264, "step": 3968 }, { "epoch": 24.654205607476637, "grad_norm": 2.953125, "learning_rate": 4.840344690354099e-05, "loss": 0.2394, "step": 3969 }, { "epoch": 24.660436137071652, "grad_norm": 2.9375, "learning_rate": 4.840258044935895e-05, "loss": 0.2335, "step": 3970 }, { "epoch": 24.666666666666668, "grad_norm": 3.046875, "learning_rate": 4.840171376788636e-05, "loss": 0.5393, "step": 3971 }, { "epoch": 24.672897196261683, "grad_norm": 2.84375, "learning_rate": 4.8400846859131646e-05, "loss": 0.2247, "step": 3972 }, { "epoch": 24.6791277258567, "grad_norm": 1.9375, "learning_rate": 4.839997972310324e-05, "loss": 0.1655, "step": 3973 }, { "epoch": 24.685358255451714, "grad_norm": 4.53125, "learning_rate": 4.839911235980955e-05, "loss": 0.362, "step": 3974 }, { "epoch": 24.69158878504673, "grad_norm": 1.7265625, "learning_rate": 4.8398244769259e-05, "loss": 0.1694, "step": 3975 }, { "epoch": 24.697819314641745, "grad_norm": 2.203125, "learning_rate": 4.839737695146002e-05, "loss": 0.232, "step": 3976 }, { "epoch": 24.70404984423676, "grad_norm": 3.015625, "learning_rate": 4.839650890642104e-05, "loss": 0.2655, "step": 3977 }, { "epoch": 24.710280373831775, "grad_norm": 2.9375, "learning_rate": 4.8395640634150485e-05, "loss": 0.265, "step": 3978 }, { "epoch": 24.71651090342679, "grad_norm": 2.6875, "learning_rate": 4.8394772134656795e-05, "loss": 0.2032, "step": 3979 }, { "epoch": 24.722741433021806, "grad_norm": 3.171875, "learning_rate": 4.839390340794841e-05, "loss": 0.2854, "step": 3980 }, { "epoch": 24.72897196261682, "grad_norm": 2.53125, "learning_rate": 4.8393034454033745e-05, "loss": 0.1904, "step": 3981 }, { "epoch": 24.735202492211837, "grad_norm": 3.828125, "learning_rate": 4.839216527292126e-05, "loss": 0.3314, "step": 3982 }, { "epoch": 24.741433021806852, "grad_norm": 5.125, "learning_rate": 4.839129586461938e-05, "loss": 0.2799, "step": 3983 }, { "epoch": 24.747663551401867, "grad_norm": 5.71875, "learning_rate": 4.839042622913657e-05, "loss": 0.2543, "step": 3984 }, { "epoch": 24.753894080996886, "grad_norm": 2.6875, "learning_rate": 4.838955636648126e-05, "loss": 0.1551, "step": 3985 }, { "epoch": 24.7601246105919, "grad_norm": 1.765625, "learning_rate": 4.838868627666191e-05, "loss": 0.1722, "step": 3986 }, { "epoch": 24.766355140186917, "grad_norm": 4.46875, "learning_rate": 4.838781595968695e-05, "loss": 0.2786, "step": 3987 }, { "epoch": 24.772585669781932, "grad_norm": 4.71875, "learning_rate": 4.838694541556486e-05, "loss": 0.1866, "step": 3988 }, { "epoch": 24.778816199376948, "grad_norm": 5.03125, "learning_rate": 4.838607464430408e-05, "loss": 0.2253, "step": 3989 }, { "epoch": 24.785046728971963, "grad_norm": 2.796875, "learning_rate": 4.838520364591306e-05, "loss": 0.1394, "step": 3990 }, { "epoch": 24.79127725856698, "grad_norm": 3.578125, "learning_rate": 4.838433242040027e-05, "loss": 0.3372, "step": 3991 }, { "epoch": 24.797507788161994, "grad_norm": 4.90625, "learning_rate": 4.8383460967774166e-05, "loss": 0.2044, "step": 3992 }, { "epoch": 24.80373831775701, "grad_norm": 3.96875, "learning_rate": 4.838258928804322e-05, "loss": 0.2155, "step": 3993 }, { "epoch": 24.809968847352025, "grad_norm": 4.28125, "learning_rate": 4.83817173812159e-05, "loss": 0.1546, "step": 3994 }, { "epoch": 24.81619937694704, "grad_norm": 3.375, "learning_rate": 4.838084524730065e-05, "loss": 0.2863, "step": 3995 }, { "epoch": 24.822429906542055, "grad_norm": 3.265625, "learning_rate": 4.8379972886305966e-05, "loss": 0.227, "step": 3996 }, { "epoch": 24.82866043613707, "grad_norm": 3.171875, "learning_rate": 4.837910029824031e-05, "loss": 0.2454, "step": 3997 }, { "epoch": 24.834890965732086, "grad_norm": 3.6875, "learning_rate": 4.837822748311217e-05, "loss": 0.3773, "step": 3998 }, { "epoch": 24.8411214953271, "grad_norm": 1.953125, "learning_rate": 4.8377354440930003e-05, "loss": 0.1471, "step": 3999 }, { "epoch": 24.847352024922117, "grad_norm": 2.125, "learning_rate": 4.8376481171702294e-05, "loss": 0.1515, "step": 4000 }, { "epoch": 24.853582554517136, "grad_norm": 2.21875, "learning_rate": 4.837560767543753e-05, "loss": 0.206, "step": 4001 }, { "epoch": 24.85981308411215, "grad_norm": 3.59375, "learning_rate": 4.8374733952144186e-05, "loss": 0.2463, "step": 4002 }, { "epoch": 24.866043613707166, "grad_norm": 2.8125, "learning_rate": 4.8373860001830755e-05, "loss": 0.1872, "step": 4003 }, { "epoch": 24.87227414330218, "grad_norm": 3.8125, "learning_rate": 4.837298582450573e-05, "loss": 0.1736, "step": 4004 }, { "epoch": 24.878504672897197, "grad_norm": 2.25, "learning_rate": 4.837211142017759e-05, "loss": 0.193, "step": 4005 }, { "epoch": 24.884735202492212, "grad_norm": 3.484375, "learning_rate": 4.8371236788854835e-05, "loss": 0.3578, "step": 4006 }, { "epoch": 24.890965732087228, "grad_norm": 4.6875, "learning_rate": 4.837036193054595e-05, "loss": 0.1488, "step": 4007 }, { "epoch": 24.897196261682243, "grad_norm": 3.171875, "learning_rate": 4.8369486845259436e-05, "loss": 0.2902, "step": 4008 }, { "epoch": 24.90342679127726, "grad_norm": 2.921875, "learning_rate": 4.83686115330038e-05, "loss": 0.1912, "step": 4009 }, { "epoch": 24.909657320872274, "grad_norm": 2.34375, "learning_rate": 4.836773599378754e-05, "loss": 0.2035, "step": 4010 }, { "epoch": 24.91588785046729, "grad_norm": 2.96875, "learning_rate": 4.836686022761916e-05, "loss": 0.1494, "step": 4011 }, { "epoch": 24.922118380062305, "grad_norm": 3.828125, "learning_rate": 4.836598423450716e-05, "loss": 0.1863, "step": 4012 }, { "epoch": 24.92834890965732, "grad_norm": 4.6875, "learning_rate": 4.836510801446004e-05, "loss": 0.2902, "step": 4013 }, { "epoch": 24.934579439252335, "grad_norm": 2.5625, "learning_rate": 4.836423156748633e-05, "loss": 0.203, "step": 4014 }, { "epoch": 24.94080996884735, "grad_norm": 2.171875, "learning_rate": 4.836335489359453e-05, "loss": 0.1933, "step": 4015 }, { "epoch": 24.947040498442366, "grad_norm": 2.296875, "learning_rate": 4.8362477992793155e-05, "loss": 0.2052, "step": 4016 }, { "epoch": 24.953271028037385, "grad_norm": 3.578125, "learning_rate": 4.836160086509073e-05, "loss": 0.2079, "step": 4017 }, { "epoch": 24.9595015576324, "grad_norm": 4.4375, "learning_rate": 4.836072351049576e-05, "loss": 0.2795, "step": 4018 }, { "epoch": 24.965732087227416, "grad_norm": 4.0625, "learning_rate": 4.835984592901678e-05, "loss": 0.3119, "step": 4019 }, { "epoch": 24.97196261682243, "grad_norm": 2.828125, "learning_rate": 4.8358968120662305e-05, "loss": 0.2265, "step": 4020 }, { "epoch": 24.978193146417446, "grad_norm": 2.484375, "learning_rate": 4.8358090085440864e-05, "loss": 0.1799, "step": 4021 }, { "epoch": 24.98442367601246, "grad_norm": 2.90625, "learning_rate": 4.8357211823360974e-05, "loss": 0.2099, "step": 4022 }, { "epoch": 24.990654205607477, "grad_norm": 1.7734375, "learning_rate": 4.835633333443118e-05, "loss": 0.1996, "step": 4023 }, { "epoch": 24.996884735202492, "grad_norm": 2.140625, "learning_rate": 4.835545461866001e-05, "loss": 0.1692, "step": 4024 }, { "epoch": 25.0, "grad_norm": 1.7890625, "learning_rate": 4.835457567605599e-05, "loss": 0.1461, "step": 4025 }, { "epoch": 25.006230529595015, "grad_norm": 2.1875, "learning_rate": 4.835369650662767e-05, "loss": 0.2209, "step": 4026 }, { "epoch": 25.01246105919003, "grad_norm": 3.90625, "learning_rate": 4.8352817110383574e-05, "loss": 0.307, "step": 4027 }, { "epoch": 25.018691588785046, "grad_norm": 2.03125, "learning_rate": 4.8351937487332256e-05, "loss": 0.1428, "step": 4028 }, { "epoch": 25.02492211838006, "grad_norm": 3.03125, "learning_rate": 4.8351057637482255e-05, "loss": 0.2877, "step": 4029 }, { "epoch": 25.031152647975077, "grad_norm": 2.0625, "learning_rate": 4.835017756084211e-05, "loss": 0.241, "step": 4030 }, { "epoch": 25.037383177570092, "grad_norm": 1.8984375, "learning_rate": 4.8349297257420367e-05, "loss": 0.1917, "step": 4031 }, { "epoch": 25.043613707165107, "grad_norm": 1.9453125, "learning_rate": 4.8348416727225595e-05, "loss": 0.1747, "step": 4032 }, { "epoch": 25.049844236760123, "grad_norm": 3.265625, "learning_rate": 4.834753597026632e-05, "loss": 0.3358, "step": 4033 }, { "epoch": 25.05607476635514, "grad_norm": 3.296875, "learning_rate": 4.8346654986551115e-05, "loss": 0.2664, "step": 4034 }, { "epoch": 25.062305295950157, "grad_norm": 2.640625, "learning_rate": 4.834577377608853e-05, "loss": 0.2513, "step": 4035 }, { "epoch": 25.068535825545172, "grad_norm": 2.65625, "learning_rate": 4.834489233888712e-05, "loss": 0.2228, "step": 4036 }, { "epoch": 25.074766355140188, "grad_norm": 3.3125, "learning_rate": 4.834401067495545e-05, "loss": 0.3459, "step": 4037 }, { "epoch": 25.080996884735203, "grad_norm": 2.484375, "learning_rate": 4.8343128784302085e-05, "loss": 0.2211, "step": 4038 }, { "epoch": 25.08722741433022, "grad_norm": 2.171875, "learning_rate": 4.834224666693559e-05, "loss": 0.216, "step": 4039 }, { "epoch": 25.093457943925234, "grad_norm": 2.546875, "learning_rate": 4.8341364322864523e-05, "loss": 0.2223, "step": 4040 }, { "epoch": 25.09968847352025, "grad_norm": 2.46875, "learning_rate": 4.834048175209746e-05, "loss": 0.2144, "step": 4041 }, { "epoch": 25.105919003115265, "grad_norm": 1.671875, "learning_rate": 4.8339598954642976e-05, "loss": 0.1499, "step": 4042 }, { "epoch": 25.11214953271028, "grad_norm": 2.265625, "learning_rate": 4.8338715930509636e-05, "loss": 0.175, "step": 4043 }, { "epoch": 25.118380062305295, "grad_norm": 2.609375, "learning_rate": 4.8337832679706024e-05, "loss": 0.231, "step": 4044 }, { "epoch": 25.12461059190031, "grad_norm": 3.09375, "learning_rate": 4.833694920224072e-05, "loss": 0.3172, "step": 4045 }, { "epoch": 25.130841121495326, "grad_norm": 2.828125, "learning_rate": 4.83360654981223e-05, "loss": 0.2139, "step": 4046 }, { "epoch": 25.13707165109034, "grad_norm": 1.9921875, "learning_rate": 4.833518156735934e-05, "loss": 0.1284, "step": 4047 }, { "epoch": 25.143302180685357, "grad_norm": 3.046875, "learning_rate": 4.833429740996044e-05, "loss": 0.3139, "step": 4048 }, { "epoch": 25.149532710280372, "grad_norm": 3.125, "learning_rate": 4.833341302593417e-05, "loss": 0.2919, "step": 4049 }, { "epoch": 25.15576323987539, "grad_norm": 2.71875, "learning_rate": 4.8332528415289136e-05, "loss": 0.2635, "step": 4050 }, { "epoch": 25.161993769470406, "grad_norm": 1.796875, "learning_rate": 4.833164357803392e-05, "loss": 0.1756, "step": 4051 }, { "epoch": 25.16822429906542, "grad_norm": 2.84375, "learning_rate": 4.8330758514177114e-05, "loss": 0.2422, "step": 4052 }, { "epoch": 25.174454828660437, "grad_norm": 4.0625, "learning_rate": 4.8329873223727326e-05, "loss": 0.2887, "step": 4053 }, { "epoch": 25.180685358255452, "grad_norm": 2.34375, "learning_rate": 4.832898770669314e-05, "loss": 0.2898, "step": 4054 }, { "epoch": 25.186915887850468, "grad_norm": 3.109375, "learning_rate": 4.832810196308316e-05, "loss": 0.3248, "step": 4055 }, { "epoch": 25.193146417445483, "grad_norm": 2.234375, "learning_rate": 4.8327215992905995e-05, "loss": 0.1969, "step": 4056 }, { "epoch": 25.1993769470405, "grad_norm": 1.9609375, "learning_rate": 4.832632979617024e-05, "loss": 0.1933, "step": 4057 }, { "epoch": 25.205607476635514, "grad_norm": 2.65625, "learning_rate": 4.832544337288452e-05, "loss": 0.2108, "step": 4058 }, { "epoch": 25.21183800623053, "grad_norm": 2.96875, "learning_rate": 4.832455672305741e-05, "loss": 0.3195, "step": 4059 }, { "epoch": 25.218068535825545, "grad_norm": 2.078125, "learning_rate": 4.8323669846697564e-05, "loss": 0.2648, "step": 4060 }, { "epoch": 25.22429906542056, "grad_norm": 2.625, "learning_rate": 4.832278274381356e-05, "loss": 0.2628, "step": 4061 }, { "epoch": 25.230529595015575, "grad_norm": 2.90625, "learning_rate": 4.832189541441403e-05, "loss": 0.1908, "step": 4062 }, { "epoch": 25.23676012461059, "grad_norm": 3.21875, "learning_rate": 4.8321007858507594e-05, "loss": 0.3498, "step": 4063 }, { "epoch": 25.242990654205606, "grad_norm": 3.609375, "learning_rate": 4.832012007610287e-05, "loss": 0.2644, "step": 4064 }, { "epoch": 25.24922118380062, "grad_norm": 1.5390625, "learning_rate": 4.831923206720848e-05, "loss": 0.1374, "step": 4065 }, { "epoch": 25.25545171339564, "grad_norm": 2.8125, "learning_rate": 4.8318343831833035e-05, "loss": 0.204, "step": 4066 }, { "epoch": 25.261682242990656, "grad_norm": 1.5546875, "learning_rate": 4.8317455369985174e-05, "loss": 0.1468, "step": 4067 }, { "epoch": 25.26791277258567, "grad_norm": 3.328125, "learning_rate": 4.831656668167353e-05, "loss": 0.2338, "step": 4068 }, { "epoch": 25.274143302180686, "grad_norm": 4.0625, "learning_rate": 4.831567776690674e-05, "loss": 0.3726, "step": 4069 }, { "epoch": 25.2803738317757, "grad_norm": 3.21875, "learning_rate": 4.831478862569342e-05, "loss": 0.2763, "step": 4070 }, { "epoch": 25.286604361370717, "grad_norm": 1.9765625, "learning_rate": 4.8313899258042206e-05, "loss": 0.2347, "step": 4071 }, { "epoch": 25.292834890965732, "grad_norm": 2.640625, "learning_rate": 4.8313009663961746e-05, "loss": 0.1761, "step": 4072 }, { "epoch": 25.299065420560748, "grad_norm": 2.9375, "learning_rate": 4.831211984346067e-05, "loss": 0.2241, "step": 4073 }, { "epoch": 25.305295950155763, "grad_norm": 2.828125, "learning_rate": 4.8311229796547634e-05, "loss": 0.224, "step": 4074 }, { "epoch": 25.31152647975078, "grad_norm": 3.515625, "learning_rate": 4.8310339523231276e-05, "loss": 0.189, "step": 4075 }, { "epoch": 25.317757009345794, "grad_norm": 1.9765625, "learning_rate": 4.830944902352024e-05, "loss": 0.2463, "step": 4076 }, { "epoch": 25.32398753894081, "grad_norm": 3.53125, "learning_rate": 4.830855829742317e-05, "loss": 0.3409, "step": 4077 }, { "epoch": 25.330218068535824, "grad_norm": 2.734375, "learning_rate": 4.830766734494873e-05, "loss": 0.2703, "step": 4078 }, { "epoch": 25.33644859813084, "grad_norm": 4.6875, "learning_rate": 4.830677616610556e-05, "loss": 0.303, "step": 4079 }, { "epoch": 25.342679127725855, "grad_norm": 5.34375, "learning_rate": 4.830588476090232e-05, "loss": 0.4538, "step": 4080 }, { "epoch": 25.34890965732087, "grad_norm": 1.25, "learning_rate": 4.8304993129347664e-05, "loss": 0.1393, "step": 4081 }, { "epoch": 25.35514018691589, "grad_norm": 1.7734375, "learning_rate": 4.830410127145026e-05, "loss": 0.2075, "step": 4082 }, { "epoch": 25.361370716510905, "grad_norm": 4.375, "learning_rate": 4.830320918721878e-05, "loss": 0.1837, "step": 4083 }, { "epoch": 25.36760124610592, "grad_norm": 3.40625, "learning_rate": 4.830231687666186e-05, "loss": 0.1819, "step": 4084 }, { "epoch": 25.373831775700936, "grad_norm": 2.84375, "learning_rate": 4.830142433978818e-05, "loss": 0.1953, "step": 4085 }, { "epoch": 25.38006230529595, "grad_norm": 3.453125, "learning_rate": 4.830053157660641e-05, "loss": 0.3821, "step": 4086 }, { "epoch": 25.386292834890966, "grad_norm": 4.25, "learning_rate": 4.829963858712523e-05, "loss": 0.2432, "step": 4087 }, { "epoch": 25.39252336448598, "grad_norm": 3.28125, "learning_rate": 4.829874537135329e-05, "loss": 0.2306, "step": 4088 }, { "epoch": 25.398753894080997, "grad_norm": 2.953125, "learning_rate": 4.8297851929299284e-05, "loss": 0.3973, "step": 4089 }, { "epoch": 25.404984423676012, "grad_norm": 2.421875, "learning_rate": 4.829695826097188e-05, "loss": 0.28, "step": 4090 }, { "epoch": 25.411214953271028, "grad_norm": 1.765625, "learning_rate": 4.829606436637976e-05, "loss": 0.1505, "step": 4091 }, { "epoch": 25.417445482866043, "grad_norm": 3.28125, "learning_rate": 4.8295170245531605e-05, "loss": 0.2812, "step": 4092 }, { "epoch": 25.42367601246106, "grad_norm": 3.359375, "learning_rate": 4.8294275898436105e-05, "loss": 0.303, "step": 4093 }, { "epoch": 25.429906542056074, "grad_norm": 3.6875, "learning_rate": 4.829338132510194e-05, "loss": 0.3126, "step": 4094 }, { "epoch": 25.43613707165109, "grad_norm": 2.515625, "learning_rate": 4.829248652553779e-05, "loss": 0.1786, "step": 4095 }, { "epoch": 25.442367601246104, "grad_norm": 2.96875, "learning_rate": 4.8291591499752365e-05, "loss": 0.2912, "step": 4096 }, { "epoch": 25.44859813084112, "grad_norm": 2.234375, "learning_rate": 4.829069624775435e-05, "loss": 0.2125, "step": 4097 }, { "epoch": 25.45482866043614, "grad_norm": 2.9375, "learning_rate": 4.828980076955243e-05, "loss": 0.1926, "step": 4098 }, { "epoch": 25.461059190031154, "grad_norm": 2.265625, "learning_rate": 4.828890506515531e-05, "loss": 0.1635, "step": 4099 }, { "epoch": 25.46728971962617, "grad_norm": 2.703125, "learning_rate": 4.828800913457169e-05, "loss": 0.2471, "step": 4100 }, { "epoch": 25.473520249221185, "grad_norm": 2.90625, "learning_rate": 4.8287112977810274e-05, "loss": 0.2444, "step": 4101 }, { "epoch": 25.4797507788162, "grad_norm": 3.265625, "learning_rate": 4.828621659487976e-05, "loss": 0.2892, "step": 4102 }, { "epoch": 25.485981308411215, "grad_norm": 2.71875, "learning_rate": 4.828531998578885e-05, "loss": 0.1782, "step": 4103 }, { "epoch": 25.49221183800623, "grad_norm": 2.3125, "learning_rate": 4.828442315054627e-05, "loss": 0.1786, "step": 4104 }, { "epoch": 25.498442367601246, "grad_norm": 1.3203125, "learning_rate": 4.8283526089160705e-05, "loss": 0.1305, "step": 4105 }, { "epoch": 25.50467289719626, "grad_norm": 4.3125, "learning_rate": 4.8282628801640886e-05, "loss": 0.2982, "step": 4106 }, { "epoch": 25.510903426791277, "grad_norm": 3.84375, "learning_rate": 4.828173128799551e-05, "loss": 0.4291, "step": 4107 }, { "epoch": 25.517133956386292, "grad_norm": 3.046875, "learning_rate": 4.8280833548233324e-05, "loss": 0.2148, "step": 4108 }, { "epoch": 25.523364485981308, "grad_norm": 3.671875, "learning_rate": 4.827993558236302e-05, "loss": 0.2714, "step": 4109 }, { "epoch": 25.529595015576323, "grad_norm": 2.03125, "learning_rate": 4.827903739039333e-05, "loss": 0.211, "step": 4110 }, { "epoch": 25.53582554517134, "grad_norm": 2.34375, "learning_rate": 4.827813897233298e-05, "loss": 0.1616, "step": 4111 }, { "epoch": 25.542056074766354, "grad_norm": 4.3125, "learning_rate": 4.8277240328190676e-05, "loss": 0.3285, "step": 4112 }, { "epoch": 25.54828660436137, "grad_norm": 3.671875, "learning_rate": 4.827634145797517e-05, "loss": 0.1585, "step": 4113 }, { "epoch": 25.554517133956388, "grad_norm": 2.515625, "learning_rate": 4.827544236169519e-05, "loss": 0.1751, "step": 4114 }, { "epoch": 25.560747663551403, "grad_norm": 3.875, "learning_rate": 4.827454303935946e-05, "loss": 0.1748, "step": 4115 }, { "epoch": 25.56697819314642, "grad_norm": 3.828125, "learning_rate": 4.827364349097671e-05, "loss": 0.2258, "step": 4116 }, { "epoch": 25.573208722741434, "grad_norm": 2.828125, "learning_rate": 4.8272743716555674e-05, "loss": 0.2166, "step": 4117 }, { "epoch": 25.57943925233645, "grad_norm": 2.4375, "learning_rate": 4.827184371610511e-05, "loss": 0.1433, "step": 4118 }, { "epoch": 25.585669781931465, "grad_norm": 3.484375, "learning_rate": 4.827094348963375e-05, "loss": 0.2704, "step": 4119 }, { "epoch": 25.59190031152648, "grad_norm": 4.375, "learning_rate": 4.8270043037150324e-05, "loss": 0.29, "step": 4120 }, { "epoch": 25.598130841121495, "grad_norm": 2.484375, "learning_rate": 4.8269142358663596e-05, "loss": 0.2731, "step": 4121 }, { "epoch": 25.60436137071651, "grad_norm": 1.5546875, "learning_rate": 4.8268241454182304e-05, "loss": 0.1227, "step": 4122 }, { "epoch": 25.610591900311526, "grad_norm": 3.75, "learning_rate": 4.8267340323715203e-05, "loss": 0.2936, "step": 4123 }, { "epoch": 25.61682242990654, "grad_norm": 2.578125, "learning_rate": 4.8266438967271036e-05, "loss": 0.1934, "step": 4124 }, { "epoch": 25.623052959501557, "grad_norm": 2.71875, "learning_rate": 4.826553738485857e-05, "loss": 0.176, "step": 4125 }, { "epoch": 25.629283489096572, "grad_norm": 3.15625, "learning_rate": 4.826463557648655e-05, "loss": 0.2092, "step": 4126 }, { "epoch": 25.635514018691588, "grad_norm": 2.6875, "learning_rate": 4.826373354216373e-05, "loss": 0.2399, "step": 4127 }, { "epoch": 25.641744548286603, "grad_norm": 2.359375, "learning_rate": 4.8262831281898893e-05, "loss": 0.2838, "step": 4128 }, { "epoch": 25.64797507788162, "grad_norm": 2.375, "learning_rate": 4.826192879570078e-05, "loss": 0.1749, "step": 4129 }, { "epoch": 25.654205607476637, "grad_norm": 3.65625, "learning_rate": 4.826102608357817e-05, "loss": 0.2197, "step": 4130 }, { "epoch": 25.660436137071652, "grad_norm": 2.765625, "learning_rate": 4.826012314553983e-05, "loss": 0.2354, "step": 4131 }, { "epoch": 25.666666666666668, "grad_norm": 2.65625, "learning_rate": 4.825921998159452e-05, "loss": 0.2259, "step": 4132 }, { "epoch": 25.672897196261683, "grad_norm": 2.390625, "learning_rate": 4.825831659175101e-05, "loss": 0.2219, "step": 4133 }, { "epoch": 25.6791277258567, "grad_norm": 3.265625, "learning_rate": 4.8257412976018084e-05, "loss": 0.2159, "step": 4134 }, { "epoch": 25.685358255451714, "grad_norm": 2.5625, "learning_rate": 4.825650913440452e-05, "loss": 0.2088, "step": 4135 }, { "epoch": 25.69158878504673, "grad_norm": 2.75, "learning_rate": 4.825560506691908e-05, "loss": 0.2283, "step": 4136 }, { "epoch": 25.697819314641745, "grad_norm": 3.25, "learning_rate": 4.825470077357056e-05, "loss": 0.2716, "step": 4137 }, { "epoch": 25.70404984423676, "grad_norm": 1.984375, "learning_rate": 4.825379625436773e-05, "loss": 0.1515, "step": 4138 }, { "epoch": 25.710280373831775, "grad_norm": 2.140625, "learning_rate": 4.8252891509319395e-05, "loss": 0.1506, "step": 4139 }, { "epoch": 25.71651090342679, "grad_norm": 1.890625, "learning_rate": 4.825198653843432e-05, "loss": 0.1216, "step": 4140 }, { "epoch": 25.722741433021806, "grad_norm": 3.03125, "learning_rate": 4.825108134172131e-05, "loss": 0.1963, "step": 4141 }, { "epoch": 25.72897196261682, "grad_norm": 2.53125, "learning_rate": 4.825017591918915e-05, "loss": 0.2236, "step": 4142 }, { "epoch": 25.735202492211837, "grad_norm": 2.15625, "learning_rate": 4.824927027084663e-05, "loss": 0.2192, "step": 4143 }, { "epoch": 25.741433021806852, "grad_norm": 2.046875, "learning_rate": 4.824836439670255e-05, "loss": 0.227, "step": 4144 }, { "epoch": 25.747663551401867, "grad_norm": 2.34375, "learning_rate": 4.824745829676571e-05, "loss": 0.1908, "step": 4145 }, { "epoch": 25.753894080996886, "grad_norm": 1.8125, "learning_rate": 4.8246551971044906e-05, "loss": 0.1968, "step": 4146 }, { "epoch": 25.7601246105919, "grad_norm": 2.140625, "learning_rate": 4.8245645419548945e-05, "loss": 0.1974, "step": 4147 }, { "epoch": 25.766355140186917, "grad_norm": 3.0625, "learning_rate": 4.824473864228663e-05, "loss": 0.3448, "step": 4148 }, { "epoch": 25.772585669781932, "grad_norm": 3.0, "learning_rate": 4.824383163926676e-05, "loss": 0.1822, "step": 4149 }, { "epoch": 25.778816199376948, "grad_norm": 3.25, "learning_rate": 4.824292441049816e-05, "loss": 0.1543, "step": 4150 }, { "epoch": 25.785046728971963, "grad_norm": 2.1875, "learning_rate": 4.824201695598963e-05, "loss": 0.2074, "step": 4151 }, { "epoch": 25.79127725856698, "grad_norm": 5.21875, "learning_rate": 4.8241109275749976e-05, "loss": 0.2024, "step": 4152 }, { "epoch": 25.797507788161994, "grad_norm": 3.53125, "learning_rate": 4.824020136978803e-05, "loss": 0.2855, "step": 4153 }, { "epoch": 25.80373831775701, "grad_norm": 3.59375, "learning_rate": 4.82392932381126e-05, "loss": 0.2518, "step": 4154 }, { "epoch": 25.809968847352025, "grad_norm": 2.25, "learning_rate": 4.823838488073251e-05, "loss": 0.1656, "step": 4155 }, { "epoch": 25.81619937694704, "grad_norm": 2.390625, "learning_rate": 4.823747629765659e-05, "loss": 0.1913, "step": 4156 }, { "epoch": 25.822429906542055, "grad_norm": 2.15625, "learning_rate": 4.823656748889364e-05, "loss": 0.1622, "step": 4157 }, { "epoch": 25.82866043613707, "grad_norm": 4.15625, "learning_rate": 4.823565845445251e-05, "loss": 0.2439, "step": 4158 }, { "epoch": 25.834890965732086, "grad_norm": 3.125, "learning_rate": 4.823474919434202e-05, "loss": 0.2527, "step": 4159 }, { "epoch": 25.8411214953271, "grad_norm": 2.65625, "learning_rate": 4.8233839708571e-05, "loss": 0.2993, "step": 4160 }, { "epoch": 25.847352024922117, "grad_norm": 2.578125, "learning_rate": 4.8232929997148276e-05, "loss": 0.2843, "step": 4161 }, { "epoch": 25.853582554517136, "grad_norm": 3.21875, "learning_rate": 4.8232020060082706e-05, "loss": 0.3185, "step": 4162 }, { "epoch": 25.85981308411215, "grad_norm": 2.40625, "learning_rate": 4.823110989738311e-05, "loss": 0.1663, "step": 4163 }, { "epoch": 25.866043613707166, "grad_norm": 2.359375, "learning_rate": 4.8230199509058326e-05, "loss": 0.1881, "step": 4164 }, { "epoch": 25.87227414330218, "grad_norm": 2.25, "learning_rate": 4.82292888951172e-05, "loss": 0.1935, "step": 4165 }, { "epoch": 25.878504672897197, "grad_norm": 3.15625, "learning_rate": 4.822837805556858e-05, "loss": 0.1472, "step": 4166 }, { "epoch": 25.884735202492212, "grad_norm": 3.8125, "learning_rate": 4.822746699042131e-05, "loss": 0.348, "step": 4167 }, { "epoch": 25.890965732087228, "grad_norm": 2.984375, "learning_rate": 4.822655569968424e-05, "loss": 0.228, "step": 4168 }, { "epoch": 25.897196261682243, "grad_norm": 2.5625, "learning_rate": 4.8225644183366216e-05, "loss": 0.1748, "step": 4169 }, { "epoch": 25.90342679127726, "grad_norm": 4.4375, "learning_rate": 4.8224732441476086e-05, "loss": 0.2482, "step": 4170 }, { "epoch": 25.909657320872274, "grad_norm": 1.9921875, "learning_rate": 4.8223820474022716e-05, "loss": 0.1526, "step": 4171 }, { "epoch": 25.91588785046729, "grad_norm": 2.03125, "learning_rate": 4.822290828101496e-05, "loss": 0.1428, "step": 4172 }, { "epoch": 25.922118380062305, "grad_norm": 3.265625, "learning_rate": 4.822199586246168e-05, "loss": 0.1863, "step": 4173 }, { "epoch": 25.92834890965732, "grad_norm": 2.0625, "learning_rate": 4.822108321837173e-05, "loss": 0.2521, "step": 4174 }, { "epoch": 25.934579439252335, "grad_norm": 3.765625, "learning_rate": 4.822017034875398e-05, "loss": 0.1658, "step": 4175 }, { "epoch": 25.94080996884735, "grad_norm": 3.296875, "learning_rate": 4.821925725361729e-05, "loss": 0.3191, "step": 4176 }, { "epoch": 25.947040498442366, "grad_norm": 3.390625, "learning_rate": 4.8218343932970535e-05, "loss": 0.2976, "step": 4177 }, { "epoch": 25.953271028037385, "grad_norm": 2.671875, "learning_rate": 4.8217430386822585e-05, "loss": 0.1234, "step": 4178 }, { "epoch": 25.9595015576324, "grad_norm": 3.671875, "learning_rate": 4.821651661518231e-05, "loss": 0.2349, "step": 4179 }, { "epoch": 25.965732087227416, "grad_norm": 1.4296875, "learning_rate": 4.821560261805858e-05, "loss": 0.1155, "step": 4180 }, { "epoch": 25.97196261682243, "grad_norm": 1.6640625, "learning_rate": 4.821468839546028e-05, "loss": 0.1745, "step": 4181 }, { "epoch": 25.978193146417446, "grad_norm": 4.3125, "learning_rate": 4.8213773947396284e-05, "loss": 0.2627, "step": 4182 }, { "epoch": 25.98442367601246, "grad_norm": 4.53125, "learning_rate": 4.8212859273875474e-05, "loss": 0.4228, "step": 4183 }, { "epoch": 25.990654205607477, "grad_norm": 4.5, "learning_rate": 4.8211944374906735e-05, "loss": 0.2879, "step": 4184 }, { "epoch": 25.996884735202492, "grad_norm": 2.125, "learning_rate": 4.821102925049896e-05, "loss": 0.2256, "step": 4185 }, { "epoch": 26.0, "grad_norm": 1.453125, "learning_rate": 4.821011390066102e-05, "loss": 0.1276, "step": 4186 }, { "epoch": 26.006230529595015, "grad_norm": 3.71875, "learning_rate": 4.8209198325401815e-05, "loss": 0.211, "step": 4187 }, { "epoch": 26.01246105919003, "grad_norm": 3.046875, "learning_rate": 4.8208282524730244e-05, "loss": 0.1594, "step": 4188 }, { "epoch": 26.018691588785046, "grad_norm": 3.828125, "learning_rate": 4.820736649865518e-05, "loss": 0.2208, "step": 4189 }, { "epoch": 26.02492211838006, "grad_norm": 3.125, "learning_rate": 4.820645024718555e-05, "loss": 0.2754, "step": 4190 }, { "epoch": 26.031152647975077, "grad_norm": 2.703125, "learning_rate": 4.8205533770330226e-05, "loss": 0.1459, "step": 4191 }, { "epoch": 26.037383177570092, "grad_norm": 2.109375, "learning_rate": 4.8204617068098124e-05, "loss": 0.1536, "step": 4192 }, { "epoch": 26.043613707165107, "grad_norm": 3.28125, "learning_rate": 4.8203700140498135e-05, "loss": 0.2202, "step": 4193 }, { "epoch": 26.049844236760123, "grad_norm": 2.75, "learning_rate": 4.820278298753918e-05, "loss": 0.1792, "step": 4194 }, { "epoch": 26.05607476635514, "grad_norm": 1.59375, "learning_rate": 4.8201865609230156e-05, "loss": 0.1266, "step": 4195 }, { "epoch": 26.062305295950157, "grad_norm": 1.2265625, "learning_rate": 4.8200948005579974e-05, "loss": 0.1295, "step": 4196 }, { "epoch": 26.068535825545172, "grad_norm": 3.765625, "learning_rate": 4.820003017659755e-05, "loss": 0.3194, "step": 4197 }, { "epoch": 26.074766355140188, "grad_norm": 3.0, "learning_rate": 4.819911212229179e-05, "loss": 0.2123, "step": 4198 }, { "epoch": 26.080996884735203, "grad_norm": 2.46875, "learning_rate": 4.8198193842671625e-05, "loss": 0.2313, "step": 4199 }, { "epoch": 26.08722741433022, "grad_norm": 3.6875, "learning_rate": 4.819727533774595e-05, "loss": 0.3561, "step": 4200 }, { "epoch": 26.093457943925234, "grad_norm": 1.8984375, "learning_rate": 4.8196356607523713e-05, "loss": 0.1722, "step": 4201 }, { "epoch": 26.09968847352025, "grad_norm": 2.703125, "learning_rate": 4.819543765201382e-05, "loss": 0.2561, "step": 4202 }, { "epoch": 26.105919003115265, "grad_norm": 2.96875, "learning_rate": 4.8194518471225194e-05, "loss": 0.185, "step": 4203 }, { "epoch": 26.11214953271028, "grad_norm": 2.640625, "learning_rate": 4.8193599065166774e-05, "loss": 0.1259, "step": 4204 }, { "epoch": 26.118380062305295, "grad_norm": 2.0625, "learning_rate": 4.819267943384748e-05, "loss": 0.22, "step": 4205 }, { "epoch": 26.12461059190031, "grad_norm": 2.953125, "learning_rate": 4.819175957727625e-05, "loss": 0.2954, "step": 4206 }, { "epoch": 26.130841121495326, "grad_norm": 3.765625, "learning_rate": 4.8190839495462015e-05, "loss": 0.2288, "step": 4207 }, { "epoch": 26.13707165109034, "grad_norm": 2.4375, "learning_rate": 4.8189919188413714e-05, "loss": 0.1484, "step": 4208 }, { "epoch": 26.143302180685357, "grad_norm": 1.9921875, "learning_rate": 4.818899865614028e-05, "loss": 0.1493, "step": 4209 }, { "epoch": 26.149532710280372, "grad_norm": 3.46875, "learning_rate": 4.818807789865065e-05, "loss": 0.2403, "step": 4210 }, { "epoch": 26.15576323987539, "grad_norm": 3.59375, "learning_rate": 4.818715691595378e-05, "loss": 0.2679, "step": 4211 }, { "epoch": 26.161993769470406, "grad_norm": 2.34375, "learning_rate": 4.8186235708058604e-05, "loss": 0.1703, "step": 4212 }, { "epoch": 26.16822429906542, "grad_norm": 2.8125, "learning_rate": 4.818531427497407e-05, "loss": 0.1923, "step": 4213 }, { "epoch": 26.174454828660437, "grad_norm": 3.171875, "learning_rate": 4.818439261670913e-05, "loss": 0.2383, "step": 4214 }, { "epoch": 26.180685358255452, "grad_norm": 2.25, "learning_rate": 4.818347073327274e-05, "loss": 0.2848, "step": 4215 }, { "epoch": 26.186915887850468, "grad_norm": 3.1875, "learning_rate": 4.818254862467384e-05, "loss": 0.1999, "step": 4216 }, { "epoch": 26.193146417445483, "grad_norm": 2.234375, "learning_rate": 4.81816262909214e-05, "loss": 0.1686, "step": 4217 }, { "epoch": 26.1993769470405, "grad_norm": 3.109375, "learning_rate": 4.818070373202437e-05, "loss": 0.238, "step": 4218 }, { "epoch": 26.205607476635514, "grad_norm": 3.171875, "learning_rate": 4.8179780947991707e-05, "loss": 0.1783, "step": 4219 }, { "epoch": 26.21183800623053, "grad_norm": 2.828125, "learning_rate": 4.8178857938832375e-05, "loss": 0.2703, "step": 4220 }, { "epoch": 26.218068535825545, "grad_norm": 2.171875, "learning_rate": 4.817793470455535e-05, "loss": 0.174, "step": 4221 }, { "epoch": 26.22429906542056, "grad_norm": 2.15625, "learning_rate": 4.817701124516959e-05, "loss": 0.2456, "step": 4222 }, { "epoch": 26.230529595015575, "grad_norm": 2.015625, "learning_rate": 4.817608756068406e-05, "loss": 0.1523, "step": 4223 }, { "epoch": 26.23676012461059, "grad_norm": 3.03125, "learning_rate": 4.817516365110773e-05, "loss": 0.2035, "step": 4224 }, { "epoch": 26.242990654205606, "grad_norm": 2.75, "learning_rate": 4.817423951644959e-05, "loss": 0.2002, "step": 4225 }, { "epoch": 26.24922118380062, "grad_norm": 2.65625, "learning_rate": 4.81733151567186e-05, "loss": 0.2111, "step": 4226 }, { "epoch": 26.25545171339564, "grad_norm": 3.171875, "learning_rate": 4.817239057192374e-05, "loss": 0.3706, "step": 4227 }, { "epoch": 26.261682242990656, "grad_norm": 2.125, "learning_rate": 4.8171465762073986e-05, "loss": 0.2067, "step": 4228 }, { "epoch": 26.26791277258567, "grad_norm": 1.7734375, "learning_rate": 4.8170540727178326e-05, "loss": 0.1449, "step": 4229 }, { "epoch": 26.274143302180686, "grad_norm": 2.53125, "learning_rate": 4.8169615467245744e-05, "loss": 0.3467, "step": 4230 }, { "epoch": 26.2803738317757, "grad_norm": 2.359375, "learning_rate": 4.816868998228523e-05, "loss": 0.1806, "step": 4231 }, { "epoch": 26.286604361370717, "grad_norm": 2.03125, "learning_rate": 4.816776427230576e-05, "loss": 0.1768, "step": 4232 }, { "epoch": 26.292834890965732, "grad_norm": 2.453125, "learning_rate": 4.8166838337316334e-05, "loss": 0.1829, "step": 4233 }, { "epoch": 26.299065420560748, "grad_norm": 2.515625, "learning_rate": 4.8165912177325946e-05, "loss": 0.1738, "step": 4234 }, { "epoch": 26.305295950155763, "grad_norm": 2.875, "learning_rate": 4.8164985792343583e-05, "loss": 0.3271, "step": 4235 }, { "epoch": 26.31152647975078, "grad_norm": 2.90625, "learning_rate": 4.816405918237825e-05, "loss": 0.2049, "step": 4236 }, { "epoch": 26.317757009345794, "grad_norm": 1.6328125, "learning_rate": 4.816313234743895e-05, "loss": 0.1204, "step": 4237 }, { "epoch": 26.32398753894081, "grad_norm": 3.46875, "learning_rate": 4.816220528753467e-05, "loss": 0.2743, "step": 4238 }, { "epoch": 26.330218068535824, "grad_norm": 2.46875, "learning_rate": 4.8161278002674424e-05, "loss": 0.1604, "step": 4239 }, { "epoch": 26.33644859813084, "grad_norm": 3.046875, "learning_rate": 4.8160350492867225e-05, "loss": 0.2139, "step": 4240 }, { "epoch": 26.342679127725855, "grad_norm": 4.15625, "learning_rate": 4.8159422758122066e-05, "loss": 0.2545, "step": 4241 }, { "epoch": 26.34890965732087, "grad_norm": 3.359375, "learning_rate": 4.815849479844796e-05, "loss": 0.2656, "step": 4242 }, { "epoch": 26.35514018691589, "grad_norm": 2.015625, "learning_rate": 4.815756661385392e-05, "loss": 0.1708, "step": 4243 }, { "epoch": 26.361370716510905, "grad_norm": 1.5078125, "learning_rate": 4.815663820434897e-05, "loss": 0.1422, "step": 4244 }, { "epoch": 26.36760124610592, "grad_norm": 3.09375, "learning_rate": 4.815570956994212e-05, "loss": 0.2563, "step": 4245 }, { "epoch": 26.373831775700936, "grad_norm": 2.875, "learning_rate": 4.8154780710642395e-05, "loss": 0.2519, "step": 4246 }, { "epoch": 26.38006230529595, "grad_norm": 2.59375, "learning_rate": 4.8153851626458804e-05, "loss": 0.2022, "step": 4247 }, { "epoch": 26.386292834890966, "grad_norm": 2.421875, "learning_rate": 4.815292231740039e-05, "loss": 0.171, "step": 4248 }, { "epoch": 26.39252336448598, "grad_norm": 2.078125, "learning_rate": 4.815199278347615e-05, "loss": 0.1298, "step": 4249 }, { "epoch": 26.398753894080997, "grad_norm": 2.875, "learning_rate": 4.815106302469513e-05, "loss": 0.2098, "step": 4250 }, { "epoch": 26.404984423676012, "grad_norm": 2.640625, "learning_rate": 4.815013304106637e-05, "loss": 0.2, "step": 4251 }, { "epoch": 26.411214953271028, "grad_norm": 2.625, "learning_rate": 4.8149202832598884e-05, "loss": 0.1445, "step": 4252 }, { "epoch": 26.417445482866043, "grad_norm": 3.640625, "learning_rate": 4.814827239930171e-05, "loss": 0.3074, "step": 4253 }, { "epoch": 26.42367601246106, "grad_norm": 3.171875, "learning_rate": 4.814734174118388e-05, "loss": 0.2151, "step": 4254 }, { "epoch": 26.429906542056074, "grad_norm": 3.0625, "learning_rate": 4.814641085825445e-05, "loss": 0.2834, "step": 4255 }, { "epoch": 26.43613707165109, "grad_norm": 2.703125, "learning_rate": 4.814547975052245e-05, "loss": 0.1894, "step": 4256 }, { "epoch": 26.442367601246104, "grad_norm": 1.828125, "learning_rate": 4.814454841799693e-05, "loss": 0.144, "step": 4257 }, { "epoch": 26.44859813084112, "grad_norm": 2.984375, "learning_rate": 4.814361686068692e-05, "loss": 0.3853, "step": 4258 }, { "epoch": 26.45482866043614, "grad_norm": 3.453125, "learning_rate": 4.814268507860148e-05, "loss": 0.3309, "step": 4259 }, { "epoch": 26.461059190031154, "grad_norm": 3.5, "learning_rate": 4.814175307174966e-05, "loss": 0.2561, "step": 4260 }, { "epoch": 26.46728971962617, "grad_norm": 2.296875, "learning_rate": 4.81408208401405e-05, "loss": 0.124, "step": 4261 }, { "epoch": 26.473520249221185, "grad_norm": 4.5, "learning_rate": 4.813988838378307e-05, "loss": 0.4667, "step": 4262 }, { "epoch": 26.4797507788162, "grad_norm": 4.0625, "learning_rate": 4.8138955702686415e-05, "loss": 0.2996, "step": 4263 }, { "epoch": 26.485981308411215, "grad_norm": 3.296875, "learning_rate": 4.8138022796859593e-05, "loss": 0.2158, "step": 4264 }, { "epoch": 26.49221183800623, "grad_norm": 3.8125, "learning_rate": 4.813708966631167e-05, "loss": 0.2842, "step": 4265 }, { "epoch": 26.498442367601246, "grad_norm": 2.875, "learning_rate": 4.8136156311051716e-05, "loss": 0.2934, "step": 4266 }, { "epoch": 26.50467289719626, "grad_norm": 1.953125, "learning_rate": 4.813522273108878e-05, "loss": 0.1566, "step": 4267 }, { "epoch": 26.510903426791277, "grad_norm": 2.375, "learning_rate": 4.813428892643194e-05, "loss": 0.1717, "step": 4268 }, { "epoch": 26.517133956386292, "grad_norm": 3.0, "learning_rate": 4.813335489709025e-05, "loss": 0.2037, "step": 4269 }, { "epoch": 26.523364485981308, "grad_norm": 2.421875, "learning_rate": 4.813242064307281e-05, "loss": 0.164, "step": 4270 }, { "epoch": 26.529595015576323, "grad_norm": 2.34375, "learning_rate": 4.8131486164388665e-05, "loss": 0.1383, "step": 4271 }, { "epoch": 26.53582554517134, "grad_norm": 2.078125, "learning_rate": 4.81305514610469e-05, "loss": 0.2159, "step": 4272 }, { "epoch": 26.542056074766354, "grad_norm": 1.921875, "learning_rate": 4.81296165330566e-05, "loss": 0.146, "step": 4273 }, { "epoch": 26.54828660436137, "grad_norm": 2.890625, "learning_rate": 4.8128681380426844e-05, "loss": 0.2169, "step": 4274 }, { "epoch": 26.554517133956388, "grad_norm": 2.875, "learning_rate": 4.8127746003166706e-05, "loss": 0.1691, "step": 4275 }, { "epoch": 26.560747663551403, "grad_norm": 3.1875, "learning_rate": 4.812681040128528e-05, "loss": 0.1981, "step": 4276 }, { "epoch": 26.56697819314642, "grad_norm": 4.09375, "learning_rate": 4.8125874574791654e-05, "loss": 0.3833, "step": 4277 }, { "epoch": 26.573208722741434, "grad_norm": 4.34375, "learning_rate": 4.81249385236949e-05, "loss": 0.3071, "step": 4278 }, { "epoch": 26.57943925233645, "grad_norm": 2.59375, "learning_rate": 4.8124002248004126e-05, "loss": 0.1824, "step": 4279 }, { "epoch": 26.585669781931465, "grad_norm": 3.0, "learning_rate": 4.8123065747728415e-05, "loss": 0.2542, "step": 4280 }, { "epoch": 26.59190031152648, "grad_norm": 2.9375, "learning_rate": 4.812212902287687e-05, "loss": 0.24, "step": 4281 }, { "epoch": 26.598130841121495, "grad_norm": 3.578125, "learning_rate": 4.812119207345859e-05, "loss": 0.2912, "step": 4282 }, { "epoch": 26.60436137071651, "grad_norm": 2.625, "learning_rate": 4.8120254899482665e-05, "loss": 0.2295, "step": 4283 }, { "epoch": 26.610591900311526, "grad_norm": 2.078125, "learning_rate": 4.811931750095821e-05, "loss": 0.1601, "step": 4284 }, { "epoch": 26.61682242990654, "grad_norm": 2.578125, "learning_rate": 4.8118379877894314e-05, "loss": 0.1568, "step": 4285 }, { "epoch": 26.623052959501557, "grad_norm": 2.71875, "learning_rate": 4.811744203030009e-05, "loss": 0.331, "step": 4286 }, { "epoch": 26.629283489096572, "grad_norm": 2.40625, "learning_rate": 4.811650395818466e-05, "loss": 0.1562, "step": 4287 }, { "epoch": 26.635514018691588, "grad_norm": 3.09375, "learning_rate": 4.811556566155711e-05, "loss": 0.2542, "step": 4288 }, { "epoch": 26.641744548286603, "grad_norm": 3.375, "learning_rate": 4.811462714042657e-05, "loss": 0.2265, "step": 4289 }, { "epoch": 26.64797507788162, "grad_norm": 2.125, "learning_rate": 4.8113688394802145e-05, "loss": 0.2257, "step": 4290 }, { "epoch": 26.654205607476637, "grad_norm": 2.734375, "learning_rate": 4.8112749424692974e-05, "loss": 0.2476, "step": 4291 }, { "epoch": 26.660436137071652, "grad_norm": 2.046875, "learning_rate": 4.8111810230108145e-05, "loss": 0.1926, "step": 4292 }, { "epoch": 26.666666666666668, "grad_norm": 2.734375, "learning_rate": 4.81108708110568e-05, "loss": 0.2884, "step": 4293 }, { "epoch": 26.672897196261683, "grad_norm": 2.84375, "learning_rate": 4.810993116754806e-05, "loss": 0.3984, "step": 4294 }, { "epoch": 26.6791277258567, "grad_norm": 3.5625, "learning_rate": 4.810899129959105e-05, "loss": 0.3946, "step": 4295 }, { "epoch": 26.685358255451714, "grad_norm": 3.578125, "learning_rate": 4.810805120719489e-05, "loss": 0.348, "step": 4296 }, { "epoch": 26.69158878504673, "grad_norm": 2.65625, "learning_rate": 4.8107110890368725e-05, "loss": 0.1985, "step": 4297 }, { "epoch": 26.697819314641745, "grad_norm": 3.15625, "learning_rate": 4.810617034912168e-05, "loss": 0.3615, "step": 4298 }, { "epoch": 26.70404984423676, "grad_norm": 2.671875, "learning_rate": 4.8105229583462885e-05, "loss": 0.3404, "step": 4299 }, { "epoch": 26.710280373831775, "grad_norm": 3.421875, "learning_rate": 4.810428859340148e-05, "loss": 0.2046, "step": 4300 }, { "epoch": 26.71651090342679, "grad_norm": 1.7890625, "learning_rate": 4.810334737894661e-05, "loss": 0.1383, "step": 4301 }, { "epoch": 26.722741433021806, "grad_norm": 3.46875, "learning_rate": 4.810240594010742e-05, "loss": 0.3612, "step": 4302 }, { "epoch": 26.72897196261682, "grad_norm": 3.765625, "learning_rate": 4.8101464276893026e-05, "loss": 0.3771, "step": 4303 }, { "epoch": 26.735202492211837, "grad_norm": 1.53125, "learning_rate": 4.810052238931261e-05, "loss": 0.1533, "step": 4304 }, { "epoch": 26.741433021806852, "grad_norm": 2.265625, "learning_rate": 4.809958027737529e-05, "loss": 0.2191, "step": 4305 }, { "epoch": 26.747663551401867, "grad_norm": 3.703125, "learning_rate": 4.809863794109024e-05, "loss": 0.3429, "step": 4306 }, { "epoch": 26.753894080996886, "grad_norm": 3.375, "learning_rate": 4.80976953804666e-05, "loss": 0.2708, "step": 4307 }, { "epoch": 26.7601246105919, "grad_norm": 1.5234375, "learning_rate": 4.809675259551352e-05, "loss": 0.15, "step": 4308 }, { "epoch": 26.766355140186917, "grad_norm": 4.40625, "learning_rate": 4.809580958624016e-05, "loss": 0.1531, "step": 4309 }, { "epoch": 26.772585669781932, "grad_norm": 2.359375, "learning_rate": 4.809486635265569e-05, "loss": 0.1345, "step": 4310 }, { "epoch": 26.778816199376948, "grad_norm": 2.453125, "learning_rate": 4.809392289476925e-05, "loss": 0.1728, "step": 4311 }, { "epoch": 26.785046728971963, "grad_norm": 4.03125, "learning_rate": 4.809297921259002e-05, "loss": 0.1882, "step": 4312 }, { "epoch": 26.79127725856698, "grad_norm": 2.671875, "learning_rate": 4.809203530612716e-05, "loss": 0.2086, "step": 4313 }, { "epoch": 26.797507788161994, "grad_norm": 2.578125, "learning_rate": 4.8091091175389833e-05, "loss": 0.2326, "step": 4314 }, { "epoch": 26.80373831775701, "grad_norm": 2.015625, "learning_rate": 4.8090146820387223e-05, "loss": 0.177, "step": 4315 }, { "epoch": 26.809968847352025, "grad_norm": 1.5234375, "learning_rate": 4.808920224112849e-05, "loss": 0.1189, "step": 4316 }, { "epoch": 26.81619937694704, "grad_norm": 1.9296875, "learning_rate": 4.80882574376228e-05, "loss": 0.138, "step": 4317 }, { "epoch": 26.822429906542055, "grad_norm": 4.71875, "learning_rate": 4.808731240987934e-05, "loss": 0.3662, "step": 4318 }, { "epoch": 26.82866043613707, "grad_norm": 4.46875, "learning_rate": 4.808636715790729e-05, "loss": 0.1757, "step": 4319 }, { "epoch": 26.834890965732086, "grad_norm": 3.578125, "learning_rate": 4.808542168171582e-05, "loss": 0.1778, "step": 4320 }, { "epoch": 26.8411214953271, "grad_norm": 3.046875, "learning_rate": 4.808447598131414e-05, "loss": 0.3024, "step": 4321 }, { "epoch": 26.847352024922117, "grad_norm": 3.265625, "learning_rate": 4.8083530056711394e-05, "loss": 0.1967, "step": 4322 }, { "epoch": 26.853582554517136, "grad_norm": 4.4375, "learning_rate": 4.80825839079168e-05, "loss": 0.4271, "step": 4323 }, { "epoch": 26.85981308411215, "grad_norm": 3.734375, "learning_rate": 4.8081637534939536e-05, "loss": 0.2163, "step": 4324 }, { "epoch": 26.866043613707166, "grad_norm": 2.796875, "learning_rate": 4.808069093778879e-05, "loss": 0.1776, "step": 4325 }, { "epoch": 26.87227414330218, "grad_norm": 3.078125, "learning_rate": 4.8079744116473766e-05, "loss": 0.2281, "step": 4326 }, { "epoch": 26.878504672897197, "grad_norm": 2.421875, "learning_rate": 4.8078797071003644e-05, "loss": 0.2057, "step": 4327 }, { "epoch": 26.884735202492212, "grad_norm": 2.5, "learning_rate": 4.807784980138764e-05, "loss": 0.2007, "step": 4328 }, { "epoch": 26.890965732087228, "grad_norm": 3.046875, "learning_rate": 4.807690230763494e-05, "loss": 0.2103, "step": 4329 }, { "epoch": 26.897196261682243, "grad_norm": 5.09375, "learning_rate": 4.807595458975476e-05, "loss": 0.2703, "step": 4330 }, { "epoch": 26.90342679127726, "grad_norm": 2.03125, "learning_rate": 4.8075006647756296e-05, "loss": 0.171, "step": 4331 }, { "epoch": 26.909657320872274, "grad_norm": 2.15625, "learning_rate": 4.8074058481648744e-05, "loss": 0.1626, "step": 4332 }, { "epoch": 26.91588785046729, "grad_norm": 3.84375, "learning_rate": 4.807311009144133e-05, "loss": 0.4179, "step": 4333 }, { "epoch": 26.922118380062305, "grad_norm": 4.875, "learning_rate": 4.807216147714326e-05, "loss": 0.382, "step": 4334 }, { "epoch": 26.92834890965732, "grad_norm": 2.40625, "learning_rate": 4.807121263876374e-05, "loss": 0.1648, "step": 4335 }, { "epoch": 26.934579439252335, "grad_norm": 2.5, "learning_rate": 4.8070263576312e-05, "loss": 0.1578, "step": 4336 }, { "epoch": 26.94080996884735, "grad_norm": 2.171875, "learning_rate": 4.806931428979724e-05, "loss": 0.1729, "step": 4337 }, { "epoch": 26.947040498442366, "grad_norm": 3.03125, "learning_rate": 4.806836477922869e-05, "loss": 0.3472, "step": 4338 }, { "epoch": 26.953271028037385, "grad_norm": 3.859375, "learning_rate": 4.806741504461557e-05, "loss": 0.131, "step": 4339 }, { "epoch": 26.9595015576324, "grad_norm": 3.90625, "learning_rate": 4.8066465085967104e-05, "loss": 0.3222, "step": 4340 }, { "epoch": 26.965732087227416, "grad_norm": 1.8984375, "learning_rate": 4.8065514903292524e-05, "loss": 0.1606, "step": 4341 }, { "epoch": 26.97196261682243, "grad_norm": 2.0, "learning_rate": 4.806456449660105e-05, "loss": 0.1784, "step": 4342 }, { "epoch": 26.978193146417446, "grad_norm": 2.8125, "learning_rate": 4.8063613865901915e-05, "loss": 0.275, "step": 4343 }, { "epoch": 26.98442367601246, "grad_norm": 1.8828125, "learning_rate": 4.8062663011204344e-05, "loss": 0.1473, "step": 4344 }, { "epoch": 26.990654205607477, "grad_norm": 2.1875, "learning_rate": 4.806171193251759e-05, "loss": 0.2095, "step": 4345 }, { "epoch": 26.996884735202492, "grad_norm": 2.171875, "learning_rate": 4.806076062985088e-05, "loss": 0.1583, "step": 4346 }, { "epoch": 27.0, "grad_norm": 2.125, "learning_rate": 4.8059809103213446e-05, "loss": 0.1393, "step": 4347 } ], "logging_steps": 1, "max_steps": 32200, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }