{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 690, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004347826086956522, "learning_rate": 0, "loss": 2.1156, "step": 1 }, { "epoch": 0.008695652173913044, "learning_rate": 0, "loss": 2.1067, "step": 2 }, { "epoch": 0.013043478260869565, "learning_rate": 0, "loss": 2.1898, "step": 3 }, { "epoch": 0.017391304347826087, "grad_norm": 2.296112298965454, "learning_rate": 0.0, "loss": 2.1665, "step": 4 }, { "epoch": 0.021739130434782608, "grad_norm": 2.296112298965454, "learning_rate": 0.0, "loss": 2.1879, "step": 5 }, { "epoch": 0.02608695652173913, "grad_norm": 2.296112298965454, "learning_rate": 0.0, "loss": 2.1444, "step": 6 }, { "epoch": 0.030434782608695653, "grad_norm": 2.296112298965454, "learning_rate": 0.0, "loss": 2.152, "step": 7 }, { "epoch": 0.034782608695652174, "grad_norm": 2.2356226444244385, "learning_rate": 3.2741131089043125e-06, "loss": 2.1847, "step": 8 }, { "epoch": 0.0391304347826087, "grad_norm": 2.2356226444244385, "learning_rate": 3.2741131089043125e-06, "loss": 2.169, "step": 9 }, { "epoch": 0.043478260869565216, "grad_norm": 2.2356226444244385, "learning_rate": 3.2741131089043125e-06, "loss": 2.2589, "step": 10 }, { "epoch": 0.04782608695652174, "grad_norm": 2.2356226444244385, "learning_rate": 3.2741131089043125e-06, "loss": 2.1925, "step": 11 }, { "epoch": 0.05217391304347826, "grad_norm": 2.2073488235473633, "learning_rate": 5.189346500732899e-06, "loss": 2.2475, "step": 12 }, { "epoch": 0.05652173913043478, "grad_norm": 2.2073488235473633, "learning_rate": 5.189346500732899e-06, "loss": 2.1839, "step": 13 }, { "epoch": 0.06086956521739131, "grad_norm": 2.2073488235473633, "learning_rate": 5.189346500732899e-06, "loss": 2.2041, "step": 14 }, { "epoch": 0.06521739130434782, "grad_norm": 2.2073488235473633, "learning_rate": 5.189346500732899e-06, "loss": 2.1681, "step": 15 }, { "epoch": 0.06956521739130435, "grad_norm": 2.202537775039673, "learning_rate": 6.548226217808625e-06, "loss": 2.1546, "step": 16 }, { "epoch": 0.07391304347826087, "grad_norm": 2.202537775039673, "learning_rate": 6.548226217808625e-06, "loss": 2.2019, "step": 17 }, { "epoch": 0.0782608695652174, "grad_norm": 2.202537775039673, "learning_rate": 6.548226217808625e-06, "loss": 2.1666, "step": 18 }, { "epoch": 0.08260869565217391, "grad_norm": 2.202537775039673, "learning_rate": 6.548226217808625e-06, "loss": 2.1445, "step": 19 }, { "epoch": 0.08695652173913043, "grad_norm": 2.1643476486206055, "learning_rate": 7.60225521340393e-06, "loss": 2.0461, "step": 20 }, { "epoch": 0.09130434782608696, "grad_norm": 2.1643476486206055, "learning_rate": 7.60225521340393e-06, "loss": 2.139, "step": 21 }, { "epoch": 0.09565217391304348, "grad_norm": 2.1643476486206055, "learning_rate": 7.60225521340393e-06, "loss": 2.253, "step": 22 }, { "epoch": 0.1, "grad_norm": 2.1643476486206055, "learning_rate": 7.60225521340393e-06, "loss": 2.1876, "step": 23 }, { "epoch": 0.10434782608695652, "grad_norm": 2.2119083404541016, "learning_rate": 8.463459609637211e-06, "loss": 2.2883, "step": 24 }, { "epoch": 0.10869565217391304, "grad_norm": 2.2119083404541016, "learning_rate": 8.463459609637211e-06, "loss": 2.2083, "step": 25 }, { "epoch": 0.11304347826086956, "grad_norm": 2.2119083404541016, "learning_rate": 8.463459609637211e-06, "loss": 2.1793, "step": 26 }, { "epoch": 0.11739130434782609, "grad_norm": 2.2119083404541016, "learning_rate": 8.463459609637211e-06, "loss": 2.1871, "step": 27 }, { "epoch": 0.12173913043478261, "grad_norm": 2.064499855041504, "learning_rate": 9.191597551655847e-06, "loss": 2.24, "step": 28 }, { "epoch": 0.12608695652173912, "grad_norm": 2.064499855041504, "learning_rate": 9.191597551655847e-06, "loss": 2.1223, "step": 29 }, { "epoch": 0.13043478260869565, "grad_norm": 2.064499855041504, "learning_rate": 9.191597551655847e-06, "loss": 2.0987, "step": 30 }, { "epoch": 0.13478260869565217, "grad_norm": 2.064499855041504, "learning_rate": 9.191597551655847e-06, "loss": 1.9908, "step": 31 }, { "epoch": 0.1391304347826087, "grad_norm": 1.9148058891296387, "learning_rate": 9.822339326712938e-06, "loss": 2.1102, "step": 32 }, { "epoch": 0.14347826086956522, "grad_norm": 1.9148058891296387, "learning_rate": 9.822339326712938e-06, "loss": 1.9757, "step": 33 }, { "epoch": 0.14782608695652175, "grad_norm": 1.9148058891296387, "learning_rate": 9.822339326712938e-06, "loss": 2.0812, "step": 34 }, { "epoch": 0.15217391304347827, "grad_norm": 1.9148058891296387, "learning_rate": 9.822339326712938e-06, "loss": 2.0167, "step": 35 }, { "epoch": 0.1565217391304348, "grad_norm": 1.9139795303344727, "learning_rate": 1.0378693001465798e-05, "loss": 2.0972, "step": 36 }, { "epoch": 0.1608695652173913, "grad_norm": 1.9139795303344727, "learning_rate": 1.0378693001465798e-05, "loss": 2.0143, "step": 37 }, { "epoch": 0.16521739130434782, "grad_norm": 1.9139795303344727, "learning_rate": 1.0378693001465798e-05, "loss": 2.0257, "step": 38 }, { "epoch": 0.16956521739130434, "grad_norm": 1.9139795303344727, "learning_rate": 1.0378693001465798e-05, "loss": 2.1066, "step": 39 }, { "epoch": 0.17391304347826086, "grad_norm": 1.8336902856826782, "learning_rate": 1.0876368322308244e-05, "loss": 1.9792, "step": 40 }, { "epoch": 0.1782608695652174, "grad_norm": 1.8336902856826782, "learning_rate": 1.0876368322308244e-05, "loss": 1.9625, "step": 41 }, { "epoch": 0.1826086956521739, "grad_norm": 1.8336902856826782, "learning_rate": 1.0876368322308244e-05, "loss": 2.0084, "step": 42 }, { "epoch": 0.18695652173913044, "grad_norm": 1.8336902856826782, "learning_rate": 1.0876368322308244e-05, "loss": 2.0717, "step": 43 }, { "epoch": 0.19130434782608696, "grad_norm": 1.9207242727279663, "learning_rate": 1.1326570411938442e-05, "loss": 2.0022, "step": 44 }, { "epoch": 0.1956521739130435, "grad_norm": 1.9207242727279663, "learning_rate": 1.1326570411938442e-05, "loss": 1.9515, "step": 45 }, { "epoch": 0.2, "grad_norm": 1.9207242727279663, "learning_rate": 1.1326570411938442e-05, "loss": 1.8031, "step": 46 }, { "epoch": 0.20434782608695654, "grad_norm": 1.9207242727279663, "learning_rate": 1.1326570411938442e-05, "loss": 1.8205, "step": 47 }, { "epoch": 0.20869565217391303, "grad_norm": 1.2533749341964722, "learning_rate": 1.1737572718541526e-05, "loss": 1.9172, "step": 48 }, { "epoch": 0.21304347826086956, "grad_norm": 1.2533749341964722, "learning_rate": 1.1737572718541526e-05, "loss": 1.8282, "step": 49 }, { "epoch": 0.21739130434782608, "grad_norm": 1.2533749341964722, "learning_rate": 1.1737572718541526e-05, "loss": 1.8121, "step": 50 }, { "epoch": 0.2217391304347826, "grad_norm": 1.2533749341964722, "learning_rate": 1.1737572718541526e-05, "loss": 1.9039, "step": 51 }, { "epoch": 0.22608695652173913, "grad_norm": 1.106286644935608, "learning_rate": 1.2115658189875932e-05, "loss": 1.8711, "step": 52 }, { "epoch": 0.23043478260869565, "grad_norm": 1.106286644935608, "learning_rate": 1.2115658189875932e-05, "loss": 1.9492, "step": 53 }, { "epoch": 0.23478260869565218, "grad_norm": 1.106286644935608, "learning_rate": 1.2115658189875932e-05, "loss": 1.8475, "step": 54 }, { "epoch": 0.2391304347826087, "grad_norm": 1.106286644935608, "learning_rate": 1.2115658189875932e-05, "loss": 1.8875, "step": 55 }, { "epoch": 0.24347826086956523, "grad_norm": 1.0397955179214478, "learning_rate": 1.2465710660560159e-05, "loss": 1.8635, "step": 56 }, { "epoch": 0.24782608695652175, "grad_norm": 1.0397955179214478, "learning_rate": 1.2465710660560159e-05, "loss": 1.8428, "step": 57 }, { "epoch": 0.25217391304347825, "grad_norm": 1.0397955179214478, "learning_rate": 1.2465710660560159e-05, "loss": 1.9786, "step": 58 }, { "epoch": 0.2565217391304348, "grad_norm": 1.0397955179214478, "learning_rate": 1.2465710660560159e-05, "loss": 1.8905, "step": 59 }, { "epoch": 0.2608695652173913, "grad_norm": 0.9489036202430725, "learning_rate": 1.279160171413683e-05, "loss": 1.8285, "step": 60 }, { "epoch": 0.26521739130434785, "grad_norm": 0.9489036202430725, "learning_rate": 1.279160171413683e-05, "loss": 1.887, "step": 61 }, { "epoch": 0.26956521739130435, "grad_norm": 0.9489036202430725, "learning_rate": 1.279160171413683e-05, "loss": 1.8579, "step": 62 }, { "epoch": 0.27391304347826084, "grad_norm": 0.9489036202430725, "learning_rate": 1.279160171413683e-05, "loss": 1.8302, "step": 63 }, { "epoch": 0.2782608695652174, "grad_norm": 0.884759247303009, "learning_rate": 1.309645243561725e-05, "loss": 1.9077, "step": 64 }, { "epoch": 0.2826086956521739, "grad_norm": 0.884759247303009, "learning_rate": 1.309645243561725e-05, "loss": 1.778, "step": 65 }, { "epoch": 0.28695652173913044, "grad_norm": 0.884759247303009, "learning_rate": 1.309645243561725e-05, "loss": 1.7314, "step": 66 }, { "epoch": 0.29130434782608694, "grad_norm": 0.884759247303009, "learning_rate": 1.309645243561725e-05, "loss": 1.8035, "step": 67 }, { "epoch": 0.2956521739130435, "grad_norm": 0.9762704968452454, "learning_rate": 1.3382815670697004e-05, "loss": 1.7613, "step": 68 }, { "epoch": 0.3, "grad_norm": 0.9762704968452454, "learning_rate": 1.3382815670697004e-05, "loss": 1.7655, "step": 69 }, { "epoch": 0.30434782608695654, "grad_norm": 0.9762704968452454, "learning_rate": 1.3382815670697004e-05, "loss": 1.6689, "step": 70 }, { "epoch": 0.30869565217391304, "grad_norm": 0.9762704968452454, "learning_rate": 1.3382815670697004e-05, "loss": 1.7682, "step": 71 }, { "epoch": 0.3130434782608696, "grad_norm": 0.8275449872016907, "learning_rate": 1.365280611037011e-05, "loss": 1.8038, "step": 72 }, { "epoch": 0.3173913043478261, "grad_norm": 0.8275449872016907, "learning_rate": 1.365280611037011e-05, "loss": 1.7022, "step": 73 }, { "epoch": 0.3217391304347826, "grad_norm": 0.8275449872016907, "learning_rate": 1.365280611037011e-05, "loss": 1.7887, "step": 74 }, { "epoch": 0.32608695652173914, "grad_norm": 0.8275449872016907, "learning_rate": 1.365280611037011e-05, "loss": 1.7613, "step": 75 }, { "epoch": 0.33043478260869563, "grad_norm": 0.7482348680496216, "learning_rate": 1.3908195157440944e-05, "loss": 1.7867, "step": 76 }, { "epoch": 0.3347826086956522, "grad_norm": 0.7482348680496216, "learning_rate": 1.3908195157440944e-05, "loss": 1.825, "step": 77 }, { "epoch": 0.3391304347826087, "grad_norm": 0.7482348680496216, "learning_rate": 1.3908195157440944e-05, "loss": 1.7778, "step": 78 }, { "epoch": 0.34347826086956523, "grad_norm": 0.7482348680496216, "learning_rate": 1.3908195157440944e-05, "loss": 1.7115, "step": 79 }, { "epoch": 0.34782608695652173, "grad_norm": 0.6351860761642456, "learning_rate": 1.4150481431212555e-05, "loss": 1.7619, "step": 80 }, { "epoch": 0.3521739130434783, "grad_norm": 0.6351860761642456, "learning_rate": 1.4150481431212555e-05, "loss": 1.7002, "step": 81 }, { "epoch": 0.3565217391304348, "grad_norm": 0.6351860761642456, "learning_rate": 1.4150481431212555e-05, "loss": 1.7352, "step": 82 }, { "epoch": 0.36086956521739133, "grad_norm": 0.6351860761642456, "learning_rate": 1.4150481431212555e-05, "loss": 1.7489, "step": 83 }, { "epoch": 0.3652173913043478, "grad_norm": 0.5535622835159302, "learning_rate": 1.4380944052388746e-05, "loss": 1.7328, "step": 84 }, { "epoch": 0.3695652173913043, "grad_norm": 0.5535622835159302, "learning_rate": 1.4380944052388746e-05, "loss": 1.7426, "step": 85 }, { "epoch": 0.3739130434782609, "grad_norm": 0.5535622835159302, "learning_rate": 1.4380944052388746e-05, "loss": 1.742, "step": 86 }, { "epoch": 0.3782608695652174, "grad_norm": 0.5535622835159302, "learning_rate": 1.4380944052388746e-05, "loss": 1.7249, "step": 87 }, { "epoch": 0.3826086956521739, "grad_norm": 0.5413718819618225, "learning_rate": 1.4600683520842756e-05, "loss": 1.6879, "step": 88 }, { "epoch": 0.3869565217391304, "grad_norm": 0.5413718819618225, "learning_rate": 1.4600683520842756e-05, "loss": 1.7437, "step": 89 }, { "epoch": 0.391304347826087, "grad_norm": 0.5413718819618225, "learning_rate": 1.4600683520842756e-05, "loss": 1.7177, "step": 90 }, { "epoch": 0.39565217391304347, "grad_norm": 0.5413718819618225, "learning_rate": 1.4600683520842756e-05, "loss": 1.6507, "step": 91 }, { "epoch": 0.4, "grad_norm": 0.5403290390968323, "learning_rate": 1.48106534992671e-05, "loss": 1.7619, "step": 92 }, { "epoch": 0.4043478260869565, "grad_norm": 0.5403290390968323, "learning_rate": 1.48106534992671e-05, "loss": 1.7386, "step": 93 }, { "epoch": 0.40869565217391307, "grad_norm": 0.5403290390968323, "learning_rate": 1.48106534992671e-05, "loss": 1.6804, "step": 94 }, { "epoch": 0.41304347826086957, "grad_norm": 0.5403290390968323, "learning_rate": 1.48106534992671e-05, "loss": 1.6704, "step": 95 }, { "epoch": 0.41739130434782606, "grad_norm": 0.48830699920654297, "learning_rate": 1.5011685827445838e-05, "loss": 1.7522, "step": 96 }, { "epoch": 0.4217391304347826, "grad_norm": 0.48830699920654297, "learning_rate": 1.5011685827445838e-05, "loss": 1.6867, "step": 97 }, { "epoch": 0.4260869565217391, "grad_norm": 0.48830699920654297, "learning_rate": 1.5011685827445838e-05, "loss": 1.6539, "step": 98 }, { "epoch": 0.43043478260869567, "grad_norm": 0.48830699920654297, "learning_rate": 1.5011685827445838e-05, "loss": 1.7037, "step": 99 }, { "epoch": 0.43478260869565216, "grad_norm": 0.45016181468963623, "learning_rate": 1.520451042680786e-05, "loss": 1.634, "step": 100 }, { "epoch": 0.4391304347826087, "grad_norm": 0.45016181468963623, "learning_rate": 1.520451042680786e-05, "loss": 1.7027, "step": 101 }, { "epoch": 0.4434782608695652, "grad_norm": 0.45016181468963623, "learning_rate": 1.520451042680786e-05, "loss": 1.6883, "step": 102 }, { "epoch": 0.44782608695652176, "grad_norm": 0.45016181468963623, "learning_rate": 1.520451042680786e-05, "loss": 1.6648, "step": 103 }, { "epoch": 0.45217391304347826, "grad_norm": 0.44704490900039673, "learning_rate": 1.5389771298780244e-05, "loss": 1.7269, "step": 104 }, { "epoch": 0.45652173913043476, "grad_norm": 0.44704490900039673, "learning_rate": 1.5389771298780244e-05, "loss": 1.7, "step": 105 }, { "epoch": 0.4608695652173913, "grad_norm": 0.44704490900039673, "learning_rate": 1.5389771298780244e-05, "loss": 1.7126, "step": 106 }, { "epoch": 0.4652173913043478, "grad_norm": 0.44704490900039673, "learning_rate": 1.5389771298780244e-05, "loss": 1.6477, "step": 107 }, { "epoch": 0.46956521739130436, "grad_norm": 0.4181145429611206, "learning_rate": 1.5568039502198696e-05, "loss": 1.6808, "step": 108 }, { "epoch": 0.47391304347826085, "grad_norm": 0.4181145429611206, "learning_rate": 1.5568039502198696e-05, "loss": 1.6378, "step": 109 }, { "epoch": 0.4782608695652174, "grad_norm": 0.4181145429611206, "learning_rate": 1.5568039502198696e-05, "loss": 1.6749, "step": 110 }, { "epoch": 0.4826086956521739, "grad_norm": 0.4181145429611206, "learning_rate": 1.5568039502198696e-05, "loss": 1.6727, "step": 111 }, { "epoch": 0.48695652173913045, "grad_norm": 0.47532176971435547, "learning_rate": 1.5739823769464473e-05, "loss": 1.6508, "step": 112 }, { "epoch": 0.49130434782608695, "grad_norm": 0.47532176971435547, "learning_rate": 1.5739823769464473e-05, "loss": 1.6771, "step": 113 }, { "epoch": 0.4956521739130435, "grad_norm": 0.47532176971435547, "learning_rate": 1.5739823769464473e-05, "loss": 1.6639, "step": 114 }, { "epoch": 0.5, "grad_norm": 0.47532176971435547, "learning_rate": 1.5739823769464473e-05, "loss": 1.6892, "step": 115 }, { "epoch": 0.5043478260869565, "grad_norm": 0.44671422243118286, "learning_rate": 1.5905579258955202e-05, "loss": 1.6251, "step": 116 }, { "epoch": 0.508695652173913, "grad_norm": 0.44671422243118286, "learning_rate": 1.5905579258955202e-05, "loss": 1.6916, "step": 117 }, { "epoch": 0.5130434782608696, "grad_norm": 0.44671422243118286, "learning_rate": 1.5905579258955202e-05, "loss": 1.6385, "step": 118 }, { "epoch": 0.5173913043478261, "grad_norm": 0.44671422243118286, "learning_rate": 1.5905579258955202e-05, "loss": 1.6745, "step": 119 }, { "epoch": 0.5217391304347826, "grad_norm": 0.43519601225852966, "learning_rate": 1.606571482304114e-05, "loss": 1.6723, "step": 120 }, { "epoch": 0.5260869565217391, "grad_norm": 0.43519601225852966, "learning_rate": 1.606571482304114e-05, "loss": 1.7032, "step": 121 }, { "epoch": 0.5304347826086957, "grad_norm": 0.43519601225852966, "learning_rate": 1.606571482304114e-05, "loss": 1.649, "step": 122 }, { "epoch": 0.5347826086956522, "grad_norm": 0.43519601225852966, "learning_rate": 1.606571482304114e-05, "loss": 1.5973, "step": 123 }, { "epoch": 0.5391304347826087, "grad_norm": 0.4354279041290283, "learning_rate": 1.6220599083923048e-05, "loss": 1.6496, "step": 124 }, { "epoch": 0.5434782608695652, "grad_norm": 0.4354279041290283, "learning_rate": 1.6220599083923048e-05, "loss": 1.6142, "step": 125 }, { "epoch": 0.5478260869565217, "grad_norm": 0.4354279041290283, "learning_rate": 1.6220599083923048e-05, "loss": 1.6788, "step": 126 }, { "epoch": 0.5521739130434783, "grad_norm": 0.4354279041290283, "learning_rate": 1.6220599083923048e-05, "loss": 1.6921, "step": 127 }, { "epoch": 0.5565217391304348, "grad_norm": 0.4076354205608368, "learning_rate": 1.6370565544521564e-05, "loss": 1.7186, "step": 128 }, { "epoch": 0.5608695652173913, "grad_norm": 0.4076354205608368, "learning_rate": 1.6370565544521564e-05, "loss": 1.5906, "step": 129 }, { "epoch": 0.5652173913043478, "grad_norm": 0.4076354205608368, "learning_rate": 1.6370565544521564e-05, "loss": 1.5466, "step": 130 }, { "epoch": 0.5695652173913044, "grad_norm": 0.4076354205608368, "learning_rate": 1.6370565544521564e-05, "loss": 1.6492, "step": 131 }, { "epoch": 0.5739130434782609, "grad_norm": 0.3979344964027405, "learning_rate": 1.651591691267134e-05, "loss": 1.6666, "step": 132 }, { "epoch": 0.5782608695652174, "grad_norm": 0.3979344964027405, "learning_rate": 1.651591691267134e-05, "loss": 1.6911, "step": 133 }, { "epoch": 0.5826086956521739, "grad_norm": 0.3979344964027405, "learning_rate": 1.651591691267134e-05, "loss": 1.5961, "step": 134 }, { "epoch": 0.5869565217391305, "grad_norm": 0.3979344964027405, "learning_rate": 1.651591691267134e-05, "loss": 1.6683, "step": 135 }, { "epoch": 0.591304347826087, "grad_norm": 0.3939136862754822, "learning_rate": 1.6656928779601318e-05, "loss": 1.6509, "step": 136 }, { "epoch": 0.5956521739130435, "grad_norm": 0.3939136862754822, "learning_rate": 1.6656928779601318e-05, "loss": 1.6463, "step": 137 }, { "epoch": 0.6, "grad_norm": 0.3939136862754822, "learning_rate": 1.6656928779601318e-05, "loss": 1.714, "step": 138 }, { "epoch": 0.6043478260869565, "grad_norm": 0.3939136862754822, "learning_rate": 1.6656928779601318e-05, "loss": 1.6111, "step": 139 }, { "epoch": 0.6086956521739131, "grad_norm": 0.3857693374156952, "learning_rate": 1.6793852765059776e-05, "loss": 1.6036, "step": 140 }, { "epoch": 0.6130434782608696, "grad_norm": 0.3857693374156952, "learning_rate": 1.6793852765059776e-05, "loss": 1.5713, "step": 141 }, { "epoch": 0.6173913043478261, "grad_norm": 0.3857693374156952, "learning_rate": 1.6793852765059776e-05, "loss": 1.6435, "step": 142 }, { "epoch": 0.6217391304347826, "grad_norm": 0.3857693374156952, "learning_rate": 1.6793852765059776e-05, "loss": 1.6701, "step": 143 }, { "epoch": 0.6260869565217392, "grad_norm": 0.35507267713546753, "learning_rate": 1.6926919219274422e-05, "loss": 1.6717, "step": 144 }, { "epoch": 0.6304347826086957, "grad_norm": 0.35507267713546753, "learning_rate": 1.6926919219274422e-05, "loss": 1.6163, "step": 145 }, { "epoch": 0.6347826086956522, "grad_norm": 0.35507267713546753, "learning_rate": 1.6926919219274422e-05, "loss": 1.5982, "step": 146 }, { "epoch": 0.6391304347826087, "grad_norm": 0.35507267713546753, "learning_rate": 1.6926919219274422e-05, "loss": 1.6509, "step": 147 }, { "epoch": 0.6434782608695652, "grad_norm": 0.35680633783340454, "learning_rate": 1.7056339554631436e-05, "loss": 1.6332, "step": 148 }, { "epoch": 0.6478260869565218, "grad_norm": 0.35680633783340454, "learning_rate": 1.7056339554631436e-05, "loss": 1.5773, "step": 149 }, { "epoch": 0.6521739130434783, "grad_norm": 0.35680633783340454, "learning_rate": 1.7056339554631436e-05, "loss": 1.562, "step": 150 }, { "epoch": 0.6565217391304348, "grad_norm": 0.35680633783340454, "learning_rate": 1.7056339554631436e-05, "loss": 1.5869, "step": 151 }, { "epoch": 0.6608695652173913, "grad_norm": 0.34213584661483765, "learning_rate": 1.7182308266345256e-05, "loss": 1.637, "step": 152 }, { "epoch": 0.6652173913043479, "grad_norm": 0.34213584661483765, "learning_rate": 1.7182308266345256e-05, "loss": 1.6367, "step": 153 }, { "epoch": 0.6695652173913044, "grad_norm": 0.34213584661483765, "learning_rate": 1.7182308266345256e-05, "loss": 1.6062, "step": 154 }, { "epoch": 0.6739130434782609, "grad_norm": 0.34213584661483765, "learning_rate": 1.7182308266345256e-05, "loss": 1.6147, "step": 155 }, { "epoch": 0.6782608695652174, "grad_norm": 0.3420649766921997, "learning_rate": 1.7305004690608827e-05, "loss": 1.5698, "step": 156 }, { "epoch": 0.6826086956521739, "grad_norm": 0.3420649766921997, "learning_rate": 1.7305004690608827e-05, "loss": 1.5857, "step": 157 }, { "epoch": 0.6869565217391305, "grad_norm": 0.3420649766921997, "learning_rate": 1.7305004690608827e-05, "loss": 1.5172, "step": 158 }, { "epoch": 0.691304347826087, "grad_norm": 0.3420649766921997, "learning_rate": 1.7305004690608827e-05, "loss": 1.5799, "step": 159 }, { "epoch": 0.6956521739130435, "grad_norm": 0.3389074504375458, "learning_rate": 1.7424594540116867e-05, "loss": 1.5744, "step": 160 }, { "epoch": 0.7, "grad_norm": 0.3389074504375458, "learning_rate": 1.7424594540116867e-05, "loss": 1.6075, "step": 161 }, { "epoch": 0.7043478260869566, "grad_norm": 0.3389074504375458, "learning_rate": 1.7424594540116867e-05, "loss": 1.5119, "step": 162 }, { "epoch": 0.7086956521739131, "grad_norm": 0.3389074504375458, "learning_rate": 1.7424594540116867e-05, "loss": 1.6134, "step": 163 }, { "epoch": 0.7130434782608696, "grad_norm": 0.3438399136066437, "learning_rate": 1.754123124995665e-05, "loss": 1.565, "step": 164 }, { "epoch": 0.717391304347826, "grad_norm": 0.3438399136066437, "learning_rate": 1.754123124995665e-05, "loss": 1.6115, "step": 165 }, { "epoch": 0.7217391304347827, "grad_norm": 0.3438399136066437, "learning_rate": 1.754123124995665e-05, "loss": 1.6062, "step": 166 }, { "epoch": 0.7260869565217392, "grad_norm": 0.3438399136066437, "learning_rate": 1.754123124995665e-05, "loss": 1.5927, "step": 167 }, { "epoch": 0.7304347826086957, "grad_norm": 0.3270600438117981, "learning_rate": 1.765505716129306e-05, "loss": 1.5475, "step": 168 }, { "epoch": 0.7347826086956522, "grad_norm": 0.3270600438117981, "learning_rate": 1.765505716129306e-05, "loss": 1.5137, "step": 169 }, { "epoch": 0.7391304347826086, "grad_norm": 0.3270600438117981, "learning_rate": 1.765505716129306e-05, "loss": 1.6577, "step": 170 }, { "epoch": 0.7434782608695653, "grad_norm": 0.3270600438117981, "learning_rate": 1.765505716129306e-05, "loss": 1.6101, "step": 171 }, { "epoch": 0.7478260869565218, "grad_norm": 0.3228694796562195, "learning_rate": 1.7766204565755586e-05, "loss": 1.5702, "step": 172 }, { "epoch": 0.7521739130434782, "grad_norm": 0.3228694796562195, "learning_rate": 1.7766204565755586e-05, "loss": 1.6037, "step": 173 }, { "epoch": 0.7565217391304347, "grad_norm": 0.3228694796562195, "learning_rate": 1.7766204565755586e-05, "loss": 1.5628, "step": 174 }, { "epoch": 0.7608695652173914, "grad_norm": 0.3228694796562195, "learning_rate": 1.7766204565755586e-05, "loss": 1.5571, "step": 175 }, { "epoch": 0.7652173913043478, "grad_norm": 0.3404123783111572, "learning_rate": 1.7874796629747068e-05, "loss": 1.6112, "step": 176 }, { "epoch": 0.7695652173913043, "grad_norm": 0.3404123783111572, "learning_rate": 1.7874796629747068e-05, "loss": 1.5407, "step": 177 }, { "epoch": 0.7739130434782608, "grad_norm": 0.3404123783111572, "learning_rate": 1.7874796629747068e-05, "loss": 1.5466, "step": 178 }, { "epoch": 0.7782608695652173, "grad_norm": 0.3404123783111572, "learning_rate": 1.7874796629747068e-05, "loss": 1.6148, "step": 179 }, { "epoch": 0.782608695652174, "grad_norm": 0.31963732838630676, "learning_rate": 1.7980948214869728e-05, "loss": 1.5987, "step": 180 }, { "epoch": 0.7869565217391304, "grad_norm": 0.31963732838630676, "learning_rate": 1.7980948214869728e-05, "loss": 1.5906, "step": 181 }, { "epoch": 0.7913043478260869, "grad_norm": 0.31963732838630676, "learning_rate": 1.7980948214869728e-05, "loss": 1.6133, "step": 182 }, { "epoch": 0.7956521739130434, "grad_norm": 0.31963732838630676, "learning_rate": 1.7980948214869728e-05, "loss": 1.5955, "step": 183 }, { "epoch": 0.8, "grad_norm": 0.3301166296005249, "learning_rate": 1.8084766608171415e-05, "loss": 1.5776, "step": 184 }, { "epoch": 0.8043478260869565, "grad_norm": 0.3301166296005249, "learning_rate": 1.8084766608171415e-05, "loss": 1.6057, "step": 185 }, { "epoch": 0.808695652173913, "grad_norm": 0.3301166296005249, "learning_rate": 1.8084766608171415e-05, "loss": 1.553, "step": 186 }, { "epoch": 0.8130434782608695, "grad_norm": 0.3301166296005249, "learning_rate": 1.8084766608171415e-05, "loss": 1.5757, "step": 187 }, { "epoch": 0.8173913043478261, "grad_norm": 0.3264283537864685, "learning_rate": 1.8186352173851508e-05, "loss": 1.6806, "step": 188 }, { "epoch": 0.8217391304347826, "grad_norm": 0.3264283537864685, "learning_rate": 1.8186352173851508e-05, "loss": 1.5758, "step": 189 }, { "epoch": 0.8260869565217391, "grad_norm": 0.3264283537864685, "learning_rate": 1.8186352173851508e-05, "loss": 1.5834, "step": 190 }, { "epoch": 0.8304347826086956, "grad_norm": 0.3264283537864685, "learning_rate": 1.8186352173851508e-05, "loss": 1.5889, "step": 191 }, { "epoch": 0.8347826086956521, "grad_norm": 0.3216494917869568, "learning_rate": 1.828579893635015e-05, "loss": 1.5547, "step": 192 }, { "epoch": 0.8391304347826087, "grad_norm": 0.3216494917869568, "learning_rate": 1.828579893635015e-05, "loss": 1.5952, "step": 193 }, { "epoch": 0.8434782608695652, "grad_norm": 0.3216494917869568, "learning_rate": 1.828579893635015e-05, "loss": 1.5456, "step": 194 }, { "epoch": 0.8478260869565217, "grad_norm": 0.3216494917869568, "learning_rate": 1.828579893635015e-05, "loss": 1.5244, "step": 195 }, { "epoch": 0.8521739130434782, "grad_norm": 0.3138068914413452, "learning_rate": 1.8383195103311694e-05, "loss": 1.6763, "step": 196 }, { "epoch": 0.8565217391304348, "grad_norm": 0.3138068914413452, "learning_rate": 1.8383195103311694e-05, "loss": 1.5649, "step": 197 }, { "epoch": 0.8608695652173913, "grad_norm": 0.3138068914413452, "learning_rate": 1.8383195103311694e-05, "loss": 1.5598, "step": 198 }, { "epoch": 0.8652173913043478, "grad_norm": 0.3138068914413452, "learning_rate": 1.8383195103311694e-05, "loss": 1.6227, "step": 199 }, { "epoch": 0.8695652173913043, "grad_norm": 0.31041961908340454, "learning_rate": 1.8478623535712173e-05, "loss": 1.5523, "step": 200 }, { "epoch": 0.8739130434782608, "grad_norm": 0.31041961908340454, "learning_rate": 1.8478623535712173e-05, "loss": 1.6512, "step": 201 }, { "epoch": 0.8782608695652174, "grad_norm": 0.31041961908340454, "learning_rate": 1.8478623535712173e-05, "loss": 1.5482, "step": 202 }, { "epoch": 0.8826086956521739, "grad_norm": 0.31041961908340454, "learning_rate": 1.8478623535712173e-05, "loss": 1.5776, "step": 203 }, { "epoch": 0.8869565217391304, "grad_norm": 0.3067266643047333, "learning_rate": 1.8572162171429905e-05, "loss": 1.5401, "step": 204 }, { "epoch": 0.8913043478260869, "grad_norm": 0.3067266643047333, "learning_rate": 1.8572162171429905e-05, "loss": 1.5391, "step": 205 }, { "epoch": 0.8956521739130435, "grad_norm": 0.3067266643047333, "learning_rate": 1.8572162171429905e-05, "loss": 1.6426, "step": 206 }, { "epoch": 0.9, "grad_norm": 0.3067266643047333, "learning_rate": 1.8572162171429905e-05, "loss": 1.5383, "step": 207 }, { "epoch": 0.9043478260869565, "grad_norm": 0.3177148103713989, "learning_rate": 1.866388440768456e-05, "loss": 1.5663, "step": 208 }, { "epoch": 0.908695652173913, "grad_norm": 0.3177148103713989, "learning_rate": 1.866388440768456e-05, "loss": 1.4794, "step": 209 }, { "epoch": 0.9130434782608695, "grad_norm": 0.3177148103713989, "learning_rate": 1.866388440768456e-05, "loss": 1.6449, "step": 210 }, { "epoch": 0.9173913043478261, "grad_norm": 0.3177148103713989, "learning_rate": 1.866388440768456e-05, "loss": 1.5473, "step": 211 }, { "epoch": 0.9217391304347826, "grad_norm": 0.32579195499420166, "learning_rate": 1.875385944704652e-05, "loss": 1.5343, "step": 212 }, { "epoch": 0.9260869565217391, "grad_norm": 0.32579195499420166, "learning_rate": 1.875385944704652e-05, "loss": 1.5602, "step": 213 }, { "epoch": 0.9304347826086956, "grad_norm": 0.32579195499420166, "learning_rate": 1.875385944704652e-05, "loss": 1.5161, "step": 214 }, { "epoch": 0.9347826086956522, "grad_norm": 0.32579195499420166, "learning_rate": 1.875385944704652e-05, "loss": 1.5444, "step": 215 }, { "epoch": 0.9391304347826087, "grad_norm": 0.301691472530365, "learning_rate": 1.8842152611103012e-05, "loss": 1.5774, "step": 216 }, { "epoch": 0.9434782608695652, "grad_norm": 0.301691472530365, "learning_rate": 1.8842152611103012e-05, "loss": 1.5541, "step": 217 }, { "epoch": 0.9478260869565217, "grad_norm": 0.301691472530365, "learning_rate": 1.8842152611103012e-05, "loss": 1.5717, "step": 218 }, { "epoch": 0.9521739130434783, "grad_norm": 0.301691472530365, "learning_rate": 1.8842152611103012e-05, "loss": 1.6241, "step": 219 }, { "epoch": 0.9565217391304348, "grad_norm": 0.3248462975025177, "learning_rate": 1.8928825625342374e-05, "loss": 1.5698, "step": 220 }, { "epoch": 0.9608695652173913, "grad_norm": 0.3248462975025177, "learning_rate": 1.8928825625342374e-05, "loss": 1.5808, "step": 221 }, { "epoch": 0.9652173913043478, "grad_norm": 0.3248462975025177, "learning_rate": 1.8928825625342374e-05, "loss": 1.5969, "step": 222 }, { "epoch": 0.9695652173913043, "grad_norm": 0.3248462975025177, "learning_rate": 1.8928825625342374e-05, "loss": 1.5211, "step": 223 }, { "epoch": 0.9739130434782609, "grad_norm": 0.32285231351852417, "learning_rate": 1.901393687836879e-05, "loss": 1.5536, "step": 224 }, { "epoch": 0.9782608695652174, "grad_norm": 0.32285231351852417, "learning_rate": 1.901393687836879e-05, "loss": 1.5651, "step": 225 }, { "epoch": 0.9826086956521739, "grad_norm": 0.32285231351852417, "learning_rate": 1.901393687836879e-05, "loss": 1.5113, "step": 226 }, { "epoch": 0.9869565217391304, "grad_norm": 0.32285231351852417, "learning_rate": 1.901393687836879e-05, "loss": 1.6101, "step": 227 }, { "epoch": 0.991304347826087, "grad_norm": 0.3327453136444092, "learning_rate": 1.9097541658173843e-05, "loss": 1.5708, "step": 228 }, { "epoch": 0.9956521739130435, "grad_norm": 0.3327453136444092, "learning_rate": 1.9097541658173843e-05, "loss": 1.516, "step": 229 }, { "epoch": 1.0, "grad_norm": 0.3327453136444092, "learning_rate": 1.9097541658173843e-05, "loss": 1.6279, "step": 230 }, { "epoch": 1.0043478260869565, "grad_norm": 0.3327453136444092, "learning_rate": 1.9097541658173843e-05, "loss": 1.4676, "step": 231 }, { "epoch": 1.008695652173913, "grad_norm": 0.3148215413093567, "learning_rate": 1.9179692367859514e-05, "loss": 1.5175, "step": 232 }, { "epoch": 1.0130434782608695, "grad_norm": 0.3148215413093567, "learning_rate": 1.9179692367859514e-05, "loss": 1.5148, "step": 233 }, { "epoch": 1.017391304347826, "grad_norm": 0.3148215413093567, "learning_rate": 1.9179692367859514e-05, "loss": 1.505, "step": 234 }, { "epoch": 1.0217391304347827, "grad_norm": 0.3148215413093567, "learning_rate": 1.9179692367859514e-05, "loss": 1.4426, "step": 235 }, { "epoch": 1.0260869565217392, "grad_norm": 0.3161942958831787, "learning_rate": 1.926043872292045e-05, "loss": 1.5709, "step": 236 }, { "epoch": 1.0304347826086957, "grad_norm": 0.3161942958831787, "learning_rate": 1.926043872292045e-05, "loss": 1.5708, "step": 237 }, { "epoch": 1.0347826086956522, "grad_norm": 0.3161942958831787, "learning_rate": 1.926043872292045e-05, "loss": 1.4882, "step": 238 }, { "epoch": 1.0391304347826087, "grad_norm": 0.3161942958831787, "learning_rate": 1.926043872292045e-05, "loss": 1.5182, "step": 239 }, { "epoch": 1.0434782608695652, "grad_norm": 0.3121671676635742, "learning_rate": 1.9339827931945454e-05, "loss": 1.59, "step": 240 }, { "epoch": 1.0478260869565217, "grad_norm": 0.3121671676635742, "learning_rate": 1.9339827931945454e-05, "loss": 1.5309, "step": 241 }, { "epoch": 1.0521739130434782, "grad_norm": 0.3121671676635742, "learning_rate": 1.9339827931945454e-05, "loss": 1.4626, "step": 242 }, { "epoch": 1.0565217391304347, "grad_norm": 0.3121671676635742, "learning_rate": 1.9339827931945454e-05, "loss": 1.5466, "step": 243 }, { "epoch": 1.0608695652173914, "grad_norm": 0.3044740557670593, "learning_rate": 1.941790486238291e-05, "loss": 1.5257, "step": 244 }, { "epoch": 1.065217391304348, "grad_norm": 0.3044740557670593, "learning_rate": 1.941790486238291e-05, "loss": 1.6118, "step": 245 }, { "epoch": 1.0695652173913044, "grad_norm": 0.3044740557670593, "learning_rate": 1.941790486238291e-05, "loss": 1.5306, "step": 246 }, { "epoch": 1.0739130434782609, "grad_norm": 0.3044740557670593, "learning_rate": 1.941790486238291e-05, "loss": 1.4952, "step": 247 }, { "epoch": 1.0782608695652174, "grad_norm": 0.3415665626525879, "learning_rate": 1.949471219282736e-05, "loss": 1.5271, "step": 248 }, { "epoch": 1.0826086956521739, "grad_norm": 0.3415665626525879, "learning_rate": 1.949471219282736e-05, "loss": 1.4936, "step": 249 }, { "epoch": 1.0869565217391304, "grad_norm": 0.3415665626525879, "learning_rate": 1.949471219282736e-05, "loss": 1.49, "step": 250 }, { "epoch": 1.0913043478260869, "grad_norm": 0.3415665626525879, "learning_rate": 1.949471219282736e-05, "loss": 1.5655, "step": 251 }, { "epoch": 1.0956521739130434, "grad_norm": 0.3097412586212158, "learning_rate": 1.9570290553121646e-05, "loss": 1.4897, "step": 252 }, { "epoch": 1.1, "grad_norm": 0.3097412586212158, "learning_rate": 1.9570290553121646e-05, "loss": 1.4897, "step": 253 }, { "epoch": 1.1043478260869566, "grad_norm": 0.3097412586212158, "learning_rate": 1.9570290553121646e-05, "loss": 1.5251, "step": 254 }, { "epoch": 1.108695652173913, "grad_norm": 0.3097412586212158, "learning_rate": 1.9570290553121646e-05, "loss": 1.451, "step": 255 }, { "epoch": 1.1130434782608696, "grad_norm": 0.3289230763912201, "learning_rate": 1.9644678653425876e-05, "loss": 1.4828, "step": 256 }, { "epoch": 1.117391304347826, "grad_norm": 0.3289230763912201, "learning_rate": 1.9644678653425876e-05, "loss": 1.5472, "step": 257 }, { "epoch": 1.1217391304347826, "grad_norm": 0.3289230763912201, "learning_rate": 1.9644678653425876e-05, "loss": 1.5391, "step": 258 }, { "epoch": 1.126086956521739, "grad_norm": 0.3289230763912201, "learning_rate": 1.9644678653425876e-05, "loss": 1.5493, "step": 259 }, { "epoch": 1.1304347826086956, "grad_norm": 0.3122900128364563, "learning_rate": 1.971791340327986e-05, "loss": 1.4891, "step": 260 }, { "epoch": 1.134782608695652, "grad_norm": 0.3122900128364563, "learning_rate": 1.971791340327986e-05, "loss": 1.4754, "step": 261 }, { "epoch": 1.1391304347826088, "grad_norm": 0.3122900128364563, "learning_rate": 1.971791340327986e-05, "loss": 1.4819, "step": 262 }, { "epoch": 1.1434782608695653, "grad_norm": 0.3122900128364563, "learning_rate": 1.971791340327986e-05, "loss": 1.5694, "step": 263 }, { "epoch": 1.1478260869565218, "grad_norm": 0.3180365562438965, "learning_rate": 1.979003002157565e-05, "loss": 1.4982, "step": 264 }, { "epoch": 1.1521739130434783, "grad_norm": 0.3180365562438965, "learning_rate": 1.979003002157565e-05, "loss": 1.5122, "step": 265 }, { "epoch": 1.1565217391304348, "grad_norm": 0.3180365562438965, "learning_rate": 1.979003002157565e-05, "loss": 1.5615, "step": 266 }, { "epoch": 1.1608695652173913, "grad_norm": 0.3180365562438965, "learning_rate": 1.979003002157565e-05, "loss": 1.5346, "step": 267 }, { "epoch": 1.1652173913043478, "grad_norm": 0.29976481199264526, "learning_rate": 1.9861062138260542e-05, "loss": 1.5567, "step": 268 }, { "epoch": 1.1695652173913043, "grad_norm": 0.29976481199264526, "learning_rate": 1.9861062138260542e-05, "loss": 1.5182, "step": 269 }, { "epoch": 1.1739130434782608, "grad_norm": 0.29976481199264526, "learning_rate": 1.9861062138260542e-05, "loss": 1.4794, "step": 270 }, { "epoch": 1.1782608695652175, "grad_norm": 0.29976481199264526, "learning_rate": 1.9861062138260542e-05, "loss": 1.5981, "step": 271 }, { "epoch": 1.182608695652174, "grad_norm": 0.3091888129711151, "learning_rate": 1.993104188850563e-05, "loss": 1.4554, "step": 272 }, { "epoch": 1.1869565217391305, "grad_norm": 0.3091888129711151, "learning_rate": 1.993104188850563e-05, "loss": 1.5417, "step": 273 }, { "epoch": 1.191304347826087, "grad_norm": 0.3091888129711151, "learning_rate": 1.993104188850563e-05, "loss": 1.4955, "step": 274 }, { "epoch": 1.1956521739130435, "grad_norm": 0.3091888129711151, "learning_rate": 1.993104188850563e-05, "loss": 1.4146, "step": 275 }, { "epoch": 1.2, "grad_norm": 0.3073773682117462, "learning_rate": 2e-05, "loss": 1.5044, "step": 276 }, { "epoch": 1.2043478260869565, "grad_norm": 0.3073773682117462, "learning_rate": 2e-05, "loss": 1.5285, "step": 277 }, { "epoch": 1.208695652173913, "grad_norm": 0.3073773682117462, "learning_rate": 2e-05, "loss": 1.5259, "step": 278 }, { "epoch": 1.2130434782608694, "grad_norm": 0.3073773682117462, "learning_rate": 2e-05, "loss": 1.5399, "step": 279 }, { "epoch": 1.2173913043478262, "grad_norm": 0.31173551082611084, "learning_rate": 2e-05, "loss": 1.4887, "step": 280 }, { "epoch": 1.2217391304347827, "grad_norm": 0.31173551082611084, "learning_rate": 2e-05, "loss": 1.4436, "step": 281 }, { "epoch": 1.2260869565217392, "grad_norm": 0.31173551082611084, "learning_rate": 2e-05, "loss": 1.412, "step": 282 }, { "epoch": 1.2304347826086957, "grad_norm": 0.31173551082611084, "learning_rate": 2e-05, "loss": 1.489, "step": 283 }, { "epoch": 1.2347826086956522, "grad_norm": 0.28800562024116516, "learning_rate": 2e-05, "loss": 1.5448, "step": 284 }, { "epoch": 1.2391304347826086, "grad_norm": 0.28800562024116516, "learning_rate": 2e-05, "loss": 1.4644, "step": 285 }, { "epoch": 1.2434782608695651, "grad_norm": 0.28800562024116516, "learning_rate": 2e-05, "loss": 1.5625, "step": 286 }, { "epoch": 1.2478260869565219, "grad_norm": 0.28800562024116516, "learning_rate": 2e-05, "loss": 1.5496, "step": 287 }, { "epoch": 1.2521739130434781, "grad_norm": 0.3107075095176697, "learning_rate": 2e-05, "loss": 1.4766, "step": 288 }, { "epoch": 1.2565217391304349, "grad_norm": 0.3107075095176697, "learning_rate": 2e-05, "loss": 1.5138, "step": 289 }, { "epoch": 1.2608695652173914, "grad_norm": 0.3107075095176697, "learning_rate": 2e-05, "loss": 1.4492, "step": 290 }, { "epoch": 1.2652173913043478, "grad_norm": 0.3107075095176697, "learning_rate": 2e-05, "loss": 1.4896, "step": 291 }, { "epoch": 1.2695652173913043, "grad_norm": 0.29269176721572876, "learning_rate": 2e-05, "loss": 1.5382, "step": 292 }, { "epoch": 1.2739130434782608, "grad_norm": 0.29269176721572876, "learning_rate": 2e-05, "loss": 1.563, "step": 293 }, { "epoch": 1.2782608695652173, "grad_norm": 0.29269176721572876, "learning_rate": 2e-05, "loss": 1.5334, "step": 294 }, { "epoch": 1.2826086956521738, "grad_norm": 0.29269176721572876, "learning_rate": 2e-05, "loss": 1.4961, "step": 295 }, { "epoch": 1.2869565217391306, "grad_norm": 0.32013219594955444, "learning_rate": 2e-05, "loss": 1.5529, "step": 296 }, { "epoch": 1.2913043478260868, "grad_norm": 0.32013219594955444, "learning_rate": 2e-05, "loss": 1.4677, "step": 297 }, { "epoch": 1.2956521739130435, "grad_norm": 0.32013219594955444, "learning_rate": 2e-05, "loss": 1.4208, "step": 298 }, { "epoch": 1.3, "grad_norm": 0.32013219594955444, "learning_rate": 2e-05, "loss": 1.5618, "step": 299 }, { "epoch": 1.3043478260869565, "grad_norm": 0.31480494141578674, "learning_rate": 2e-05, "loss": 1.4929, "step": 300 }, { "epoch": 1.308695652173913, "grad_norm": 0.31480494141578674, "learning_rate": 2e-05, "loss": 1.4795, "step": 301 }, { "epoch": 1.3130434782608695, "grad_norm": 0.31480494141578674, "learning_rate": 2e-05, "loss": 1.5061, "step": 302 }, { "epoch": 1.317391304347826, "grad_norm": 0.31480494141578674, "learning_rate": 2e-05, "loss": 1.4551, "step": 303 }, { "epoch": 1.3217391304347825, "grad_norm": 0.3061749339103699, "learning_rate": 2e-05, "loss": 1.5821, "step": 304 }, { "epoch": 1.3260869565217392, "grad_norm": 0.3061749339103699, "learning_rate": 2e-05, "loss": 1.5486, "step": 305 }, { "epoch": 1.3304347826086955, "grad_norm": 0.3061749339103699, "learning_rate": 2e-05, "loss": 1.5276, "step": 306 }, { "epoch": 1.3347826086956522, "grad_norm": 0.3061749339103699, "learning_rate": 2e-05, "loss": 1.5527, "step": 307 }, { "epoch": 1.3391304347826087, "grad_norm": 0.30773481726646423, "learning_rate": 2e-05, "loss": 1.5047, "step": 308 }, { "epoch": 1.3434782608695652, "grad_norm": 0.30773481726646423, "learning_rate": 2e-05, "loss": 1.4873, "step": 309 }, { "epoch": 1.3478260869565217, "grad_norm": 0.30773481726646423, "learning_rate": 2e-05, "loss": 1.4933, "step": 310 }, { "epoch": 1.3521739130434782, "grad_norm": 0.30773481726646423, "learning_rate": 2e-05, "loss": 1.481, "step": 311 }, { "epoch": 1.3565217391304347, "grad_norm": 0.3024744987487793, "learning_rate": 2e-05, "loss": 1.5306, "step": 312 }, { "epoch": 1.3608695652173912, "grad_norm": 0.3024744987487793, "learning_rate": 2e-05, "loss": 1.4178, "step": 313 }, { "epoch": 1.365217391304348, "grad_norm": 0.3024744987487793, "learning_rate": 2e-05, "loss": 1.4804, "step": 314 }, { "epoch": 1.3695652173913042, "grad_norm": 0.3024744987487793, "learning_rate": 2e-05, "loss": 1.4536, "step": 315 }, { "epoch": 1.373913043478261, "grad_norm": 0.28332576155662537, "learning_rate": 2e-05, "loss": 1.4416, "step": 316 }, { "epoch": 1.3782608695652174, "grad_norm": 0.28332576155662537, "learning_rate": 2e-05, "loss": 1.4684, "step": 317 }, { "epoch": 1.382608695652174, "grad_norm": 0.28332576155662537, "learning_rate": 2e-05, "loss": 1.46, "step": 318 }, { "epoch": 1.3869565217391304, "grad_norm": 0.28332576155662537, "learning_rate": 2e-05, "loss": 1.5102, "step": 319 }, { "epoch": 1.391304347826087, "grad_norm": 0.3170158863067627, "learning_rate": 2e-05, "loss": 1.5092, "step": 320 }, { "epoch": 1.3956521739130434, "grad_norm": 0.3170158863067627, "learning_rate": 2e-05, "loss": 1.4435, "step": 321 }, { "epoch": 1.4, "grad_norm": 0.3170158863067627, "learning_rate": 2e-05, "loss": 1.4784, "step": 322 }, { "epoch": 1.4043478260869566, "grad_norm": 0.3170158863067627, "learning_rate": 2e-05, "loss": 1.5239, "step": 323 }, { "epoch": 1.4086956521739131, "grad_norm": 0.32871973514556885, "learning_rate": 2e-05, "loss": 1.5463, "step": 324 }, { "epoch": 1.4130434782608696, "grad_norm": 0.32871973514556885, "learning_rate": 2e-05, "loss": 1.5111, "step": 325 }, { "epoch": 1.4173913043478261, "grad_norm": 0.32871973514556885, "learning_rate": 2e-05, "loss": 1.5124, "step": 326 }, { "epoch": 1.4217391304347826, "grad_norm": 0.32871973514556885, "learning_rate": 2e-05, "loss": 1.5062, "step": 327 }, { "epoch": 1.4260869565217391, "grad_norm": 0.3206692636013031, "learning_rate": 2e-05, "loss": 1.4279, "step": 328 }, { "epoch": 1.4304347826086956, "grad_norm": 0.3206692636013031, "learning_rate": 2e-05, "loss": 1.5357, "step": 329 }, { "epoch": 1.434782608695652, "grad_norm": 0.3206692636013031, "learning_rate": 2e-05, "loss": 1.4907, "step": 330 }, { "epoch": 1.4391304347826086, "grad_norm": 0.3206692636013031, "learning_rate": 2e-05, "loss": 1.5184, "step": 331 }, { "epoch": 1.4434782608695653, "grad_norm": 0.31712856888771057, "learning_rate": 2e-05, "loss": 1.4562, "step": 332 }, { "epoch": 1.4478260869565218, "grad_norm": 0.31712856888771057, "learning_rate": 2e-05, "loss": 1.4774, "step": 333 }, { "epoch": 1.4521739130434783, "grad_norm": 0.31712856888771057, "learning_rate": 2e-05, "loss": 1.5188, "step": 334 }, { "epoch": 1.4565217391304348, "grad_norm": 0.31712856888771057, "learning_rate": 2e-05, "loss": 1.4372, "step": 335 }, { "epoch": 1.4608695652173913, "grad_norm": 0.3141573965549469, "learning_rate": 2e-05, "loss": 1.5092, "step": 336 }, { "epoch": 1.4652173913043478, "grad_norm": 0.3141573965549469, "learning_rate": 2e-05, "loss": 1.4997, "step": 337 }, { "epoch": 1.4695652173913043, "grad_norm": 0.3141573965549469, "learning_rate": 2e-05, "loss": 1.513, "step": 338 }, { "epoch": 1.4739130434782608, "grad_norm": 0.3141573965549469, "learning_rate": 2e-05, "loss": 1.4096, "step": 339 }, { "epoch": 1.4782608695652173, "grad_norm": 0.31141531467437744, "learning_rate": 2e-05, "loss": 1.4981, "step": 340 }, { "epoch": 1.482608695652174, "grad_norm": 0.31141531467437744, "learning_rate": 2e-05, "loss": 1.4774, "step": 341 }, { "epoch": 1.4869565217391305, "grad_norm": 0.31141531467437744, "learning_rate": 2e-05, "loss": 1.4529, "step": 342 }, { "epoch": 1.491304347826087, "grad_norm": 0.31141531467437744, "learning_rate": 2e-05, "loss": 1.4369, "step": 343 }, { "epoch": 1.4956521739130435, "grad_norm": 0.32201042771339417, "learning_rate": 2e-05, "loss": 1.5313, "step": 344 }, { "epoch": 1.5, "grad_norm": 0.32201042771339417, "learning_rate": 2e-05, "loss": 1.508, "step": 345 }, { "epoch": 1.5043478260869565, "grad_norm": 0.32201042771339417, "learning_rate": 2e-05, "loss": 1.5375, "step": 346 }, { "epoch": 1.508695652173913, "grad_norm": 0.32201042771339417, "learning_rate": 2e-05, "loss": 1.5526, "step": 347 }, { "epoch": 1.5130434782608697, "grad_norm": 0.3498378098011017, "learning_rate": 2e-05, "loss": 1.464, "step": 348 }, { "epoch": 1.517391304347826, "grad_norm": 0.3498378098011017, "learning_rate": 2e-05, "loss": 1.498, "step": 349 }, { "epoch": 1.5217391304347827, "grad_norm": 0.3498378098011017, "learning_rate": 2e-05, "loss": 1.4936, "step": 350 }, { "epoch": 1.526086956521739, "grad_norm": 0.3498378098011017, "learning_rate": 2e-05, "loss": 1.5393, "step": 351 }, { "epoch": 1.5304347826086957, "grad_norm": 0.31443101167678833, "learning_rate": 2e-05, "loss": 1.3899, "step": 352 }, { "epoch": 1.5347826086956522, "grad_norm": 0.31443101167678833, "learning_rate": 2e-05, "loss": 1.4724, "step": 353 }, { "epoch": 1.5391304347826087, "grad_norm": 0.31443101167678833, "learning_rate": 2e-05, "loss": 1.5721, "step": 354 }, { "epoch": 1.5434782608695652, "grad_norm": 0.31443101167678833, "learning_rate": 2e-05, "loss": 1.4112, "step": 355 }, { "epoch": 1.5478260869565217, "grad_norm": 0.30996161699295044, "learning_rate": 2e-05, "loss": 1.4969, "step": 356 }, { "epoch": 1.5521739130434784, "grad_norm": 0.30996161699295044, "learning_rate": 2e-05, "loss": 1.442, "step": 357 }, { "epoch": 1.5565217391304347, "grad_norm": 0.30996161699295044, "learning_rate": 2e-05, "loss": 1.4836, "step": 358 }, { "epoch": 1.5608695652173914, "grad_norm": 0.30996161699295044, "learning_rate": 2e-05, "loss": 1.4341, "step": 359 }, { "epoch": 1.5652173913043477, "grad_norm": 0.31205815076828003, "learning_rate": 2e-05, "loss": 1.5243, "step": 360 }, { "epoch": 1.5695652173913044, "grad_norm": 0.31205815076828003, "learning_rate": 2e-05, "loss": 1.5261, "step": 361 }, { "epoch": 1.5739130434782609, "grad_norm": 0.31205815076828003, "learning_rate": 2e-05, "loss": 1.5295, "step": 362 }, { "epoch": 1.5782608695652174, "grad_norm": 0.31205815076828003, "learning_rate": 2e-05, "loss": 1.4583, "step": 363 }, { "epoch": 1.5826086956521739, "grad_norm": 0.31103307008743286, "learning_rate": 2e-05, "loss": 1.4848, "step": 364 }, { "epoch": 1.5869565217391304, "grad_norm": 0.31103307008743286, "learning_rate": 2e-05, "loss": 1.4382, "step": 365 }, { "epoch": 1.591304347826087, "grad_norm": 0.31103307008743286, "learning_rate": 2e-05, "loss": 1.4936, "step": 366 }, { "epoch": 1.5956521739130434, "grad_norm": 0.31103307008743286, "learning_rate": 2e-05, "loss": 1.4764, "step": 367 }, { "epoch": 1.6, "grad_norm": 0.3369222581386566, "learning_rate": 2e-05, "loss": 1.5201, "step": 368 }, { "epoch": 1.6043478260869564, "grad_norm": 0.3369222581386566, "learning_rate": 2e-05, "loss": 1.4769, "step": 369 }, { "epoch": 1.608695652173913, "grad_norm": 0.3369222581386566, "learning_rate": 2e-05, "loss": 1.459, "step": 370 }, { "epoch": 1.6130434782608696, "grad_norm": 0.3369222581386566, "learning_rate": 2e-05, "loss": 1.4849, "step": 371 }, { "epoch": 1.617391304347826, "grad_norm": 0.32731279730796814, "learning_rate": 2e-05, "loss": 1.4742, "step": 372 }, { "epoch": 1.6217391304347826, "grad_norm": 0.32731279730796814, "learning_rate": 2e-05, "loss": 1.4444, "step": 373 }, { "epoch": 1.626086956521739, "grad_norm": 0.32731279730796814, "learning_rate": 2e-05, "loss": 1.4716, "step": 374 }, { "epoch": 1.6304347826086958, "grad_norm": 0.32731279730796814, "learning_rate": 2e-05, "loss": 1.474, "step": 375 }, { "epoch": 1.634782608695652, "grad_norm": 0.3295840620994568, "learning_rate": 2e-05, "loss": 1.4419, "step": 376 }, { "epoch": 1.6391304347826088, "grad_norm": 0.3295840620994568, "learning_rate": 2e-05, "loss": 1.4744, "step": 377 }, { "epoch": 1.643478260869565, "grad_norm": 0.3295840620994568, "learning_rate": 2e-05, "loss": 1.4461, "step": 378 }, { "epoch": 1.6478260869565218, "grad_norm": 0.3295840620994568, "learning_rate": 2e-05, "loss": 1.4591, "step": 379 }, { "epoch": 1.6521739130434783, "grad_norm": 0.326766699552536, "learning_rate": 2e-05, "loss": 1.4586, "step": 380 }, { "epoch": 1.6565217391304348, "grad_norm": 0.326766699552536, "learning_rate": 2e-05, "loss": 1.4196, "step": 381 }, { "epoch": 1.6608695652173913, "grad_norm": 0.326766699552536, "learning_rate": 2e-05, "loss": 1.4268, "step": 382 }, { "epoch": 1.6652173913043478, "grad_norm": 0.326766699552536, "learning_rate": 2e-05, "loss": 1.4626, "step": 383 }, { "epoch": 1.6695652173913045, "grad_norm": 0.31557193398475647, "learning_rate": 2e-05, "loss": 1.5136, "step": 384 }, { "epoch": 1.6739130434782608, "grad_norm": 0.31557193398475647, "learning_rate": 2e-05, "loss": 1.5269, "step": 385 }, { "epoch": 1.6782608695652175, "grad_norm": 0.31557193398475647, "learning_rate": 2e-05, "loss": 1.4681, "step": 386 }, { "epoch": 1.6826086956521737, "grad_norm": 0.31557193398475647, "learning_rate": 2e-05, "loss": 1.4613, "step": 387 }, { "epoch": 1.6869565217391305, "grad_norm": 0.30831843614578247, "learning_rate": 2e-05, "loss": 1.5108, "step": 388 }, { "epoch": 1.691304347826087, "grad_norm": 0.30831843614578247, "learning_rate": 2e-05, "loss": 1.4337, "step": 389 }, { "epoch": 1.6956521739130435, "grad_norm": 0.30831843614578247, "learning_rate": 2e-05, "loss": 1.4706, "step": 390 }, { "epoch": 1.7, "grad_norm": 0.30831843614578247, "learning_rate": 2e-05, "loss": 1.4333, "step": 391 }, { "epoch": 1.7043478260869565, "grad_norm": 0.3329159915447235, "learning_rate": 2e-05, "loss": 1.4945, "step": 392 }, { "epoch": 1.7086956521739132, "grad_norm": 0.3329159915447235, "learning_rate": 2e-05, "loss": 1.4734, "step": 393 }, { "epoch": 1.7130434782608694, "grad_norm": 0.3329159915447235, "learning_rate": 2e-05, "loss": 1.4894, "step": 394 }, { "epoch": 1.7173913043478262, "grad_norm": 0.3329159915447235, "learning_rate": 2e-05, "loss": 1.5237, "step": 395 }, { "epoch": 1.7217391304347827, "grad_norm": 0.33224427700042725, "learning_rate": 2e-05, "loss": 1.4447, "step": 396 }, { "epoch": 1.7260869565217392, "grad_norm": 0.33224427700042725, "learning_rate": 2e-05, "loss": 1.4854, "step": 397 }, { "epoch": 1.7304347826086957, "grad_norm": 0.33224427700042725, "learning_rate": 2e-05, "loss": 1.4955, "step": 398 }, { "epoch": 1.7347826086956522, "grad_norm": 0.33224427700042725, "learning_rate": 2e-05, "loss": 1.5115, "step": 399 }, { "epoch": 1.7391304347826086, "grad_norm": 0.31770649552345276, "learning_rate": 2e-05, "loss": 1.53, "step": 400 }, { "epoch": 1.7434782608695651, "grad_norm": 0.31770649552345276, "learning_rate": 2e-05, "loss": 1.49, "step": 401 }, { "epoch": 1.7478260869565219, "grad_norm": 0.31770649552345276, "learning_rate": 2e-05, "loss": 1.4991, "step": 402 }, { "epoch": 1.7521739130434781, "grad_norm": 0.31770649552345276, "learning_rate": 2e-05, "loss": 1.5023, "step": 403 }, { "epoch": 1.7565217391304349, "grad_norm": 0.3262755870819092, "learning_rate": 2e-05, "loss": 1.4715, "step": 404 }, { "epoch": 1.7608695652173914, "grad_norm": 0.3262755870819092, "learning_rate": 2e-05, "loss": 1.4738, "step": 405 }, { "epoch": 1.7652173913043478, "grad_norm": 0.3262755870819092, "learning_rate": 2e-05, "loss": 1.4988, "step": 406 }, { "epoch": 1.7695652173913043, "grad_norm": 0.3262755870819092, "learning_rate": 2e-05, "loss": 1.4644, "step": 407 }, { "epoch": 1.7739130434782608, "grad_norm": 0.3169100880622864, "learning_rate": 2e-05, "loss": 1.4858, "step": 408 }, { "epoch": 1.7782608695652173, "grad_norm": 0.3169100880622864, "learning_rate": 2e-05, "loss": 1.5205, "step": 409 }, { "epoch": 1.7826086956521738, "grad_norm": 0.3169100880622864, "learning_rate": 2e-05, "loss": 1.4483, "step": 410 }, { "epoch": 1.7869565217391306, "grad_norm": 0.3169100880622864, "learning_rate": 2e-05, "loss": 1.3957, "step": 411 }, { "epoch": 1.7913043478260868, "grad_norm": 0.3154841661453247, "learning_rate": 2e-05, "loss": 1.4797, "step": 412 }, { "epoch": 1.7956521739130435, "grad_norm": 0.3154841661453247, "learning_rate": 2e-05, "loss": 1.4251, "step": 413 }, { "epoch": 1.8, "grad_norm": 0.3154841661453247, "learning_rate": 2e-05, "loss": 1.4896, "step": 414 }, { "epoch": 1.8043478260869565, "grad_norm": 0.3154841661453247, "learning_rate": 2e-05, "loss": 1.4793, "step": 415 }, { "epoch": 1.808695652173913, "grad_norm": 0.33579424023628235, "learning_rate": 2e-05, "loss": 1.4919, "step": 416 }, { "epoch": 1.8130434782608695, "grad_norm": 0.33579424023628235, "learning_rate": 2e-05, "loss": 1.5001, "step": 417 }, { "epoch": 1.8173913043478263, "grad_norm": 0.33579424023628235, "learning_rate": 2e-05, "loss": 1.5017, "step": 418 }, { "epoch": 1.8217391304347825, "grad_norm": 0.33579424023628235, "learning_rate": 2e-05, "loss": 1.4573, "step": 419 }, { "epoch": 1.8260869565217392, "grad_norm": 0.3174879550933838, "learning_rate": 2e-05, "loss": 1.5409, "step": 420 }, { "epoch": 1.8304347826086955, "grad_norm": 0.3174879550933838, "learning_rate": 2e-05, "loss": 1.4664, "step": 421 }, { "epoch": 1.8347826086956522, "grad_norm": 0.3174879550933838, "learning_rate": 2e-05, "loss": 1.4139, "step": 422 }, { "epoch": 1.8391304347826087, "grad_norm": 0.3174879550933838, "learning_rate": 2e-05, "loss": 1.5241, "step": 423 }, { "epoch": 1.8434782608695652, "grad_norm": 0.32846924662590027, "learning_rate": 2e-05, "loss": 1.4632, "step": 424 }, { "epoch": 1.8478260869565217, "grad_norm": 0.32846924662590027, "learning_rate": 2e-05, "loss": 1.4625, "step": 425 }, { "epoch": 1.8521739130434782, "grad_norm": 0.32846924662590027, "learning_rate": 2e-05, "loss": 1.4093, "step": 426 }, { "epoch": 1.856521739130435, "grad_norm": 0.32846924662590027, "learning_rate": 2e-05, "loss": 1.4844, "step": 427 }, { "epoch": 1.8608695652173912, "grad_norm": 0.31925633549690247, "learning_rate": 2e-05, "loss": 1.4778, "step": 428 }, { "epoch": 1.865217391304348, "grad_norm": 0.31925633549690247, "learning_rate": 2e-05, "loss": 1.4719, "step": 429 }, { "epoch": 1.8695652173913042, "grad_norm": 0.31925633549690247, "learning_rate": 2e-05, "loss": 1.4404, "step": 430 }, { "epoch": 1.873913043478261, "grad_norm": 0.31925633549690247, "learning_rate": 2e-05, "loss": 1.4333, "step": 431 }, { "epoch": 1.8782608695652174, "grad_norm": 0.31410935521125793, "learning_rate": 2e-05, "loss": 1.452, "step": 432 }, { "epoch": 1.882608695652174, "grad_norm": 0.31410935521125793, "learning_rate": 2e-05, "loss": 1.4822, "step": 433 }, { "epoch": 1.8869565217391304, "grad_norm": 0.31410935521125793, "learning_rate": 2e-05, "loss": 1.4667, "step": 434 }, { "epoch": 1.891304347826087, "grad_norm": 0.31410935521125793, "learning_rate": 2e-05, "loss": 1.475, "step": 435 }, { "epoch": 1.8956521739130436, "grad_norm": 0.3550223112106323, "learning_rate": 2e-05, "loss": 1.3765, "step": 436 }, { "epoch": 1.9, "grad_norm": 0.3550223112106323, "learning_rate": 2e-05, "loss": 1.4709, "step": 437 }, { "epoch": 1.9043478260869566, "grad_norm": 0.3550223112106323, "learning_rate": 2e-05, "loss": 1.529, "step": 438 }, { "epoch": 1.908695652173913, "grad_norm": 0.3550223112106323, "learning_rate": 2e-05, "loss": 1.4646, "step": 439 }, { "epoch": 1.9130434782608696, "grad_norm": 0.31760966777801514, "learning_rate": 2e-05, "loss": 1.4878, "step": 440 }, { "epoch": 1.9173913043478261, "grad_norm": 0.31760966777801514, "learning_rate": 2e-05, "loss": 1.4272, "step": 441 }, { "epoch": 1.9217391304347826, "grad_norm": 0.31760966777801514, "learning_rate": 2e-05, "loss": 1.4009, "step": 442 }, { "epoch": 1.9260869565217391, "grad_norm": 0.31760966777801514, "learning_rate": 2e-05, "loss": 1.3982, "step": 443 }, { "epoch": 1.9304347826086956, "grad_norm": 0.3307594358921051, "learning_rate": 2e-05, "loss": 1.4638, "step": 444 }, { "epoch": 1.9347826086956523, "grad_norm": 0.3307594358921051, "learning_rate": 2e-05, "loss": 1.4494, "step": 445 }, { "epoch": 1.9391304347826086, "grad_norm": 0.3307594358921051, "learning_rate": 2e-05, "loss": 1.4583, "step": 446 }, { "epoch": 1.9434782608695653, "grad_norm": 0.3307594358921051, "learning_rate": 2e-05, "loss": 1.529, "step": 447 }, { "epoch": 1.9478260869565216, "grad_norm": 0.38704583048820496, "learning_rate": 2e-05, "loss": 1.4085, "step": 448 }, { "epoch": 1.9521739130434783, "grad_norm": 0.38704583048820496, "learning_rate": 2e-05, "loss": 1.4487, "step": 449 }, { "epoch": 1.9565217391304348, "grad_norm": 0.38704583048820496, "learning_rate": 2e-05, "loss": 1.4419, "step": 450 }, { "epoch": 1.9608695652173913, "grad_norm": 0.38704583048820496, "learning_rate": 2e-05, "loss": 1.4364, "step": 451 }, { "epoch": 1.9652173913043478, "grad_norm": 0.3177286684513092, "learning_rate": 2e-05, "loss": 1.5169, "step": 452 }, { "epoch": 1.9695652173913043, "grad_norm": 0.3177286684513092, "learning_rate": 2e-05, "loss": 1.4835, "step": 453 }, { "epoch": 1.973913043478261, "grad_norm": 0.3177286684513092, "learning_rate": 2e-05, "loss": 1.4178, "step": 454 }, { "epoch": 1.9782608695652173, "grad_norm": 0.3177286684513092, "learning_rate": 2e-05, "loss": 1.4891, "step": 455 }, { "epoch": 1.982608695652174, "grad_norm": 0.31212443113327026, "learning_rate": 2e-05, "loss": 1.4395, "step": 456 }, { "epoch": 1.9869565217391303, "grad_norm": 0.31212443113327026, "learning_rate": 2e-05, "loss": 1.4942, "step": 457 }, { "epoch": 1.991304347826087, "grad_norm": 0.31212443113327026, "learning_rate": 2e-05, "loss": 1.4559, "step": 458 }, { "epoch": 1.9956521739130435, "grad_norm": 0.31212443113327026, "learning_rate": 2e-05, "loss": 1.5684, "step": 459 }, { "epoch": 2.0, "grad_norm": 0.34978216886520386, "learning_rate": 2e-05, "loss": 1.4239, "step": 460 }, { "epoch": 2.0043478260869567, "grad_norm": 0.34978216886520386, "learning_rate": 2e-05, "loss": 1.4397, "step": 461 }, { "epoch": 2.008695652173913, "grad_norm": 0.34978216886520386, "learning_rate": 2e-05, "loss": 1.4092, "step": 462 }, { "epoch": 2.0130434782608697, "grad_norm": 0.34978216886520386, "learning_rate": 2e-05, "loss": 1.3932, "step": 463 }, { "epoch": 2.017391304347826, "grad_norm": 0.3233121931552887, "learning_rate": 2e-05, "loss": 1.4781, "step": 464 }, { "epoch": 2.0217391304347827, "grad_norm": 0.3233121931552887, "learning_rate": 2e-05, "loss": 1.3867, "step": 465 }, { "epoch": 2.026086956521739, "grad_norm": 0.3233121931552887, "learning_rate": 2e-05, "loss": 1.4451, "step": 466 }, { "epoch": 2.0304347826086957, "grad_norm": 0.3233121931552887, "learning_rate": 2e-05, "loss": 1.4717, "step": 467 }, { "epoch": 2.034782608695652, "grad_norm": 0.3593771159648895, "learning_rate": 2e-05, "loss": 1.4251, "step": 468 }, { "epoch": 2.0391304347826087, "grad_norm": 0.3593771159648895, "learning_rate": 2e-05, "loss": 1.3973, "step": 469 }, { "epoch": 2.0434782608695654, "grad_norm": 0.3593771159648895, "learning_rate": 2e-05, "loss": 1.4282, "step": 470 }, { "epoch": 2.0478260869565217, "grad_norm": 0.3593771159648895, "learning_rate": 2e-05, "loss": 1.413, "step": 471 }, { "epoch": 2.0521739130434784, "grad_norm": 0.3393575847148895, "learning_rate": 2e-05, "loss": 1.3597, "step": 472 }, { "epoch": 2.0565217391304347, "grad_norm": 0.3393575847148895, "learning_rate": 2e-05, "loss": 1.4215, "step": 473 }, { "epoch": 2.0608695652173914, "grad_norm": 0.3393575847148895, "learning_rate": 2e-05, "loss": 1.475, "step": 474 }, { "epoch": 2.0652173913043477, "grad_norm": 0.3393575847148895, "learning_rate": 2e-05, "loss": 1.3778, "step": 475 }, { "epoch": 2.0695652173913044, "grad_norm": 0.34234023094177246, "learning_rate": 2e-05, "loss": 1.4295, "step": 476 }, { "epoch": 2.0739130434782607, "grad_norm": 0.34234023094177246, "learning_rate": 2e-05, "loss": 1.4279, "step": 477 }, { "epoch": 2.0782608695652174, "grad_norm": 0.34234023094177246, "learning_rate": 2e-05, "loss": 1.4368, "step": 478 }, { "epoch": 2.082608695652174, "grad_norm": 0.34234023094177246, "learning_rate": 2e-05, "loss": 1.3915, "step": 479 }, { "epoch": 2.0869565217391304, "grad_norm": 0.34392404556274414, "learning_rate": 2e-05, "loss": 1.4547, "step": 480 }, { "epoch": 2.091304347826087, "grad_norm": 0.34392404556274414, "learning_rate": 2e-05, "loss": 1.395, "step": 481 }, { "epoch": 2.0956521739130434, "grad_norm": 0.34392404556274414, "learning_rate": 2e-05, "loss": 1.4057, "step": 482 }, { "epoch": 2.1, "grad_norm": 0.34392404556274414, "learning_rate": 2e-05, "loss": 1.3388, "step": 483 }, { "epoch": 2.1043478260869564, "grad_norm": 0.32572683691978455, "learning_rate": 2e-05, "loss": 1.3976, "step": 484 }, { "epoch": 2.108695652173913, "grad_norm": 0.32572683691978455, "learning_rate": 2e-05, "loss": 1.3961, "step": 485 }, { "epoch": 2.1130434782608694, "grad_norm": 0.32572683691978455, "learning_rate": 2e-05, "loss": 1.4213, "step": 486 }, { "epoch": 2.117391304347826, "grad_norm": 0.32572683691978455, "learning_rate": 2e-05, "loss": 1.4081, "step": 487 }, { "epoch": 2.121739130434783, "grad_norm": 0.33425086736679077, "learning_rate": 2e-05, "loss": 1.3635, "step": 488 }, { "epoch": 2.126086956521739, "grad_norm": 0.33425086736679077, "learning_rate": 2e-05, "loss": 1.4042, "step": 489 }, { "epoch": 2.130434782608696, "grad_norm": 0.33425086736679077, "learning_rate": 2e-05, "loss": 1.3905, "step": 490 }, { "epoch": 2.134782608695652, "grad_norm": 0.33425086736679077, "learning_rate": 2e-05, "loss": 1.4297, "step": 491 }, { "epoch": 2.139130434782609, "grad_norm": 0.3359292149543762, "learning_rate": 2e-05, "loss": 1.4222, "step": 492 }, { "epoch": 2.143478260869565, "grad_norm": 0.3359292149543762, "learning_rate": 2e-05, "loss": 1.4278, "step": 493 }, { "epoch": 2.1478260869565218, "grad_norm": 0.3359292149543762, "learning_rate": 2e-05, "loss": 1.4651, "step": 494 }, { "epoch": 2.1521739130434785, "grad_norm": 0.3359292149543762, "learning_rate": 2e-05, "loss": 1.3972, "step": 495 }, { "epoch": 2.1565217391304348, "grad_norm": 0.33683329820632935, "learning_rate": 2e-05, "loss": 1.4149, "step": 496 }, { "epoch": 2.1608695652173915, "grad_norm": 0.33683329820632935, "learning_rate": 2e-05, "loss": 1.4339, "step": 497 }, { "epoch": 2.1652173913043478, "grad_norm": 0.33683329820632935, "learning_rate": 2e-05, "loss": 1.3726, "step": 498 }, { "epoch": 2.1695652173913045, "grad_norm": 0.33683329820632935, "learning_rate": 2e-05, "loss": 1.4655, "step": 499 }, { "epoch": 2.1739130434782608, "grad_norm": 0.3291366994380951, "learning_rate": 2e-05, "loss": 1.4376, "step": 500 }, { "epoch": 2.1782608695652175, "grad_norm": 0.3291366994380951, "learning_rate": 2e-05, "loss": 1.4483, "step": 501 }, { "epoch": 2.1826086956521737, "grad_norm": 0.3291366994380951, "learning_rate": 2e-05, "loss": 1.4218, "step": 502 }, { "epoch": 2.1869565217391305, "grad_norm": 0.3291366994380951, "learning_rate": 2e-05, "loss": 1.4179, "step": 503 }, { "epoch": 2.1913043478260867, "grad_norm": 0.3259764015674591, "learning_rate": 2e-05, "loss": 1.4048, "step": 504 }, { "epoch": 2.1956521739130435, "grad_norm": 0.3259764015674591, "learning_rate": 2e-05, "loss": 1.4301, "step": 505 }, { "epoch": 2.2, "grad_norm": 0.3259764015674591, "learning_rate": 2e-05, "loss": 1.3742, "step": 506 }, { "epoch": 2.2043478260869565, "grad_norm": 0.3259764015674591, "learning_rate": 2e-05, "loss": 1.4225, "step": 507 }, { "epoch": 2.208695652173913, "grad_norm": 0.30931395292282104, "learning_rate": 2e-05, "loss": 1.438, "step": 508 }, { "epoch": 2.2130434782608694, "grad_norm": 0.30931395292282104, "learning_rate": 2e-05, "loss": 1.4397, "step": 509 }, { "epoch": 2.217391304347826, "grad_norm": 0.30931395292282104, "learning_rate": 2e-05, "loss": 1.4551, "step": 510 }, { "epoch": 2.2217391304347824, "grad_norm": 0.30931395292282104, "learning_rate": 2e-05, "loss": 1.3048, "step": 511 }, { "epoch": 2.226086956521739, "grad_norm": 0.3235810399055481, "learning_rate": 2e-05, "loss": 1.4198, "step": 512 }, { "epoch": 2.230434782608696, "grad_norm": 0.3235810399055481, "learning_rate": 2e-05, "loss": 1.359, "step": 513 }, { "epoch": 2.234782608695652, "grad_norm": 0.3235810399055481, "learning_rate": 2e-05, "loss": 1.3955, "step": 514 }, { "epoch": 2.239130434782609, "grad_norm": 0.3235810399055481, "learning_rate": 2e-05, "loss": 1.3925, "step": 515 }, { "epoch": 2.243478260869565, "grad_norm": 0.33742859959602356, "learning_rate": 2e-05, "loss": 1.5026, "step": 516 }, { "epoch": 2.247826086956522, "grad_norm": 0.33742859959602356, "learning_rate": 2e-05, "loss": 1.423, "step": 517 }, { "epoch": 2.252173913043478, "grad_norm": 0.33742859959602356, "learning_rate": 2e-05, "loss": 1.3906, "step": 518 }, { "epoch": 2.256521739130435, "grad_norm": 0.33742859959602356, "learning_rate": 2e-05, "loss": 1.3976, "step": 519 }, { "epoch": 2.260869565217391, "grad_norm": 0.3243946135044098, "learning_rate": 2e-05, "loss": 1.4636, "step": 520 }, { "epoch": 2.265217391304348, "grad_norm": 0.3243946135044098, "learning_rate": 2e-05, "loss": 1.4101, "step": 521 }, { "epoch": 2.269565217391304, "grad_norm": 0.3243946135044098, "learning_rate": 2e-05, "loss": 1.37, "step": 522 }, { "epoch": 2.273913043478261, "grad_norm": 0.3243946135044098, "learning_rate": 2e-05, "loss": 1.3731, "step": 523 }, { "epoch": 2.2782608695652176, "grad_norm": 0.3233046233654022, "learning_rate": 2e-05, "loss": 1.3996, "step": 524 }, { "epoch": 2.282608695652174, "grad_norm": 0.3233046233654022, "learning_rate": 2e-05, "loss": 1.3471, "step": 525 }, { "epoch": 2.2869565217391306, "grad_norm": 0.3233046233654022, "learning_rate": 2e-05, "loss": 1.4454, "step": 526 }, { "epoch": 2.291304347826087, "grad_norm": 0.3233046233654022, "learning_rate": 2e-05, "loss": 1.3706, "step": 527 }, { "epoch": 2.2956521739130435, "grad_norm": 0.30293363332748413, "learning_rate": 2e-05, "loss": 1.4508, "step": 528 }, { "epoch": 2.3, "grad_norm": 0.30293363332748413, "learning_rate": 2e-05, "loss": 1.4687, "step": 529 }, { "epoch": 2.3043478260869565, "grad_norm": 0.30293363332748413, "learning_rate": 2e-05, "loss": 1.4307, "step": 530 }, { "epoch": 2.3086956521739133, "grad_norm": 0.30293363332748413, "learning_rate": 2e-05, "loss": 1.3598, "step": 531 }, { "epoch": 2.3130434782608695, "grad_norm": 0.30868127942085266, "learning_rate": 2e-05, "loss": 1.4503, "step": 532 }, { "epoch": 2.3173913043478263, "grad_norm": 0.30868127942085266, "learning_rate": 2e-05, "loss": 1.3985, "step": 533 }, { "epoch": 2.3217391304347825, "grad_norm": 0.30868127942085266, "learning_rate": 2e-05, "loss": 1.367, "step": 534 }, { "epoch": 2.3260869565217392, "grad_norm": 0.30868127942085266, "learning_rate": 2e-05, "loss": 1.4399, "step": 535 }, { "epoch": 2.3304347826086955, "grad_norm": 0.31287699937820435, "learning_rate": 2e-05, "loss": 1.3822, "step": 536 }, { "epoch": 2.3347826086956522, "grad_norm": 0.31287699937820435, "learning_rate": 2e-05, "loss": 1.4136, "step": 537 }, { "epoch": 2.3391304347826085, "grad_norm": 0.31287699937820435, "learning_rate": 2e-05, "loss": 1.4002, "step": 538 }, { "epoch": 2.3434782608695652, "grad_norm": 0.31287699937820435, "learning_rate": 2e-05, "loss": 1.4282, "step": 539 }, { "epoch": 2.3478260869565215, "grad_norm": 0.3195163607597351, "learning_rate": 2e-05, "loss": 1.4068, "step": 540 }, { "epoch": 2.3521739130434782, "grad_norm": 0.3195163607597351, "learning_rate": 2e-05, "loss": 1.3813, "step": 541 }, { "epoch": 2.356521739130435, "grad_norm": 0.3195163607597351, "learning_rate": 2e-05, "loss": 1.4368, "step": 542 }, { "epoch": 2.360869565217391, "grad_norm": 0.3195163607597351, "learning_rate": 2e-05, "loss": 1.393, "step": 543 }, { "epoch": 2.365217391304348, "grad_norm": 0.31394919753074646, "learning_rate": 2e-05, "loss": 1.4161, "step": 544 }, { "epoch": 2.369565217391304, "grad_norm": 0.31394919753074646, "learning_rate": 2e-05, "loss": 1.4387, "step": 545 }, { "epoch": 2.373913043478261, "grad_norm": 0.31394919753074646, "learning_rate": 2e-05, "loss": 1.3624, "step": 546 }, { "epoch": 2.378260869565217, "grad_norm": 0.31394919753074646, "learning_rate": 2e-05, "loss": 1.452, "step": 547 }, { "epoch": 2.382608695652174, "grad_norm": 0.3255818784236908, "learning_rate": 2e-05, "loss": 1.4097, "step": 548 }, { "epoch": 2.3869565217391306, "grad_norm": 0.3255818784236908, "learning_rate": 2e-05, "loss": 1.3715, "step": 549 }, { "epoch": 2.391304347826087, "grad_norm": 0.3255818784236908, "learning_rate": 2e-05, "loss": 1.4356, "step": 550 }, { "epoch": 2.3956521739130436, "grad_norm": 0.3255818784236908, "learning_rate": 2e-05, "loss": 1.4727, "step": 551 }, { "epoch": 2.4, "grad_norm": 0.34206336736679077, "learning_rate": 2e-05, "loss": 1.4212, "step": 552 }, { "epoch": 2.4043478260869566, "grad_norm": 0.34206336736679077, "learning_rate": 2e-05, "loss": 1.3371, "step": 553 }, { "epoch": 2.408695652173913, "grad_norm": 0.34206336736679077, "learning_rate": 2e-05, "loss": 1.3715, "step": 554 }, { "epoch": 2.4130434782608696, "grad_norm": 0.34206336736679077, "learning_rate": 2e-05, "loss": 1.407, "step": 555 }, { "epoch": 2.417391304347826, "grad_norm": 0.32757705450057983, "learning_rate": 2e-05, "loss": 1.404, "step": 556 }, { "epoch": 2.4217391304347826, "grad_norm": 0.32757705450057983, "learning_rate": 2e-05, "loss": 1.3585, "step": 557 }, { "epoch": 2.426086956521739, "grad_norm": 0.32757705450057983, "learning_rate": 2e-05, "loss": 1.39, "step": 558 }, { "epoch": 2.4304347826086956, "grad_norm": 0.32757705450057983, "learning_rate": 2e-05, "loss": 1.4128, "step": 559 }, { "epoch": 2.4347826086956523, "grad_norm": 0.33430787920951843, "learning_rate": 2e-05, "loss": 1.4144, "step": 560 }, { "epoch": 2.4391304347826086, "grad_norm": 0.33430787920951843, "learning_rate": 2e-05, "loss": 1.4206, "step": 561 }, { "epoch": 2.4434782608695653, "grad_norm": 0.33430787920951843, "learning_rate": 2e-05, "loss": 1.374, "step": 562 }, { "epoch": 2.4478260869565216, "grad_norm": 0.33430787920951843, "learning_rate": 2e-05, "loss": 1.4283, "step": 563 }, { "epoch": 2.4521739130434783, "grad_norm": 0.3351895213127136, "learning_rate": 2e-05, "loss": 1.3168, "step": 564 }, { "epoch": 2.4565217391304346, "grad_norm": 0.3351895213127136, "learning_rate": 2e-05, "loss": 1.3109, "step": 565 }, { "epoch": 2.4608695652173913, "grad_norm": 0.3351895213127136, "learning_rate": 2e-05, "loss": 1.4633, "step": 566 }, { "epoch": 2.465217391304348, "grad_norm": 0.3351895213127136, "learning_rate": 2e-05, "loss": 1.3754, "step": 567 }, { "epoch": 2.4695652173913043, "grad_norm": 0.31671515107154846, "learning_rate": 2e-05, "loss": 1.4487, "step": 568 }, { "epoch": 2.473913043478261, "grad_norm": 0.31671515107154846, "learning_rate": 2e-05, "loss": 1.4277, "step": 569 }, { "epoch": 2.4782608695652173, "grad_norm": 0.31671515107154846, "learning_rate": 2e-05, "loss": 1.3566, "step": 570 }, { "epoch": 2.482608695652174, "grad_norm": 0.31671515107154846, "learning_rate": 2e-05, "loss": 1.4195, "step": 571 }, { "epoch": 2.4869565217391303, "grad_norm": 0.32078683376312256, "learning_rate": 2e-05, "loss": 1.4236, "step": 572 }, { "epoch": 2.491304347826087, "grad_norm": 0.32078683376312256, "learning_rate": 2e-05, "loss": 1.3728, "step": 573 }, { "epoch": 2.4956521739130437, "grad_norm": 0.32078683376312256, "learning_rate": 2e-05, "loss": 1.367, "step": 574 }, { "epoch": 2.5, "grad_norm": 0.32078683376312256, "learning_rate": 2e-05, "loss": 1.3892, "step": 575 }, { "epoch": 2.5043478260869563, "grad_norm": 0.3213704228401184, "learning_rate": 2e-05, "loss": 1.4073, "step": 576 }, { "epoch": 2.508695652173913, "grad_norm": 0.3213704228401184, "learning_rate": 2e-05, "loss": 1.38, "step": 577 }, { "epoch": 2.5130434782608697, "grad_norm": 0.3213704228401184, "learning_rate": 2e-05, "loss": 1.4, "step": 578 }, { "epoch": 2.517391304347826, "grad_norm": 0.3213704228401184, "learning_rate": 2e-05, "loss": 1.3417, "step": 579 }, { "epoch": 2.5217391304347827, "grad_norm": 0.33939409255981445, "learning_rate": 2e-05, "loss": 1.4088, "step": 580 }, { "epoch": 2.526086956521739, "grad_norm": 0.33939409255981445, "learning_rate": 2e-05, "loss": 1.3889, "step": 581 }, { "epoch": 2.5304347826086957, "grad_norm": 0.33939409255981445, "learning_rate": 2e-05, "loss": 1.3547, "step": 582 }, { "epoch": 2.534782608695652, "grad_norm": 0.33939409255981445, "learning_rate": 2e-05, "loss": 1.3643, "step": 583 }, { "epoch": 2.5391304347826087, "grad_norm": 0.362350732088089, "learning_rate": 2e-05, "loss": 1.3045, "step": 584 }, { "epoch": 2.5434782608695654, "grad_norm": 0.362350732088089, "learning_rate": 2e-05, "loss": 1.4226, "step": 585 }, { "epoch": 2.5478260869565217, "grad_norm": 0.362350732088089, "learning_rate": 2e-05, "loss": 1.3398, "step": 586 }, { "epoch": 2.5521739130434784, "grad_norm": 0.362350732088089, "learning_rate": 2e-05, "loss": 1.434, "step": 587 }, { "epoch": 2.5565217391304347, "grad_norm": 0.34566718339920044, "learning_rate": 2e-05, "loss": 1.3933, "step": 588 }, { "epoch": 2.5608695652173914, "grad_norm": 0.34566718339920044, "learning_rate": 2e-05, "loss": 1.3881, "step": 589 }, { "epoch": 2.5652173913043477, "grad_norm": 0.34566718339920044, "learning_rate": 2e-05, "loss": 1.3688, "step": 590 }, { "epoch": 2.5695652173913044, "grad_norm": 0.34566718339920044, "learning_rate": 2e-05, "loss": 1.4052, "step": 591 }, { "epoch": 2.573913043478261, "grad_norm": 0.3323229253292084, "learning_rate": 2e-05, "loss": 1.4094, "step": 592 }, { "epoch": 2.5782608695652174, "grad_norm": 0.3323229253292084, "learning_rate": 2e-05, "loss": 1.3552, "step": 593 }, { "epoch": 2.5826086956521737, "grad_norm": 0.3323229253292084, "learning_rate": 2e-05, "loss": 1.4367, "step": 594 }, { "epoch": 2.5869565217391304, "grad_norm": 0.3323229253292084, "learning_rate": 2e-05, "loss": 1.3934, "step": 595 }, { "epoch": 2.591304347826087, "grad_norm": 0.32952815294265747, "learning_rate": 2e-05, "loss": 1.3602, "step": 596 }, { "epoch": 2.5956521739130434, "grad_norm": 0.32952815294265747, "learning_rate": 2e-05, "loss": 1.3772, "step": 597 }, { "epoch": 2.6, "grad_norm": 0.32952815294265747, "learning_rate": 2e-05, "loss": 1.4432, "step": 598 }, { "epoch": 2.6043478260869564, "grad_norm": 0.32952815294265747, "learning_rate": 2e-05, "loss": 1.4353, "step": 599 }, { "epoch": 2.608695652173913, "grad_norm": 0.3243506848812103, "learning_rate": 2e-05, "loss": 1.3708, "step": 600 }, { "epoch": 2.6130434782608694, "grad_norm": 0.3243506848812103, "learning_rate": 2e-05, "loss": 1.3407, "step": 601 }, { "epoch": 2.617391304347826, "grad_norm": 0.3243506848812103, "learning_rate": 2e-05, "loss": 1.3312, "step": 602 }, { "epoch": 2.621739130434783, "grad_norm": 0.3243506848812103, "learning_rate": 2e-05, "loss": 1.3443, "step": 603 }, { "epoch": 2.626086956521739, "grad_norm": 0.31402692198753357, "learning_rate": 2e-05, "loss": 1.3297, "step": 604 }, { "epoch": 2.630434782608696, "grad_norm": 0.31402692198753357, "learning_rate": 2e-05, "loss": 1.3604, "step": 605 }, { "epoch": 2.634782608695652, "grad_norm": 0.31402692198753357, "learning_rate": 2e-05, "loss": 1.3888, "step": 606 }, { "epoch": 2.639130434782609, "grad_norm": 0.31402692198753357, "learning_rate": 2e-05, "loss": 1.3901, "step": 607 }, { "epoch": 2.643478260869565, "grad_norm": 0.3442557156085968, "learning_rate": 2e-05, "loss": 1.3481, "step": 608 }, { "epoch": 2.6478260869565218, "grad_norm": 0.3442557156085968, "learning_rate": 2e-05, "loss": 1.3451, "step": 609 }, { "epoch": 2.6521739130434785, "grad_norm": 0.3442557156085968, "learning_rate": 2e-05, "loss": 1.3634, "step": 610 }, { "epoch": 2.6565217391304348, "grad_norm": 0.3442557156085968, "learning_rate": 2e-05, "loss": 1.322, "step": 611 }, { "epoch": 2.660869565217391, "grad_norm": 0.33002713322639465, "learning_rate": 2e-05, "loss": 1.3747, "step": 612 }, { "epoch": 2.6652173913043478, "grad_norm": 0.33002713322639465, "learning_rate": 2e-05, "loss": 1.4328, "step": 613 }, { "epoch": 2.6695652173913045, "grad_norm": 0.33002713322639465, "learning_rate": 2e-05, "loss": 1.3775, "step": 614 }, { "epoch": 2.6739130434782608, "grad_norm": 0.33002713322639465, "learning_rate": 2e-05, "loss": 1.3485, "step": 615 }, { "epoch": 2.6782608695652175, "grad_norm": 0.32511940598487854, "learning_rate": 2e-05, "loss": 1.4626, "step": 616 }, { "epoch": 2.6826086956521737, "grad_norm": 0.32511940598487854, "learning_rate": 2e-05, "loss": 1.3966, "step": 617 }, { "epoch": 2.6869565217391305, "grad_norm": 0.32511940598487854, "learning_rate": 2e-05, "loss": 1.4073, "step": 618 }, { "epoch": 2.6913043478260867, "grad_norm": 0.32511940598487854, "learning_rate": 2e-05, "loss": 1.312, "step": 619 }, { "epoch": 2.6956521739130435, "grad_norm": 0.3344216048717499, "learning_rate": 2e-05, "loss": 1.4085, "step": 620 }, { "epoch": 2.7, "grad_norm": 0.3344216048717499, "learning_rate": 2e-05, "loss": 1.4103, "step": 621 }, { "epoch": 2.7043478260869565, "grad_norm": 0.3344216048717499, "learning_rate": 2e-05, "loss": 1.3982, "step": 622 }, { "epoch": 2.708695652173913, "grad_norm": 0.3344216048717499, "learning_rate": 2e-05, "loss": 1.4319, "step": 623 }, { "epoch": 2.7130434782608694, "grad_norm": 0.33260592818260193, "learning_rate": 2e-05, "loss": 1.407, "step": 624 }, { "epoch": 2.717391304347826, "grad_norm": 0.33260592818260193, "learning_rate": 2e-05, "loss": 1.3397, "step": 625 }, { "epoch": 2.7217391304347824, "grad_norm": 0.33260592818260193, "learning_rate": 2e-05, "loss": 1.3808, "step": 626 }, { "epoch": 2.726086956521739, "grad_norm": 0.33260592818260193, "learning_rate": 2e-05, "loss": 1.4175, "step": 627 }, { "epoch": 2.730434782608696, "grad_norm": 0.33184948563575745, "learning_rate": 2e-05, "loss": 1.3604, "step": 628 }, { "epoch": 2.734782608695652, "grad_norm": 0.33184948563575745, "learning_rate": 2e-05, "loss": 1.4572, "step": 629 }, { "epoch": 2.7391304347826084, "grad_norm": 0.33184948563575745, "learning_rate": 2e-05, "loss": 1.3895, "step": 630 }, { "epoch": 2.743478260869565, "grad_norm": 0.33184948563575745, "learning_rate": 2e-05, "loss": 1.4055, "step": 631 }, { "epoch": 2.747826086956522, "grad_norm": 0.32070547342300415, "learning_rate": 2e-05, "loss": 1.3849, "step": 632 }, { "epoch": 2.752173913043478, "grad_norm": 0.32070547342300415, "learning_rate": 2e-05, "loss": 1.3835, "step": 633 }, { "epoch": 2.756521739130435, "grad_norm": 0.32070547342300415, "learning_rate": 2e-05, "loss": 1.4008, "step": 634 }, { "epoch": 2.7608695652173916, "grad_norm": 0.32070547342300415, "learning_rate": 2e-05, "loss": 1.3944, "step": 635 }, { "epoch": 2.765217391304348, "grad_norm": 0.32883745431900024, "learning_rate": 2e-05, "loss": 1.3374, "step": 636 }, { "epoch": 2.769565217391304, "grad_norm": 0.32883745431900024, "learning_rate": 2e-05, "loss": 1.3797, "step": 637 }, { "epoch": 2.773913043478261, "grad_norm": 0.32883745431900024, "learning_rate": 2e-05, "loss": 1.3135, "step": 638 }, { "epoch": 2.7782608695652176, "grad_norm": 0.32883745431900024, "learning_rate": 2e-05, "loss": 1.3744, "step": 639 }, { "epoch": 2.782608695652174, "grad_norm": 0.3163747489452362, "learning_rate": 2e-05, "loss": 1.3676, "step": 640 }, { "epoch": 2.7869565217391306, "grad_norm": 0.3163747489452362, "learning_rate": 2e-05, "loss": 1.392, "step": 641 }, { "epoch": 2.791304347826087, "grad_norm": 0.3163747489452362, "learning_rate": 2e-05, "loss": 1.3777, "step": 642 }, { "epoch": 2.7956521739130435, "grad_norm": 0.3163747489452362, "learning_rate": 2e-05, "loss": 1.4129, "step": 643 }, { "epoch": 2.8, "grad_norm": 0.33478814363479614, "learning_rate": 2e-05, "loss": 1.3628, "step": 644 }, { "epoch": 2.8043478260869565, "grad_norm": 0.33478814363479614, "learning_rate": 2e-05, "loss": 1.3612, "step": 645 }, { "epoch": 2.8086956521739133, "grad_norm": 0.33478814363479614, "learning_rate": 2e-05, "loss": 1.4229, "step": 646 }, { "epoch": 2.8130434782608695, "grad_norm": 0.33478814363479614, "learning_rate": 2e-05, "loss": 1.355, "step": 647 }, { "epoch": 2.8173913043478263, "grad_norm": 0.3139325678348541, "learning_rate": 2e-05, "loss": 1.3561, "step": 648 }, { "epoch": 2.8217391304347825, "grad_norm": 0.3139325678348541, "learning_rate": 2e-05, "loss": 1.4152, "step": 649 }, { "epoch": 2.8260869565217392, "grad_norm": 0.3139325678348541, "learning_rate": 2e-05, "loss": 1.423, "step": 650 }, { "epoch": 2.8304347826086955, "grad_norm": 0.3139325678348541, "learning_rate": 2e-05, "loss": 1.3525, "step": 651 }, { "epoch": 2.8347826086956522, "grad_norm": 0.3335968554019928, "learning_rate": 2e-05, "loss": 1.3864, "step": 652 }, { "epoch": 2.839130434782609, "grad_norm": 0.3335968554019928, "learning_rate": 2e-05, "loss": 1.4314, "step": 653 }, { "epoch": 2.8434782608695652, "grad_norm": 0.3335968554019928, "learning_rate": 2e-05, "loss": 1.4349, "step": 654 }, { "epoch": 2.8478260869565215, "grad_norm": 0.3335968554019928, "learning_rate": 2e-05, "loss": 1.3767, "step": 655 }, { "epoch": 2.8521739130434782, "grad_norm": 0.3227713406085968, "learning_rate": 2e-05, "loss": 1.3651, "step": 656 }, { "epoch": 2.856521739130435, "grad_norm": 0.3227713406085968, "learning_rate": 2e-05, "loss": 1.3297, "step": 657 }, { "epoch": 2.860869565217391, "grad_norm": 0.3227713406085968, "learning_rate": 2e-05, "loss": 1.4148, "step": 658 }, { "epoch": 2.865217391304348, "grad_norm": 0.3227713406085968, "learning_rate": 2e-05, "loss": 1.4048, "step": 659 }, { "epoch": 2.869565217391304, "grad_norm": 0.3265250325202942, "learning_rate": 2e-05, "loss": 1.3643, "step": 660 }, { "epoch": 2.873913043478261, "grad_norm": 0.3265250325202942, "learning_rate": 2e-05, "loss": 1.4123, "step": 661 }, { "epoch": 2.878260869565217, "grad_norm": 0.3265250325202942, "learning_rate": 2e-05, "loss": 1.4146, "step": 662 }, { "epoch": 2.882608695652174, "grad_norm": 0.3265250325202942, "learning_rate": 2e-05, "loss": 1.4065, "step": 663 }, { "epoch": 2.8869565217391306, "grad_norm": 0.30972209572792053, "learning_rate": 2e-05, "loss": 1.3875, "step": 664 }, { "epoch": 2.891304347826087, "grad_norm": 0.30972209572792053, "learning_rate": 2e-05, "loss": 1.4828, "step": 665 }, { "epoch": 2.8956521739130436, "grad_norm": 0.30972209572792053, "learning_rate": 2e-05, "loss": 1.4318, "step": 666 }, { "epoch": 2.9, "grad_norm": 0.30972209572792053, "learning_rate": 2e-05, "loss": 1.4493, "step": 667 }, { "epoch": 2.9043478260869566, "grad_norm": 0.31576821208000183, "learning_rate": 2e-05, "loss": 1.3041, "step": 668 }, { "epoch": 2.908695652173913, "grad_norm": 0.31576821208000183, "learning_rate": 2e-05, "loss": 1.4027, "step": 669 }, { "epoch": 2.9130434782608696, "grad_norm": 0.31576821208000183, "learning_rate": 2e-05, "loss": 1.4331, "step": 670 }, { "epoch": 2.9173913043478263, "grad_norm": 0.31576821208000183, "learning_rate": 2e-05, "loss": 1.3375, "step": 671 }, { "epoch": 2.9217391304347826, "grad_norm": 0.3335813581943512, "learning_rate": 2e-05, "loss": 1.3935, "step": 672 }, { "epoch": 2.926086956521739, "grad_norm": 0.3335813581943512, "learning_rate": 2e-05, "loss": 1.3927, "step": 673 }, { "epoch": 2.9304347826086956, "grad_norm": 0.3335813581943512, "learning_rate": 2e-05, "loss": 1.3892, "step": 674 }, { "epoch": 2.9347826086956523, "grad_norm": 0.3335813581943512, "learning_rate": 2e-05, "loss": 1.3488, "step": 675 }, { "epoch": 2.9391304347826086, "grad_norm": 0.3342477083206177, "learning_rate": 2e-05, "loss": 1.3527, "step": 676 }, { "epoch": 2.9434782608695653, "grad_norm": 0.3342477083206177, "learning_rate": 2e-05, "loss": 1.4115, "step": 677 }, { "epoch": 2.9478260869565216, "grad_norm": 0.3342477083206177, "learning_rate": 2e-05, "loss": 1.3586, "step": 678 }, { "epoch": 2.9521739130434783, "grad_norm": 0.3342477083206177, "learning_rate": 2e-05, "loss": 1.4366, "step": 679 }, { "epoch": 2.9565217391304346, "grad_norm": 0.32498446106910706, "learning_rate": 2e-05, "loss": 1.3317, "step": 680 }, { "epoch": 2.9608695652173913, "grad_norm": 0.32498446106910706, "learning_rate": 2e-05, "loss": 1.4396, "step": 681 }, { "epoch": 2.965217391304348, "grad_norm": 0.32498446106910706, "learning_rate": 2e-05, "loss": 1.3385, "step": 682 }, { "epoch": 2.9695652173913043, "grad_norm": 0.32498446106910706, "learning_rate": 2e-05, "loss": 1.3554, "step": 683 }, { "epoch": 2.973913043478261, "grad_norm": 0.3209493160247803, "learning_rate": 2e-05, "loss": 1.3452, "step": 684 }, { "epoch": 2.9782608695652173, "grad_norm": 0.3209493160247803, "learning_rate": 2e-05, "loss": 1.3912, "step": 685 }, { "epoch": 2.982608695652174, "grad_norm": 0.3209493160247803, "learning_rate": 2e-05, "loss": 1.4286, "step": 686 }, { "epoch": 2.9869565217391303, "grad_norm": 0.3209493160247803, "learning_rate": 2e-05, "loss": 1.3302, "step": 687 }, { "epoch": 2.991304347826087, "grad_norm": 0.3104861378669739, "learning_rate": 2e-05, "loss": 1.3402, "step": 688 }, { "epoch": 2.9956521739130437, "grad_norm": 0.3104861378669739, "learning_rate": 2e-05, "loss": 1.328, "step": 689 }, { "epoch": 3.0, "grad_norm": 0.3104861378669739, "learning_rate": 2e-05, "loss": 1.3663, "step": 690 }, { "epoch": 3.0, "step": 690, "total_flos": 3.864388766479155e+17, "train_loss": 1.544883234086244, "train_runtime": 425.3662, "train_samples_per_second": 363.082, "train_steps_per_second": 1.622 } ], "logging_steps": 1.0, "max_steps": 690, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.864388766479155e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }