{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016806722689075631, "grad_norm": 5.185371097391246, "learning_rate": 2.684563758389262e-07, "loss": 0.8806, "step": 1 }, { "epoch": 0.0033613445378151263, "grad_norm": 5.279033372025492, "learning_rate": 5.369127516778524e-07, "loss": 0.8836, "step": 2 }, { "epoch": 0.005042016806722689, "grad_norm": 5.275406345987167, "learning_rate": 8.053691275167786e-07, "loss": 0.8711, "step": 3 }, { "epoch": 0.0067226890756302525, "grad_norm": 5.2494757133342675, "learning_rate": 1.0738255033557048e-06, "loss": 0.8914, "step": 4 }, { "epoch": 0.008403361344537815, "grad_norm": 5.147570277304961, "learning_rate": 1.342281879194631e-06, "loss": 0.8761, "step": 5 }, { "epoch": 0.010084033613445379, "grad_norm": 4.807435520081194, "learning_rate": 1.6107382550335572e-06, "loss": 0.855, "step": 6 }, { "epoch": 0.011764705882352941, "grad_norm": 4.032522035965656, "learning_rate": 1.8791946308724835e-06, "loss": 0.8288, "step": 7 }, { "epoch": 0.013445378151260505, "grad_norm": 3.962601864776931, "learning_rate": 2.1476510067114096e-06, "loss": 0.8347, "step": 8 }, { "epoch": 0.015126050420168067, "grad_norm": 3.658596254234891, "learning_rate": 2.416107382550336e-06, "loss": 0.8251, "step": 9 }, { "epoch": 0.01680672268907563, "grad_norm": 2.1254677656735548, "learning_rate": 2.684563758389262e-06, "loss": 0.7736, "step": 10 }, { "epoch": 0.018487394957983194, "grad_norm": 1.9329109042951849, "learning_rate": 2.9530201342281885e-06, "loss": 0.782, "step": 11 }, { "epoch": 0.020168067226890758, "grad_norm": 1.8730567907233713, "learning_rate": 3.2214765100671143e-06, "loss": 0.7709, "step": 12 }, { "epoch": 0.021848739495798318, "grad_norm": 3.4066157637016286, "learning_rate": 3.4899328859060407e-06, "loss": 0.7629, "step": 13 }, { "epoch": 0.023529411764705882, "grad_norm": 3.600071670399184, "learning_rate": 3.758389261744967e-06, "loss": 0.7708, "step": 14 }, { "epoch": 0.025210084033613446, "grad_norm": 3.4866454754092273, "learning_rate": 4.026845637583892e-06, "loss": 0.7601, "step": 15 }, { "epoch": 0.02689075630252101, "grad_norm": 3.4947327504170036, "learning_rate": 4.295302013422819e-06, "loss": 0.7512, "step": 16 }, { "epoch": 0.02857142857142857, "grad_norm": 2.67281196136215, "learning_rate": 4.563758389261745e-06, "loss": 0.7161, "step": 17 }, { "epoch": 0.030252100840336135, "grad_norm": 2.4415312335622117, "learning_rate": 4.832214765100672e-06, "loss": 0.7002, "step": 18 }, { "epoch": 0.031932773109243695, "grad_norm": 2.009167982735997, "learning_rate": 5.1006711409395985e-06, "loss": 0.6971, "step": 19 }, { "epoch": 0.03361344537815126, "grad_norm": 1.4675142878309353, "learning_rate": 5.369127516778524e-06, "loss": 0.6763, "step": 20 }, { "epoch": 0.03529411764705882, "grad_norm": 1.1884168829087065, "learning_rate": 5.637583892617449e-06, "loss": 0.6614, "step": 21 }, { "epoch": 0.03697478991596639, "grad_norm": 1.3302996921544596, "learning_rate": 5.906040268456377e-06, "loss": 0.6667, "step": 22 }, { "epoch": 0.03865546218487395, "grad_norm": 1.4708639516257762, "learning_rate": 6.174496644295303e-06, "loss": 0.6474, "step": 23 }, { "epoch": 0.040336134453781515, "grad_norm": 1.4687331310286122, "learning_rate": 6.442953020134229e-06, "loss": 0.6423, "step": 24 }, { "epoch": 0.04201680672268908, "grad_norm": 1.1528821080356604, "learning_rate": 6.711409395973155e-06, "loss": 0.6193, "step": 25 }, { "epoch": 0.043697478991596636, "grad_norm": 0.892817848056295, "learning_rate": 6.979865771812081e-06, "loss": 0.6202, "step": 26 }, { "epoch": 0.0453781512605042, "grad_norm": 0.9260243441534799, "learning_rate": 7.248322147651007e-06, "loss": 0.6213, "step": 27 }, { "epoch": 0.047058823529411764, "grad_norm": 1.0201171210124738, "learning_rate": 7.516778523489934e-06, "loss": 0.6211, "step": 28 }, { "epoch": 0.04873949579831933, "grad_norm": 0.7580538383502603, "learning_rate": 7.785234899328859e-06, "loss": 0.6092, "step": 29 }, { "epoch": 0.05042016806722689, "grad_norm": 0.5911210736882891, "learning_rate": 8.053691275167785e-06, "loss": 0.6099, "step": 30 }, { "epoch": 0.052100840336134456, "grad_norm": 0.758397093823697, "learning_rate": 8.322147651006712e-06, "loss": 0.6059, "step": 31 }, { "epoch": 0.05378151260504202, "grad_norm": 0.7254223309853771, "learning_rate": 8.590604026845638e-06, "loss": 0.6078, "step": 32 }, { "epoch": 0.05546218487394958, "grad_norm": 0.5351729468711538, "learning_rate": 8.859060402684566e-06, "loss": 0.595, "step": 33 }, { "epoch": 0.05714285714285714, "grad_norm": 0.470267181454843, "learning_rate": 9.12751677852349e-06, "loss": 0.5851, "step": 34 }, { "epoch": 0.058823529411764705, "grad_norm": 0.6646339006447611, "learning_rate": 9.395973154362416e-06, "loss": 0.5888, "step": 35 }, { "epoch": 0.06050420168067227, "grad_norm": 0.582226379503529, "learning_rate": 9.664429530201343e-06, "loss": 0.5893, "step": 36 }, { "epoch": 0.06218487394957983, "grad_norm": 0.38986396764010534, "learning_rate": 9.93288590604027e-06, "loss": 0.576, "step": 37 }, { "epoch": 0.06386554621848739, "grad_norm": 0.4296746309254322, "learning_rate": 1.0201342281879197e-05, "loss": 0.5759, "step": 38 }, { "epoch": 0.06554621848739496, "grad_norm": 0.5497352043233644, "learning_rate": 1.0469798657718123e-05, "loss": 0.5801, "step": 39 }, { "epoch": 0.06722689075630252, "grad_norm": 0.42043388986799257, "learning_rate": 1.0738255033557049e-05, "loss": 0.5752, "step": 40 }, { "epoch": 0.06890756302521009, "grad_norm": 0.4019282578338675, "learning_rate": 1.1006711409395975e-05, "loss": 0.5707, "step": 41 }, { "epoch": 0.07058823529411765, "grad_norm": 0.5104832169261361, "learning_rate": 1.1275167785234899e-05, "loss": 0.5725, "step": 42 }, { "epoch": 0.07226890756302522, "grad_norm": 0.39707985109421395, "learning_rate": 1.1543624161073828e-05, "loss": 0.5816, "step": 43 }, { "epoch": 0.07394957983193277, "grad_norm": 0.43144539512859104, "learning_rate": 1.1812080536912754e-05, "loss": 0.5721, "step": 44 }, { "epoch": 0.07563025210084033, "grad_norm": 0.4400862418608138, "learning_rate": 1.208053691275168e-05, "loss": 0.5704, "step": 45 }, { "epoch": 0.0773109243697479, "grad_norm": 0.35824566512265993, "learning_rate": 1.2348993288590606e-05, "loss": 0.5597, "step": 46 }, { "epoch": 0.07899159663865546, "grad_norm": 0.39072503927107716, "learning_rate": 1.2617449664429532e-05, "loss": 0.5578, "step": 47 }, { "epoch": 0.08067226890756303, "grad_norm": 0.4236330830962998, "learning_rate": 1.2885906040268457e-05, "loss": 0.56, "step": 48 }, { "epoch": 0.08235294117647059, "grad_norm": 0.30918244945793655, "learning_rate": 1.3154362416107385e-05, "loss": 0.5512, "step": 49 }, { "epoch": 0.08403361344537816, "grad_norm": 0.39310431618519365, "learning_rate": 1.342281879194631e-05, "loss": 0.5627, "step": 50 }, { "epoch": 0.08571428571428572, "grad_norm": 0.32904897215114265, "learning_rate": 1.3691275167785237e-05, "loss": 0.5459, "step": 51 }, { "epoch": 0.08739495798319327, "grad_norm": 0.3122229027296676, "learning_rate": 1.3959731543624163e-05, "loss": 0.5392, "step": 52 }, { "epoch": 0.08907563025210084, "grad_norm": 0.31013611786686685, "learning_rate": 1.4228187919463088e-05, "loss": 0.544, "step": 53 }, { "epoch": 0.0907563025210084, "grad_norm": 0.27372913691874146, "learning_rate": 1.4496644295302014e-05, "loss": 0.538, "step": 54 }, { "epoch": 0.09243697478991597, "grad_norm": 0.3062183378801753, "learning_rate": 1.4765100671140942e-05, "loss": 0.5408, "step": 55 }, { "epoch": 0.09411764705882353, "grad_norm": 0.2647941669562589, "learning_rate": 1.5033557046979868e-05, "loss": 0.5435, "step": 56 }, { "epoch": 0.0957983193277311, "grad_norm": 0.2401009075476614, "learning_rate": 1.5302013422818792e-05, "loss": 0.5263, "step": 57 }, { "epoch": 0.09747899159663866, "grad_norm": 0.30155492829574987, "learning_rate": 1.5570469798657718e-05, "loss": 0.5287, "step": 58 }, { "epoch": 0.09915966386554621, "grad_norm": 0.24513139050453941, "learning_rate": 1.5838926174496644e-05, "loss": 0.5331, "step": 59 }, { "epoch": 0.10084033613445378, "grad_norm": 0.2914088266455597, "learning_rate": 1.610738255033557e-05, "loss": 0.5335, "step": 60 }, { "epoch": 0.10252100840336134, "grad_norm": 0.3113632504209663, "learning_rate": 1.63758389261745e-05, "loss": 0.5362, "step": 61 }, { "epoch": 0.10420168067226891, "grad_norm": 0.2743094241885386, "learning_rate": 1.6644295302013425e-05, "loss": 0.532, "step": 62 }, { "epoch": 0.10588235294117647, "grad_norm": 0.3220431352503825, "learning_rate": 1.691275167785235e-05, "loss": 0.5358, "step": 63 }, { "epoch": 0.10756302521008404, "grad_norm": 0.4384425797228442, "learning_rate": 1.7181208053691277e-05, "loss": 0.5222, "step": 64 }, { "epoch": 0.1092436974789916, "grad_norm": 0.3621618834820739, "learning_rate": 1.7449664429530202e-05, "loss": 0.5274, "step": 65 }, { "epoch": 0.11092436974789915, "grad_norm": 0.47996576342712854, "learning_rate": 1.771812080536913e-05, "loss": 0.5211, "step": 66 }, { "epoch": 0.11260504201680673, "grad_norm": 0.6172957205339873, "learning_rate": 1.7986577181208054e-05, "loss": 0.5269, "step": 67 }, { "epoch": 0.11428571428571428, "grad_norm": 0.6013617585000967, "learning_rate": 1.825503355704698e-05, "loss": 0.5182, "step": 68 }, { "epoch": 0.11596638655462185, "grad_norm": 0.452754915911596, "learning_rate": 1.8523489932885906e-05, "loss": 0.5163, "step": 69 }, { "epoch": 0.11764705882352941, "grad_norm": 0.4041245982738884, "learning_rate": 1.8791946308724832e-05, "loss": 0.5338, "step": 70 }, { "epoch": 0.11932773109243698, "grad_norm": 0.4221354977599781, "learning_rate": 1.9060402684563758e-05, "loss": 0.5309, "step": 71 }, { "epoch": 0.12100840336134454, "grad_norm": 0.44683157806831736, "learning_rate": 1.9328859060402687e-05, "loss": 0.5207, "step": 72 }, { "epoch": 0.1226890756302521, "grad_norm": 0.4742279523900657, "learning_rate": 1.9597315436241613e-05, "loss": 0.529, "step": 73 }, { "epoch": 0.12436974789915967, "grad_norm": 0.50562581842773, "learning_rate": 1.986577181208054e-05, "loss": 0.518, "step": 74 }, { "epoch": 0.12605042016806722, "grad_norm": 0.5202199933122749, "learning_rate": 2.0134228187919468e-05, "loss": 0.5245, "step": 75 }, { "epoch": 0.12773109243697478, "grad_norm": 0.5171357689706414, "learning_rate": 2.0402684563758394e-05, "loss": 0.5226, "step": 76 }, { "epoch": 0.12941176470588237, "grad_norm": 0.46655463450615436, "learning_rate": 2.067114093959732e-05, "loss": 0.5114, "step": 77 }, { "epoch": 0.13109243697478992, "grad_norm": 0.4327668822353372, "learning_rate": 2.0939597315436246e-05, "loss": 0.5184, "step": 78 }, { "epoch": 0.13277310924369748, "grad_norm": 0.5682287100428585, "learning_rate": 2.120805369127517e-05, "loss": 0.5201, "step": 79 }, { "epoch": 0.13445378151260504, "grad_norm": 0.9727134034892411, "learning_rate": 2.1476510067114097e-05, "loss": 0.511, "step": 80 }, { "epoch": 0.1361344537815126, "grad_norm": 1.4801286695906204, "learning_rate": 2.1744966442953023e-05, "loss": 0.5247, "step": 81 }, { "epoch": 0.13781512605042018, "grad_norm": 0.523137779946603, "learning_rate": 2.201342281879195e-05, "loss": 0.5154, "step": 82 }, { "epoch": 0.13949579831932774, "grad_norm": 1.2976040471329593, "learning_rate": 2.228187919463087e-05, "loss": 0.5127, "step": 83 }, { "epoch": 0.1411764705882353, "grad_norm": 0.8803075035053305, "learning_rate": 2.2550335570469797e-05, "loss": 0.5158, "step": 84 }, { "epoch": 0.14285714285714285, "grad_norm": 0.8604964330012713, "learning_rate": 2.2818791946308723e-05, "loss": 0.5083, "step": 85 }, { "epoch": 0.14453781512605043, "grad_norm": 1.2994850817089831, "learning_rate": 2.3087248322147656e-05, "loss": 0.5166, "step": 86 }, { "epoch": 0.146218487394958, "grad_norm": 0.5485685225802135, "learning_rate": 2.3355704697986582e-05, "loss": 0.512, "step": 87 }, { "epoch": 0.14789915966386555, "grad_norm": 1.0976417955602105, "learning_rate": 2.3624161073825508e-05, "loss": 0.5173, "step": 88 }, { "epoch": 0.1495798319327731, "grad_norm": 0.8712112232565923, "learning_rate": 2.3892617449664434e-05, "loss": 0.5026, "step": 89 }, { "epoch": 0.15126050420168066, "grad_norm": 0.7712656072863795, "learning_rate": 2.416107382550336e-05, "loss": 0.5153, "step": 90 }, { "epoch": 0.15294117647058825, "grad_norm": 0.8767108436369941, "learning_rate": 2.4429530201342285e-05, "loss": 0.5166, "step": 91 }, { "epoch": 0.1546218487394958, "grad_norm": 0.7297601566528265, "learning_rate": 2.469798657718121e-05, "loss": 0.5037, "step": 92 }, { "epoch": 0.15630252100840336, "grad_norm": 0.8154941514401665, "learning_rate": 2.4966442953020137e-05, "loss": 0.5085, "step": 93 }, { "epoch": 0.15798319327731092, "grad_norm": 0.8439586035195423, "learning_rate": 2.5234899328859063e-05, "loss": 0.5049, "step": 94 }, { "epoch": 0.15966386554621848, "grad_norm": 0.8715920540591497, "learning_rate": 2.550335570469799e-05, "loss": 0.5002, "step": 95 }, { "epoch": 0.16134453781512606, "grad_norm": 0.7635327767940013, "learning_rate": 2.5771812080536915e-05, "loss": 0.5177, "step": 96 }, { "epoch": 0.16302521008403362, "grad_norm": 0.8444451034266721, "learning_rate": 2.604026845637584e-05, "loss": 0.5023, "step": 97 }, { "epoch": 0.16470588235294117, "grad_norm": 0.8568114399710639, "learning_rate": 2.630872483221477e-05, "loss": 0.4991, "step": 98 }, { "epoch": 0.16638655462184873, "grad_norm": 0.7363639867148235, "learning_rate": 2.6577181208053696e-05, "loss": 0.5056, "step": 99 }, { "epoch": 0.16806722689075632, "grad_norm": 1.0220278662553064, "learning_rate": 2.684563758389262e-05, "loss": 0.4997, "step": 100 }, { "epoch": 0.16974789915966387, "grad_norm": 1.0014904232229733, "learning_rate": 2.7114093959731548e-05, "loss": 0.5017, "step": 101 }, { "epoch": 0.17142857142857143, "grad_norm": 1.16018967312553, "learning_rate": 2.7382550335570473e-05, "loss": 0.5053, "step": 102 }, { "epoch": 0.173109243697479, "grad_norm": 0.8455576579341144, "learning_rate": 2.76510067114094e-05, "loss": 0.5099, "step": 103 }, { "epoch": 0.17478991596638654, "grad_norm": 1.2076649874881717, "learning_rate": 2.7919463087248325e-05, "loss": 0.5116, "step": 104 }, { "epoch": 0.17647058823529413, "grad_norm": 0.9089724749958054, "learning_rate": 2.818791946308725e-05, "loss": 0.4999, "step": 105 }, { "epoch": 0.1781512605042017, "grad_norm": 1.053150015352144, "learning_rate": 2.8456375838926177e-05, "loss": 0.5121, "step": 106 }, { "epoch": 0.17983193277310924, "grad_norm": 0.8478778287622188, "learning_rate": 2.8724832214765103e-05, "loss": 0.5071, "step": 107 }, { "epoch": 0.1815126050420168, "grad_norm": 1.0242582040528652, "learning_rate": 2.899328859060403e-05, "loss": 0.4998, "step": 108 }, { "epoch": 0.18319327731092436, "grad_norm": 1.116197685608516, "learning_rate": 2.9261744966442958e-05, "loss": 0.5096, "step": 109 }, { "epoch": 0.18487394957983194, "grad_norm": 0.9306373262733914, "learning_rate": 2.9530201342281884e-05, "loss": 0.4994, "step": 110 }, { "epoch": 0.1865546218487395, "grad_norm": 1.1110710517186093, "learning_rate": 2.979865771812081e-05, "loss": 0.5072, "step": 111 }, { "epoch": 0.18823529411764706, "grad_norm": 0.695599290906805, "learning_rate": 3.0067114093959736e-05, "loss": 0.5005, "step": 112 }, { "epoch": 0.1899159663865546, "grad_norm": 0.9563394967004832, "learning_rate": 3.033557046979866e-05, "loss": 0.5049, "step": 113 }, { "epoch": 0.1915966386554622, "grad_norm": 0.9191773851212317, "learning_rate": 3.0604026845637584e-05, "loss": 0.4972, "step": 114 }, { "epoch": 0.19327731092436976, "grad_norm": 1.0426284408534972, "learning_rate": 3.087248322147651e-05, "loss": 0.5015, "step": 115 }, { "epoch": 0.1949579831932773, "grad_norm": 1.027192770025465, "learning_rate": 3.1140939597315436e-05, "loss": 0.4964, "step": 116 }, { "epoch": 0.19663865546218487, "grad_norm": 0.9997824245426175, "learning_rate": 3.140939597315436e-05, "loss": 0.4941, "step": 117 }, { "epoch": 0.19831932773109243, "grad_norm": 1.2456828324305413, "learning_rate": 3.167785234899329e-05, "loss": 0.5089, "step": 118 }, { "epoch": 0.2, "grad_norm": 0.7436237359599585, "learning_rate": 3.194630872483221e-05, "loss": 0.4948, "step": 119 }, { "epoch": 0.20168067226890757, "grad_norm": 1.1159711356945343, "learning_rate": 3.221476510067114e-05, "loss": 0.4925, "step": 120 }, { "epoch": 0.20336134453781513, "grad_norm": 0.5113095941756574, "learning_rate": 3.248322147651007e-05, "loss": 0.4966, "step": 121 }, { "epoch": 0.20504201680672268, "grad_norm": 0.9348041858964933, "learning_rate": 3.2751677852349e-05, "loss": 0.4972, "step": 122 }, { "epoch": 0.20672268907563024, "grad_norm": 0.948620972819323, "learning_rate": 3.3020134228187924e-05, "loss": 0.4993, "step": 123 }, { "epoch": 0.20840336134453782, "grad_norm": 1.0820531386745118, "learning_rate": 3.328859060402685e-05, "loss": 0.4925, "step": 124 }, { "epoch": 0.21008403361344538, "grad_norm": 0.8742949945544924, "learning_rate": 3.3557046979865775e-05, "loss": 0.4954, "step": 125 }, { "epoch": 0.21176470588235294, "grad_norm": 1.2018134415776005, "learning_rate": 3.38255033557047e-05, "loss": 0.4987, "step": 126 }, { "epoch": 0.2134453781512605, "grad_norm": 1.367557288384633, "learning_rate": 3.409395973154363e-05, "loss": 0.4975, "step": 127 }, { "epoch": 0.21512605042016808, "grad_norm": 0.5231887024171837, "learning_rate": 3.436241610738255e-05, "loss": 0.4898, "step": 128 }, { "epoch": 0.21680672268907564, "grad_norm": 1.100661854625, "learning_rate": 3.463087248322148e-05, "loss": 0.4994, "step": 129 }, { "epoch": 0.2184873949579832, "grad_norm": 1.3099418103670066, "learning_rate": 3.4899328859060405e-05, "loss": 0.4955, "step": 130 }, { "epoch": 0.22016806722689075, "grad_norm": 0.8934675457513748, "learning_rate": 3.516778523489933e-05, "loss": 0.4968, "step": 131 }, { "epoch": 0.2218487394957983, "grad_norm": 1.2712598225639788, "learning_rate": 3.543624161073826e-05, "loss": 0.5006, "step": 132 }, { "epoch": 0.2235294117647059, "grad_norm": 0.6165490098713385, "learning_rate": 3.570469798657719e-05, "loss": 0.499, "step": 133 }, { "epoch": 0.22521008403361345, "grad_norm": 1.2930427936536468, "learning_rate": 3.597315436241611e-05, "loss": 0.4924, "step": 134 }, { "epoch": 0.226890756302521, "grad_norm": 0.506834524105459, "learning_rate": 3.6241610738255034e-05, "loss": 0.4885, "step": 135 }, { "epoch": 0.22857142857142856, "grad_norm": 1.1139887942116513, "learning_rate": 3.651006711409396e-05, "loss": 0.4973, "step": 136 }, { "epoch": 0.23025210084033612, "grad_norm": 1.1944897274838888, "learning_rate": 3.6778523489932886e-05, "loss": 0.4967, "step": 137 }, { "epoch": 0.2319327731092437, "grad_norm": 0.7134335951153069, "learning_rate": 3.704697986577181e-05, "loss": 0.4906, "step": 138 }, { "epoch": 0.23361344537815126, "grad_norm": 0.9818944822408663, "learning_rate": 3.731543624161074e-05, "loss": 0.489, "step": 139 }, { "epoch": 0.23529411764705882, "grad_norm": 1.4811311843909853, "learning_rate": 3.7583892617449664e-05, "loss": 0.5024, "step": 140 }, { "epoch": 0.23697478991596638, "grad_norm": 1.1188978424785652, "learning_rate": 3.785234899328859e-05, "loss": 0.4862, "step": 141 }, { "epoch": 0.23865546218487396, "grad_norm": 1.4406475575017759, "learning_rate": 3.8120805369127515e-05, "loss": 0.4934, "step": 142 }, { "epoch": 0.24033613445378152, "grad_norm": 1.390037607387754, "learning_rate": 3.838926174496644e-05, "loss": 0.4955, "step": 143 }, { "epoch": 0.24201680672268908, "grad_norm": 1.3909224928127575, "learning_rate": 3.8657718120805374e-05, "loss": 0.497, "step": 144 }, { "epoch": 0.24369747899159663, "grad_norm": 1.4057880004139167, "learning_rate": 3.89261744966443e-05, "loss": 0.4887, "step": 145 }, { "epoch": 0.2453781512605042, "grad_norm": 1.4397097510099024, "learning_rate": 3.9194630872483226e-05, "loss": 0.495, "step": 146 }, { "epoch": 0.24705882352941178, "grad_norm": 0.8393793004401069, "learning_rate": 3.946308724832215e-05, "loss": 0.4811, "step": 147 }, { "epoch": 0.24873949579831933, "grad_norm": 1.8937754648974725, "learning_rate": 3.973154362416108e-05, "loss": 0.4983, "step": 148 }, { "epoch": 0.2504201680672269, "grad_norm": 1.3460108819073133, "learning_rate": 4e-05, "loss": 0.4897, "step": 149 }, { "epoch": 0.25210084033613445, "grad_norm": 1.697010846585393, "learning_rate": 4.0268456375838936e-05, "loss": 0.4912, "step": 150 }, { "epoch": 0.253781512605042, "grad_norm": 1.5233069758972664, "learning_rate": 4.0536912751677855e-05, "loss": 0.4956, "step": 151 }, { "epoch": 0.25546218487394956, "grad_norm": 1.1521812222170114, "learning_rate": 4.080536912751679e-05, "loss": 0.487, "step": 152 }, { "epoch": 0.2571428571428571, "grad_norm": 1.3495885192105475, "learning_rate": 4.107382550335571e-05, "loss": 0.489, "step": 153 }, { "epoch": 0.25882352941176473, "grad_norm": 0.9055533563219321, "learning_rate": 4.134228187919464e-05, "loss": 0.4919, "step": 154 }, { "epoch": 0.2605042016806723, "grad_norm": 1.0753015861270354, "learning_rate": 4.161073825503356e-05, "loss": 0.4848, "step": 155 }, { "epoch": 0.26218487394957984, "grad_norm": 1.2162165843878523, "learning_rate": 4.187919463087249e-05, "loss": 0.4951, "step": 156 }, { "epoch": 0.2638655462184874, "grad_norm": 0.7620205375633081, "learning_rate": 4.214765100671141e-05, "loss": 0.491, "step": 157 }, { "epoch": 0.26554621848739496, "grad_norm": 1.107611805053371, "learning_rate": 4.241610738255034e-05, "loss": 0.4949, "step": 158 }, { "epoch": 0.2672268907563025, "grad_norm": 1.2676920601242294, "learning_rate": 4.268456375838926e-05, "loss": 0.4863, "step": 159 }, { "epoch": 0.2689075630252101, "grad_norm": 0.7418017963922823, "learning_rate": 4.2953020134228195e-05, "loss": 0.4841, "step": 160 }, { "epoch": 0.27058823529411763, "grad_norm": 1.4040331829994255, "learning_rate": 4.322147651006712e-05, "loss": 0.4928, "step": 161 }, { "epoch": 0.2722689075630252, "grad_norm": 0.8472349238492176, "learning_rate": 4.3489932885906046e-05, "loss": 0.4849, "step": 162 }, { "epoch": 0.2739495798319328, "grad_norm": 1.235294748510048, "learning_rate": 4.375838926174497e-05, "loss": 0.4875, "step": 163 }, { "epoch": 0.27563025210084036, "grad_norm": 0.8548363863356004, "learning_rate": 4.40268456375839e-05, "loss": 0.4803, "step": 164 }, { "epoch": 0.2773109243697479, "grad_norm": 1.7946552379389356, "learning_rate": 4.4295302013422824e-05, "loss": 0.4929, "step": 165 }, { "epoch": 0.27899159663865547, "grad_norm": 0.8905138853266164, "learning_rate": 4.456375838926174e-05, "loss": 0.4841, "step": 166 }, { "epoch": 0.280672268907563, "grad_norm": 1.5604902384645059, "learning_rate": 4.4832214765100676e-05, "loss": 0.4906, "step": 167 }, { "epoch": 0.2823529411764706, "grad_norm": 1.258413586215208, "learning_rate": 4.5100671140939595e-05, "loss": 0.4877, "step": 168 }, { "epoch": 0.28403361344537814, "grad_norm": 1.158982189231407, "learning_rate": 4.536912751677853e-05, "loss": 0.4945, "step": 169 }, { "epoch": 0.2857142857142857, "grad_norm": 1.0836013590974625, "learning_rate": 4.563758389261745e-05, "loss": 0.4916, "step": 170 }, { "epoch": 0.28739495798319326, "grad_norm": 1.0019888748967192, "learning_rate": 4.590604026845638e-05, "loss": 0.4863, "step": 171 }, { "epoch": 0.28907563025210087, "grad_norm": 1.410652879013061, "learning_rate": 4.617449664429531e-05, "loss": 0.487, "step": 172 }, { "epoch": 0.2907563025210084, "grad_norm": 0.8091027196716256, "learning_rate": 4.644295302013423e-05, "loss": 0.4933, "step": 173 }, { "epoch": 0.292436974789916, "grad_norm": 1.2516553258481948, "learning_rate": 4.6711409395973164e-05, "loss": 0.4864, "step": 174 }, { "epoch": 0.29411764705882354, "grad_norm": 0.9635362263381467, "learning_rate": 4.697986577181208e-05, "loss": 0.4904, "step": 175 }, { "epoch": 0.2957983193277311, "grad_norm": 1.0180361867555137, "learning_rate": 4.7248322147651016e-05, "loss": 0.4841, "step": 176 }, { "epoch": 0.29747899159663865, "grad_norm": 1.277483529671902, "learning_rate": 4.7516778523489935e-05, "loss": 0.4973, "step": 177 }, { "epoch": 0.2991596638655462, "grad_norm": 0.9148147746605856, "learning_rate": 4.778523489932887e-05, "loss": 0.4818, "step": 178 }, { "epoch": 0.30084033613445377, "grad_norm": 1.1666633339379586, "learning_rate": 4.8053691275167786e-05, "loss": 0.5017, "step": 179 }, { "epoch": 0.3025210084033613, "grad_norm": 1.3704600101668534, "learning_rate": 4.832214765100672e-05, "loss": 0.487, "step": 180 }, { "epoch": 0.3042016806722689, "grad_norm": 0.7348054784901624, "learning_rate": 4.859060402684564e-05, "loss": 0.4892, "step": 181 }, { "epoch": 0.3058823529411765, "grad_norm": 1.113033661577877, "learning_rate": 4.885906040268457e-05, "loss": 0.4886, "step": 182 }, { "epoch": 0.30756302521008405, "grad_norm": 1.3140105905315715, "learning_rate": 4.912751677852349e-05, "loss": 0.4896, "step": 183 }, { "epoch": 0.3092436974789916, "grad_norm": 0.7231499672301288, "learning_rate": 4.939597315436242e-05, "loss": 0.4876, "step": 184 }, { "epoch": 0.31092436974789917, "grad_norm": 1.4184020691961938, "learning_rate": 4.966442953020135e-05, "loss": 0.49, "step": 185 }, { "epoch": 0.3126050420168067, "grad_norm": 0.9016010816297785, "learning_rate": 4.9932885906040274e-05, "loss": 0.4809, "step": 186 }, { "epoch": 0.3142857142857143, "grad_norm": 0.8860404290240751, "learning_rate": 5.02013422818792e-05, "loss": 0.4812, "step": 187 }, { "epoch": 0.31596638655462184, "grad_norm": 1.1054077717535455, "learning_rate": 5.0469798657718126e-05, "loss": 0.4835, "step": 188 }, { "epoch": 0.3176470588235294, "grad_norm": 1.2801851805617759, "learning_rate": 5.073825503355705e-05, "loss": 0.4895, "step": 189 }, { "epoch": 0.31932773109243695, "grad_norm": 1.0024175943206142, "learning_rate": 5.100671140939598e-05, "loss": 0.4847, "step": 190 }, { "epoch": 0.32100840336134456, "grad_norm": 1.8757539162837982, "learning_rate": 5.1275167785234904e-05, "loss": 0.4877, "step": 191 }, { "epoch": 0.3226890756302521, "grad_norm": 0.942356226869904, "learning_rate": 5.154362416107383e-05, "loss": 0.4711, "step": 192 }, { "epoch": 0.3243697478991597, "grad_norm": 2.0123035509745817, "learning_rate": 5.1812080536912755e-05, "loss": 0.4988, "step": 193 }, { "epoch": 0.32605042016806723, "grad_norm": 1.5267658900347683, "learning_rate": 5.208053691275168e-05, "loss": 0.4816, "step": 194 }, { "epoch": 0.3277310924369748, "grad_norm": 1.5925642166230356, "learning_rate": 5.234899328859061e-05, "loss": 0.482, "step": 195 }, { "epoch": 0.32941176470588235, "grad_norm": 1.4697181757574689, "learning_rate": 5.261744966442954e-05, "loss": 0.4835, "step": 196 }, { "epoch": 0.3310924369747899, "grad_norm": 1.1632147027246282, "learning_rate": 5.288590604026846e-05, "loss": 0.4874, "step": 197 }, { "epoch": 0.33277310924369746, "grad_norm": 1.2190679193642526, "learning_rate": 5.315436241610739e-05, "loss": 0.4831, "step": 198 }, { "epoch": 0.334453781512605, "grad_norm": 1.3143781539387362, "learning_rate": 5.342281879194631e-05, "loss": 0.4885, "step": 199 }, { "epoch": 0.33613445378151263, "grad_norm": 0.905150180886035, "learning_rate": 5.369127516778524e-05, "loss": 0.4716, "step": 200 }, { "epoch": 0.3378151260504202, "grad_norm": 1.0714799378494302, "learning_rate": 5.395973154362416e-05, "loss": 0.4846, "step": 201 }, { "epoch": 0.33949579831932775, "grad_norm": 0.8429397483205768, "learning_rate": 5.4228187919463095e-05, "loss": 0.4753, "step": 202 }, { "epoch": 0.3411764705882353, "grad_norm": 1.42963639046099, "learning_rate": 5.4496644295302014e-05, "loss": 0.4862, "step": 203 }, { "epoch": 0.34285714285714286, "grad_norm": 0.8981161586921399, "learning_rate": 5.476510067114095e-05, "loss": 0.484, "step": 204 }, { "epoch": 0.3445378151260504, "grad_norm": 1.2005532663139413, "learning_rate": 5.5033557046979866e-05, "loss": 0.4716, "step": 205 }, { "epoch": 0.346218487394958, "grad_norm": 0.9579894706708744, "learning_rate": 5.53020134228188e-05, "loss": 0.4912, "step": 206 }, { "epoch": 0.34789915966386553, "grad_norm": 1.1097327549924116, "learning_rate": 5.5570469798657725e-05, "loss": 0.4777, "step": 207 }, { "epoch": 0.3495798319327731, "grad_norm": 1.2950126796310506, "learning_rate": 5.583892617449665e-05, "loss": 0.4845, "step": 208 }, { "epoch": 0.35126050420168065, "grad_norm": 0.8936339922056112, "learning_rate": 5.6107382550335576e-05, "loss": 0.4906, "step": 209 }, { "epoch": 0.35294117647058826, "grad_norm": 1.2261703316784087, "learning_rate": 5.63758389261745e-05, "loss": 0.483, "step": 210 }, { "epoch": 0.3546218487394958, "grad_norm": 0.7783683381231354, "learning_rate": 5.664429530201343e-05, "loss": 0.4835, "step": 211 }, { "epoch": 0.3563025210084034, "grad_norm": 0.7523659842514121, "learning_rate": 5.6912751677852354e-05, "loss": 0.4724, "step": 212 }, { "epoch": 0.35798319327731093, "grad_norm": 1.5780709243091136, "learning_rate": 5.718120805369128e-05, "loss": 0.4762, "step": 213 }, { "epoch": 0.3596638655462185, "grad_norm": 0.565651935182512, "learning_rate": 5.7449664429530206e-05, "loss": 0.4639, "step": 214 }, { "epoch": 0.36134453781512604, "grad_norm": 1.5399868530780154, "learning_rate": 5.771812080536913e-05, "loss": 0.4797, "step": 215 }, { "epoch": 0.3630252100840336, "grad_norm": 0.9131675321825046, "learning_rate": 5.798657718120806e-05, "loss": 0.4793, "step": 216 }, { "epoch": 0.36470588235294116, "grad_norm": 1.0179241538422839, "learning_rate": 5.825503355704698e-05, "loss": 0.4852, "step": 217 }, { "epoch": 0.3663865546218487, "grad_norm": 1.1318674983581714, "learning_rate": 5.8523489932885916e-05, "loss": 0.4705, "step": 218 }, { "epoch": 0.3680672268907563, "grad_norm": 1.3507204764398357, "learning_rate": 5.8791946308724835e-05, "loss": 0.472, "step": 219 }, { "epoch": 0.3697478991596639, "grad_norm": 0.9718729655790517, "learning_rate": 5.906040268456377e-05, "loss": 0.4948, "step": 220 }, { "epoch": 0.37142857142857144, "grad_norm": 1.0710597693221284, "learning_rate": 5.932885906040269e-05, "loss": 0.477, "step": 221 }, { "epoch": 0.373109243697479, "grad_norm": 1.334281507713167, "learning_rate": 5.959731543624162e-05, "loss": 0.4722, "step": 222 }, { "epoch": 0.37478991596638656, "grad_norm": 0.7785464890886484, "learning_rate": 5.986577181208054e-05, "loss": 0.4821, "step": 223 }, { "epoch": 0.3764705882352941, "grad_norm": 1.4750528349399297, "learning_rate": 6.013422818791947e-05, "loss": 0.481, "step": 224 }, { "epoch": 0.37815126050420167, "grad_norm": 0.8027280478419658, "learning_rate": 6.040268456375839e-05, "loss": 0.4698, "step": 225 }, { "epoch": 0.3798319327731092, "grad_norm": 1.1260802379282986, "learning_rate": 6.067114093959732e-05, "loss": 0.4811, "step": 226 }, { "epoch": 0.3815126050420168, "grad_norm": 1.1073909476846955, "learning_rate": 6.093959731543624e-05, "loss": 0.4834, "step": 227 }, { "epoch": 0.3831932773109244, "grad_norm": 1.2734758410619993, "learning_rate": 6.120805369127517e-05, "loss": 0.4775, "step": 228 }, { "epoch": 0.38487394957983195, "grad_norm": 1.0758552285133052, "learning_rate": 6.14765100671141e-05, "loss": 0.4856, "step": 229 }, { "epoch": 0.3865546218487395, "grad_norm": 0.8171631948312451, "learning_rate": 6.174496644295302e-05, "loss": 0.4735, "step": 230 }, { "epoch": 0.38823529411764707, "grad_norm": 1.346585413687685, "learning_rate": 6.201342281879196e-05, "loss": 0.4768, "step": 231 }, { "epoch": 0.3899159663865546, "grad_norm": 0.7631666944185137, "learning_rate": 6.228187919463087e-05, "loss": 0.4817, "step": 232 }, { "epoch": 0.3915966386554622, "grad_norm": 0.8570227014968144, "learning_rate": 6.255033557046981e-05, "loss": 0.4639, "step": 233 }, { "epoch": 0.39327731092436974, "grad_norm": 1.2390202360653482, "learning_rate": 6.281879194630872e-05, "loss": 0.4724, "step": 234 }, { "epoch": 0.3949579831932773, "grad_norm": 0.6026156539726694, "learning_rate": 6.308724832214766e-05, "loss": 0.4789, "step": 235 }, { "epoch": 0.39663865546218485, "grad_norm": 0.8118350280371067, "learning_rate": 6.335570469798657e-05, "loss": 0.4741, "step": 236 }, { "epoch": 0.3983193277310924, "grad_norm": 0.8848637921907686, "learning_rate": 6.362416107382551e-05, "loss": 0.4782, "step": 237 }, { "epoch": 0.4, "grad_norm": 1.1685028605116359, "learning_rate": 6.389261744966443e-05, "loss": 0.4873, "step": 238 }, { "epoch": 0.4016806722689076, "grad_norm": 1.080884917276308, "learning_rate": 6.416107382550337e-05, "loss": 0.4817, "step": 239 }, { "epoch": 0.40336134453781514, "grad_norm": 1.242017778392259, "learning_rate": 6.442953020134228e-05, "loss": 0.4728, "step": 240 }, { "epoch": 0.4050420168067227, "grad_norm": 0.9489125430074402, "learning_rate": 6.469798657718122e-05, "loss": 0.4837, "step": 241 }, { "epoch": 0.40672268907563025, "grad_norm": 1.0454168165992488, "learning_rate": 6.496644295302014e-05, "loss": 0.4775, "step": 242 }, { "epoch": 0.4084033613445378, "grad_norm": 1.320892442905446, "learning_rate": 6.523489932885907e-05, "loss": 0.4803, "step": 243 }, { "epoch": 0.41008403361344536, "grad_norm": 0.8473876480944419, "learning_rate": 6.5503355704698e-05, "loss": 0.4718, "step": 244 }, { "epoch": 0.4117647058823529, "grad_norm": 1.199934079383766, "learning_rate": 6.577181208053692e-05, "loss": 0.4718, "step": 245 }, { "epoch": 0.4134453781512605, "grad_norm": 0.8302507752662015, "learning_rate": 6.604026845637585e-05, "loss": 0.4716, "step": 246 }, { "epoch": 0.4151260504201681, "grad_norm": 1.2285877750116911, "learning_rate": 6.630872483221477e-05, "loss": 0.4799, "step": 247 }, { "epoch": 0.41680672268907565, "grad_norm": 1.0509243441120923, "learning_rate": 6.65771812080537e-05, "loss": 0.4734, "step": 248 }, { "epoch": 0.4184873949579832, "grad_norm": 1.5409925762074905, "learning_rate": 6.684563758389262e-05, "loss": 0.4882, "step": 249 }, { "epoch": 0.42016806722689076, "grad_norm": 0.9884794461640918, "learning_rate": 6.711409395973155e-05, "loss": 0.4807, "step": 250 }, { "epoch": 0.4218487394957983, "grad_norm": 1.0325974693491777, "learning_rate": 6.738255033557048e-05, "loss": 0.4748, "step": 251 }, { "epoch": 0.4235294117647059, "grad_norm": 0.8525273344761205, "learning_rate": 6.76510067114094e-05, "loss": 0.4746, "step": 252 }, { "epoch": 0.42521008403361343, "grad_norm": 1.0103178245827906, "learning_rate": 6.791946308724833e-05, "loss": 0.4689, "step": 253 }, { "epoch": 0.426890756302521, "grad_norm": 1.1091607057566373, "learning_rate": 6.818791946308725e-05, "loss": 0.4704, "step": 254 }, { "epoch": 0.42857142857142855, "grad_norm": 1.5087877949454862, "learning_rate": 6.845637583892618e-05, "loss": 0.4753, "step": 255 }, { "epoch": 0.43025210084033616, "grad_norm": 0.7310888648379513, "learning_rate": 6.87248322147651e-05, "loss": 0.4685, "step": 256 }, { "epoch": 0.4319327731092437, "grad_norm": 1.0560210515777133, "learning_rate": 6.899328859060403e-05, "loss": 0.4613, "step": 257 }, { "epoch": 0.4336134453781513, "grad_norm": 1.5327134707604042, "learning_rate": 6.926174496644296e-05, "loss": 0.4807, "step": 258 }, { "epoch": 0.43529411764705883, "grad_norm": 0.8115159036447009, "learning_rate": 6.953020134228188e-05, "loss": 0.4756, "step": 259 }, { "epoch": 0.4369747899159664, "grad_norm": 1.7426117311043001, "learning_rate": 6.979865771812081e-05, "loss": 0.4755, "step": 260 }, { "epoch": 0.43865546218487395, "grad_norm": 1.012835074279992, "learning_rate": 7.006711409395974e-05, "loss": 0.4766, "step": 261 }, { "epoch": 0.4403361344537815, "grad_norm": 2.230539765184518, "learning_rate": 7.033557046979866e-05, "loss": 0.476, "step": 262 }, { "epoch": 0.44201680672268906, "grad_norm": 1.7250829969385808, "learning_rate": 7.060402684563759e-05, "loss": 0.476, "step": 263 }, { "epoch": 0.4436974789915966, "grad_norm": 1.3775555263265915, "learning_rate": 7.087248322147653e-05, "loss": 0.4724, "step": 264 }, { "epoch": 0.44537815126050423, "grad_norm": 1.3897463689570753, "learning_rate": 7.114093959731544e-05, "loss": 0.4838, "step": 265 }, { "epoch": 0.4470588235294118, "grad_norm": 1.0419110987939726, "learning_rate": 7.140939597315438e-05, "loss": 0.4779, "step": 266 }, { "epoch": 0.44873949579831934, "grad_norm": 0.8845423050443665, "learning_rate": 7.167785234899329e-05, "loss": 0.4805, "step": 267 }, { "epoch": 0.4504201680672269, "grad_norm": 0.9831118537456526, "learning_rate": 7.194630872483222e-05, "loss": 0.4777, "step": 268 }, { "epoch": 0.45210084033613446, "grad_norm": 0.7350220475405578, "learning_rate": 7.221476510067114e-05, "loss": 0.4724, "step": 269 }, { "epoch": 0.453781512605042, "grad_norm": 1.319849483414545, "learning_rate": 7.248322147651007e-05, "loss": 0.4819, "step": 270 }, { "epoch": 0.45546218487394957, "grad_norm": 0.8530248432804404, "learning_rate": 7.2751677852349e-05, "loss": 0.4701, "step": 271 }, { "epoch": 0.45714285714285713, "grad_norm": 0.9134297322028255, "learning_rate": 7.302013422818792e-05, "loss": 0.4652, "step": 272 }, { "epoch": 0.4588235294117647, "grad_norm": 1.2749005856833306, "learning_rate": 7.328859060402685e-05, "loss": 0.4703, "step": 273 }, { "epoch": 0.46050420168067224, "grad_norm": 0.9920740289887459, "learning_rate": 7.355704697986577e-05, "loss": 0.4722, "step": 274 }, { "epoch": 0.46218487394957986, "grad_norm": 0.8673197365789165, "learning_rate": 7.38255033557047e-05, "loss": 0.4764, "step": 275 }, { "epoch": 0.4638655462184874, "grad_norm": 1.2693160876499594, "learning_rate": 7.409395973154362e-05, "loss": 0.4805, "step": 276 }, { "epoch": 0.46554621848739497, "grad_norm": 1.2188878046861507, "learning_rate": 7.436241610738256e-05, "loss": 0.4717, "step": 277 }, { "epoch": 0.4672268907563025, "grad_norm": 0.8754532729114949, "learning_rate": 7.463087248322148e-05, "loss": 0.4647, "step": 278 }, { "epoch": 0.4689075630252101, "grad_norm": 0.6855846702036135, "learning_rate": 7.489932885906041e-05, "loss": 0.4736, "step": 279 }, { "epoch": 0.47058823529411764, "grad_norm": 0.603112645708118, "learning_rate": 7.516778523489933e-05, "loss": 0.4677, "step": 280 }, { "epoch": 0.4722689075630252, "grad_norm": 0.8007550008930027, "learning_rate": 7.543624161073827e-05, "loss": 0.4759, "step": 281 }, { "epoch": 0.47394957983193275, "grad_norm": 1.0443645383911266, "learning_rate": 7.570469798657718e-05, "loss": 0.4614, "step": 282 }, { "epoch": 0.4756302521008403, "grad_norm": 1.3993150374064818, "learning_rate": 7.597315436241612e-05, "loss": 0.4819, "step": 283 }, { "epoch": 0.4773109243697479, "grad_norm": 0.6547244384826703, "learning_rate": 7.624161073825503e-05, "loss": 0.4595, "step": 284 }, { "epoch": 0.4789915966386555, "grad_norm": 0.9599921793865691, "learning_rate": 7.651006711409397e-05, "loss": 0.4722, "step": 285 }, { "epoch": 0.48067226890756304, "grad_norm": 1.4780047924961124, "learning_rate": 7.677852348993288e-05, "loss": 0.4786, "step": 286 }, { "epoch": 0.4823529411764706, "grad_norm": 0.9293848732494505, "learning_rate": 7.704697986577182e-05, "loss": 0.4693, "step": 287 }, { "epoch": 0.48403361344537815, "grad_norm": 1.2627415930951078, "learning_rate": 7.731543624161075e-05, "loss": 0.4696, "step": 288 }, { "epoch": 0.4857142857142857, "grad_norm": 0.6877863917814994, "learning_rate": 7.758389261744967e-05, "loss": 0.4678, "step": 289 }, { "epoch": 0.48739495798319327, "grad_norm": 1.2961709011778173, "learning_rate": 7.78523489932886e-05, "loss": 0.4648, "step": 290 }, { "epoch": 0.4890756302521008, "grad_norm": 0.5988278909871823, "learning_rate": 7.812080536912753e-05, "loss": 0.4701, "step": 291 }, { "epoch": 0.4907563025210084, "grad_norm": 1.2272434507992431, "learning_rate": 7.838926174496645e-05, "loss": 0.4686, "step": 292 }, { "epoch": 0.492436974789916, "grad_norm": 0.6962804806604058, "learning_rate": 7.865771812080538e-05, "loss": 0.4568, "step": 293 }, { "epoch": 0.49411764705882355, "grad_norm": 1.0725122999017895, "learning_rate": 7.89261744966443e-05, "loss": 0.4699, "step": 294 }, { "epoch": 0.4957983193277311, "grad_norm": 0.666447880190579, "learning_rate": 7.919463087248323e-05, "loss": 0.4626, "step": 295 }, { "epoch": 0.49747899159663866, "grad_norm": 0.8171894326068043, "learning_rate": 7.946308724832215e-05, "loss": 0.4577, "step": 296 }, { "epoch": 0.4991596638655462, "grad_norm": 1.4566009655672951, "learning_rate": 7.973154362416108e-05, "loss": 0.4717, "step": 297 }, { "epoch": 0.5008403361344538, "grad_norm": 1.1083879837811905, "learning_rate": 8e-05, "loss": 0.4694, "step": 298 }, { "epoch": 0.5025210084033613, "grad_norm": 0.8954578233888306, "learning_rate": 7.999997245562333e-05, "loss": 0.4609, "step": 299 }, { "epoch": 0.5042016806722689, "grad_norm": 1.2956081681253464, "learning_rate": 7.999988982253123e-05, "loss": 0.4676, "step": 300 }, { "epoch": 0.5058823529411764, "grad_norm": 0.9521939981893466, "learning_rate": 7.999975210083752e-05, "loss": 0.4589, "step": 301 }, { "epoch": 0.507563025210084, "grad_norm": 1.1153550279691113, "learning_rate": 7.999955929073187e-05, "loss": 0.4704, "step": 302 }, { "epoch": 0.5092436974789916, "grad_norm": 1.1738570336852796, "learning_rate": 7.999931139247981e-05, "loss": 0.4675, "step": 303 }, { "epoch": 0.5109243697478991, "grad_norm": 1.2742103055604193, "learning_rate": 7.999900840642277e-05, "loss": 0.4587, "step": 304 }, { "epoch": 0.5126050420168067, "grad_norm": 0.711599441980962, "learning_rate": 7.999865033297801e-05, "loss": 0.4563, "step": 305 }, { "epoch": 0.5142857142857142, "grad_norm": 0.6538139490310148, "learning_rate": 7.999823717263868e-05, "loss": 0.4628, "step": 306 }, { "epoch": 0.5159663865546219, "grad_norm": 0.9189734429305162, "learning_rate": 7.999776892597381e-05, "loss": 0.4734, "step": 307 }, { "epoch": 0.5176470588235295, "grad_norm": 1.2258371894508977, "learning_rate": 7.999724559362824e-05, "loss": 0.472, "step": 308 }, { "epoch": 0.519327731092437, "grad_norm": 0.8376094489768933, "learning_rate": 7.999666717632274e-05, "loss": 0.4489, "step": 309 }, { "epoch": 0.5210084033613446, "grad_norm": 0.8806444877737181, "learning_rate": 7.999603367485393e-05, "loss": 0.4691, "step": 310 }, { "epoch": 0.5226890756302521, "grad_norm": 0.8241572663720752, "learning_rate": 7.999534509009426e-05, "loss": 0.4661, "step": 311 }, { "epoch": 0.5243697478991597, "grad_norm": 1.069605747807687, "learning_rate": 7.999460142299205e-05, "loss": 0.4651, "step": 312 }, { "epoch": 0.5260504201680672, "grad_norm": 1.3899526924665204, "learning_rate": 7.999380267457152e-05, "loss": 0.4616, "step": 313 }, { "epoch": 0.5277310924369748, "grad_norm": 0.4491717338933011, "learning_rate": 7.999294884593271e-05, "loss": 0.46, "step": 314 }, { "epoch": 0.5294117647058824, "grad_norm": 1.2327422831172332, "learning_rate": 7.999203993825153e-05, "loss": 0.4723, "step": 315 }, { "epoch": 0.5310924369747899, "grad_norm": 1.0326260434447077, "learning_rate": 7.999107595277974e-05, "loss": 0.4583, "step": 316 }, { "epoch": 0.5327731092436975, "grad_norm": 1.0861675345965072, "learning_rate": 7.999005689084495e-05, "loss": 0.4651, "step": 317 }, { "epoch": 0.534453781512605, "grad_norm": 0.9422445041640858, "learning_rate": 7.998898275385068e-05, "loss": 0.462, "step": 318 }, { "epoch": 0.5361344537815126, "grad_norm": 0.9761939276332894, "learning_rate": 7.998785354327617e-05, "loss": 0.455, "step": 319 }, { "epoch": 0.5378151260504201, "grad_norm": 0.9726866674553927, "learning_rate": 7.998666926067667e-05, "loss": 0.4669, "step": 320 }, { "epoch": 0.5394957983193277, "grad_norm": 1.3361088446108333, "learning_rate": 7.998542990768313e-05, "loss": 0.4644, "step": 321 }, { "epoch": 0.5411764705882353, "grad_norm": 0.6142259218141635, "learning_rate": 7.998413548600246e-05, "loss": 0.4686, "step": 322 }, { "epoch": 0.5428571428571428, "grad_norm": 0.9159352616160243, "learning_rate": 7.998278599741733e-05, "loss": 0.466, "step": 323 }, { "epoch": 0.5445378151260504, "grad_norm": 1.3465900437029323, "learning_rate": 7.998138144378631e-05, "loss": 0.4702, "step": 324 }, { "epoch": 0.5462184873949579, "grad_norm": 0.7494554087885942, "learning_rate": 7.997992182704376e-05, "loss": 0.4501, "step": 325 }, { "epoch": 0.5478991596638656, "grad_norm": 0.9978351878768936, "learning_rate": 7.997840714919989e-05, "loss": 0.4627, "step": 326 }, { "epoch": 0.5495798319327732, "grad_norm": 0.9980257245316938, "learning_rate": 7.997683741234075e-05, "loss": 0.4637, "step": 327 }, { "epoch": 0.5512605042016807, "grad_norm": 1.1057653482108614, "learning_rate": 7.997521261862821e-05, "loss": 0.4605, "step": 328 }, { "epoch": 0.5529411764705883, "grad_norm": 0.9611424433992446, "learning_rate": 7.997353277029995e-05, "loss": 0.4676, "step": 329 }, { "epoch": 0.5546218487394958, "grad_norm": 0.823163108545755, "learning_rate": 7.997179786966951e-05, "loss": 0.4619, "step": 330 }, { "epoch": 0.5563025210084034, "grad_norm": 1.097448513604207, "learning_rate": 7.997000791912623e-05, "loss": 0.4694, "step": 331 }, { "epoch": 0.5579831932773109, "grad_norm": 0.9620325679103203, "learning_rate": 7.996816292113525e-05, "loss": 0.4599, "step": 332 }, { "epoch": 0.5596638655462185, "grad_norm": 0.9800148645563171, "learning_rate": 7.996626287823754e-05, "loss": 0.4727, "step": 333 }, { "epoch": 0.561344537815126, "grad_norm": 1.153983425489295, "learning_rate": 7.996430779304987e-05, "loss": 0.4676, "step": 334 }, { "epoch": 0.5630252100840336, "grad_norm": 0.9406696309065081, "learning_rate": 7.996229766826484e-05, "loss": 0.4549, "step": 335 }, { "epoch": 0.5647058823529412, "grad_norm": 0.8244503433806202, "learning_rate": 7.99602325066508e-05, "loss": 0.4558, "step": 336 }, { "epoch": 0.5663865546218487, "grad_norm": 0.7336845621262541, "learning_rate": 7.995811231105196e-05, "loss": 0.462, "step": 337 }, { "epoch": 0.5680672268907563, "grad_norm": 0.8841089061566196, "learning_rate": 7.995593708438827e-05, "loss": 0.4607, "step": 338 }, { "epoch": 0.5697478991596638, "grad_norm": 0.8764697179818751, "learning_rate": 7.995370682965551e-05, "loss": 0.4536, "step": 339 }, { "epoch": 0.5714285714285714, "grad_norm": 0.8884104221692158, "learning_rate": 7.995142154992522e-05, "loss": 0.4568, "step": 340 }, { "epoch": 0.573109243697479, "grad_norm": 0.6555413282163642, "learning_rate": 7.994908124834473e-05, "loss": 0.4632, "step": 341 }, { "epoch": 0.5747899159663865, "grad_norm": 0.8165841833341266, "learning_rate": 7.994668592813715e-05, "loss": 0.4534, "step": 342 }, { "epoch": 0.5764705882352941, "grad_norm": 0.9625116000963834, "learning_rate": 7.994423559260137e-05, "loss": 0.4624, "step": 343 }, { "epoch": 0.5781512605042017, "grad_norm": 0.9047074442774421, "learning_rate": 7.9941730245112e-05, "loss": 0.4628, "step": 344 }, { "epoch": 0.5798319327731093, "grad_norm": 1.0855030261903311, "learning_rate": 7.99391698891195e-05, "loss": 0.4589, "step": 345 }, { "epoch": 0.5815126050420169, "grad_norm": 1.0526836198262537, "learning_rate": 7.993655452815003e-05, "loss": 0.459, "step": 346 }, { "epoch": 0.5831932773109244, "grad_norm": 0.7400997346360255, "learning_rate": 7.99338841658055e-05, "loss": 0.455, "step": 347 }, { "epoch": 0.584873949579832, "grad_norm": 0.45973551396739704, "learning_rate": 7.993115880576359e-05, "loss": 0.4539, "step": 348 }, { "epoch": 0.5865546218487395, "grad_norm": 0.4839066574425342, "learning_rate": 7.99283784517777e-05, "loss": 0.4575, "step": 349 }, { "epoch": 0.5882352941176471, "grad_norm": 0.6372771732433801, "learning_rate": 7.9925543107677e-05, "loss": 0.4538, "step": 350 }, { "epoch": 0.5899159663865546, "grad_norm": 0.6954763592535681, "learning_rate": 7.992265277736639e-05, "loss": 0.4569, "step": 351 }, { "epoch": 0.5915966386554622, "grad_norm": 0.7466439145156624, "learning_rate": 7.991970746482649e-05, "loss": 0.4574, "step": 352 }, { "epoch": 0.5932773109243697, "grad_norm": 0.790905541721855, "learning_rate": 7.99167071741136e-05, "loss": 0.4503, "step": 353 }, { "epoch": 0.5949579831932773, "grad_norm": 0.9291643040967381, "learning_rate": 7.991365190935982e-05, "loss": 0.4536, "step": 354 }, { "epoch": 0.5966386554621849, "grad_norm": 0.9964239648454367, "learning_rate": 7.991054167477292e-05, "loss": 0.4631, "step": 355 }, { "epoch": 0.5983193277310924, "grad_norm": 0.8824086685701953, "learning_rate": 7.990737647463633e-05, "loss": 0.4572, "step": 356 }, { "epoch": 0.6, "grad_norm": 0.7100575572407378, "learning_rate": 7.990415631330926e-05, "loss": 0.4584, "step": 357 }, { "epoch": 0.6016806722689075, "grad_norm": 0.7842795834436239, "learning_rate": 7.990088119522656e-05, "loss": 0.4624, "step": 358 }, { "epoch": 0.6033613445378151, "grad_norm": 1.034078065647351, "learning_rate": 7.98975511248988e-05, "loss": 0.4705, "step": 359 }, { "epoch": 0.6050420168067226, "grad_norm": 1.0722508523554277, "learning_rate": 7.989416610691219e-05, "loss": 0.4673, "step": 360 }, { "epoch": 0.6067226890756302, "grad_norm": 1.165338703319896, "learning_rate": 7.989072614592867e-05, "loss": 0.4603, "step": 361 }, { "epoch": 0.6084033613445378, "grad_norm": 0.9739671509052005, "learning_rate": 7.98872312466858e-05, "loss": 0.4584, "step": 362 }, { "epoch": 0.6100840336134454, "grad_norm": 0.8198152657322477, "learning_rate": 7.988368141399684e-05, "loss": 0.4624, "step": 363 }, { "epoch": 0.611764705882353, "grad_norm": 0.8223500741047297, "learning_rate": 7.988007665275065e-05, "loss": 0.4621, "step": 364 }, { "epoch": 0.6134453781512605, "grad_norm": 0.7992080694106389, "learning_rate": 7.987641696791182e-05, "loss": 0.465, "step": 365 }, { "epoch": 0.6151260504201681, "grad_norm": 0.8368875480888296, "learning_rate": 7.987270236452052e-05, "loss": 0.4656, "step": 366 }, { "epoch": 0.6168067226890757, "grad_norm": 1.0722729367633659, "learning_rate": 7.986893284769254e-05, "loss": 0.4638, "step": 367 }, { "epoch": 0.6184873949579832, "grad_norm": 0.769691915133592, "learning_rate": 7.986510842261939e-05, "loss": 0.4615, "step": 368 }, { "epoch": 0.6201680672268908, "grad_norm": 0.5144756032724467, "learning_rate": 7.986122909456808e-05, "loss": 0.4673, "step": 369 }, { "epoch": 0.6218487394957983, "grad_norm": 0.7214663948365585, "learning_rate": 7.985729486888132e-05, "loss": 0.4509, "step": 370 }, { "epoch": 0.6235294117647059, "grad_norm": 0.540048622820865, "learning_rate": 7.985330575097742e-05, "loss": 0.4568, "step": 371 }, { "epoch": 0.6252100840336134, "grad_norm": 0.6084211938318337, "learning_rate": 7.984926174635024e-05, "loss": 0.4547, "step": 372 }, { "epoch": 0.626890756302521, "grad_norm": 0.6905108393162471, "learning_rate": 7.984516286056926e-05, "loss": 0.4504, "step": 373 }, { "epoch": 0.6285714285714286, "grad_norm": 0.7035699183224882, "learning_rate": 7.984100909927955e-05, "loss": 0.4638, "step": 374 }, { "epoch": 0.6302521008403361, "grad_norm": 0.7159388707177279, "learning_rate": 7.983680046820176e-05, "loss": 0.4423, "step": 375 }, { "epoch": 0.6319327731092437, "grad_norm": 0.8051021876389326, "learning_rate": 7.983253697313207e-05, "loss": 0.451, "step": 376 }, { "epoch": 0.6336134453781512, "grad_norm": 0.9435925527380927, "learning_rate": 7.982821861994227e-05, "loss": 0.4604, "step": 377 }, { "epoch": 0.6352941176470588, "grad_norm": 0.9686324141407122, "learning_rate": 7.982384541457966e-05, "loss": 0.4561, "step": 378 }, { "epoch": 0.6369747899159663, "grad_norm": 0.8513757428258891, "learning_rate": 7.981941736306714e-05, "loss": 0.45, "step": 379 }, { "epoch": 0.6386554621848739, "grad_norm": 0.7221110043442212, "learning_rate": 7.981493447150303e-05, "loss": 0.4643, "step": 380 }, { "epoch": 0.6403361344537815, "grad_norm": 0.6461792694987035, "learning_rate": 7.981039674606132e-05, "loss": 0.4609, "step": 381 }, { "epoch": 0.6420168067226891, "grad_norm": 0.4409606098260902, "learning_rate": 7.980580419299143e-05, "loss": 0.4499, "step": 382 }, { "epoch": 0.6436974789915967, "grad_norm": 0.4527260222572918, "learning_rate": 7.980115681861832e-05, "loss": 0.4536, "step": 383 }, { "epoch": 0.6453781512605042, "grad_norm": 0.6248201920449988, "learning_rate": 7.979645462934241e-05, "loss": 0.456, "step": 384 }, { "epoch": 0.6470588235294118, "grad_norm": 0.7311832880913077, "learning_rate": 7.979169763163967e-05, "loss": 0.4586, "step": 385 }, { "epoch": 0.6487394957983194, "grad_norm": 0.7889263650307611, "learning_rate": 7.978688583206152e-05, "loss": 0.4589, "step": 386 }, { "epoch": 0.6504201680672269, "grad_norm": 0.9306247772029179, "learning_rate": 7.978201923723487e-05, "loss": 0.4621, "step": 387 }, { "epoch": 0.6521008403361345, "grad_norm": 1.1709416167141913, "learning_rate": 7.977709785386205e-05, "loss": 0.4632, "step": 388 }, { "epoch": 0.653781512605042, "grad_norm": 0.8294424113127506, "learning_rate": 7.977212168872093e-05, "loss": 0.4521, "step": 389 }, { "epoch": 0.6554621848739496, "grad_norm": 0.6345036316869391, "learning_rate": 7.976709074866476e-05, "loss": 0.4541, "step": 390 }, { "epoch": 0.6571428571428571, "grad_norm": 0.6248938595863183, "learning_rate": 7.976200504062226e-05, "loss": 0.4522, "step": 391 }, { "epoch": 0.6588235294117647, "grad_norm": 0.6950185296093198, "learning_rate": 7.975686457159751e-05, "loss": 0.4472, "step": 392 }, { "epoch": 0.6605042016806723, "grad_norm": 0.612689516953478, "learning_rate": 7.975166934867012e-05, "loss": 0.4594, "step": 393 }, { "epoch": 0.6621848739495798, "grad_norm": 0.5709805435766765, "learning_rate": 7.974641937899502e-05, "loss": 0.46, "step": 394 }, { "epoch": 0.6638655462184874, "grad_norm": 0.7615901634664017, "learning_rate": 7.974111466980258e-05, "loss": 0.4583, "step": 395 }, { "epoch": 0.6655462184873949, "grad_norm": 0.6937499768921052, "learning_rate": 7.973575522839853e-05, "loss": 0.4487, "step": 396 }, { "epoch": 0.6672268907563025, "grad_norm": 0.5515788577619807, "learning_rate": 7.973034106216401e-05, "loss": 0.4534, "step": 397 }, { "epoch": 0.66890756302521, "grad_norm": 0.45299136717259414, "learning_rate": 7.972487217855549e-05, "loss": 0.4629, "step": 398 }, { "epoch": 0.6705882352941176, "grad_norm": 0.5750223344282559, "learning_rate": 7.971934858510485e-05, "loss": 0.4391, "step": 399 }, { "epoch": 0.6722689075630253, "grad_norm": 0.6605122645956948, "learning_rate": 7.971377028941926e-05, "loss": 0.4577, "step": 400 }, { "epoch": 0.6739495798319328, "grad_norm": 0.7717489972001313, "learning_rate": 7.970813729918126e-05, "loss": 0.4464, "step": 401 }, { "epoch": 0.6756302521008404, "grad_norm": 1.0566032653945878, "learning_rate": 7.970244962214873e-05, "loss": 0.4551, "step": 402 }, { "epoch": 0.6773109243697479, "grad_norm": 1.2011693783357607, "learning_rate": 7.969670726615483e-05, "loss": 0.4505, "step": 403 }, { "epoch": 0.6789915966386555, "grad_norm": 0.6676236018890173, "learning_rate": 7.969091023910802e-05, "loss": 0.45, "step": 404 }, { "epoch": 0.680672268907563, "grad_norm": 0.36144211381848457, "learning_rate": 7.96850585489921e-05, "loss": 0.4507, "step": 405 }, { "epoch": 0.6823529411764706, "grad_norm": 0.5451852915773836, "learning_rate": 7.967915220386614e-05, "loss": 0.4426, "step": 406 }, { "epoch": 0.6840336134453782, "grad_norm": 0.7823033834724415, "learning_rate": 7.967319121186444e-05, "loss": 0.4471, "step": 407 }, { "epoch": 0.6857142857142857, "grad_norm": 0.9050520800450582, "learning_rate": 7.966717558119662e-05, "loss": 0.4525, "step": 408 }, { "epoch": 0.6873949579831933, "grad_norm": 0.9210359548429012, "learning_rate": 7.96611053201475e-05, "loss": 0.4514, "step": 409 }, { "epoch": 0.6890756302521008, "grad_norm": 0.7296176648268952, "learning_rate": 7.965498043707715e-05, "loss": 0.4481, "step": 410 }, { "epoch": 0.6907563025210084, "grad_norm": 0.6892006981377596, "learning_rate": 7.96488009404209e-05, "loss": 0.4524, "step": 411 }, { "epoch": 0.692436974789916, "grad_norm": 0.6519965285849953, "learning_rate": 7.964256683868923e-05, "loss": 0.4521, "step": 412 }, { "epoch": 0.6941176470588235, "grad_norm": 0.6234971284120305, "learning_rate": 7.963627814046791e-05, "loss": 0.4547, "step": 413 }, { "epoch": 0.6957983193277311, "grad_norm": 0.7692393933826233, "learning_rate": 7.962993485441783e-05, "loss": 0.446, "step": 414 }, { "epoch": 0.6974789915966386, "grad_norm": 0.6768144931594383, "learning_rate": 7.962353698927507e-05, "loss": 0.4671, "step": 415 }, { "epoch": 0.6991596638655462, "grad_norm": 0.6508335721896449, "learning_rate": 7.961708455385093e-05, "loss": 0.4518, "step": 416 }, { "epoch": 0.7008403361344537, "grad_norm": 0.6540828403334353, "learning_rate": 7.961057755703177e-05, "loss": 0.453, "step": 417 }, { "epoch": 0.7025210084033613, "grad_norm": 0.5924607971807357, "learning_rate": 7.96040160077792e-05, "loss": 0.4519, "step": 418 }, { "epoch": 0.704201680672269, "grad_norm": 0.5370192468171079, "learning_rate": 7.959739991512989e-05, "loss": 0.4444, "step": 419 }, { "epoch": 0.7058823529411765, "grad_norm": 0.41397075489020796, "learning_rate": 7.959072928819563e-05, "loss": 0.444, "step": 420 }, { "epoch": 0.7075630252100841, "grad_norm": 0.5706253639996917, "learning_rate": 7.958400413616335e-05, "loss": 0.457, "step": 421 }, { "epoch": 0.7092436974789916, "grad_norm": 0.7745752770176129, "learning_rate": 7.957722446829507e-05, "loss": 0.4498, "step": 422 }, { "epoch": 0.7109243697478992, "grad_norm": 0.9025882711665463, "learning_rate": 7.957039029392784e-05, "loss": 0.4417, "step": 423 }, { "epoch": 0.7126050420168067, "grad_norm": 0.9495295400812924, "learning_rate": 7.956350162247385e-05, "loss": 0.4506, "step": 424 }, { "epoch": 0.7142857142857143, "grad_norm": 1.0639221585851708, "learning_rate": 7.955655846342028e-05, "loss": 0.4547, "step": 425 }, { "epoch": 0.7159663865546219, "grad_norm": 1.1452088788356567, "learning_rate": 7.954956082632941e-05, "loss": 0.4542, "step": 426 }, { "epoch": 0.7176470588235294, "grad_norm": 0.815789189960643, "learning_rate": 7.954250872083849e-05, "loss": 0.4568, "step": 427 }, { "epoch": 0.719327731092437, "grad_norm": 0.7895513190486858, "learning_rate": 7.953540215665982e-05, "loss": 0.4451, "step": 428 }, { "epoch": 0.7210084033613445, "grad_norm": 0.8952387143322712, "learning_rate": 7.95282411435807e-05, "loss": 0.4506, "step": 429 }, { "epoch": 0.7226890756302521, "grad_norm": 0.9185831161654213, "learning_rate": 7.95210256914634e-05, "loss": 0.4464, "step": 430 }, { "epoch": 0.7243697478991596, "grad_norm": 1.0241047840590825, "learning_rate": 7.951375581024518e-05, "loss": 0.4448, "step": 431 }, { "epoch": 0.7260504201680672, "grad_norm": 1.1534204015735594, "learning_rate": 7.950643150993828e-05, "loss": 0.4471, "step": 432 }, { "epoch": 0.7277310924369748, "grad_norm": 0.6694585478212992, "learning_rate": 7.949905280062984e-05, "loss": 0.4478, "step": 433 }, { "epoch": 0.7294117647058823, "grad_norm": 0.641166817704749, "learning_rate": 7.949161969248196e-05, "loss": 0.4512, "step": 434 }, { "epoch": 0.7310924369747899, "grad_norm": 0.9556017264973636, "learning_rate": 7.948413219573166e-05, "loss": 0.4544, "step": 435 }, { "epoch": 0.7327731092436974, "grad_norm": 0.9367699270537879, "learning_rate": 7.947659032069086e-05, "loss": 0.45, "step": 436 }, { "epoch": 0.7344537815126051, "grad_norm": 0.7189411904894562, "learning_rate": 7.946899407774638e-05, "loss": 0.4457, "step": 437 }, { "epoch": 0.7361344537815127, "grad_norm": 0.5079556163550458, "learning_rate": 7.94613434773599e-05, "loss": 0.4461, "step": 438 }, { "epoch": 0.7378151260504202, "grad_norm": 0.5350790372045231, "learning_rate": 7.945363853006797e-05, "loss": 0.4459, "step": 439 }, { "epoch": 0.7394957983193278, "grad_norm": 0.7937043535174361, "learning_rate": 7.9445879246482e-05, "loss": 0.4476, "step": 440 }, { "epoch": 0.7411764705882353, "grad_norm": 0.7089826343191844, "learning_rate": 7.943806563728821e-05, "loss": 0.4421, "step": 441 }, { "epoch": 0.7428571428571429, "grad_norm": 0.5643272481971402, "learning_rate": 7.943019771324766e-05, "loss": 0.4474, "step": 442 }, { "epoch": 0.7445378151260504, "grad_norm": 0.8201123180282318, "learning_rate": 7.942227548519619e-05, "loss": 0.4539, "step": 443 }, { "epoch": 0.746218487394958, "grad_norm": 0.9211146862283482, "learning_rate": 7.941429896404443e-05, "loss": 0.444, "step": 444 }, { "epoch": 0.7478991596638656, "grad_norm": 0.8033199026817421, "learning_rate": 7.940626816077785e-05, "loss": 0.4441, "step": 445 }, { "epoch": 0.7495798319327731, "grad_norm": 0.8538516782748172, "learning_rate": 7.939818308645657e-05, "loss": 0.451, "step": 446 }, { "epoch": 0.7512605042016807, "grad_norm": 0.8214199817639242, "learning_rate": 7.939004375221552e-05, "loss": 0.4576, "step": 447 }, { "epoch": 0.7529411764705882, "grad_norm": 0.8002859443198643, "learning_rate": 7.938185016926435e-05, "loss": 0.4493, "step": 448 }, { "epoch": 0.7546218487394958, "grad_norm": 0.7901328079144773, "learning_rate": 7.937360234888742e-05, "loss": 0.4563, "step": 449 }, { "epoch": 0.7563025210084033, "grad_norm": 0.8247858856142821, "learning_rate": 7.936530030244378e-05, "loss": 0.4491, "step": 450 }, { "epoch": 0.7579831932773109, "grad_norm": 0.8145021868578016, "learning_rate": 7.935694404136715e-05, "loss": 0.4357, "step": 451 }, { "epoch": 0.7596638655462185, "grad_norm": 0.6323708676194418, "learning_rate": 7.934853357716596e-05, "loss": 0.4514, "step": 452 }, { "epoch": 0.761344537815126, "grad_norm": 0.3478121664741467, "learning_rate": 7.934006892142324e-05, "loss": 0.4473, "step": 453 }, { "epoch": 0.7630252100840336, "grad_norm": 0.34492924227414284, "learning_rate": 7.933155008579667e-05, "loss": 0.4432, "step": 454 }, { "epoch": 0.7647058823529411, "grad_norm": 0.42254055957457604, "learning_rate": 7.932297708201855e-05, "loss": 0.448, "step": 455 }, { "epoch": 0.7663865546218488, "grad_norm": 0.5206759716360051, "learning_rate": 7.93143499218958e-05, "loss": 0.4485, "step": 456 }, { "epoch": 0.7680672268907563, "grad_norm": 0.6251959141405861, "learning_rate": 7.93056686173099e-05, "loss": 0.4474, "step": 457 }, { "epoch": 0.7697478991596639, "grad_norm": 0.5498358548985528, "learning_rate": 7.92969331802169e-05, "loss": 0.4526, "step": 458 }, { "epoch": 0.7714285714285715, "grad_norm": 0.5953848985064495, "learning_rate": 7.928814362264742e-05, "loss": 0.4312, "step": 459 }, { "epoch": 0.773109243697479, "grad_norm": 0.7220427052423753, "learning_rate": 7.927929995670658e-05, "loss": 0.442, "step": 460 }, { "epoch": 0.7747899159663866, "grad_norm": 0.7274977660965468, "learning_rate": 7.927040219457406e-05, "loss": 0.4445, "step": 461 }, { "epoch": 0.7764705882352941, "grad_norm": 0.6636805095251171, "learning_rate": 7.926145034850404e-05, "loss": 0.4468, "step": 462 }, { "epoch": 0.7781512605042017, "grad_norm": 0.6300189998114643, "learning_rate": 7.925244443082514e-05, "loss": 0.4487, "step": 463 }, { "epoch": 0.7798319327731092, "grad_norm": 0.7111509859598584, "learning_rate": 7.92433844539405e-05, "loss": 0.4423, "step": 464 }, { "epoch": 0.7815126050420168, "grad_norm": 0.7629616183644943, "learning_rate": 7.923427043032769e-05, "loss": 0.454, "step": 465 }, { "epoch": 0.7831932773109244, "grad_norm": 0.7760134347541973, "learning_rate": 7.922510237253871e-05, "loss": 0.4581, "step": 466 }, { "epoch": 0.7848739495798319, "grad_norm": 0.8151488025970143, "learning_rate": 7.921588029319998e-05, "loss": 0.454, "step": 467 }, { "epoch": 0.7865546218487395, "grad_norm": 0.8170121782410553, "learning_rate": 7.920660420501233e-05, "loss": 0.447, "step": 468 }, { "epoch": 0.788235294117647, "grad_norm": 0.7789223104136569, "learning_rate": 7.919727412075094e-05, "loss": 0.4447, "step": 469 }, { "epoch": 0.7899159663865546, "grad_norm": 0.7015384168955207, "learning_rate": 7.918789005326542e-05, "loss": 0.4347, "step": 470 }, { "epoch": 0.7915966386554621, "grad_norm": 0.548753658948843, "learning_rate": 7.917845201547965e-05, "loss": 0.4431, "step": 471 }, { "epoch": 0.7932773109243697, "grad_norm": 0.4022347833617108, "learning_rate": 7.916896002039188e-05, "loss": 0.4399, "step": 472 }, { "epoch": 0.7949579831932773, "grad_norm": 0.431663205067307, "learning_rate": 7.915941408107466e-05, "loss": 0.4392, "step": 473 }, { "epoch": 0.7966386554621848, "grad_norm": 0.4681389481898292, "learning_rate": 7.914981421067486e-05, "loss": 0.4447, "step": 474 }, { "epoch": 0.7983193277310925, "grad_norm": 0.38861730722218124, "learning_rate": 7.914016042241358e-05, "loss": 0.4432, "step": 475 }, { "epoch": 0.8, "grad_norm": 0.43209688569954874, "learning_rate": 7.91304527295862e-05, "loss": 0.4438, "step": 476 }, { "epoch": 0.8016806722689076, "grad_norm": 0.5972655774394134, "learning_rate": 7.912069114556236e-05, "loss": 0.4484, "step": 477 }, { "epoch": 0.8033613445378152, "grad_norm": 0.7488957476648936, "learning_rate": 7.911087568378588e-05, "loss": 0.4509, "step": 478 }, { "epoch": 0.8050420168067227, "grad_norm": 0.8134558845881897, "learning_rate": 7.910100635777478e-05, "loss": 0.4502, "step": 479 }, { "epoch": 0.8067226890756303, "grad_norm": 0.8244108427448561, "learning_rate": 7.909108318112131e-05, "loss": 0.4554, "step": 480 }, { "epoch": 0.8084033613445378, "grad_norm": 0.8490429919572731, "learning_rate": 7.908110616749186e-05, "loss": 0.4386, "step": 481 }, { "epoch": 0.8100840336134454, "grad_norm": 0.7864928700539783, "learning_rate": 7.907107533062694e-05, "loss": 0.4521, "step": 482 }, { "epoch": 0.8117647058823529, "grad_norm": 0.6368024967486482, "learning_rate": 7.906099068434121e-05, "loss": 0.4411, "step": 483 }, { "epoch": 0.8134453781512605, "grad_norm": 0.5667421682349703, "learning_rate": 7.905085224252346e-05, "loss": 0.4506, "step": 484 }, { "epoch": 0.8151260504201681, "grad_norm": 0.6436907263891435, "learning_rate": 7.90406600191365e-05, "loss": 0.4365, "step": 485 }, { "epoch": 0.8168067226890756, "grad_norm": 0.7601175747439523, "learning_rate": 7.903041402821729e-05, "loss": 0.4502, "step": 486 }, { "epoch": 0.8184873949579832, "grad_norm": 0.8491846994130389, "learning_rate": 7.902011428387678e-05, "loss": 0.4475, "step": 487 }, { "epoch": 0.8201680672268907, "grad_norm": 0.8602777331269315, "learning_rate": 7.900976080029997e-05, "loss": 0.4369, "step": 488 }, { "epoch": 0.8218487394957983, "grad_norm": 0.7661368911241271, "learning_rate": 7.899935359174591e-05, "loss": 0.437, "step": 489 }, { "epoch": 0.8235294117647058, "grad_norm": 0.7266390867054168, "learning_rate": 7.898889267254754e-05, "loss": 0.4453, "step": 490 }, { "epoch": 0.8252100840336134, "grad_norm": 0.7775636971055834, "learning_rate": 7.89783780571119e-05, "loss": 0.4448, "step": 491 }, { "epoch": 0.826890756302521, "grad_norm": 0.808831750013845, "learning_rate": 7.896780975991988e-05, "loss": 0.439, "step": 492 }, { "epoch": 0.8285714285714286, "grad_norm": 0.6974849153551852, "learning_rate": 7.895718779552633e-05, "loss": 0.4353, "step": 493 }, { "epoch": 0.8302521008403362, "grad_norm": 0.5223106099608984, "learning_rate": 7.894651217856005e-05, "loss": 0.427, "step": 494 }, { "epoch": 0.8319327731092437, "grad_norm": 0.43738131110444095, "learning_rate": 7.893578292372367e-05, "loss": 0.4377, "step": 495 }, { "epoch": 0.8336134453781513, "grad_norm": 0.4149419516376177, "learning_rate": 7.892500004579375e-05, "loss": 0.4414, "step": 496 }, { "epoch": 0.8352941176470589, "grad_norm": 0.4746703378288258, "learning_rate": 7.891416355962065e-05, "loss": 0.444, "step": 497 }, { "epoch": 0.8369747899159664, "grad_norm": 0.49454437632123427, "learning_rate": 7.890327348012861e-05, "loss": 0.4469, "step": 498 }, { "epoch": 0.838655462184874, "grad_norm": 0.5822582490782191, "learning_rate": 7.889232982231562e-05, "loss": 0.436, "step": 499 }, { "epoch": 0.8403361344537815, "grad_norm": 0.6993925543448798, "learning_rate": 7.888133260125351e-05, "loss": 0.4463, "step": 500 }, { "epoch": 0.8420168067226891, "grad_norm": 0.7882169647881817, "learning_rate": 7.887028183208785e-05, "loss": 0.4503, "step": 501 }, { "epoch": 0.8436974789915966, "grad_norm": 0.8801320812820647, "learning_rate": 7.885917753003798e-05, "loss": 0.4566, "step": 502 }, { "epoch": 0.8453781512605042, "grad_norm": 0.9864984920799575, "learning_rate": 7.884801971039694e-05, "loss": 0.4495, "step": 503 }, { "epoch": 0.8470588235294118, "grad_norm": 0.9966965122474297, "learning_rate": 7.883680838853152e-05, "loss": 0.4463, "step": 504 }, { "epoch": 0.8487394957983193, "grad_norm": 1.010073147717239, "learning_rate": 7.882554357988212e-05, "loss": 0.4432, "step": 505 }, { "epoch": 0.8504201680672269, "grad_norm": 0.8442280068334713, "learning_rate": 7.881422529996286e-05, "loss": 0.4445, "step": 506 }, { "epoch": 0.8521008403361344, "grad_norm": 0.5601406712216319, "learning_rate": 7.880285356436151e-05, "loss": 0.4442, "step": 507 }, { "epoch": 0.853781512605042, "grad_norm": 0.37551404692690604, "learning_rate": 7.879142838873944e-05, "loss": 0.4401, "step": 508 }, { "epoch": 0.8554621848739495, "grad_norm": 0.5452668698035379, "learning_rate": 7.877994978883158e-05, "loss": 0.4343, "step": 509 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6280456144515015, "learning_rate": 7.876841778044648e-05, "loss": 0.447, "step": 510 }, { "epoch": 0.8588235294117647, "grad_norm": 0.5937042139650256, "learning_rate": 7.875683237946627e-05, "loss": 0.439, "step": 511 }, { "epoch": 0.8605042016806723, "grad_norm": 0.5950465015425337, "learning_rate": 7.874519360184657e-05, "loss": 0.4485, "step": 512 }, { "epoch": 0.8621848739495799, "grad_norm": 0.563952087934072, "learning_rate": 7.873350146361653e-05, "loss": 0.452, "step": 513 }, { "epoch": 0.8638655462184874, "grad_norm": 0.4634454480885586, "learning_rate": 7.872175598087875e-05, "loss": 0.4353, "step": 514 }, { "epoch": 0.865546218487395, "grad_norm": 0.56008240778512, "learning_rate": 7.870995716980938e-05, "loss": 0.4335, "step": 515 }, { "epoch": 0.8672268907563025, "grad_norm": 0.6201598455756608, "learning_rate": 7.869810504665792e-05, "loss": 0.4417, "step": 516 }, { "epoch": 0.8689075630252101, "grad_norm": 0.5011838534342936, "learning_rate": 7.868619962774736e-05, "loss": 0.4449, "step": 517 }, { "epoch": 0.8705882352941177, "grad_norm": 0.452396087344912, "learning_rate": 7.867424092947404e-05, "loss": 0.4435, "step": 518 }, { "epoch": 0.8722689075630252, "grad_norm": 0.5419828234324432, "learning_rate": 7.866222896830776e-05, "loss": 0.4339, "step": 519 }, { "epoch": 0.8739495798319328, "grad_norm": 0.5576024317447995, "learning_rate": 7.865016376079158e-05, "loss": 0.4505, "step": 520 }, { "epoch": 0.8756302521008403, "grad_norm": 0.5886310690262299, "learning_rate": 7.863804532354193e-05, "loss": 0.4456, "step": 521 }, { "epoch": 0.8773109243697479, "grad_norm": 0.6843205638411988, "learning_rate": 7.862587367324855e-05, "loss": 0.4379, "step": 522 }, { "epoch": 0.8789915966386554, "grad_norm": 0.8611731336294365, "learning_rate": 7.861364882667449e-05, "loss": 0.448, "step": 523 }, { "epoch": 0.880672268907563, "grad_norm": 0.9850065657349637, "learning_rate": 7.860137080065601e-05, "loss": 0.451, "step": 524 }, { "epoch": 0.8823529411764706, "grad_norm": 0.9680701579620873, "learning_rate": 7.858903961210265e-05, "loss": 0.4528, "step": 525 }, { "epoch": 0.8840336134453781, "grad_norm": 0.7212409153805136, "learning_rate": 7.857665527799716e-05, "loss": 0.4402, "step": 526 }, { "epoch": 0.8857142857142857, "grad_norm": 0.6749426775422702, "learning_rate": 7.856421781539548e-05, "loss": 0.4374, "step": 527 }, { "epoch": 0.8873949579831932, "grad_norm": 0.8778888808363016, "learning_rate": 7.855172724142672e-05, "loss": 0.441, "step": 528 }, { "epoch": 0.8890756302521008, "grad_norm": 0.9302188890753615, "learning_rate": 7.85391835732931e-05, "loss": 0.4365, "step": 529 }, { "epoch": 0.8907563025210085, "grad_norm": 0.878301992004877, "learning_rate": 7.852658682827005e-05, "loss": 0.4528, "step": 530 }, { "epoch": 0.892436974789916, "grad_norm": 0.7182803896824043, "learning_rate": 7.8513937023706e-05, "loss": 0.4387, "step": 531 }, { "epoch": 0.8941176470588236, "grad_norm": 0.5074361148697082, "learning_rate": 7.85012341770225e-05, "loss": 0.4229, "step": 532 }, { "epoch": 0.8957983193277311, "grad_norm": 0.40441254299211044, "learning_rate": 7.848847830571419e-05, "loss": 0.4474, "step": 533 }, { "epoch": 0.8974789915966387, "grad_norm": 0.473596636222914, "learning_rate": 7.847566942734866e-05, "loss": 0.4457, "step": 534 }, { "epoch": 0.8991596638655462, "grad_norm": 0.4534411526167765, "learning_rate": 7.846280755956655e-05, "loss": 0.4415, "step": 535 }, { "epoch": 0.9008403361344538, "grad_norm": 0.3954173268545826, "learning_rate": 7.844989272008146e-05, "loss": 0.4341, "step": 536 }, { "epoch": 0.9025210084033614, "grad_norm": 0.38312162245178033, "learning_rate": 7.843692492667997e-05, "loss": 0.4412, "step": 537 }, { "epoch": 0.9042016806722689, "grad_norm": 0.43752959225212684, "learning_rate": 7.842390419722155e-05, "loss": 0.4416, "step": 538 }, { "epoch": 0.9058823529411765, "grad_norm": 0.40581665284861357, "learning_rate": 7.84108305496386e-05, "loss": 0.4317, "step": 539 }, { "epoch": 0.907563025210084, "grad_norm": 0.44147325607309706, "learning_rate": 7.839770400193639e-05, "loss": 0.4437, "step": 540 }, { "epoch": 0.9092436974789916, "grad_norm": 0.5305804045906902, "learning_rate": 7.838452457219306e-05, "loss": 0.4373, "step": 541 }, { "epoch": 0.9109243697478991, "grad_norm": 0.6351717215667315, "learning_rate": 7.837129227855955e-05, "loss": 0.4451, "step": 542 }, { "epoch": 0.9126050420168067, "grad_norm": 0.7295926915221591, "learning_rate": 7.835800713925964e-05, "loss": 0.4338, "step": 543 }, { "epoch": 0.9142857142857143, "grad_norm": 0.823485834441508, "learning_rate": 7.834466917258987e-05, "loss": 0.441, "step": 544 }, { "epoch": 0.9159663865546218, "grad_norm": 0.9923769178095391, "learning_rate": 7.833127839691954e-05, "loss": 0.4396, "step": 545 }, { "epoch": 0.9176470588235294, "grad_norm": 1.1291743400180596, "learning_rate": 7.831783483069066e-05, "loss": 0.4423, "step": 546 }, { "epoch": 0.9193277310924369, "grad_norm": 0.7211018685434838, "learning_rate": 7.8304338492418e-05, "loss": 0.4361, "step": 547 }, { "epoch": 0.9210084033613445, "grad_norm": 0.5009635924935609, "learning_rate": 7.829078940068894e-05, "loss": 0.4354, "step": 548 }, { "epoch": 0.9226890756302522, "grad_norm": 0.38505373447223407, "learning_rate": 7.827718757416355e-05, "loss": 0.4423, "step": 549 }, { "epoch": 0.9243697478991597, "grad_norm": 0.40889576368516306, "learning_rate": 7.826353303157455e-05, "loss": 0.4412, "step": 550 }, { "epoch": 0.9260504201680673, "grad_norm": 0.5501795868022142, "learning_rate": 7.824982579172718e-05, "loss": 0.4443, "step": 551 }, { "epoch": 0.9277310924369748, "grad_norm": 0.5985180412405634, "learning_rate": 7.823606587349935e-05, "loss": 0.4406, "step": 552 }, { "epoch": 0.9294117647058824, "grad_norm": 0.5944442597543493, "learning_rate": 7.822225329584147e-05, "loss": 0.4412, "step": 553 }, { "epoch": 0.9310924369747899, "grad_norm": 0.6202269068527828, "learning_rate": 7.820838807777646e-05, "loss": 0.4516, "step": 554 }, { "epoch": 0.9327731092436975, "grad_norm": 0.6186625860810193, "learning_rate": 7.81944702383998e-05, "loss": 0.4413, "step": 555 }, { "epoch": 0.934453781512605, "grad_norm": 0.6324171655119329, "learning_rate": 7.818049979687937e-05, "loss": 0.4466, "step": 556 }, { "epoch": 0.9361344537815126, "grad_norm": 0.6167781295659638, "learning_rate": 7.816647677245554e-05, "loss": 0.4471, "step": 557 }, { "epoch": 0.9378151260504202, "grad_norm": 0.5054888553258667, "learning_rate": 7.815240118444108e-05, "loss": 0.4369, "step": 558 }, { "epoch": 0.9394957983193277, "grad_norm": 0.4589749381422496, "learning_rate": 7.813827305222114e-05, "loss": 0.4429, "step": 559 }, { "epoch": 0.9411764705882353, "grad_norm": 0.4697311415735902, "learning_rate": 7.812409239525326e-05, "loss": 0.4399, "step": 560 }, { "epoch": 0.9428571428571428, "grad_norm": 0.6586400649638215, "learning_rate": 7.810985923306731e-05, "loss": 0.4526, "step": 561 }, { "epoch": 0.9445378151260504, "grad_norm": 0.9362645413018801, "learning_rate": 7.809557358526547e-05, "loss": 0.4368, "step": 562 }, { "epoch": 0.946218487394958, "grad_norm": 1.1040947666313237, "learning_rate": 7.808123547152222e-05, "loss": 0.4463, "step": 563 }, { "epoch": 0.9478991596638655, "grad_norm": 0.7157634328457568, "learning_rate": 7.806684491158425e-05, "loss": 0.4381, "step": 564 }, { "epoch": 0.9495798319327731, "grad_norm": 0.39714006242719824, "learning_rate": 7.805240192527052e-05, "loss": 0.4451, "step": 565 }, { "epoch": 0.9512605042016806, "grad_norm": 0.433919682117276, "learning_rate": 7.803790653247219e-05, "loss": 0.4338, "step": 566 }, { "epoch": 0.9529411764705882, "grad_norm": 0.680376399211333, "learning_rate": 7.802335875315259e-05, "loss": 0.4374, "step": 567 }, { "epoch": 0.9546218487394958, "grad_norm": 0.8132548394719255, "learning_rate": 7.800875860734718e-05, "loss": 0.4385, "step": 568 }, { "epoch": 0.9563025210084034, "grad_norm": 0.8501548565081195, "learning_rate": 7.799410611516356e-05, "loss": 0.4356, "step": 569 }, { "epoch": 0.957983193277311, "grad_norm": 0.7801838929729491, "learning_rate": 7.797940129678142e-05, "loss": 0.4376, "step": 570 }, { "epoch": 0.9596638655462185, "grad_norm": 0.667230669746139, "learning_rate": 7.796464417245252e-05, "loss": 0.4443, "step": 571 }, { "epoch": 0.9613445378151261, "grad_norm": 0.5348854624054132, "learning_rate": 7.794983476250065e-05, "loss": 0.4439, "step": 572 }, { "epoch": 0.9630252100840336, "grad_norm": 0.41247857085577333, "learning_rate": 7.79349730873216e-05, "loss": 0.4428, "step": 573 }, { "epoch": 0.9647058823529412, "grad_norm": 0.5330022301479793, "learning_rate": 7.792005916738316e-05, "loss": 0.4386, "step": 574 }, { "epoch": 0.9663865546218487, "grad_norm": 0.7577692937568961, "learning_rate": 7.790509302322503e-05, "loss": 0.443, "step": 575 }, { "epoch": 0.9680672268907563, "grad_norm": 0.8438648251376532, "learning_rate": 7.789007467545892e-05, "loss": 0.435, "step": 576 }, { "epoch": 0.9697478991596639, "grad_norm": 0.7548220822875487, "learning_rate": 7.787500414476833e-05, "loss": 0.4488, "step": 577 }, { "epoch": 0.9714285714285714, "grad_norm": 0.6496211212295644, "learning_rate": 7.785988145190871e-05, "loss": 0.4455, "step": 578 }, { "epoch": 0.973109243697479, "grad_norm": 0.6941465776553114, "learning_rate": 7.784470661770731e-05, "loss": 0.4394, "step": 579 }, { "epoch": 0.9747899159663865, "grad_norm": 0.6463770034662822, "learning_rate": 7.782947966306319e-05, "loss": 0.4429, "step": 580 }, { "epoch": 0.9764705882352941, "grad_norm": 0.5261709281303069, "learning_rate": 7.78142006089472e-05, "loss": 0.4395, "step": 581 }, { "epoch": 0.9781512605042016, "grad_norm": 0.4382790485123969, "learning_rate": 7.779886947640194e-05, "loss": 0.437, "step": 582 }, { "epoch": 0.9798319327731092, "grad_norm": 0.41274048135169494, "learning_rate": 7.778348628654175e-05, "loss": 0.4453, "step": 583 }, { "epoch": 0.9815126050420168, "grad_norm": 0.403227529524687, "learning_rate": 7.776805106055264e-05, "loss": 0.4426, "step": 584 }, { "epoch": 0.9831932773109243, "grad_norm": 0.4398457621677066, "learning_rate": 7.775256381969229e-05, "loss": 0.4342, "step": 585 }, { "epoch": 0.984873949579832, "grad_norm": 0.49874211945637087, "learning_rate": 7.773702458529001e-05, "loss": 0.4385, "step": 586 }, { "epoch": 0.9865546218487395, "grad_norm": 0.6906462330674743, "learning_rate": 7.772143337874674e-05, "loss": 0.4483, "step": 587 }, { "epoch": 0.9882352941176471, "grad_norm": 0.8075868548739334, "learning_rate": 7.770579022153499e-05, "loss": 0.4325, "step": 588 }, { "epoch": 0.9899159663865547, "grad_norm": 0.8339712379418146, "learning_rate": 7.76900951351988e-05, "loss": 0.4421, "step": 589 }, { "epoch": 0.9915966386554622, "grad_norm": 0.6807710772986726, "learning_rate": 7.767434814135373e-05, "loss": 0.4313, "step": 590 }, { "epoch": 0.9932773109243698, "grad_norm": 0.4409027137542364, "learning_rate": 7.765854926168685e-05, "loss": 0.4256, "step": 591 }, { "epoch": 0.9949579831932773, "grad_norm": 0.48244582084174703, "learning_rate": 7.764269851795668e-05, "loss": 0.438, "step": 592 }, { "epoch": 0.9966386554621849, "grad_norm": 0.6978088895411947, "learning_rate": 7.762679593199314e-05, "loss": 0.4377, "step": 593 }, { "epoch": 0.9983193277310924, "grad_norm": 0.6355208689444832, "learning_rate": 7.76108415256976e-05, "loss": 0.432, "step": 594 }, { "epoch": 1.0, "grad_norm": 0.46687936241554334, "learning_rate": 7.759483532104275e-05, "loss": 0.4455, "step": 595 }, { "epoch": 1.0016806722689076, "grad_norm": 0.48788474274535754, "learning_rate": 7.757877734007264e-05, "loss": 0.417, "step": 596 }, { "epoch": 1.0033613445378151, "grad_norm": 0.43687729559324223, "learning_rate": 7.756266760490262e-05, "loss": 0.4149, "step": 597 }, { "epoch": 1.0050420168067227, "grad_norm": 0.5070844880906, "learning_rate": 7.754650613771933e-05, "loss": 0.4139, "step": 598 }, { "epoch": 1.0067226890756302, "grad_norm": 0.610651218405719, "learning_rate": 7.753029296078063e-05, "loss": 0.4221, "step": 599 }, { "epoch": 1.0084033613445378, "grad_norm": 0.7549198748336643, "learning_rate": 7.751402809641563e-05, "loss": 0.4208, "step": 600 }, { "epoch": 1.0100840336134453, "grad_norm": 0.8265269075130188, "learning_rate": 7.749771156702463e-05, "loss": 0.4179, "step": 601 }, { "epoch": 1.011764705882353, "grad_norm": 0.7647572686080818, "learning_rate": 7.7481343395079e-05, "loss": 0.4166, "step": 602 }, { "epoch": 1.0134453781512605, "grad_norm": 0.6824354931340557, "learning_rate": 7.746492360312134e-05, "loss": 0.4192, "step": 603 }, { "epoch": 1.015126050420168, "grad_norm": 0.6343559789161084, "learning_rate": 7.744845221376528e-05, "loss": 0.4129, "step": 604 }, { "epoch": 1.0168067226890756, "grad_norm": 0.6183864414027355, "learning_rate": 7.743192924969553e-05, "loss": 0.416, "step": 605 }, { "epoch": 1.0184873949579831, "grad_norm": 0.5616199902208074, "learning_rate": 7.741535473366783e-05, "loss": 0.4298, "step": 606 }, { "epoch": 1.0201680672268907, "grad_norm": 0.6103302197197726, "learning_rate": 7.739872868850893e-05, "loss": 0.4142, "step": 607 }, { "epoch": 1.0218487394957982, "grad_norm": 0.6421422744076934, "learning_rate": 7.73820511371165e-05, "loss": 0.4173, "step": 608 }, { "epoch": 1.0235294117647058, "grad_norm": 0.6588501788886473, "learning_rate": 7.736532210245921e-05, "loss": 0.4197, "step": 609 }, { "epoch": 1.0252100840336134, "grad_norm": 0.578591461827814, "learning_rate": 7.734854160757658e-05, "loss": 0.4286, "step": 610 }, { "epoch": 1.026890756302521, "grad_norm": 0.5236889356021464, "learning_rate": 7.733170967557904e-05, "loss": 0.4318, "step": 611 }, { "epoch": 1.0285714285714285, "grad_norm": 0.6988074336206159, "learning_rate": 7.731482632964783e-05, "loss": 0.4195, "step": 612 }, { "epoch": 1.030252100840336, "grad_norm": 0.6880510961063336, "learning_rate": 7.729789159303501e-05, "loss": 0.4132, "step": 613 }, { "epoch": 1.0319327731092436, "grad_norm": 0.6546853481469133, "learning_rate": 7.728090548906343e-05, "loss": 0.417, "step": 614 }, { "epoch": 1.0336134453781514, "grad_norm": 0.5691041982940833, "learning_rate": 7.726386804112667e-05, "loss": 0.4093, "step": 615 }, { "epoch": 1.035294117647059, "grad_norm": 0.3740675746099188, "learning_rate": 7.724677927268902e-05, "loss": 0.4237, "step": 616 }, { "epoch": 1.0369747899159665, "grad_norm": 0.46625352463025527, "learning_rate": 7.722963920728545e-05, "loss": 0.4216, "step": 617 }, { "epoch": 1.038655462184874, "grad_norm": 0.5265148534675278, "learning_rate": 7.72124478685216e-05, "loss": 0.4151, "step": 618 }, { "epoch": 1.0403361344537816, "grad_norm": 0.5336661615823718, "learning_rate": 7.719520528007367e-05, "loss": 0.4197, "step": 619 }, { "epoch": 1.0420168067226891, "grad_norm": 0.5947202406120358, "learning_rate": 7.717791146568852e-05, "loss": 0.4112, "step": 620 }, { "epoch": 1.0436974789915967, "grad_norm": 0.6497342544338861, "learning_rate": 7.716056644918348e-05, "loss": 0.4194, "step": 621 }, { "epoch": 1.0453781512605043, "grad_norm": 0.48433129300861383, "learning_rate": 7.714317025444645e-05, "loss": 0.4203, "step": 622 }, { "epoch": 1.0470588235294118, "grad_norm": 0.37240199856198514, "learning_rate": 7.712572290543582e-05, "loss": 0.4164, "step": 623 }, { "epoch": 1.0487394957983194, "grad_norm": 0.39345294621933147, "learning_rate": 7.710822442618037e-05, "loss": 0.4177, "step": 624 }, { "epoch": 1.050420168067227, "grad_norm": 0.3670385023465729, "learning_rate": 7.709067484077935e-05, "loss": 0.4173, "step": 625 }, { "epoch": 1.0521008403361345, "grad_norm": 0.4384922884745366, "learning_rate": 7.707307417340236e-05, "loss": 0.4275, "step": 626 }, { "epoch": 1.053781512605042, "grad_norm": 0.43562272580564454, "learning_rate": 7.705542244828942e-05, "loss": 0.4179, "step": 627 }, { "epoch": 1.0554621848739496, "grad_norm": 0.4065421199985246, "learning_rate": 7.703771968975078e-05, "loss": 0.4085, "step": 628 }, { "epoch": 1.0571428571428572, "grad_norm": 0.4214907607963905, "learning_rate": 7.7019965922167e-05, "loss": 0.4098, "step": 629 }, { "epoch": 1.0588235294117647, "grad_norm": 0.5098913903222594, "learning_rate": 7.700216116998894e-05, "loss": 0.4188, "step": 630 }, { "epoch": 1.0605042016806723, "grad_norm": 0.6131926682979792, "learning_rate": 7.698430545773763e-05, "loss": 0.4148, "step": 631 }, { "epoch": 1.0621848739495798, "grad_norm": 0.8360475885983109, "learning_rate": 7.696639881000428e-05, "loss": 0.4178, "step": 632 }, { "epoch": 1.0638655462184874, "grad_norm": 1.0630690254526622, "learning_rate": 7.694844125145028e-05, "loss": 0.4161, "step": 633 }, { "epoch": 1.065546218487395, "grad_norm": 0.9383568269842568, "learning_rate": 7.693043280680709e-05, "loss": 0.4195, "step": 634 }, { "epoch": 1.0672268907563025, "grad_norm": 0.8048866380223046, "learning_rate": 7.69123735008763e-05, "loss": 0.4118, "step": 635 }, { "epoch": 1.06890756302521, "grad_norm": 0.6496139901540049, "learning_rate": 7.689426335852951e-05, "loss": 0.4159, "step": 636 }, { "epoch": 1.0705882352941176, "grad_norm": 0.46199858998399584, "learning_rate": 7.687610240470838e-05, "loss": 0.4256, "step": 637 }, { "epoch": 1.0722689075630252, "grad_norm": 0.3242118924085421, "learning_rate": 7.68578906644245e-05, "loss": 0.4158, "step": 638 }, { "epoch": 1.0739495798319327, "grad_norm": 0.3218446624580423, "learning_rate": 7.683962816275941e-05, "loss": 0.4242, "step": 639 }, { "epoch": 1.0756302521008403, "grad_norm": 0.4552075422759905, "learning_rate": 7.682131492486459e-05, "loss": 0.4177, "step": 640 }, { "epoch": 1.0773109243697478, "grad_norm": 0.5656053469526829, "learning_rate": 7.680295097596138e-05, "loss": 0.4161, "step": 641 }, { "epoch": 1.0789915966386554, "grad_norm": 0.5629683410803271, "learning_rate": 7.678453634134092e-05, "loss": 0.4152, "step": 642 }, { "epoch": 1.080672268907563, "grad_norm": 0.5062642971031328, "learning_rate": 7.676607104636423e-05, "loss": 0.4044, "step": 643 }, { "epoch": 1.0823529411764705, "grad_norm": 0.47453455854192367, "learning_rate": 7.674755511646205e-05, "loss": 0.4193, "step": 644 }, { "epoch": 1.084033613445378, "grad_norm": 0.41289832121465747, "learning_rate": 7.672898857713485e-05, "loss": 0.4149, "step": 645 }, { "epoch": 1.0857142857142856, "grad_norm": 0.39408303810701045, "learning_rate": 7.671037145395284e-05, "loss": 0.4117, "step": 646 }, { "epoch": 1.0873949579831932, "grad_norm": 0.48602357215203096, "learning_rate": 7.669170377255587e-05, "loss": 0.4213, "step": 647 }, { "epoch": 1.0890756302521007, "grad_norm": 0.5742610874136242, "learning_rate": 7.667298555865342e-05, "loss": 0.404, "step": 648 }, { "epoch": 1.0907563025210083, "grad_norm": 0.7079560692562716, "learning_rate": 7.665421683802457e-05, "loss": 0.4224, "step": 649 }, { "epoch": 1.092436974789916, "grad_norm": 0.8035671494210167, "learning_rate": 7.663539763651794e-05, "loss": 0.4131, "step": 650 }, { "epoch": 1.0941176470588236, "grad_norm": 0.8329996025967644, "learning_rate": 7.66165279800517e-05, "loss": 0.4224, "step": 651 }, { "epoch": 1.0957983193277312, "grad_norm": 0.8509433865895827, "learning_rate": 7.659760789461351e-05, "loss": 0.4211, "step": 652 }, { "epoch": 1.0974789915966388, "grad_norm": 0.9099103761530775, "learning_rate": 7.657863740626043e-05, "loss": 0.4253, "step": 653 }, { "epoch": 1.0991596638655463, "grad_norm": 0.983149422621047, "learning_rate": 7.655961654111902e-05, "loss": 0.4202, "step": 654 }, { "epoch": 1.1008403361344539, "grad_norm": 0.9359853162871273, "learning_rate": 7.654054532538513e-05, "loss": 0.4287, "step": 655 }, { "epoch": 1.1025210084033614, "grad_norm": 0.6269406538735026, "learning_rate": 7.652142378532403e-05, "loss": 0.4144, "step": 656 }, { "epoch": 1.104201680672269, "grad_norm": 0.555868726500765, "learning_rate": 7.650225194727026e-05, "loss": 0.4217, "step": 657 }, { "epoch": 1.1058823529411765, "grad_norm": 0.8324778004919178, "learning_rate": 7.648302983762763e-05, "loss": 0.4223, "step": 658 }, { "epoch": 1.107563025210084, "grad_norm": 0.8094099619001168, "learning_rate": 7.646375748286918e-05, "loss": 0.4217, "step": 659 }, { "epoch": 1.1092436974789917, "grad_norm": 0.46728184848686743, "learning_rate": 7.644443490953718e-05, "loss": 0.4208, "step": 660 }, { "epoch": 1.1109243697478992, "grad_norm": 0.44107905195340047, "learning_rate": 7.642506214424303e-05, "loss": 0.4132, "step": 661 }, { "epoch": 1.1126050420168068, "grad_norm": 0.6605465686975187, "learning_rate": 7.640563921366728e-05, "loss": 0.4202, "step": 662 }, { "epoch": 1.1142857142857143, "grad_norm": 0.6316884653539047, "learning_rate": 7.638616614455953e-05, "loss": 0.4066, "step": 663 }, { "epoch": 1.1159663865546219, "grad_norm": 0.4144915866253175, "learning_rate": 7.636664296373848e-05, "loss": 0.4095, "step": 664 }, { "epoch": 1.1176470588235294, "grad_norm": 0.5590020131584241, "learning_rate": 7.634706969809183e-05, "loss": 0.4293, "step": 665 }, { "epoch": 1.119327731092437, "grad_norm": 0.6451480608027411, "learning_rate": 7.632744637457622e-05, "loss": 0.4169, "step": 666 }, { "epoch": 1.1210084033613446, "grad_norm": 0.5152532892929356, "learning_rate": 7.630777302021728e-05, "loss": 0.4281, "step": 667 }, { "epoch": 1.122689075630252, "grad_norm": 0.4080908183835616, "learning_rate": 7.628804966210952e-05, "loss": 0.4182, "step": 668 }, { "epoch": 1.1243697478991597, "grad_norm": 0.44079055205472073, "learning_rate": 7.626827632741632e-05, "loss": 0.4069, "step": 669 }, { "epoch": 1.1260504201680672, "grad_norm": 0.38142904753292695, "learning_rate": 7.62484530433699e-05, "loss": 0.4091, "step": 670 }, { "epoch": 1.1277310924369748, "grad_norm": 0.37361296140813016, "learning_rate": 7.622857983727124e-05, "loss": 0.4172, "step": 671 }, { "epoch": 1.1294117647058823, "grad_norm": 0.5288875050238943, "learning_rate": 7.620865673649013e-05, "loss": 0.4164, "step": 672 }, { "epoch": 1.13109243697479, "grad_norm": 0.6093681970188259, "learning_rate": 7.618868376846498e-05, "loss": 0.4172, "step": 673 }, { "epoch": 1.1327731092436975, "grad_norm": 0.5906268765772769, "learning_rate": 7.616866096070299e-05, "loss": 0.4214, "step": 674 }, { "epoch": 1.134453781512605, "grad_norm": 0.5197514019386982, "learning_rate": 7.614858834077993e-05, "loss": 0.4246, "step": 675 }, { "epoch": 1.1361344537815126, "grad_norm": 0.3968428704107069, "learning_rate": 7.61284659363402e-05, "loss": 0.4249, "step": 676 }, { "epoch": 1.1378151260504201, "grad_norm": 0.3728486817281701, "learning_rate": 7.610829377509673e-05, "loss": 0.4234, "step": 677 }, { "epoch": 1.1394957983193277, "grad_norm": 0.4913698311275425, "learning_rate": 7.608807188483101e-05, "loss": 0.4209, "step": 678 }, { "epoch": 1.1411764705882352, "grad_norm": 0.6454599573511207, "learning_rate": 7.606780029339301e-05, "loss": 0.4236, "step": 679 }, { "epoch": 1.1428571428571428, "grad_norm": 0.7715911772146112, "learning_rate": 7.604747902870117e-05, "loss": 0.4168, "step": 680 }, { "epoch": 1.1445378151260504, "grad_norm": 0.736514426448835, "learning_rate": 7.60271081187423e-05, "loss": 0.4195, "step": 681 }, { "epoch": 1.146218487394958, "grad_norm": 0.5426866484620578, "learning_rate": 7.600668759157157e-05, "loss": 0.4139, "step": 682 }, { "epoch": 1.1478991596638655, "grad_norm": 0.4135438796043502, "learning_rate": 7.598621747531257e-05, "loss": 0.416, "step": 683 }, { "epoch": 1.149579831932773, "grad_norm": 0.4190139581521961, "learning_rate": 7.596569779815709e-05, "loss": 0.4156, "step": 684 }, { "epoch": 1.1512605042016806, "grad_norm": 0.5327478628069173, "learning_rate": 7.594512858836524e-05, "loss": 0.425, "step": 685 }, { "epoch": 1.1529411764705881, "grad_norm": 0.6310676723521408, "learning_rate": 7.59245098742653e-05, "loss": 0.4154, "step": 686 }, { "epoch": 1.1546218487394957, "grad_norm": 0.6127623459821396, "learning_rate": 7.590384168425378e-05, "loss": 0.4192, "step": 687 }, { "epoch": 1.1563025210084033, "grad_norm": 0.555177883069966, "learning_rate": 7.588312404679527e-05, "loss": 0.4131, "step": 688 }, { "epoch": 1.1579831932773108, "grad_norm": 0.47856261604841976, "learning_rate": 7.586235699042252e-05, "loss": 0.4199, "step": 689 }, { "epoch": 1.1596638655462184, "grad_norm": 0.5072335432171549, "learning_rate": 7.584154054373628e-05, "loss": 0.414, "step": 690 }, { "epoch": 1.1613445378151261, "grad_norm": 0.4754722390396806, "learning_rate": 7.582067473540538e-05, "loss": 0.4183, "step": 691 }, { "epoch": 1.1630252100840337, "grad_norm": 0.42613709636103764, "learning_rate": 7.57997595941666e-05, "loss": 0.4132, "step": 692 }, { "epoch": 1.1647058823529413, "grad_norm": 0.3351898661983106, "learning_rate": 7.577879514882465e-05, "loss": 0.4205, "step": 693 }, { "epoch": 1.1663865546218488, "grad_norm": 0.3468761299171179, "learning_rate": 7.575778142825217e-05, "loss": 0.4214, "step": 694 }, { "epoch": 1.1680672268907564, "grad_norm": 0.3928053738976945, "learning_rate": 7.573671846138965e-05, "loss": 0.4099, "step": 695 }, { "epoch": 1.169747899159664, "grad_norm": 0.42309452456547136, "learning_rate": 7.571560627724541e-05, "loss": 0.4181, "step": 696 }, { "epoch": 1.1714285714285715, "grad_norm": 0.4383712205397024, "learning_rate": 7.569444490489553e-05, "loss": 0.4287, "step": 697 }, { "epoch": 1.173109243697479, "grad_norm": 0.40802963391031566, "learning_rate": 7.567323437348387e-05, "loss": 0.4298, "step": 698 }, { "epoch": 1.1747899159663866, "grad_norm": 0.4392357126049293, "learning_rate": 7.565197471222197e-05, "loss": 0.4141, "step": 699 }, { "epoch": 1.1764705882352942, "grad_norm": 0.5514754113692921, "learning_rate": 7.563066595038904e-05, "loss": 0.4225, "step": 700 }, { "epoch": 1.1781512605042017, "grad_norm": 0.5156923350061713, "learning_rate": 7.560930811733187e-05, "loss": 0.4121, "step": 701 }, { "epoch": 1.1798319327731093, "grad_norm": 0.612241237491993, "learning_rate": 7.558790124246493e-05, "loss": 0.4216, "step": 702 }, { "epoch": 1.1815126050420168, "grad_norm": 0.7946251682931725, "learning_rate": 7.556644535527013e-05, "loss": 0.42, "step": 703 }, { "epoch": 1.1831932773109244, "grad_norm": 0.8506450131264243, "learning_rate": 7.554494048529693e-05, "loss": 0.4141, "step": 704 }, { "epoch": 1.184873949579832, "grad_norm": 0.8737677264120064, "learning_rate": 7.552338666216225e-05, "loss": 0.4164, "step": 705 }, { "epoch": 1.1865546218487395, "grad_norm": 0.9487966724722486, "learning_rate": 7.550178391555043e-05, "loss": 0.4202, "step": 706 }, { "epoch": 1.188235294117647, "grad_norm": 0.9489072714569157, "learning_rate": 7.548013227521315e-05, "loss": 0.4121, "step": 707 }, { "epoch": 1.1899159663865546, "grad_norm": 0.7902284896774713, "learning_rate": 7.545843177096948e-05, "loss": 0.4147, "step": 708 }, { "epoch": 1.1915966386554622, "grad_norm": 0.5993384913959577, "learning_rate": 7.543668243270574e-05, "loss": 0.4257, "step": 709 }, { "epoch": 1.1932773109243697, "grad_norm": 0.49229050776413785, "learning_rate": 7.541488429037558e-05, "loss": 0.4207, "step": 710 }, { "epoch": 1.1949579831932773, "grad_norm": 0.4876620970977695, "learning_rate": 7.539303737399974e-05, "loss": 0.4178, "step": 711 }, { "epoch": 1.1966386554621848, "grad_norm": 0.47067303971220886, "learning_rate": 7.537114171366625e-05, "loss": 0.4173, "step": 712 }, { "epoch": 1.1983193277310924, "grad_norm": 0.48002964116850694, "learning_rate": 7.534919733953023e-05, "loss": 0.4218, "step": 713 }, { "epoch": 1.2, "grad_norm": 0.5269330678245898, "learning_rate": 7.532720428181387e-05, "loss": 0.4089, "step": 714 }, { "epoch": 1.2016806722689075, "grad_norm": 0.5578763134710797, "learning_rate": 7.530516257080642e-05, "loss": 0.4142, "step": 715 }, { "epoch": 1.203361344537815, "grad_norm": 0.5690439824635096, "learning_rate": 7.528307223686415e-05, "loss": 0.424, "step": 716 }, { "epoch": 1.2050420168067226, "grad_norm": 0.5237198527845989, "learning_rate": 7.526093331041027e-05, "loss": 0.4093, "step": 717 }, { "epoch": 1.2067226890756302, "grad_norm": 0.4360056211939958, "learning_rate": 7.523874582193495e-05, "loss": 0.4106, "step": 718 }, { "epoch": 1.2084033613445377, "grad_norm": 0.4456947110977198, "learning_rate": 7.521650980199518e-05, "loss": 0.4195, "step": 719 }, { "epoch": 1.2100840336134453, "grad_norm": 0.45341435432516647, "learning_rate": 7.519422528121488e-05, "loss": 0.4255, "step": 720 }, { "epoch": 1.2117647058823529, "grad_norm": 0.4712152761929175, "learning_rate": 7.517189229028464e-05, "loss": 0.4173, "step": 721 }, { "epoch": 1.2134453781512604, "grad_norm": 0.476409971317272, "learning_rate": 7.514951085996195e-05, "loss": 0.4097, "step": 722 }, { "epoch": 1.2151260504201682, "grad_norm": 0.5206330925096843, "learning_rate": 7.512708102107089e-05, "loss": 0.4185, "step": 723 }, { "epoch": 1.2168067226890757, "grad_norm": 0.5856157405150452, "learning_rate": 7.510460280450227e-05, "loss": 0.4376, "step": 724 }, { "epoch": 1.2184873949579833, "grad_norm": 0.5612729291985443, "learning_rate": 7.508207624121351e-05, "loss": 0.4177, "step": 725 }, { "epoch": 1.2201680672268909, "grad_norm": 0.5010563205097154, "learning_rate": 7.505950136222863e-05, "loss": 0.4155, "step": 726 }, { "epoch": 1.2218487394957984, "grad_norm": 0.48644372596651153, "learning_rate": 7.503687819863816e-05, "loss": 0.4003, "step": 727 }, { "epoch": 1.223529411764706, "grad_norm": 0.5516074324183868, "learning_rate": 7.501420678159916e-05, "loss": 0.4158, "step": 728 }, { "epoch": 1.2252100840336135, "grad_norm": 0.6198537740819527, "learning_rate": 7.499148714233512e-05, "loss": 0.4181, "step": 729 }, { "epoch": 1.226890756302521, "grad_norm": 0.72833418875299, "learning_rate": 7.496871931213597e-05, "loss": 0.4158, "step": 730 }, { "epoch": 1.2285714285714286, "grad_norm": 0.8045849815095305, "learning_rate": 7.494590332235799e-05, "loss": 0.419, "step": 731 }, { "epoch": 1.2302521008403362, "grad_norm": 0.8596854757230934, "learning_rate": 7.492303920442379e-05, "loss": 0.4264, "step": 732 }, { "epoch": 1.2319327731092438, "grad_norm": 0.8138287384145947, "learning_rate": 7.490012698982225e-05, "loss": 0.413, "step": 733 }, { "epoch": 1.2336134453781513, "grad_norm": 0.695897726649887, "learning_rate": 7.487716671010852e-05, "loss": 0.4154, "step": 734 }, { "epoch": 1.2352941176470589, "grad_norm": 0.4965582676619232, "learning_rate": 7.485415839690394e-05, "loss": 0.4091, "step": 735 }, { "epoch": 1.2369747899159664, "grad_norm": 0.3513687911793114, "learning_rate": 7.483110208189596e-05, "loss": 0.4274, "step": 736 }, { "epoch": 1.238655462184874, "grad_norm": 0.414075469802254, "learning_rate": 7.480799779683819e-05, "loss": 0.4185, "step": 737 }, { "epoch": 1.2403361344537815, "grad_norm": 0.4681994646139174, "learning_rate": 7.478484557355029e-05, "loss": 0.4175, "step": 738 }, { "epoch": 1.242016806722689, "grad_norm": 0.4711343538778145, "learning_rate": 7.476164544391793e-05, "loss": 0.4189, "step": 739 }, { "epoch": 1.2436974789915967, "grad_norm": 0.4008812268234646, "learning_rate": 7.473839743989278e-05, "loss": 0.414, "step": 740 }, { "epoch": 1.2453781512605042, "grad_norm": 0.37783157164078224, "learning_rate": 7.47151015934924e-05, "loss": 0.4135, "step": 741 }, { "epoch": 1.2470588235294118, "grad_norm": 0.4329651312461925, "learning_rate": 7.469175793680031e-05, "loss": 0.4168, "step": 742 }, { "epoch": 1.2487394957983193, "grad_norm": 0.5371727310457672, "learning_rate": 7.466836650196579e-05, "loss": 0.4255, "step": 743 }, { "epoch": 1.250420168067227, "grad_norm": 0.5280535023893417, "learning_rate": 7.4644927321204e-05, "loss": 0.4129, "step": 744 }, { "epoch": 1.2521008403361344, "grad_norm": 0.4092279203127339, "learning_rate": 7.46214404267958e-05, "loss": 0.4091, "step": 745 }, { "epoch": 1.253781512605042, "grad_norm": 0.4166845901478314, "learning_rate": 7.459790585108778e-05, "loss": 0.4173, "step": 746 }, { "epoch": 1.2554621848739496, "grad_norm": 0.5304149107491916, "learning_rate": 7.457432362649222e-05, "loss": 0.4214, "step": 747 }, { "epoch": 1.2571428571428571, "grad_norm": 0.6113992634977135, "learning_rate": 7.455069378548698e-05, "loss": 0.4108, "step": 748 }, { "epoch": 1.2588235294117647, "grad_norm": 0.6077844347222788, "learning_rate": 7.452701636061555e-05, "loss": 0.4102, "step": 749 }, { "epoch": 1.2605042016806722, "grad_norm": 0.6200118625999493, "learning_rate": 7.450329138448691e-05, "loss": 0.4223, "step": 750 }, { "epoch": 1.2621848739495798, "grad_norm": 0.6909906376100323, "learning_rate": 7.447951888977555e-05, "loss": 0.4192, "step": 751 }, { "epoch": 1.2638655462184873, "grad_norm": 0.6321309724023012, "learning_rate": 7.44556989092214e-05, "loss": 0.407, "step": 752 }, { "epoch": 1.265546218487395, "grad_norm": 0.6041243594007785, "learning_rate": 7.443183147562975e-05, "loss": 0.4235, "step": 753 }, { "epoch": 1.2672268907563025, "grad_norm": 0.5866313861706614, "learning_rate": 7.440791662187134e-05, "loss": 0.413, "step": 754 }, { "epoch": 1.26890756302521, "grad_norm": 0.5510835148407986, "learning_rate": 7.43839543808821e-05, "loss": 0.4219, "step": 755 }, { "epoch": 1.2705882352941176, "grad_norm": 0.481363220437347, "learning_rate": 7.435994478566332e-05, "loss": 0.4169, "step": 756 }, { "epoch": 1.2722689075630251, "grad_norm": 0.48076506578625217, "learning_rate": 7.433588786928143e-05, "loss": 0.4351, "step": 757 }, { "epoch": 1.2739495798319327, "grad_norm": 0.4291923255314117, "learning_rate": 7.43117836648681e-05, "loss": 0.4166, "step": 758 }, { "epoch": 1.2756302521008402, "grad_norm": 0.43947647300221665, "learning_rate": 7.428763220562009e-05, "loss": 0.418, "step": 759 }, { "epoch": 1.2773109243697478, "grad_norm": 0.520709879477232, "learning_rate": 7.426343352479925e-05, "loss": 0.4215, "step": 760 }, { "epoch": 1.2789915966386554, "grad_norm": 0.6125778220015469, "learning_rate": 7.423918765573243e-05, "loss": 0.4188, "step": 761 }, { "epoch": 1.280672268907563, "grad_norm": 0.702063019090705, "learning_rate": 7.421489463181151e-05, "loss": 0.4158, "step": 762 }, { "epoch": 1.2823529411764705, "grad_norm": 0.6923350747270768, "learning_rate": 7.419055448649332e-05, "loss": 0.4153, "step": 763 }, { "epoch": 1.284033613445378, "grad_norm": 0.5687722012115959, "learning_rate": 7.416616725329953e-05, "loss": 0.4134, "step": 764 }, { "epoch": 1.2857142857142856, "grad_norm": 0.46943143699925355, "learning_rate": 7.414173296581674e-05, "loss": 0.4241, "step": 765 }, { "epoch": 1.2873949579831931, "grad_norm": 0.4951651855151075, "learning_rate": 7.411725165769628e-05, "loss": 0.4193, "step": 766 }, { "epoch": 1.289075630252101, "grad_norm": 0.506659409695784, "learning_rate": 7.409272336265428e-05, "loss": 0.4288, "step": 767 }, { "epoch": 1.2907563025210085, "grad_norm": 0.5351461135668523, "learning_rate": 7.406814811447156e-05, "loss": 0.4164, "step": 768 }, { "epoch": 1.292436974789916, "grad_norm": 0.4916009667391678, "learning_rate": 7.404352594699363e-05, "loss": 0.4149, "step": 769 }, { "epoch": 1.2941176470588236, "grad_norm": 0.44513750247716677, "learning_rate": 7.40188568941306e-05, "loss": 0.4248, "step": 770 }, { "epoch": 1.2957983193277312, "grad_norm": 0.41133339418148873, "learning_rate": 7.399414098985714e-05, "loss": 0.4242, "step": 771 }, { "epoch": 1.2974789915966387, "grad_norm": 0.39859897626299734, "learning_rate": 7.396937826821247e-05, "loss": 0.4149, "step": 772 }, { "epoch": 1.2991596638655463, "grad_norm": 0.36615895294594974, "learning_rate": 7.394456876330027e-05, "loss": 0.4184, "step": 773 }, { "epoch": 1.3008403361344538, "grad_norm": 0.4225752458934245, "learning_rate": 7.391971250928868e-05, "loss": 0.4088, "step": 774 }, { "epoch": 1.3025210084033614, "grad_norm": 0.48904449179303683, "learning_rate": 7.389480954041017e-05, "loss": 0.4173, "step": 775 }, { "epoch": 1.304201680672269, "grad_norm": 0.5101293696241938, "learning_rate": 7.38698598909616e-05, "loss": 0.4143, "step": 776 }, { "epoch": 1.3058823529411765, "grad_norm": 0.4487142507134038, "learning_rate": 7.384486359530407e-05, "loss": 0.4079, "step": 777 }, { "epoch": 1.307563025210084, "grad_norm": 0.4768949541634418, "learning_rate": 7.381982068786298e-05, "loss": 0.419, "step": 778 }, { "epoch": 1.3092436974789916, "grad_norm": 0.4457327818300364, "learning_rate": 7.379473120312788e-05, "loss": 0.4202, "step": 779 }, { "epoch": 1.3109243697478992, "grad_norm": 0.380106657139599, "learning_rate": 7.376959517565247e-05, "loss": 0.41, "step": 780 }, { "epoch": 1.3126050420168067, "grad_norm": 0.4149743171070702, "learning_rate": 7.374441264005459e-05, "loss": 0.413, "step": 781 }, { "epoch": 1.3142857142857143, "grad_norm": 0.4668816275671711, "learning_rate": 7.371918363101609e-05, "loss": 0.4139, "step": 782 }, { "epoch": 1.3159663865546218, "grad_norm": 0.48179445833313245, "learning_rate": 7.369390818328281e-05, "loss": 0.4127, "step": 783 }, { "epoch": 1.3176470588235294, "grad_norm": 0.4731089155083404, "learning_rate": 7.36685863316646e-05, "loss": 0.4081, "step": 784 }, { "epoch": 1.319327731092437, "grad_norm": 0.5309217615866072, "learning_rate": 7.36432181110352e-05, "loss": 0.4183, "step": 785 }, { "epoch": 1.3210084033613445, "grad_norm": 0.6758399427418307, "learning_rate": 7.361780355633217e-05, "loss": 0.4147, "step": 786 }, { "epoch": 1.322689075630252, "grad_norm": 0.7531547876433018, "learning_rate": 7.359234270255695e-05, "loss": 0.4141, "step": 787 }, { "epoch": 1.3243697478991596, "grad_norm": 0.710280998496504, "learning_rate": 7.356683558477466e-05, "loss": 0.4102, "step": 788 }, { "epoch": 1.3260504201680672, "grad_norm": 0.7047188311478196, "learning_rate": 7.354128223811422e-05, "loss": 0.4178, "step": 789 }, { "epoch": 1.3277310924369747, "grad_norm": 0.6318446284333504, "learning_rate": 7.351568269776818e-05, "loss": 0.4145, "step": 790 }, { "epoch": 1.3294117647058823, "grad_norm": 0.47127744579818104, "learning_rate": 7.349003699899269e-05, "loss": 0.4127, "step": 791 }, { "epoch": 1.3310924369747898, "grad_norm": 0.39634285538470143, "learning_rate": 7.34643451771075e-05, "loss": 0.4111, "step": 792 }, { "epoch": 1.3327731092436974, "grad_norm": 0.5285597259679089, "learning_rate": 7.343860726749586e-05, "loss": 0.4087, "step": 793 }, { "epoch": 1.334453781512605, "grad_norm": 0.7701094709365047, "learning_rate": 7.341282330560454e-05, "loss": 0.4176, "step": 794 }, { "epoch": 1.3361344537815127, "grad_norm": 0.948738622442119, "learning_rate": 7.338699332694365e-05, "loss": 0.4137, "step": 795 }, { "epoch": 1.3378151260504203, "grad_norm": 0.8863071475708546, "learning_rate": 7.336111736708674e-05, "loss": 0.4122, "step": 796 }, { "epoch": 1.3394957983193279, "grad_norm": 0.6773098911811348, "learning_rate": 7.333519546167068e-05, "loss": 0.4161, "step": 797 }, { "epoch": 1.3411764705882354, "grad_norm": 0.426999218345416, "learning_rate": 7.330922764639562e-05, "loss": 0.4139, "step": 798 }, { "epoch": 1.342857142857143, "grad_norm": 0.5794922423045844, "learning_rate": 7.328321395702488e-05, "loss": 0.421, "step": 799 }, { "epoch": 1.3445378151260505, "grad_norm": 0.6818647359873162, "learning_rate": 7.325715442938503e-05, "loss": 0.4129, "step": 800 }, { "epoch": 1.346218487394958, "grad_norm": 0.6556883750387814, "learning_rate": 7.323104909936574e-05, "loss": 0.4177, "step": 801 }, { "epoch": 1.3478991596638656, "grad_norm": 0.4839902297717083, "learning_rate": 7.320489800291977e-05, "loss": 0.4152, "step": 802 }, { "epoch": 1.3495798319327732, "grad_norm": 0.33243553569964146, "learning_rate": 7.317870117606289e-05, "loss": 0.4117, "step": 803 }, { "epoch": 1.3512605042016808, "grad_norm": 0.46454488566838253, "learning_rate": 7.315245865487387e-05, "loss": 0.4184, "step": 804 }, { "epoch": 1.3529411764705883, "grad_norm": 0.5553949207628237, "learning_rate": 7.31261704754944e-05, "loss": 0.4052, "step": 805 }, { "epoch": 1.3546218487394959, "grad_norm": 0.5212728405963298, "learning_rate": 7.309983667412905e-05, "loss": 0.4148, "step": 806 }, { "epoch": 1.3563025210084034, "grad_norm": 0.4093562245999426, "learning_rate": 7.307345728704525e-05, "loss": 0.4251, "step": 807 }, { "epoch": 1.357983193277311, "grad_norm": 0.35605625345507735, "learning_rate": 7.304703235057317e-05, "loss": 0.4154, "step": 808 }, { "epoch": 1.3596638655462185, "grad_norm": 0.3316397554221099, "learning_rate": 7.302056190110572e-05, "loss": 0.4069, "step": 809 }, { "epoch": 1.361344537815126, "grad_norm": 0.2796932258014461, "learning_rate": 7.299404597509852e-05, "loss": 0.4094, "step": 810 }, { "epoch": 1.3630252100840337, "grad_norm": 0.2878643543440376, "learning_rate": 7.296748460906981e-05, "loss": 0.4173, "step": 811 }, { "epoch": 1.3647058823529412, "grad_norm": 0.351919690561876, "learning_rate": 7.294087783960038e-05, "loss": 0.4111, "step": 812 }, { "epoch": 1.3663865546218488, "grad_norm": 0.4781797459091511, "learning_rate": 7.29142257033336e-05, "loss": 0.411, "step": 813 }, { "epoch": 1.3680672268907563, "grad_norm": 0.5398427785813774, "learning_rate": 7.288752823697527e-05, "loss": 0.425, "step": 814 }, { "epoch": 1.3697478991596639, "grad_norm": 0.5890424452124411, "learning_rate": 7.286078547729366e-05, "loss": 0.4163, "step": 815 }, { "epoch": 1.3714285714285714, "grad_norm": 0.5736285635659246, "learning_rate": 7.283399746111938e-05, "loss": 0.4062, "step": 816 }, { "epoch": 1.373109243697479, "grad_norm": 0.5502104438410899, "learning_rate": 7.280716422534542e-05, "loss": 0.4203, "step": 817 }, { "epoch": 1.3747899159663866, "grad_norm": 0.548976248451182, "learning_rate": 7.278028580692701e-05, "loss": 0.419, "step": 818 }, { "epoch": 1.3764705882352941, "grad_norm": 0.5733091709904149, "learning_rate": 7.275336224288159e-05, "loss": 0.409, "step": 819 }, { "epoch": 1.3781512605042017, "grad_norm": 0.5320682182946287, "learning_rate": 7.272639357028882e-05, "loss": 0.4092, "step": 820 }, { "epoch": 1.3798319327731092, "grad_norm": 0.5241429850210833, "learning_rate": 7.269937982629047e-05, "loss": 0.4242, "step": 821 }, { "epoch": 1.3815126050420168, "grad_norm": 0.6004673663784591, "learning_rate": 7.267232104809036e-05, "loss": 0.4209, "step": 822 }, { "epoch": 1.3831932773109243, "grad_norm": 0.7406536040935202, "learning_rate": 7.264521727295437e-05, "loss": 0.4075, "step": 823 }, { "epoch": 1.384873949579832, "grad_norm": 0.777741092126964, "learning_rate": 7.261806853821029e-05, "loss": 0.4177, "step": 824 }, { "epoch": 1.3865546218487395, "grad_norm": 0.5952649324861092, "learning_rate": 7.259087488124791e-05, "loss": 0.4096, "step": 825 }, { "epoch": 1.388235294117647, "grad_norm": 0.3899147700950411, "learning_rate": 7.256363633951884e-05, "loss": 0.4164, "step": 826 }, { "epoch": 1.3899159663865546, "grad_norm": 0.44423986485328637, "learning_rate": 7.253635295053651e-05, "loss": 0.4213, "step": 827 }, { "epoch": 1.3915966386554621, "grad_norm": 0.5582103825233151, "learning_rate": 7.250902475187609e-05, "loss": 0.4122, "step": 828 }, { "epoch": 1.3932773109243697, "grad_norm": 0.571247413892442, "learning_rate": 7.248165178117452e-05, "loss": 0.4057, "step": 829 }, { "epoch": 1.3949579831932772, "grad_norm": 0.5335793754167865, "learning_rate": 7.245423407613037e-05, "loss": 0.4152, "step": 830 }, { "epoch": 1.3966386554621848, "grad_norm": 0.4875516086868791, "learning_rate": 7.24267716745038e-05, "loss": 0.4124, "step": 831 }, { "epoch": 1.3983193277310924, "grad_norm": 0.37350888792711506, "learning_rate": 7.239926461411657e-05, "loss": 0.4124, "step": 832 }, { "epoch": 1.4, "grad_norm": 0.451719932096374, "learning_rate": 7.23717129328519e-05, "loss": 0.4193, "step": 833 }, { "epoch": 1.4016806722689075, "grad_norm": 0.4954304303265037, "learning_rate": 7.23441166686545e-05, "loss": 0.4138, "step": 834 }, { "epoch": 1.403361344537815, "grad_norm": 0.41611563025982345, "learning_rate": 7.231647585953047e-05, "loss": 0.4199, "step": 835 }, { "epoch": 1.4050420168067226, "grad_norm": 0.37833045364464446, "learning_rate": 7.228879054354722e-05, "loss": 0.4192, "step": 836 }, { "epoch": 1.4067226890756301, "grad_norm": 0.39374798582357046, "learning_rate": 7.226106075883353e-05, "loss": 0.4196, "step": 837 }, { "epoch": 1.4084033613445377, "grad_norm": 0.38541700077159063, "learning_rate": 7.223328654357936e-05, "loss": 0.4153, "step": 838 }, { "epoch": 1.4100840336134453, "grad_norm": 0.40550519705488225, "learning_rate": 7.220546793603587e-05, "loss": 0.4047, "step": 839 }, { "epoch": 1.4117647058823528, "grad_norm": 0.34971644964498677, "learning_rate": 7.21776049745154e-05, "loss": 0.4165, "step": 840 }, { "epoch": 1.4134453781512604, "grad_norm": 0.39553107927716574, "learning_rate": 7.214969769739133e-05, "loss": 0.417, "step": 841 }, { "epoch": 1.4151260504201681, "grad_norm": 0.5509180071847434, "learning_rate": 7.21217461430981e-05, "loss": 0.4225, "step": 842 }, { "epoch": 1.4168067226890757, "grad_norm": 0.6891713106865223, "learning_rate": 7.209375035013108e-05, "loss": 0.4098, "step": 843 }, { "epoch": 1.4184873949579833, "grad_norm": 0.6682030918141645, "learning_rate": 7.206571035704664e-05, "loss": 0.4133, "step": 844 }, { "epoch": 1.4201680672268908, "grad_norm": 0.5371776974273769, "learning_rate": 7.203762620246197e-05, "loss": 0.4201, "step": 845 }, { "epoch": 1.4218487394957984, "grad_norm": 0.5959026512017033, "learning_rate": 7.200949792505512e-05, "loss": 0.4182, "step": 846 }, { "epoch": 1.423529411764706, "grad_norm": 0.7360708804920642, "learning_rate": 7.198132556356485e-05, "loss": 0.4216, "step": 847 }, { "epoch": 1.4252100840336135, "grad_norm": 0.8084984485976898, "learning_rate": 7.195310915679068e-05, "loss": 0.4219, "step": 848 }, { "epoch": 1.426890756302521, "grad_norm": 0.767263524674779, "learning_rate": 7.19248487435928e-05, "loss": 0.4102, "step": 849 }, { "epoch": 1.4285714285714286, "grad_norm": 0.7439815010487166, "learning_rate": 7.189654436289195e-05, "loss": 0.4098, "step": 850 }, { "epoch": 1.4302521008403362, "grad_norm": 0.7006107826213422, "learning_rate": 7.186819605366946e-05, "loss": 0.416, "step": 851 }, { "epoch": 1.4319327731092437, "grad_norm": 0.44786939761286987, "learning_rate": 7.183980385496717e-05, "loss": 0.4174, "step": 852 }, { "epoch": 1.4336134453781513, "grad_norm": 0.34640515390125803, "learning_rate": 7.181136780588734e-05, "loss": 0.4087, "step": 853 }, { "epoch": 1.4352941176470588, "grad_norm": 0.5062224790555945, "learning_rate": 7.178288794559265e-05, "loss": 0.4206, "step": 854 }, { "epoch": 1.4369747899159664, "grad_norm": 0.5494325523242729, "learning_rate": 7.175436431330608e-05, "loss": 0.41, "step": 855 }, { "epoch": 1.438655462184874, "grad_norm": 0.4503531089882056, "learning_rate": 7.172579694831091e-05, "loss": 0.4201, "step": 856 }, { "epoch": 1.4403361344537815, "grad_norm": 0.3194069093537631, "learning_rate": 7.169718588995068e-05, "loss": 0.4098, "step": 857 }, { "epoch": 1.442016806722689, "grad_norm": 0.3730974279140584, "learning_rate": 7.166853117762907e-05, "loss": 0.4172, "step": 858 }, { "epoch": 1.4436974789915966, "grad_norm": 0.4042209501913334, "learning_rate": 7.163983285080987e-05, "loss": 0.4175, "step": 859 }, { "epoch": 1.4453781512605042, "grad_norm": 0.41039646268235697, "learning_rate": 7.161109094901696e-05, "loss": 0.4188, "step": 860 }, { "epoch": 1.4470588235294117, "grad_norm": 0.410753977547714, "learning_rate": 7.158230551183425e-05, "loss": 0.41, "step": 861 }, { "epoch": 1.4487394957983193, "grad_norm": 0.3505301227865825, "learning_rate": 7.155347657890556e-05, "loss": 0.4096, "step": 862 }, { "epoch": 1.4504201680672268, "grad_norm": 0.39108433963138584, "learning_rate": 7.152460418993466e-05, "loss": 0.411, "step": 863 }, { "epoch": 1.4521008403361344, "grad_norm": 0.488802745360071, "learning_rate": 7.149568838468513e-05, "loss": 0.4143, "step": 864 }, { "epoch": 1.453781512605042, "grad_norm": 0.45016206883376914, "learning_rate": 7.146672920298038e-05, "loss": 0.4082, "step": 865 }, { "epoch": 1.4554621848739495, "grad_norm": 0.41101435405159564, "learning_rate": 7.143772668470354e-05, "loss": 0.4037, "step": 866 }, { "epoch": 1.457142857142857, "grad_norm": 0.4874238935499363, "learning_rate": 7.140868086979741e-05, "loss": 0.4055, "step": 867 }, { "epoch": 1.4588235294117646, "grad_norm": 0.6125302069597601, "learning_rate": 7.137959179826443e-05, "loss": 0.4122, "step": 868 }, { "epoch": 1.4605042016806722, "grad_norm": 0.6395604867256046, "learning_rate": 7.135045951016664e-05, "loss": 0.406, "step": 869 }, { "epoch": 1.46218487394958, "grad_norm": 0.5849671386468663, "learning_rate": 7.132128404562554e-05, "loss": 0.4127, "step": 870 }, { "epoch": 1.4638655462184875, "grad_norm": 0.54161165343328, "learning_rate": 7.129206544482218e-05, "loss": 0.4107, "step": 871 }, { "epoch": 1.465546218487395, "grad_norm": 0.5105387511269015, "learning_rate": 7.126280374799693e-05, "loss": 0.4132, "step": 872 }, { "epoch": 1.4672268907563026, "grad_norm": 0.43684399362146975, "learning_rate": 7.123349899544956e-05, "loss": 0.4125, "step": 873 }, { "epoch": 1.4689075630252102, "grad_norm": 0.3988172042224663, "learning_rate": 7.120415122753912e-05, "loss": 0.4123, "step": 874 }, { "epoch": 1.4705882352941178, "grad_norm": 0.47568292134357787, "learning_rate": 7.117476048468393e-05, "loss": 0.415, "step": 875 }, { "epoch": 1.4722689075630253, "grad_norm": 0.5727372725765637, "learning_rate": 7.114532680736144e-05, "loss": 0.4122, "step": 876 }, { "epoch": 1.4739495798319329, "grad_norm": 0.6027877576310986, "learning_rate": 7.11158502361083e-05, "loss": 0.4098, "step": 877 }, { "epoch": 1.4756302521008404, "grad_norm": 0.5544362059160424, "learning_rate": 7.108633081152016e-05, "loss": 0.4133, "step": 878 }, { "epoch": 1.477310924369748, "grad_norm": 0.41621431502463346, "learning_rate": 7.105676857425177e-05, "loss": 0.4167, "step": 879 }, { "epoch": 1.4789915966386555, "grad_norm": 0.43273263309082943, "learning_rate": 7.102716356501678e-05, "loss": 0.4243, "step": 880 }, { "epoch": 1.480672268907563, "grad_norm": 0.5184450666039031, "learning_rate": 7.099751582458777e-05, "loss": 0.4158, "step": 881 }, { "epoch": 1.4823529411764707, "grad_norm": 0.5078609364529083, "learning_rate": 7.096782539379616e-05, "loss": 0.4175, "step": 882 }, { "epoch": 1.4840336134453782, "grad_norm": 0.5326889638622965, "learning_rate": 7.093809231353218e-05, "loss": 0.4139, "step": 883 }, { "epoch": 1.4857142857142858, "grad_norm": 0.49738180912125346, "learning_rate": 7.090831662474477e-05, "loss": 0.4158, "step": 884 }, { "epoch": 1.4873949579831933, "grad_norm": 0.3725551248732158, "learning_rate": 7.087849836844159e-05, "loss": 0.4192, "step": 885 }, { "epoch": 1.4890756302521009, "grad_norm": 0.3437252349818566, "learning_rate": 7.084863758568889e-05, "loss": 0.4128, "step": 886 }, { "epoch": 1.4907563025210084, "grad_norm": 0.3901480933567556, "learning_rate": 7.081873431761152e-05, "loss": 0.4067, "step": 887 }, { "epoch": 1.492436974789916, "grad_norm": 0.4010023715937298, "learning_rate": 7.07887886053928e-05, "loss": 0.4104, "step": 888 }, { "epoch": 1.4941176470588236, "grad_norm": 0.38122537393665545, "learning_rate": 7.075880049027455e-05, "loss": 0.4208, "step": 889 }, { "epoch": 1.495798319327731, "grad_norm": 0.3155460226841672, "learning_rate": 7.072877001355695e-05, "loss": 0.4142, "step": 890 }, { "epoch": 1.4974789915966387, "grad_norm": 0.3364333958647623, "learning_rate": 7.069869721659855e-05, "loss": 0.4248, "step": 891 }, { "epoch": 1.4991596638655462, "grad_norm": 0.4005346421237616, "learning_rate": 7.066858214081617e-05, "loss": 0.4228, "step": 892 }, { "epoch": 1.5008403361344538, "grad_norm": 0.3950169790924976, "learning_rate": 7.063842482768487e-05, "loss": 0.4111, "step": 893 }, { "epoch": 1.5025210084033613, "grad_norm": 0.40123334510997044, "learning_rate": 7.060822531873783e-05, "loss": 0.4138, "step": 894 }, { "epoch": 1.504201680672269, "grad_norm": 0.42656807394244345, "learning_rate": 7.057798365556644e-05, "loss": 0.4135, "step": 895 }, { "epoch": 1.5058823529411764, "grad_norm": 0.5172011542097631, "learning_rate": 7.054769987982003e-05, "loss": 0.4184, "step": 896 }, { "epoch": 1.507563025210084, "grad_norm": 0.5517049701451748, "learning_rate": 7.051737403320604e-05, "loss": 0.4194, "step": 897 }, { "epoch": 1.5092436974789916, "grad_norm": 0.554059911483115, "learning_rate": 7.048700615748974e-05, "loss": 0.4132, "step": 898 }, { "epoch": 1.5109243697478991, "grad_norm": 0.5954319006279227, "learning_rate": 7.045659629449439e-05, "loss": 0.4094, "step": 899 }, { "epoch": 1.5126050420168067, "grad_norm": 0.7148123127574896, "learning_rate": 7.042614448610101e-05, "loss": 0.4062, "step": 900 }, { "epoch": 1.5142857142857142, "grad_norm": 0.8398982066669234, "learning_rate": 7.039565077424842e-05, "loss": 0.4116, "step": 901 }, { "epoch": 1.5159663865546218, "grad_norm": 0.8532854718754649, "learning_rate": 7.03651152009331e-05, "loss": 0.4185, "step": 902 }, { "epoch": 1.5176470588235293, "grad_norm": 0.7480418955444158, "learning_rate": 7.033453780820923e-05, "loss": 0.4131, "step": 903 }, { "epoch": 1.519327731092437, "grad_norm": 0.49899673901794744, "learning_rate": 7.030391863818857e-05, "loss": 0.413, "step": 904 }, { "epoch": 1.5210084033613445, "grad_norm": 0.3367454783298432, "learning_rate": 7.027325773304043e-05, "loss": 0.4116, "step": 905 }, { "epoch": 1.522689075630252, "grad_norm": 0.5204147686576513, "learning_rate": 7.024255513499159e-05, "loss": 0.418, "step": 906 }, { "epoch": 1.5243697478991596, "grad_norm": 0.5362854021963376, "learning_rate": 7.021181088632621e-05, "loss": 0.4114, "step": 907 }, { "epoch": 1.5260504201680671, "grad_norm": 0.4834768728973426, "learning_rate": 7.018102502938588e-05, "loss": 0.4169, "step": 908 }, { "epoch": 1.5277310924369747, "grad_norm": 0.4794949037447628, "learning_rate": 7.015019760656946e-05, "loss": 0.4201, "step": 909 }, { "epoch": 1.5294117647058822, "grad_norm": 0.45832120721569936, "learning_rate": 7.011932866033306e-05, "loss": 0.4092, "step": 910 }, { "epoch": 1.5310924369747898, "grad_norm": 0.422962631936005, "learning_rate": 7.008841823318994e-05, "loss": 0.4125, "step": 911 }, { "epoch": 1.5327731092436974, "grad_norm": 0.3188017786499304, "learning_rate": 7.005746636771059e-05, "loss": 0.4186, "step": 912 }, { "epoch": 1.534453781512605, "grad_norm": 0.32082417780307887, "learning_rate": 7.002647310652242e-05, "loss": 0.4167, "step": 913 }, { "epoch": 1.5361344537815125, "grad_norm": 0.36761965941491104, "learning_rate": 6.999543849230998e-05, "loss": 0.4102, "step": 914 }, { "epoch": 1.53781512605042, "grad_norm": 0.28704955068341864, "learning_rate": 6.996436256781474e-05, "loss": 0.4101, "step": 915 }, { "epoch": 1.5394957983193276, "grad_norm": 0.32191827901590403, "learning_rate": 6.993324537583502e-05, "loss": 0.4123, "step": 916 }, { "epoch": 1.5411764705882351, "grad_norm": 0.3497230027983995, "learning_rate": 6.990208695922599e-05, "loss": 0.4198, "step": 917 }, { "epoch": 1.5428571428571427, "grad_norm": 0.3253160795325662, "learning_rate": 6.987088736089965e-05, "loss": 0.4103, "step": 918 }, { "epoch": 1.5445378151260503, "grad_norm": 0.3868241080107839, "learning_rate": 6.983964662382464e-05, "loss": 0.4192, "step": 919 }, { "epoch": 1.5462184873949578, "grad_norm": 0.3955523579612011, "learning_rate": 6.980836479102632e-05, "loss": 0.4071, "step": 920 }, { "epoch": 1.5478991596638656, "grad_norm": 0.32879231810841125, "learning_rate": 6.977704190558658e-05, "loss": 0.4091, "step": 921 }, { "epoch": 1.5495798319327732, "grad_norm": 0.29972571095348166, "learning_rate": 6.974567801064393e-05, "loss": 0.4002, "step": 922 }, { "epoch": 1.5512605042016807, "grad_norm": 0.2609068614944735, "learning_rate": 6.971427314939331e-05, "loss": 0.4101, "step": 923 }, { "epoch": 1.5529411764705883, "grad_norm": 0.3237135787710619, "learning_rate": 6.968282736508604e-05, "loss": 0.4144, "step": 924 }, { "epoch": 1.5546218487394958, "grad_norm": 0.3875442457179766, "learning_rate": 6.96513407010299e-05, "loss": 0.407, "step": 925 }, { "epoch": 1.5563025210084034, "grad_norm": 0.4249898738792853, "learning_rate": 6.96198132005889e-05, "loss": 0.4108, "step": 926 }, { "epoch": 1.557983193277311, "grad_norm": 0.4065742892727906, "learning_rate": 6.95882449071833e-05, "loss": 0.4129, "step": 927 }, { "epoch": 1.5596638655462185, "grad_norm": 0.38796289462898575, "learning_rate": 6.955663586428956e-05, "loss": 0.4138, "step": 928 }, { "epoch": 1.561344537815126, "grad_norm": 0.4673938750310089, "learning_rate": 6.952498611544023e-05, "loss": 0.4124, "step": 929 }, { "epoch": 1.5630252100840336, "grad_norm": 0.5278918221186115, "learning_rate": 6.949329570422397e-05, "loss": 0.4155, "step": 930 }, { "epoch": 1.5647058823529412, "grad_norm": 0.6390435926001264, "learning_rate": 6.946156467428538e-05, "loss": 0.4153, "step": 931 }, { "epoch": 1.5663865546218487, "grad_norm": 0.8813190795750412, "learning_rate": 6.942979306932504e-05, "loss": 0.4118, "step": 932 }, { "epoch": 1.5680672268907563, "grad_norm": 1.104415813156809, "learning_rate": 6.939798093309942e-05, "loss": 0.4087, "step": 933 }, { "epoch": 1.5697478991596638, "grad_norm": 0.8527391869136186, "learning_rate": 6.936612830942079e-05, "loss": 0.4147, "step": 934 }, { "epoch": 1.5714285714285714, "grad_norm": 0.6193870955530923, "learning_rate": 6.933423524215716e-05, "loss": 0.4104, "step": 935 }, { "epoch": 1.573109243697479, "grad_norm": 0.45440006774832864, "learning_rate": 6.930230177523228e-05, "loss": 0.4145, "step": 936 }, { "epoch": 1.5747899159663865, "grad_norm": 0.5013033416316524, "learning_rate": 6.927032795262552e-05, "loss": 0.4154, "step": 937 }, { "epoch": 1.576470588235294, "grad_norm": 0.5488752985373846, "learning_rate": 6.923831381837183e-05, "loss": 0.4146, "step": 938 }, { "epoch": 1.5781512605042018, "grad_norm": 0.4603916410898837, "learning_rate": 6.920625941656169e-05, "loss": 0.4134, "step": 939 }, { "epoch": 1.5798319327731094, "grad_norm": 0.4636818028572672, "learning_rate": 6.917416479134101e-05, "loss": 0.4207, "step": 940 }, { "epoch": 1.581512605042017, "grad_norm": 0.46374991833398505, "learning_rate": 6.914202998691112e-05, "loss": 0.4138, "step": 941 }, { "epoch": 1.5831932773109245, "grad_norm": 0.36708429999735037, "learning_rate": 6.910985504752866e-05, "loss": 0.4206, "step": 942 }, { "epoch": 1.584873949579832, "grad_norm": 0.4696688730355237, "learning_rate": 6.907764001750559e-05, "loss": 0.4219, "step": 943 }, { "epoch": 1.5865546218487396, "grad_norm": 0.5074268910356771, "learning_rate": 6.904538494120905e-05, "loss": 0.41, "step": 944 }, { "epoch": 1.5882352941176472, "grad_norm": 0.43406059926707985, "learning_rate": 6.901308986306131e-05, "loss": 0.4092, "step": 945 }, { "epoch": 1.5899159663865547, "grad_norm": 0.37305368774684927, "learning_rate": 6.89807548275398e-05, "loss": 0.4138, "step": 946 }, { "epoch": 1.5915966386554623, "grad_norm": 0.4610616507348138, "learning_rate": 6.894837987917692e-05, "loss": 0.4174, "step": 947 }, { "epoch": 1.5932773109243699, "grad_norm": 0.5726239916480035, "learning_rate": 6.891596506256006e-05, "loss": 0.4086, "step": 948 }, { "epoch": 1.5949579831932774, "grad_norm": 0.5710530239775734, "learning_rate": 6.888351042233153e-05, "loss": 0.4011, "step": 949 }, { "epoch": 1.596638655462185, "grad_norm": 0.4172317124122912, "learning_rate": 6.885101600318843e-05, "loss": 0.4177, "step": 950 }, { "epoch": 1.5983193277310925, "grad_norm": 0.35238305958710897, "learning_rate": 6.881848184988274e-05, "loss": 0.4126, "step": 951 }, { "epoch": 1.6, "grad_norm": 0.43733199313782684, "learning_rate": 6.878590800722107e-05, "loss": 0.4238, "step": 952 }, { "epoch": 1.6016806722689076, "grad_norm": 0.5850295078101878, "learning_rate": 6.875329452006475e-05, "loss": 0.4066, "step": 953 }, { "epoch": 1.6033613445378152, "grad_norm": 0.654131462385933, "learning_rate": 6.872064143332968e-05, "loss": 0.4168, "step": 954 }, { "epoch": 1.6050420168067228, "grad_norm": 0.560536419043286, "learning_rate": 6.86879487919863e-05, "loss": 0.4008, "step": 955 }, { "epoch": 1.6067226890756303, "grad_norm": 0.6009495866315013, "learning_rate": 6.865521664105954e-05, "loss": 0.418, "step": 956 }, { "epoch": 1.6084033613445379, "grad_norm": 0.7043602519125237, "learning_rate": 6.862244502562876e-05, "loss": 0.4097, "step": 957 }, { "epoch": 1.6100840336134454, "grad_norm": 0.8422116947029052, "learning_rate": 6.858963399082759e-05, "loss": 0.4278, "step": 958 }, { "epoch": 1.611764705882353, "grad_norm": 0.880663036937141, "learning_rate": 6.855678358184405e-05, "loss": 0.4175, "step": 959 }, { "epoch": 1.6134453781512605, "grad_norm": 0.7799149395655822, "learning_rate": 6.852389384392032e-05, "loss": 0.4158, "step": 960 }, { "epoch": 1.615126050420168, "grad_norm": 0.46556022929366764, "learning_rate": 6.849096482235277e-05, "loss": 0.4101, "step": 961 }, { "epoch": 1.6168067226890757, "grad_norm": 0.3235319860663099, "learning_rate": 6.845799656249187e-05, "loss": 0.4243, "step": 962 }, { "epoch": 1.6184873949579832, "grad_norm": 0.4133798688154145, "learning_rate": 6.842498910974214e-05, "loss": 0.412, "step": 963 }, { "epoch": 1.6201680672268908, "grad_norm": 0.4332766286311385, "learning_rate": 6.839194250956204e-05, "loss": 0.4075, "step": 964 }, { "epoch": 1.6218487394957983, "grad_norm": 0.4804698886540719, "learning_rate": 6.835885680746402e-05, "loss": 0.4174, "step": 965 }, { "epoch": 1.6235294117647059, "grad_norm": 0.4935879054802656, "learning_rate": 6.832573204901427e-05, "loss": 0.4093, "step": 966 }, { "epoch": 1.6252100840336134, "grad_norm": 0.44460364171704714, "learning_rate": 6.829256827983288e-05, "loss": 0.41, "step": 967 }, { "epoch": 1.626890756302521, "grad_norm": 0.317211219012059, "learning_rate": 6.825936554559356e-05, "loss": 0.4106, "step": 968 }, { "epoch": 1.6285714285714286, "grad_norm": 0.25881377770868386, "learning_rate": 6.82261238920238e-05, "loss": 0.4085, "step": 969 }, { "epoch": 1.6302521008403361, "grad_norm": 0.283168518387446, "learning_rate": 6.819284336490462e-05, "loss": 0.4084, "step": 970 }, { "epoch": 1.6319327731092437, "grad_norm": 0.3178958089168236, "learning_rate": 6.815952401007057e-05, "loss": 0.4165, "step": 971 }, { "epoch": 1.6336134453781512, "grad_norm": 0.3800239377925783, "learning_rate": 6.81261658734097e-05, "loss": 0.4256, "step": 972 }, { "epoch": 1.6352941176470588, "grad_norm": 0.3206517062051527, "learning_rate": 6.809276900086346e-05, "loss": 0.4086, "step": 973 }, { "epoch": 1.6369747899159663, "grad_norm": 0.3959976002577841, "learning_rate": 6.805933343842667e-05, "loss": 0.4091, "step": 974 }, { "epoch": 1.638655462184874, "grad_norm": 0.5515957543599064, "learning_rate": 6.80258592321474e-05, "loss": 0.4086, "step": 975 }, { "epoch": 1.6403361344537815, "grad_norm": 0.49877843972783353, "learning_rate": 6.799234642812694e-05, "loss": 0.4194, "step": 976 }, { "epoch": 1.642016806722689, "grad_norm": 0.49175675856276674, "learning_rate": 6.79587950725198e-05, "loss": 0.3999, "step": 977 }, { "epoch": 1.6436974789915966, "grad_norm": 0.5054579643809435, "learning_rate": 6.792520521153351e-05, "loss": 0.4138, "step": 978 }, { "epoch": 1.6453781512605041, "grad_norm": 0.5232176855623527, "learning_rate": 6.789157689142865e-05, "loss": 0.409, "step": 979 }, { "epoch": 1.6470588235294117, "grad_norm": 0.633630813060529, "learning_rate": 6.785791015851881e-05, "loss": 0.4121, "step": 980 }, { "epoch": 1.6487394957983192, "grad_norm": 0.6923853102220983, "learning_rate": 6.782420505917042e-05, "loss": 0.4149, "step": 981 }, { "epoch": 1.6504201680672268, "grad_norm": 0.7564542157032046, "learning_rate": 6.779046163980279e-05, "loss": 0.4114, "step": 982 }, { "epoch": 1.6521008403361344, "grad_norm": 0.8053239262153116, "learning_rate": 6.775667994688796e-05, "loss": 0.4145, "step": 983 }, { "epoch": 1.653781512605042, "grad_norm": 0.7145300572996082, "learning_rate": 6.772286002695076e-05, "loss": 0.4129, "step": 984 }, { "epoch": 1.6554621848739495, "grad_norm": 0.5314972286493588, "learning_rate": 6.768900192656861e-05, "loss": 0.4073, "step": 985 }, { "epoch": 1.657142857142857, "grad_norm": 0.38465020237466074, "learning_rate": 6.765510569237151e-05, "loss": 0.4182, "step": 986 }, { "epoch": 1.6588235294117646, "grad_norm": 0.48613303974881605, "learning_rate": 6.7621171371042e-05, "loss": 0.4063, "step": 987 }, { "epoch": 1.6605042016806721, "grad_norm": 0.4568566513618116, "learning_rate": 6.758719900931506e-05, "loss": 0.4112, "step": 988 }, { "epoch": 1.6621848739495797, "grad_norm": 0.4150376311208336, "learning_rate": 6.755318865397808e-05, "loss": 0.4015, "step": 989 }, { "epoch": 1.6638655462184873, "grad_norm": 0.4054497964107821, "learning_rate": 6.751914035187077e-05, "loss": 0.402, "step": 990 }, { "epoch": 1.6655462184873948, "grad_norm": 0.3753641709999822, "learning_rate": 6.748505414988504e-05, "loss": 0.4054, "step": 991 }, { "epoch": 1.6672268907563024, "grad_norm": 0.3508872451930597, "learning_rate": 6.74509300949651e-05, "loss": 0.4057, "step": 992 }, { "epoch": 1.66890756302521, "grad_norm": 0.4166389218560123, "learning_rate": 6.741676823410724e-05, "loss": 0.4186, "step": 993 }, { "epoch": 1.6705882352941175, "grad_norm": 0.44022476588885284, "learning_rate": 6.738256861435978e-05, "loss": 0.4086, "step": 994 }, { "epoch": 1.6722689075630253, "grad_norm": 0.4757042016843758, "learning_rate": 6.734833128282312e-05, "loss": 0.4182, "step": 995 }, { "epoch": 1.6739495798319328, "grad_norm": 0.4687917991031043, "learning_rate": 6.731405628664954e-05, "loss": 0.4129, "step": 996 }, { "epoch": 1.6756302521008404, "grad_norm": 0.3984847802214829, "learning_rate": 6.727974367304321e-05, "loss": 0.4109, "step": 997 }, { "epoch": 1.677310924369748, "grad_norm": 0.33574131274346647, "learning_rate": 6.724539348926011e-05, "loss": 0.4157, "step": 998 }, { "epoch": 1.6789915966386555, "grad_norm": 0.38321081901046566, "learning_rate": 6.721100578260796e-05, "loss": 0.4166, "step": 999 }, { "epoch": 1.680672268907563, "grad_norm": 0.42919804332591177, "learning_rate": 6.717658060044617e-05, "loss": 0.4069, "step": 1000 }, { "epoch": 1.6823529411764706, "grad_norm": 0.35630904847950917, "learning_rate": 6.714211799018573e-05, "loss": 0.4047, "step": 1001 }, { "epoch": 1.6840336134453782, "grad_norm": 0.3624344135207008, "learning_rate": 6.71076179992892e-05, "loss": 0.4195, "step": 1002 }, { "epoch": 1.6857142857142857, "grad_norm": 0.3484209980291502, "learning_rate": 6.707308067527062e-05, "loss": 0.4138, "step": 1003 }, { "epoch": 1.6873949579831933, "grad_norm": 0.32017538062200657, "learning_rate": 6.703850606569544e-05, "loss": 0.4053, "step": 1004 }, { "epoch": 1.6890756302521008, "grad_norm": 0.30722928888952067, "learning_rate": 6.70038942181805e-05, "loss": 0.4062, "step": 1005 }, { "epoch": 1.6907563025210084, "grad_norm": 0.35271755710667496, "learning_rate": 6.696924518039381e-05, "loss": 0.4058, "step": 1006 }, { "epoch": 1.692436974789916, "grad_norm": 0.5215134167644846, "learning_rate": 6.693455900005475e-05, "loss": 0.4093, "step": 1007 }, { "epoch": 1.6941176470588235, "grad_norm": 0.5439930379553499, "learning_rate": 6.689983572493374e-05, "loss": 0.408, "step": 1008 }, { "epoch": 1.695798319327731, "grad_norm": 0.5317195619586947, "learning_rate": 6.686507540285234e-05, "loss": 0.416, "step": 1009 }, { "epoch": 1.6974789915966386, "grad_norm": 0.5609065394382301, "learning_rate": 6.683027808168314e-05, "loss": 0.404, "step": 1010 }, { "epoch": 1.6991596638655462, "grad_norm": 0.5037816091843502, "learning_rate": 6.679544380934964e-05, "loss": 0.4044, "step": 1011 }, { "epoch": 1.7008403361344537, "grad_norm": 0.3460038637061333, "learning_rate": 6.676057263382627e-05, "loss": 0.4153, "step": 1012 }, { "epoch": 1.7025210084033613, "grad_norm": 0.3658032424656448, "learning_rate": 6.672566460313825e-05, "loss": 0.4047, "step": 1013 }, { "epoch": 1.704201680672269, "grad_norm": 0.4639185347622136, "learning_rate": 6.66907197653616e-05, "loss": 0.4069, "step": 1014 }, { "epoch": 1.7058823529411766, "grad_norm": 0.48909965219791113, "learning_rate": 6.6655738168623e-05, "loss": 0.4171, "step": 1015 }, { "epoch": 1.7075630252100842, "grad_norm": 0.48581151135705253, "learning_rate": 6.662071986109976e-05, "loss": 0.4077, "step": 1016 }, { "epoch": 1.7092436974789917, "grad_norm": 0.49155145678452494, "learning_rate": 6.658566489101976e-05, "loss": 0.4147, "step": 1017 }, { "epoch": 1.7109243697478993, "grad_norm": 0.5900627006481629, "learning_rate": 6.655057330666136e-05, "loss": 0.4018, "step": 1018 }, { "epoch": 1.7126050420168069, "grad_norm": 0.6210947596157823, "learning_rate": 6.651544515635336e-05, "loss": 0.415, "step": 1019 }, { "epoch": 1.7142857142857144, "grad_norm": 0.6632807872704964, "learning_rate": 6.648028048847491e-05, "loss": 0.4144, "step": 1020 }, { "epoch": 1.715966386554622, "grad_norm": 0.6498382081759296, "learning_rate": 6.644507935145542e-05, "loss": 0.41, "step": 1021 }, { "epoch": 1.7176470588235295, "grad_norm": 0.6164678104875658, "learning_rate": 6.64098417937746e-05, "loss": 0.4052, "step": 1022 }, { "epoch": 1.719327731092437, "grad_norm": 0.5869988258279654, "learning_rate": 6.637456786396226e-05, "loss": 0.4028, "step": 1023 }, { "epoch": 1.7210084033613446, "grad_norm": 0.4498721646752205, "learning_rate": 6.633925761059832e-05, "loss": 0.4177, "step": 1024 }, { "epoch": 1.7226890756302522, "grad_norm": 0.36640024269122123, "learning_rate": 6.630391108231273e-05, "loss": 0.4097, "step": 1025 }, { "epoch": 1.7243697478991598, "grad_norm": 0.4488728246225823, "learning_rate": 6.62685283277854e-05, "loss": 0.4105, "step": 1026 }, { "epoch": 1.7260504201680673, "grad_norm": 0.45722394677997796, "learning_rate": 6.623310939574611e-05, "loss": 0.4071, "step": 1027 }, { "epoch": 1.7277310924369749, "grad_norm": 0.42581105129352065, "learning_rate": 6.61976543349745e-05, "loss": 0.4005, "step": 1028 }, { "epoch": 1.7294117647058824, "grad_norm": 0.3838270747410389, "learning_rate": 6.616216319429993e-05, "loss": 0.4121, "step": 1029 }, { "epoch": 1.73109243697479, "grad_norm": 0.33251817119908517, "learning_rate": 6.612663602260147e-05, "loss": 0.408, "step": 1030 }, { "epoch": 1.7327731092436975, "grad_norm": 0.4104068764262944, "learning_rate": 6.609107286880783e-05, "loss": 0.413, "step": 1031 }, { "epoch": 1.734453781512605, "grad_norm": 0.38776707851960157, "learning_rate": 6.605547378189721e-05, "loss": 0.4179, "step": 1032 }, { "epoch": 1.7361344537815127, "grad_norm": 0.27996506606742855, "learning_rate": 6.601983881089742e-05, "loss": 0.4169, "step": 1033 }, { "epoch": 1.7378151260504202, "grad_norm": 0.3413358877822298, "learning_rate": 6.598416800488553e-05, "loss": 0.415, "step": 1034 }, { "epoch": 1.7394957983193278, "grad_norm": 0.3496484862755638, "learning_rate": 6.594846141298809e-05, "loss": 0.4077, "step": 1035 }, { "epoch": 1.7411764705882353, "grad_norm": 0.39060686242996234, "learning_rate": 6.591271908438087e-05, "loss": 0.405, "step": 1036 }, { "epoch": 1.7428571428571429, "grad_norm": 0.4419821881700148, "learning_rate": 6.58769410682889e-05, "loss": 0.407, "step": 1037 }, { "epoch": 1.7445378151260504, "grad_norm": 0.40552351884138244, "learning_rate": 6.584112741398634e-05, "loss": 0.4116, "step": 1038 }, { "epoch": 1.746218487394958, "grad_norm": 0.3976009636187432, "learning_rate": 6.580527817079641e-05, "loss": 0.4056, "step": 1039 }, { "epoch": 1.7478991596638656, "grad_norm": 0.4038480556941658, "learning_rate": 6.576939338809138e-05, "loss": 0.4107, "step": 1040 }, { "epoch": 1.749579831932773, "grad_norm": 0.3485770030628482, "learning_rate": 6.573347311529242e-05, "loss": 0.4066, "step": 1041 }, { "epoch": 1.7512605042016807, "grad_norm": 0.3182582032866884, "learning_rate": 6.569751740186963e-05, "loss": 0.4213, "step": 1042 }, { "epoch": 1.7529411764705882, "grad_norm": 0.3569405068886141, "learning_rate": 6.56615262973419e-05, "loss": 0.4105, "step": 1043 }, { "epoch": 1.7546218487394958, "grad_norm": 0.4175985068358299, "learning_rate": 6.562549985127683e-05, "loss": 0.4084, "step": 1044 }, { "epoch": 1.7563025210084033, "grad_norm": 0.43276334128222926, "learning_rate": 6.558943811329076e-05, "loss": 0.4091, "step": 1045 }, { "epoch": 1.757983193277311, "grad_norm": 0.45499048743282117, "learning_rate": 6.555334113304857e-05, "loss": 0.4033, "step": 1046 }, { "epoch": 1.7596638655462185, "grad_norm": 0.5105925662285289, "learning_rate": 6.55172089602637e-05, "loss": 0.4049, "step": 1047 }, { "epoch": 1.761344537815126, "grad_norm": 0.45306725427548994, "learning_rate": 6.548104164469806e-05, "loss": 0.4113, "step": 1048 }, { "epoch": 1.7630252100840336, "grad_norm": 0.3782847129818191, "learning_rate": 6.544483923616195e-05, "loss": 0.4067, "step": 1049 }, { "epoch": 1.7647058823529411, "grad_norm": 0.47153255474618194, "learning_rate": 6.540860178451403e-05, "loss": 0.4171, "step": 1050 }, { "epoch": 1.7663865546218487, "grad_norm": 0.5242892523721046, "learning_rate": 6.53723293396612e-05, "loss": 0.4112, "step": 1051 }, { "epoch": 1.7680672268907562, "grad_norm": 0.4774439017651856, "learning_rate": 6.533602195155853e-05, "loss": 0.418, "step": 1052 }, { "epoch": 1.7697478991596638, "grad_norm": 0.3407650472823471, "learning_rate": 6.529967967020926e-05, "loss": 0.406, "step": 1053 }, { "epoch": 1.7714285714285714, "grad_norm": 0.46547156362058617, "learning_rate": 6.526330254566466e-05, "loss": 0.4021, "step": 1054 }, { "epoch": 1.773109243697479, "grad_norm": 0.49794373091431476, "learning_rate": 6.522689062802398e-05, "loss": 0.4164, "step": 1055 }, { "epoch": 1.7747899159663865, "grad_norm": 0.3720231152564036, "learning_rate": 6.519044396743442e-05, "loss": 0.4157, "step": 1056 }, { "epoch": 1.776470588235294, "grad_norm": 0.31007203145207884, "learning_rate": 6.515396261409098e-05, "loss": 0.4088, "step": 1057 }, { "epoch": 1.7781512605042016, "grad_norm": 0.34333274320422136, "learning_rate": 6.51174466182365e-05, "loss": 0.3979, "step": 1058 }, { "epoch": 1.7798319327731091, "grad_norm": 0.3270197240737855, "learning_rate": 6.508089603016147e-05, "loss": 0.4026, "step": 1059 }, { "epoch": 1.7815126050420167, "grad_norm": 0.364919530406562, "learning_rate": 6.504431090020406e-05, "loss": 0.4069, "step": 1060 }, { "epoch": 1.7831932773109243, "grad_norm": 0.4155024087284706, "learning_rate": 6.500769127874998e-05, "loss": 0.406, "step": 1061 }, { "epoch": 1.7848739495798318, "grad_norm": 0.4436412931383234, "learning_rate": 6.497103721623251e-05, "loss": 0.4127, "step": 1062 }, { "epoch": 1.7865546218487394, "grad_norm": 0.49657049845554485, "learning_rate": 6.493434876313226e-05, "loss": 0.4051, "step": 1063 }, { "epoch": 1.788235294117647, "grad_norm": 0.5589105303063652, "learning_rate": 6.48976259699773e-05, "loss": 0.414, "step": 1064 }, { "epoch": 1.7899159663865545, "grad_norm": 0.5558049013342893, "learning_rate": 6.486086888734292e-05, "loss": 0.4123, "step": 1065 }, { "epoch": 1.791596638655462, "grad_norm": 0.4748167001808158, "learning_rate": 6.482407756585169e-05, "loss": 0.4066, "step": 1066 }, { "epoch": 1.7932773109243696, "grad_norm": 0.3152583003079238, "learning_rate": 6.478725205617331e-05, "loss": 0.4108, "step": 1067 }, { "epoch": 1.7949579831932772, "grad_norm": 0.2754093499617658, "learning_rate": 6.475039240902455e-05, "loss": 0.4071, "step": 1068 }, { "epoch": 1.7966386554621847, "grad_norm": 0.35445212326853415, "learning_rate": 6.471349867516922e-05, "loss": 0.4069, "step": 1069 }, { "epoch": 1.7983193277310925, "grad_norm": 0.3547604359992653, "learning_rate": 6.467657090541806e-05, "loss": 0.4084, "step": 1070 }, { "epoch": 1.8, "grad_norm": 0.3282291962302917, "learning_rate": 6.46396091506287e-05, "loss": 0.4143, "step": 1071 }, { "epoch": 1.8016806722689076, "grad_norm": 0.33062693961979445, "learning_rate": 6.460261346170556e-05, "loss": 0.4019, "step": 1072 }, { "epoch": 1.8033613445378152, "grad_norm": 0.38735911294838776, "learning_rate": 6.45655838895998e-05, "loss": 0.4081, "step": 1073 }, { "epoch": 1.8050420168067227, "grad_norm": 0.46880100733803637, "learning_rate": 6.452852048530925e-05, "loss": 0.4132, "step": 1074 }, { "epoch": 1.8067226890756303, "grad_norm": 0.5275491589710114, "learning_rate": 6.44914232998783e-05, "loss": 0.4038, "step": 1075 }, { "epoch": 1.8084033613445378, "grad_norm": 0.480671893753054, "learning_rate": 6.445429238439793e-05, "loss": 0.4062, "step": 1076 }, { "epoch": 1.8100840336134454, "grad_norm": 0.37506367186987566, "learning_rate": 6.441712779000552e-05, "loss": 0.4045, "step": 1077 }, { "epoch": 1.811764705882353, "grad_norm": 0.2742173892058475, "learning_rate": 6.437992956788485e-05, "loss": 0.404, "step": 1078 }, { "epoch": 1.8134453781512605, "grad_norm": 0.2776555360285855, "learning_rate": 6.434269776926601e-05, "loss": 0.4098, "step": 1079 }, { "epoch": 1.815126050420168, "grad_norm": 0.30135349682808804, "learning_rate": 6.430543244542532e-05, "loss": 0.411, "step": 1080 }, { "epoch": 1.8168067226890756, "grad_norm": 0.31677884273666573, "learning_rate": 6.426813364768531e-05, "loss": 0.4053, "step": 1081 }, { "epoch": 1.8184873949579832, "grad_norm": 0.39670409554130154, "learning_rate": 6.423080142741458e-05, "loss": 0.4074, "step": 1082 }, { "epoch": 1.8201680672268907, "grad_norm": 0.48753394926770094, "learning_rate": 6.419343583602776e-05, "loss": 0.4057, "step": 1083 }, { "epoch": 1.8218487394957983, "grad_norm": 0.5289474629704387, "learning_rate": 6.415603692498548e-05, "loss": 0.413, "step": 1084 }, { "epoch": 1.8235294117647058, "grad_norm": 0.6504797026611978, "learning_rate": 6.411860474579416e-05, "loss": 0.4063, "step": 1085 }, { "epoch": 1.8252100840336134, "grad_norm": 0.8026664402469593, "learning_rate": 6.408113935000614e-05, "loss": 0.4057, "step": 1086 }, { "epoch": 1.826890756302521, "grad_norm": 0.7651908293505008, "learning_rate": 6.40436407892195e-05, "loss": 0.416, "step": 1087 }, { "epoch": 1.8285714285714287, "grad_norm": 0.6066154896171815, "learning_rate": 6.400610911507789e-05, "loss": 0.4041, "step": 1088 }, { "epoch": 1.8302521008403363, "grad_norm": 0.4692501942653527, "learning_rate": 6.39685443792707e-05, "loss": 0.4071, "step": 1089 }, { "epoch": 1.8319327731092439, "grad_norm": 0.3457770074504733, "learning_rate": 6.393094663353277e-05, "loss": 0.4123, "step": 1090 }, { "epoch": 1.8336134453781514, "grad_norm": 0.293991150574311, "learning_rate": 6.389331592964442e-05, "loss": 0.4038, "step": 1091 }, { "epoch": 1.835294117647059, "grad_norm": 0.2976574778853427, "learning_rate": 6.385565231943137e-05, "loss": 0.4103, "step": 1092 }, { "epoch": 1.8369747899159665, "grad_norm": 0.40173935227637725, "learning_rate": 6.381795585476465e-05, "loss": 0.4076, "step": 1093 }, { "epoch": 1.838655462184874, "grad_norm": 0.4538056712214925, "learning_rate": 6.378022658756054e-05, "loss": 0.4116, "step": 1094 }, { "epoch": 1.8403361344537816, "grad_norm": 0.5178820860995883, "learning_rate": 6.374246456978051e-05, "loss": 0.4048, "step": 1095 }, { "epoch": 1.8420168067226892, "grad_norm": 0.5549696967085782, "learning_rate": 6.370466985343112e-05, "loss": 0.4036, "step": 1096 }, { "epoch": 1.8436974789915967, "grad_norm": 0.5600677491206888, "learning_rate": 6.366684249056394e-05, "loss": 0.4045, "step": 1097 }, { "epoch": 1.8453781512605043, "grad_norm": 0.5862033705262517, "learning_rate": 6.362898253327555e-05, "loss": 0.4053, "step": 1098 }, { "epoch": 1.8470588235294119, "grad_norm": 0.4857693356738776, "learning_rate": 6.359109003370739e-05, "loss": 0.4076, "step": 1099 }, { "epoch": 1.8487394957983194, "grad_norm": 0.34000993805995466, "learning_rate": 6.355316504404572e-05, "loss": 0.4134, "step": 1100 }, { "epoch": 1.850420168067227, "grad_norm": 0.3711672464519246, "learning_rate": 6.351520761652155e-05, "loss": 0.4172, "step": 1101 }, { "epoch": 1.8521008403361345, "grad_norm": 0.49679383940020555, "learning_rate": 6.347721780341058e-05, "loss": 0.4185, "step": 1102 }, { "epoch": 1.853781512605042, "grad_norm": 0.5331054748206874, "learning_rate": 6.343919565703308e-05, "loss": 0.4114, "step": 1103 }, { "epoch": 1.8554621848739496, "grad_norm": 0.48476331907173875, "learning_rate": 6.340114122975388e-05, "loss": 0.4153, "step": 1104 }, { "epoch": 1.8571428571428572, "grad_norm": 0.4473214323714141, "learning_rate": 6.336305457398224e-05, "loss": 0.4139, "step": 1105 }, { "epoch": 1.8588235294117648, "grad_norm": 0.4506666094780096, "learning_rate": 6.332493574217182e-05, "loss": 0.4093, "step": 1106 }, { "epoch": 1.8605042016806723, "grad_norm": 0.48511819080241986, "learning_rate": 6.32867847868206e-05, "loss": 0.4048, "step": 1107 }, { "epoch": 1.8621848739495799, "grad_norm": 0.5475585052943874, "learning_rate": 6.324860176047079e-05, "loss": 0.4045, "step": 1108 }, { "epoch": 1.8638655462184874, "grad_norm": 0.527970303154566, "learning_rate": 6.321038671570879e-05, "loss": 0.4072, "step": 1109 }, { "epoch": 1.865546218487395, "grad_norm": 0.373410916593926, "learning_rate": 6.317213970516504e-05, "loss": 0.4057, "step": 1110 }, { "epoch": 1.8672268907563025, "grad_norm": 0.23917059010099603, "learning_rate": 6.313386078151408e-05, "loss": 0.4096, "step": 1111 }, { "epoch": 1.86890756302521, "grad_norm": 0.3321515868485195, "learning_rate": 6.309554999747435e-05, "loss": 0.4076, "step": 1112 }, { "epoch": 1.8705882352941177, "grad_norm": 0.37452363185696064, "learning_rate": 6.305720740580819e-05, "loss": 0.4169, "step": 1113 }, { "epoch": 1.8722689075630252, "grad_norm": 0.34776554655093933, "learning_rate": 6.301883305932173e-05, "loss": 0.4154, "step": 1114 }, { "epoch": 1.8739495798319328, "grad_norm": 0.36794186782548405, "learning_rate": 6.298042701086484e-05, "loss": 0.4173, "step": 1115 }, { "epoch": 1.8756302521008403, "grad_norm": 0.4566078956072596, "learning_rate": 6.294198931333106e-05, "loss": 0.4121, "step": 1116 }, { "epoch": 1.877310924369748, "grad_norm": 0.4345380952178851, "learning_rate": 6.290352001965753e-05, "loss": 0.4045, "step": 1117 }, { "epoch": 1.8789915966386554, "grad_norm": 0.39740657863558243, "learning_rate": 6.286501918282486e-05, "loss": 0.4098, "step": 1118 }, { "epoch": 1.880672268907563, "grad_norm": 0.4551861890828323, "learning_rate": 6.282648685585713e-05, "loss": 0.4128, "step": 1119 }, { "epoch": 1.8823529411764706, "grad_norm": 0.44878018804937375, "learning_rate": 6.27879230918218e-05, "loss": 0.4094, "step": 1120 }, { "epoch": 1.8840336134453781, "grad_norm": 0.4031871337521206, "learning_rate": 6.27493279438296e-05, "loss": 0.4048, "step": 1121 }, { "epoch": 1.8857142857142857, "grad_norm": 0.36214614959972447, "learning_rate": 6.27107014650345e-05, "loss": 0.4052, "step": 1122 }, { "epoch": 1.8873949579831932, "grad_norm": 0.29777758494934653, "learning_rate": 6.26720437086336e-05, "loss": 0.4053, "step": 1123 }, { "epoch": 1.8890756302521008, "grad_norm": 0.2440520882175115, "learning_rate": 6.263335472786711e-05, "loss": 0.4069, "step": 1124 }, { "epoch": 1.8907563025210083, "grad_norm": 0.25745066013711687, "learning_rate": 6.259463457601822e-05, "loss": 0.4087, "step": 1125 }, { "epoch": 1.892436974789916, "grad_norm": 0.3369143654201446, "learning_rate": 6.255588330641304e-05, "loss": 0.4111, "step": 1126 }, { "epoch": 1.8941176470588235, "grad_norm": 0.44064963497237586, "learning_rate": 6.251710097242055e-05, "loss": 0.4138, "step": 1127 }, { "epoch": 1.895798319327731, "grad_norm": 0.4790151750565546, "learning_rate": 6.247828762745254e-05, "loss": 0.4062, "step": 1128 }, { "epoch": 1.8974789915966386, "grad_norm": 0.4140479613293109, "learning_rate": 6.243944332496345e-05, "loss": 0.416, "step": 1129 }, { "epoch": 1.8991596638655461, "grad_norm": 0.36105335203802014, "learning_rate": 6.24005681184504e-05, "loss": 0.4099, "step": 1130 }, { "epoch": 1.9008403361344537, "grad_norm": 0.3598186233030016, "learning_rate": 6.236166206145302e-05, "loss": 0.4087, "step": 1131 }, { "epoch": 1.9025210084033612, "grad_norm": 0.34647966559803706, "learning_rate": 6.23227252075535e-05, "loss": 0.4062, "step": 1132 }, { "epoch": 1.9042016806722688, "grad_norm": 0.3432171076789625, "learning_rate": 6.228375761037641e-05, "loss": 0.4059, "step": 1133 }, { "epoch": 1.9058823529411764, "grad_norm": 0.30982677399528985, "learning_rate": 6.224475932358865e-05, "loss": 0.412, "step": 1134 }, { "epoch": 1.907563025210084, "grad_norm": 0.382598134665834, "learning_rate": 6.22057304008994e-05, "loss": 0.406, "step": 1135 }, { "epoch": 1.9092436974789915, "grad_norm": 0.464885861442905, "learning_rate": 6.216667089606001e-05, "loss": 0.4086, "step": 1136 }, { "epoch": 1.910924369747899, "grad_norm": 0.4456349830892416, "learning_rate": 6.212758086286398e-05, "loss": 0.4122, "step": 1137 }, { "epoch": 1.9126050420168066, "grad_norm": 0.4331174998756414, "learning_rate": 6.208846035514684e-05, "loss": 0.406, "step": 1138 }, { "epoch": 1.9142857142857141, "grad_norm": 0.40321613856325317, "learning_rate": 6.204930942678609e-05, "loss": 0.4127, "step": 1139 }, { "epoch": 1.9159663865546217, "grad_norm": 0.43278762604318183, "learning_rate": 6.201012813170113e-05, "loss": 0.4148, "step": 1140 }, { "epoch": 1.9176470588235293, "grad_norm": 0.47361084431774975, "learning_rate": 6.197091652385317e-05, "loss": 0.4192, "step": 1141 }, { "epoch": 1.9193277310924368, "grad_norm": 0.48574953579524766, "learning_rate": 6.193167465724516e-05, "loss": 0.4092, "step": 1142 }, { "epoch": 1.9210084033613444, "grad_norm": 0.43926791807866444, "learning_rate": 6.189240258592178e-05, "loss": 0.4008, "step": 1143 }, { "epoch": 1.9226890756302522, "grad_norm": 0.37649448891212156, "learning_rate": 6.185310036396923e-05, "loss": 0.3982, "step": 1144 }, { "epoch": 1.9243697478991597, "grad_norm": 0.3557681198380595, "learning_rate": 6.181376804551526e-05, "loss": 0.4079, "step": 1145 }, { "epoch": 1.9260504201680673, "grad_norm": 0.27146767296184693, "learning_rate": 6.177440568472913e-05, "loss": 0.4112, "step": 1146 }, { "epoch": 1.9277310924369748, "grad_norm": 0.21108862307484616, "learning_rate": 6.173501333582138e-05, "loss": 0.3976, "step": 1147 }, { "epoch": 1.9294117647058824, "grad_norm": 0.2350870948682723, "learning_rate": 6.169559105304391e-05, "loss": 0.401, "step": 1148 }, { "epoch": 1.93109243697479, "grad_norm": 0.28975269017378863, "learning_rate": 6.165613889068984e-05, "loss": 0.4077, "step": 1149 }, { "epoch": 1.9327731092436975, "grad_norm": 0.3467584985217818, "learning_rate": 6.16166569030934e-05, "loss": 0.4087, "step": 1150 }, { "epoch": 1.934453781512605, "grad_norm": 0.36919494980457157, "learning_rate": 6.157714514462998e-05, "loss": 0.4041, "step": 1151 }, { "epoch": 1.9361344537815126, "grad_norm": 0.38486660939180384, "learning_rate": 6.153760366971586e-05, "loss": 0.4051, "step": 1152 }, { "epoch": 1.9378151260504202, "grad_norm": 0.41381017198659764, "learning_rate": 6.149803253280834e-05, "loss": 0.404, "step": 1153 }, { "epoch": 1.9394957983193277, "grad_norm": 0.3792443151636761, "learning_rate": 6.145843178840553e-05, "loss": 0.4021, "step": 1154 }, { "epoch": 1.9411764705882353, "grad_norm": 0.40760085772611065, "learning_rate": 6.14188014910463e-05, "loss": 0.4096, "step": 1155 }, { "epoch": 1.9428571428571428, "grad_norm": 0.4961577087329667, "learning_rate": 6.137914169531028e-05, "loss": 0.4086, "step": 1156 }, { "epoch": 1.9445378151260504, "grad_norm": 0.4735829866568803, "learning_rate": 6.133945245581765e-05, "loss": 0.4103, "step": 1157 }, { "epoch": 1.946218487394958, "grad_norm": 0.4588273066055638, "learning_rate": 6.12997338272292e-05, "loss": 0.4016, "step": 1158 }, { "epoch": 1.9478991596638655, "grad_norm": 0.49597906658466195, "learning_rate": 6.125998586424616e-05, "loss": 0.4131, "step": 1159 }, { "epoch": 1.949579831932773, "grad_norm": 0.4825166085634964, "learning_rate": 6.122020862161018e-05, "loss": 0.4045, "step": 1160 }, { "epoch": 1.9512605042016806, "grad_norm": 0.42201686031659064, "learning_rate": 6.118040215410324e-05, "loss": 0.4035, "step": 1161 }, { "epoch": 1.9529411764705882, "grad_norm": 0.39049503542155506, "learning_rate": 6.114056651654754e-05, "loss": 0.4087, "step": 1162 }, { "epoch": 1.954621848739496, "grad_norm": 0.4154455411821173, "learning_rate": 6.110070176380547e-05, "loss": 0.4033, "step": 1163 }, { "epoch": 1.9563025210084035, "grad_norm": 0.36024309570492263, "learning_rate": 6.106080795077952e-05, "loss": 0.399, "step": 1164 }, { "epoch": 1.957983193277311, "grad_norm": 0.3187760180751725, "learning_rate": 6.1020885132412225e-05, "loss": 0.3992, "step": 1165 }, { "epoch": 1.9596638655462186, "grad_norm": 0.32421581686234596, "learning_rate": 6.0980933363686004e-05, "loss": 0.412, "step": 1166 }, { "epoch": 1.9613445378151262, "grad_norm": 0.3755574972134325, "learning_rate": 6.094095269962321e-05, "loss": 0.4062, "step": 1167 }, { "epoch": 1.9630252100840337, "grad_norm": 0.4486207147791569, "learning_rate": 6.090094319528595e-05, "loss": 0.4071, "step": 1168 }, { "epoch": 1.9647058823529413, "grad_norm": 0.4009492194826018, "learning_rate": 6.086090490577608e-05, "loss": 0.4101, "step": 1169 }, { "epoch": 1.9663865546218489, "grad_norm": 0.29953883240606194, "learning_rate": 6.082083788623508e-05, "loss": 0.4041, "step": 1170 }, { "epoch": 1.9680672268907564, "grad_norm": 0.48429221435144637, "learning_rate": 6.078074219184402e-05, "loss": 0.4116, "step": 1171 }, { "epoch": 1.969747899159664, "grad_norm": 0.5874034788982692, "learning_rate": 6.0740617877823414e-05, "loss": 0.4005, "step": 1172 }, { "epoch": 1.9714285714285715, "grad_norm": 0.5294876325733971, "learning_rate": 6.070046499943325e-05, "loss": 0.4026, "step": 1173 }, { "epoch": 1.973109243697479, "grad_norm": 0.5108356415748515, "learning_rate": 6.066028361197282e-05, "loss": 0.4118, "step": 1174 }, { "epoch": 1.9747899159663866, "grad_norm": 0.5460706566089084, "learning_rate": 6.062007377078068e-05, "loss": 0.4138, "step": 1175 }, { "epoch": 1.9764705882352942, "grad_norm": 0.5582404242629482, "learning_rate": 6.05798355312346e-05, "loss": 0.3972, "step": 1176 }, { "epoch": 1.9781512605042018, "grad_norm": 0.4956188977266891, "learning_rate": 6.053956894875142e-05, "loss": 0.4049, "step": 1177 }, { "epoch": 1.9798319327731093, "grad_norm": 0.4751702512523244, "learning_rate": 6.049927407878705e-05, "loss": 0.4143, "step": 1178 }, { "epoch": 1.9815126050420169, "grad_norm": 0.4255926512107527, "learning_rate": 6.045895097683632e-05, "loss": 0.3972, "step": 1179 }, { "epoch": 1.9831932773109244, "grad_norm": 0.5258230431084419, "learning_rate": 6.041859969843301e-05, "loss": 0.4091, "step": 1180 }, { "epoch": 1.984873949579832, "grad_norm": 0.6302532729213637, "learning_rate": 6.037822029914962e-05, "loss": 0.4098, "step": 1181 }, { "epoch": 1.9865546218487395, "grad_norm": 0.5358471082744432, "learning_rate": 6.033781283459744e-05, "loss": 0.4044, "step": 1182 }, { "epoch": 1.988235294117647, "grad_norm": 0.38701682915836905, "learning_rate": 6.029737736042638e-05, "loss": 0.4035, "step": 1183 }, { "epoch": 1.9899159663865547, "grad_norm": 0.2555540002416816, "learning_rate": 6.025691393232494e-05, "loss": 0.4012, "step": 1184 }, { "epoch": 1.9915966386554622, "grad_norm": 0.3642800201722641, "learning_rate": 6.0216422606020126e-05, "loss": 0.399, "step": 1185 }, { "epoch": 1.9932773109243698, "grad_norm": 0.416822708519629, "learning_rate": 6.017590343727733e-05, "loss": 0.398, "step": 1186 }, { "epoch": 1.9949579831932773, "grad_norm": 0.3978204292341643, "learning_rate": 6.013535648190035e-05, "loss": 0.406, "step": 1187 }, { "epoch": 1.9966386554621849, "grad_norm": 0.35319552537688326, "learning_rate": 6.009478179573119e-05, "loss": 0.3963, "step": 1188 }, { "epoch": 1.9983193277310924, "grad_norm": 0.2831258708575588, "learning_rate": 6.005417943465007e-05, "loss": 0.4022, "step": 1189 }, { "epoch": 2.0, "grad_norm": 0.2691150873396986, "learning_rate": 6.001354945457534e-05, "loss": 0.3974, "step": 1190 }, { "epoch": 2.0016806722689076, "grad_norm": 0.3178879066227943, "learning_rate": 5.997289191146337e-05, "loss": 0.3814, "step": 1191 }, { "epoch": 2.003361344537815, "grad_norm": 0.40159734107815537, "learning_rate": 5.993220686130849e-05, "loss": 0.383, "step": 1192 }, { "epoch": 2.0050420168067227, "grad_norm": 0.43975456591215567, "learning_rate": 5.989149436014293e-05, "loss": 0.3759, "step": 1193 }, { "epoch": 2.0067226890756302, "grad_norm": 0.44848961349205874, "learning_rate": 5.9850754464036694e-05, "loss": 0.3886, "step": 1194 }, { "epoch": 2.008403361344538, "grad_norm": 0.37724451256137465, "learning_rate": 5.980998722909755e-05, "loss": 0.3774, "step": 1195 }, { "epoch": 2.0100840336134453, "grad_norm": 0.43962662926232987, "learning_rate": 5.976919271147089e-05, "loss": 0.3822, "step": 1196 }, { "epoch": 2.011764705882353, "grad_norm": 0.49501795822730993, "learning_rate": 5.972837096733971e-05, "loss": 0.3893, "step": 1197 }, { "epoch": 2.0134453781512605, "grad_norm": 0.3664019290605734, "learning_rate": 5.968752205292445e-05, "loss": 0.3681, "step": 1198 }, { "epoch": 2.015126050420168, "grad_norm": 0.2971911249256198, "learning_rate": 5.964664602448304e-05, "loss": 0.3852, "step": 1199 }, { "epoch": 2.0168067226890756, "grad_norm": 0.26306827316422504, "learning_rate": 5.9605742938310714e-05, "loss": 0.3749, "step": 1200 }, { "epoch": 2.018487394957983, "grad_norm": 0.3391507154824733, "learning_rate": 5.956481285073995e-05, "loss": 0.3781, "step": 1201 }, { "epoch": 2.0201680672268907, "grad_norm": 0.3839961182506537, "learning_rate": 5.952385581814044e-05, "loss": 0.3708, "step": 1202 }, { "epoch": 2.0218487394957982, "grad_norm": 0.3812151072813323, "learning_rate": 5.948287189691899e-05, "loss": 0.3751, "step": 1203 }, { "epoch": 2.023529411764706, "grad_norm": 0.39155760218370594, "learning_rate": 5.944186114351944e-05, "loss": 0.3851, "step": 1204 }, { "epoch": 2.0252100840336134, "grad_norm": 0.4705594240564956, "learning_rate": 5.940082361442255e-05, "loss": 0.3728, "step": 1205 }, { "epoch": 2.026890756302521, "grad_norm": 0.5492538239381354, "learning_rate": 5.9359759366145985e-05, "loss": 0.3761, "step": 1206 }, { "epoch": 2.0285714285714285, "grad_norm": 0.5931029853263909, "learning_rate": 5.9318668455244204e-05, "loss": 0.37, "step": 1207 }, { "epoch": 2.030252100840336, "grad_norm": 0.5907962737607789, "learning_rate": 5.9277550938308384e-05, "loss": 0.3881, "step": 1208 }, { "epoch": 2.0319327731092436, "grad_norm": 0.46006730487568204, "learning_rate": 5.9236406871966335e-05, "loss": 0.3921, "step": 1209 }, { "epoch": 2.033613445378151, "grad_norm": 0.33750526421597243, "learning_rate": 5.919523631288244e-05, "loss": 0.3747, "step": 1210 }, { "epoch": 2.0352941176470587, "grad_norm": 0.3702670342420941, "learning_rate": 5.915403931775759e-05, "loss": 0.3856, "step": 1211 }, { "epoch": 2.0369747899159663, "grad_norm": 0.4758786435732811, "learning_rate": 5.911281594332904e-05, "loss": 0.3883, "step": 1212 }, { "epoch": 2.038655462184874, "grad_norm": 0.5205114671638192, "learning_rate": 5.907156624637041e-05, "loss": 0.382, "step": 1213 }, { "epoch": 2.0403361344537814, "grad_norm": 0.4300922461833762, "learning_rate": 5.9030290283691536e-05, "loss": 0.3757, "step": 1214 }, { "epoch": 2.042016806722689, "grad_norm": 0.41054950611381713, "learning_rate": 5.8988988112138496e-05, "loss": 0.3782, "step": 1215 }, { "epoch": 2.0436974789915965, "grad_norm": 0.3572881729677034, "learning_rate": 5.894765978859338e-05, "loss": 0.3873, "step": 1216 }, { "epoch": 2.045378151260504, "grad_norm": 0.3243966992254374, "learning_rate": 5.8906305369974344e-05, "loss": 0.3822, "step": 1217 }, { "epoch": 2.0470588235294116, "grad_norm": 0.4383544697282829, "learning_rate": 5.8864924913235476e-05, "loss": 0.3762, "step": 1218 }, { "epoch": 2.048739495798319, "grad_norm": 0.41159033081781304, "learning_rate": 5.8823518475366714e-05, "loss": 0.374, "step": 1219 }, { "epoch": 2.0504201680672267, "grad_norm": 0.258100878014769, "learning_rate": 5.8782086113393794e-05, "loss": 0.3767, "step": 1220 }, { "epoch": 2.0521008403361343, "grad_norm": 0.313500606226554, "learning_rate": 5.8740627884378134e-05, "loss": 0.3784, "step": 1221 }, { "epoch": 2.053781512605042, "grad_norm": 0.4588102441733581, "learning_rate": 5.86991438454168e-05, "loss": 0.3738, "step": 1222 }, { "epoch": 2.0554621848739494, "grad_norm": 0.4697904483661579, "learning_rate": 5.865763405364239e-05, "loss": 0.3806, "step": 1223 }, { "epoch": 2.057142857142857, "grad_norm": 0.4025182105508747, "learning_rate": 5.8616098566222956e-05, "loss": 0.367, "step": 1224 }, { "epoch": 2.0588235294117645, "grad_norm": 0.401794601435679, "learning_rate": 5.857453744036196e-05, "loss": 0.3761, "step": 1225 }, { "epoch": 2.060504201680672, "grad_norm": 0.4235390238155572, "learning_rate": 5.853295073329818e-05, "loss": 0.3838, "step": 1226 }, { "epoch": 2.0621848739495796, "grad_norm": 0.39370500103793277, "learning_rate": 5.8491338502305606e-05, "loss": 0.3757, "step": 1227 }, { "epoch": 2.063865546218487, "grad_norm": 0.3290785268459111, "learning_rate": 5.8449700804693386e-05, "loss": 0.3799, "step": 1228 }, { "epoch": 2.065546218487395, "grad_norm": 0.3379339823879967, "learning_rate": 5.840803769780573e-05, "loss": 0.3828, "step": 1229 }, { "epoch": 2.0672268907563027, "grad_norm": 0.30814572060213824, "learning_rate": 5.8366349239021854e-05, "loss": 0.3871, "step": 1230 }, { "epoch": 2.0689075630252103, "grad_norm": 0.2789455977062216, "learning_rate": 5.8324635485755916e-05, "loss": 0.3758, "step": 1231 }, { "epoch": 2.070588235294118, "grad_norm": 0.2826230812112666, "learning_rate": 5.828289649545685e-05, "loss": 0.3861, "step": 1232 }, { "epoch": 2.0722689075630254, "grad_norm": 0.3136435671873403, "learning_rate": 5.824113232560839e-05, "loss": 0.3722, "step": 1233 }, { "epoch": 2.073949579831933, "grad_norm": 0.41071857622434593, "learning_rate": 5.8199343033728955e-05, "loss": 0.3862, "step": 1234 }, { "epoch": 2.0756302521008405, "grad_norm": 0.4095938590516487, "learning_rate": 5.8157528677371526e-05, "loss": 0.377, "step": 1235 }, { "epoch": 2.077310924369748, "grad_norm": 0.31023150449033826, "learning_rate": 5.8115689314123625e-05, "loss": 0.3792, "step": 1236 }, { "epoch": 2.0789915966386556, "grad_norm": 0.29926793242050576, "learning_rate": 5.807382500160721e-05, "loss": 0.3815, "step": 1237 }, { "epoch": 2.080672268907563, "grad_norm": 0.3073161396642086, "learning_rate": 5.8031935797478604e-05, "loss": 0.3809, "step": 1238 }, { "epoch": 2.0823529411764707, "grad_norm": 0.30157980300386433, "learning_rate": 5.799002175942841e-05, "loss": 0.3902, "step": 1239 }, { "epoch": 2.0840336134453783, "grad_norm": 0.33759810683529456, "learning_rate": 5.794808294518142e-05, "loss": 0.3769, "step": 1240 }, { "epoch": 2.085714285714286, "grad_norm": 0.35667724259028255, "learning_rate": 5.7906119412496565e-05, "loss": 0.3757, "step": 1241 }, { "epoch": 2.0873949579831934, "grad_norm": 0.34116696743246494, "learning_rate": 5.786413121916682e-05, "loss": 0.3843, "step": 1242 }, { "epoch": 2.089075630252101, "grad_norm": 0.36460125585086306, "learning_rate": 5.7822118423019115e-05, "loss": 0.3752, "step": 1243 }, { "epoch": 2.0907563025210085, "grad_norm": 0.35008848529801423, "learning_rate": 5.778008108191425e-05, "loss": 0.3827, "step": 1244 }, { "epoch": 2.092436974789916, "grad_norm": 0.33806241290713485, "learning_rate": 5.7738019253746846e-05, "loss": 0.3734, "step": 1245 }, { "epoch": 2.0941176470588236, "grad_norm": 0.35473489247783885, "learning_rate": 5.769593299644526e-05, "loss": 0.3903, "step": 1246 }, { "epoch": 2.095798319327731, "grad_norm": 0.36686973502281556, "learning_rate": 5.7653822367971475e-05, "loss": 0.3813, "step": 1247 }, { "epoch": 2.0974789915966388, "grad_norm": 0.3190433522683223, "learning_rate": 5.761168742632103e-05, "loss": 0.3891, "step": 1248 }, { "epoch": 2.0991596638655463, "grad_norm": 0.275585792962165, "learning_rate": 5.756952822952295e-05, "loss": 0.3678, "step": 1249 }, { "epoch": 2.100840336134454, "grad_norm": 0.2586467205526234, "learning_rate": 5.752734483563971e-05, "loss": 0.3889, "step": 1250 }, { "epoch": 2.1025210084033614, "grad_norm": 0.3458841989324355, "learning_rate": 5.748513730276705e-05, "loss": 0.3835, "step": 1251 }, { "epoch": 2.104201680672269, "grad_norm": 0.4525059933771688, "learning_rate": 5.744290568903399e-05, "loss": 0.3782, "step": 1252 }, { "epoch": 2.1058823529411765, "grad_norm": 0.40066844052041584, "learning_rate": 5.740065005260269e-05, "loss": 0.3768, "step": 1253 }, { "epoch": 2.107563025210084, "grad_norm": 0.3208061251766291, "learning_rate": 5.7358370451668414e-05, "loss": 0.3749, "step": 1254 }, { "epoch": 2.1092436974789917, "grad_norm": 0.3264339315109501, "learning_rate": 5.7316066944459434e-05, "loss": 0.3787, "step": 1255 }, { "epoch": 2.110924369747899, "grad_norm": 0.32014184759341685, "learning_rate": 5.727373958923692e-05, "loss": 0.3806, "step": 1256 }, { "epoch": 2.1126050420168068, "grad_norm": 0.3186672095800306, "learning_rate": 5.7231388444294926e-05, "loss": 0.3816, "step": 1257 }, { "epoch": 2.1142857142857143, "grad_norm": 0.29791392903166064, "learning_rate": 5.718901356796022e-05, "loss": 0.3881, "step": 1258 }, { "epoch": 2.115966386554622, "grad_norm": 0.25264579747617383, "learning_rate": 5.7146615018592306e-05, "loss": 0.3821, "step": 1259 }, { "epoch": 2.1176470588235294, "grad_norm": 0.3264965734487081, "learning_rate": 5.710419285458324e-05, "loss": 0.3768, "step": 1260 }, { "epoch": 2.119327731092437, "grad_norm": 0.4106233721974186, "learning_rate": 5.706174713435764e-05, "loss": 0.3961, "step": 1261 }, { "epoch": 2.1210084033613446, "grad_norm": 0.4327238029976947, "learning_rate": 5.701927791637255e-05, "loss": 0.3756, "step": 1262 }, { "epoch": 2.122689075630252, "grad_norm": 0.38801340958140595, "learning_rate": 5.697678525911737e-05, "loss": 0.3837, "step": 1263 }, { "epoch": 2.1243697478991597, "grad_norm": 0.2743978248009578, "learning_rate": 5.69342692211138e-05, "loss": 0.3804, "step": 1264 }, { "epoch": 2.1260504201680672, "grad_norm": 0.24521347779105496, "learning_rate": 5.6891729860915725e-05, "loss": 0.3725, "step": 1265 }, { "epoch": 2.127731092436975, "grad_norm": 0.3072946195472766, "learning_rate": 5.684916723710914e-05, "loss": 0.3817, "step": 1266 }, { "epoch": 2.1294117647058823, "grad_norm": 0.27709420318459377, "learning_rate": 5.680658140831211e-05, "loss": 0.3903, "step": 1267 }, { "epoch": 2.13109243697479, "grad_norm": 0.2659911189260197, "learning_rate": 5.6763972433174625e-05, "loss": 0.3835, "step": 1268 }, { "epoch": 2.1327731092436975, "grad_norm": 0.29146870719908946, "learning_rate": 5.672134037037858e-05, "loss": 0.3762, "step": 1269 }, { "epoch": 2.134453781512605, "grad_norm": 0.22828385931464878, "learning_rate": 5.667868527863764e-05, "loss": 0.3712, "step": 1270 }, { "epoch": 2.1361344537815126, "grad_norm": 0.21829610671647512, "learning_rate": 5.663600721669722e-05, "loss": 0.3819, "step": 1271 }, { "epoch": 2.13781512605042, "grad_norm": 0.29583066086477505, "learning_rate": 5.659330624333433e-05, "loss": 0.3919, "step": 1272 }, { "epoch": 2.1394957983193277, "grad_norm": 0.30252561969822883, "learning_rate": 5.655058241735757e-05, "loss": 0.3746, "step": 1273 }, { "epoch": 2.1411764705882352, "grad_norm": 0.29464788903121375, "learning_rate": 5.6507835797607006e-05, "loss": 0.3861, "step": 1274 }, { "epoch": 2.142857142857143, "grad_norm": 0.2962456932133165, "learning_rate": 5.646506644295405e-05, "loss": 0.3761, "step": 1275 }, { "epoch": 2.1445378151260504, "grad_norm": 0.3236169717011634, "learning_rate": 5.642227441230152e-05, "loss": 0.3761, "step": 1276 }, { "epoch": 2.146218487394958, "grad_norm": 0.28327976012067646, "learning_rate": 5.6379459764583355e-05, "loss": 0.3766, "step": 1277 }, { "epoch": 2.1478991596638655, "grad_norm": 0.19078157389198933, "learning_rate": 5.633662255876472e-05, "loss": 0.3754, "step": 1278 }, { "epoch": 2.149579831932773, "grad_norm": 0.22035749734473303, "learning_rate": 5.6293762853841814e-05, "loss": 0.3819, "step": 1279 }, { "epoch": 2.1512605042016806, "grad_norm": 0.29372371139667897, "learning_rate": 5.6250880708841825e-05, "loss": 0.3862, "step": 1280 }, { "epoch": 2.152941176470588, "grad_norm": 0.33803745370149546, "learning_rate": 5.620797618282286e-05, "loss": 0.3829, "step": 1281 }, { "epoch": 2.1546218487394957, "grad_norm": 0.3341321180203604, "learning_rate": 5.616504933487385e-05, "loss": 0.3794, "step": 1282 }, { "epoch": 2.1563025210084033, "grad_norm": 0.2436701894392329, "learning_rate": 5.612210022411443e-05, "loss": 0.3748, "step": 1283 }, { "epoch": 2.157983193277311, "grad_norm": 0.25073843078131947, "learning_rate": 5.607912890969495e-05, "loss": 0.3769, "step": 1284 }, { "epoch": 2.1596638655462184, "grad_norm": 0.29664226463474375, "learning_rate": 5.6036135450796294e-05, "loss": 0.3832, "step": 1285 }, { "epoch": 2.161344537815126, "grad_norm": 0.24958463872415035, "learning_rate": 5.599311990662989e-05, "loss": 0.3807, "step": 1286 }, { "epoch": 2.1630252100840335, "grad_norm": 0.2888128070973308, "learning_rate": 5.595008233643752e-05, "loss": 0.3769, "step": 1287 }, { "epoch": 2.164705882352941, "grad_norm": 0.29508680807732013, "learning_rate": 5.590702279949136e-05, "loss": 0.3772, "step": 1288 }, { "epoch": 2.1663865546218486, "grad_norm": 0.22476787847749835, "learning_rate": 5.5863941355093814e-05, "loss": 0.3758, "step": 1289 }, { "epoch": 2.168067226890756, "grad_norm": 0.2388514509299122, "learning_rate": 5.5820838062577456e-05, "loss": 0.3828, "step": 1290 }, { "epoch": 2.1697478991596637, "grad_norm": 0.24628358924915944, "learning_rate": 5.577771298130494e-05, "loss": 0.3759, "step": 1291 }, { "epoch": 2.1714285714285713, "grad_norm": 0.26985855589441854, "learning_rate": 5.573456617066896e-05, "loss": 0.3858, "step": 1292 }, { "epoch": 2.173109243697479, "grad_norm": 0.3225328502925552, "learning_rate": 5.569139769009211e-05, "loss": 0.3859, "step": 1293 }, { "epoch": 2.1747899159663864, "grad_norm": 0.2841005605808643, "learning_rate": 5.564820759902683e-05, "loss": 0.3814, "step": 1294 }, { "epoch": 2.176470588235294, "grad_norm": 0.21313817517985992, "learning_rate": 5.560499595695533e-05, "loss": 0.3717, "step": 1295 }, { "epoch": 2.1781512605042015, "grad_norm": 0.2418237546081809, "learning_rate": 5.556176282338949e-05, "loss": 0.3772, "step": 1296 }, { "epoch": 2.179831932773109, "grad_norm": 0.2621725329482153, "learning_rate": 5.55185082578708e-05, "loss": 0.3816, "step": 1297 }, { "epoch": 2.1815126050420166, "grad_norm": 0.3153856093729432, "learning_rate": 5.5475232319970275e-05, "loss": 0.3812, "step": 1298 }, { "epoch": 2.183193277310924, "grad_norm": 0.2997392431860689, "learning_rate": 5.543193506928832e-05, "loss": 0.3782, "step": 1299 }, { "epoch": 2.184873949579832, "grad_norm": 0.27767083334919995, "learning_rate": 5.538861656545477e-05, "loss": 0.3822, "step": 1300 }, { "epoch": 2.1865546218487397, "grad_norm": 0.2808005606384819, "learning_rate": 5.534527686812864e-05, "loss": 0.3764, "step": 1301 }, { "epoch": 2.1882352941176473, "grad_norm": 0.2606318102007651, "learning_rate": 5.530191603699821e-05, "loss": 0.3854, "step": 1302 }, { "epoch": 2.189915966386555, "grad_norm": 0.2535941977038486, "learning_rate": 5.525853413178081e-05, "loss": 0.3763, "step": 1303 }, { "epoch": 2.1915966386554624, "grad_norm": 0.3118837632739524, "learning_rate": 5.521513121222284e-05, "loss": 0.3889, "step": 1304 }, { "epoch": 2.19327731092437, "grad_norm": 0.344926082176097, "learning_rate": 5.517170733809959e-05, "loss": 0.3832, "step": 1305 }, { "epoch": 2.1949579831932775, "grad_norm": 0.3736767447335891, "learning_rate": 5.512826256921527e-05, "loss": 0.3837, "step": 1306 }, { "epoch": 2.196638655462185, "grad_norm": 0.39483655410038865, "learning_rate": 5.5084796965402794e-05, "loss": 0.3843, "step": 1307 }, { "epoch": 2.1983193277310926, "grad_norm": 0.4362583524182564, "learning_rate": 5.504131058652385e-05, "loss": 0.386, "step": 1308 }, { "epoch": 2.2, "grad_norm": 0.4520296303619181, "learning_rate": 5.499780349246867e-05, "loss": 0.3705, "step": 1309 }, { "epoch": 2.2016806722689077, "grad_norm": 0.3553255469397692, "learning_rate": 5.4954275743156054e-05, "loss": 0.3888, "step": 1310 }, { "epoch": 2.2033613445378153, "grad_norm": 0.28164790334652196, "learning_rate": 5.491072739853324e-05, "loss": 0.3785, "step": 1311 }, { "epoch": 2.205042016806723, "grad_norm": 0.294566626902417, "learning_rate": 5.486715851857583e-05, "loss": 0.3783, "step": 1312 }, { "epoch": 2.2067226890756304, "grad_norm": 0.27510403633462366, "learning_rate": 5.482356916328771e-05, "loss": 0.3851, "step": 1313 }, { "epoch": 2.208403361344538, "grad_norm": 0.35771987047142617, "learning_rate": 5.477995939270093e-05, "loss": 0.3678, "step": 1314 }, { "epoch": 2.2100840336134455, "grad_norm": 0.3735119112951117, "learning_rate": 5.473632926687573e-05, "loss": 0.3812, "step": 1315 }, { "epoch": 2.211764705882353, "grad_norm": 0.3588938953692829, "learning_rate": 5.469267884590031e-05, "loss": 0.3788, "step": 1316 }, { "epoch": 2.2134453781512606, "grad_norm": 0.31345093310383787, "learning_rate": 5.464900818989087e-05, "loss": 0.3798, "step": 1317 }, { "epoch": 2.215126050420168, "grad_norm": 0.23887366522443151, "learning_rate": 5.460531735899146e-05, "loss": 0.3688, "step": 1318 }, { "epoch": 2.2168067226890757, "grad_norm": 0.3184999695303908, "learning_rate": 5.45616064133739e-05, "loss": 0.3843, "step": 1319 }, { "epoch": 2.2184873949579833, "grad_norm": 0.32574663558330663, "learning_rate": 5.4517875413237736e-05, "loss": 0.3769, "step": 1320 }, { "epoch": 2.220168067226891, "grad_norm": 0.31746440250322544, "learning_rate": 5.447412441881012e-05, "loss": 0.3722, "step": 1321 }, { "epoch": 2.2218487394957984, "grad_norm": 0.30418945198594577, "learning_rate": 5.443035349034577e-05, "loss": 0.3848, "step": 1322 }, { "epoch": 2.223529411764706, "grad_norm": 0.2604358503737244, "learning_rate": 5.438656268812679e-05, "loss": 0.3868, "step": 1323 }, { "epoch": 2.2252100840336135, "grad_norm": 0.24370390237458806, "learning_rate": 5.4342752072462747e-05, "loss": 0.3825, "step": 1324 }, { "epoch": 2.226890756302521, "grad_norm": 0.2581288301725645, "learning_rate": 5.429892170369041e-05, "loss": 0.37, "step": 1325 }, { "epoch": 2.2285714285714286, "grad_norm": 0.23795055747924473, "learning_rate": 5.425507164217378e-05, "loss": 0.3843, "step": 1326 }, { "epoch": 2.230252100840336, "grad_norm": 0.27210699684782563, "learning_rate": 5.421120194830403e-05, "loss": 0.3699, "step": 1327 }, { "epoch": 2.2319327731092438, "grad_norm": 0.30722848986388235, "learning_rate": 5.4167312682499296e-05, "loss": 0.3735, "step": 1328 }, { "epoch": 2.2336134453781513, "grad_norm": 0.24443581429622233, "learning_rate": 5.4123403905204716e-05, "loss": 0.3826, "step": 1329 }, { "epoch": 2.235294117647059, "grad_norm": 0.2488595649618296, "learning_rate": 5.407947567689228e-05, "loss": 0.3779, "step": 1330 }, { "epoch": 2.2369747899159664, "grad_norm": 0.2186451900703093, "learning_rate": 5.403552805806076e-05, "loss": 0.3888, "step": 1331 }, { "epoch": 2.238655462184874, "grad_norm": 0.24877733382429146, "learning_rate": 5.399156110923567e-05, "loss": 0.3831, "step": 1332 }, { "epoch": 2.2403361344537815, "grad_norm": 0.28703529825064866, "learning_rate": 5.39475748909691e-05, "loss": 0.3789, "step": 1333 }, { "epoch": 2.242016806722689, "grad_norm": 0.22159940412423973, "learning_rate": 5.390356946383969e-05, "loss": 0.3789, "step": 1334 }, { "epoch": 2.2436974789915967, "grad_norm": 0.23994589234113975, "learning_rate": 5.385954488845258e-05, "loss": 0.381, "step": 1335 }, { "epoch": 2.245378151260504, "grad_norm": 0.30317202350788724, "learning_rate": 5.381550122543921e-05, "loss": 0.3829, "step": 1336 }, { "epoch": 2.2470588235294118, "grad_norm": 0.3251240455941484, "learning_rate": 5.377143853545736e-05, "loss": 0.3803, "step": 1337 }, { "epoch": 2.2487394957983193, "grad_norm": 0.23785403036762418, "learning_rate": 5.372735687919097e-05, "loss": 0.3678, "step": 1338 }, { "epoch": 2.250420168067227, "grad_norm": 0.20980164995944472, "learning_rate": 5.3683256317350165e-05, "loss": 0.3761, "step": 1339 }, { "epoch": 2.2521008403361344, "grad_norm": 0.28704270125325504, "learning_rate": 5.3639136910671045e-05, "loss": 0.3824, "step": 1340 }, { "epoch": 2.253781512605042, "grad_norm": 0.2932317009461062, "learning_rate": 5.35949987199157e-05, "loss": 0.3899, "step": 1341 }, { "epoch": 2.2554621848739496, "grad_norm": 0.2779825635771772, "learning_rate": 5.355084180587206e-05, "loss": 0.3785, "step": 1342 }, { "epoch": 2.257142857142857, "grad_norm": 0.31982345594784245, "learning_rate": 5.350666622935388e-05, "loss": 0.3837, "step": 1343 }, { "epoch": 2.2588235294117647, "grad_norm": 0.28485224805643766, "learning_rate": 5.346247205120059e-05, "loss": 0.3696, "step": 1344 }, { "epoch": 2.2605042016806722, "grad_norm": 0.29119567085902814, "learning_rate": 5.341825933227723e-05, "loss": 0.3851, "step": 1345 }, { "epoch": 2.26218487394958, "grad_norm": 0.3329098088266826, "learning_rate": 5.3374028133474406e-05, "loss": 0.3794, "step": 1346 }, { "epoch": 2.2638655462184873, "grad_norm": 0.2305616431936747, "learning_rate": 5.332977851570815e-05, "loss": 0.3832, "step": 1347 }, { "epoch": 2.265546218487395, "grad_norm": 0.2298746106302083, "learning_rate": 5.328551053991987e-05, "loss": 0.3838, "step": 1348 }, { "epoch": 2.2672268907563025, "grad_norm": 0.2532331034345443, "learning_rate": 5.3241224267076265e-05, "loss": 0.3689, "step": 1349 }, { "epoch": 2.26890756302521, "grad_norm": 0.2802319342696925, "learning_rate": 5.319691975816921e-05, "loss": 0.3839, "step": 1350 }, { "epoch": 2.2705882352941176, "grad_norm": 0.3015529619020571, "learning_rate": 5.315259707421571e-05, "loss": 0.3844, "step": 1351 }, { "epoch": 2.272268907563025, "grad_norm": 0.2817904986181791, "learning_rate": 5.3108256276257804e-05, "loss": 0.375, "step": 1352 }, { "epoch": 2.2739495798319327, "grad_norm": 0.25926863822805585, "learning_rate": 5.3063897425362494e-05, "loss": 0.3835, "step": 1353 }, { "epoch": 2.2756302521008402, "grad_norm": 0.2705562602709399, "learning_rate": 5.301952058262158e-05, "loss": 0.3748, "step": 1354 }, { "epoch": 2.277310924369748, "grad_norm": 0.31892288936516366, "learning_rate": 5.297512580915173e-05, "loss": 0.3757, "step": 1355 }, { "epoch": 2.2789915966386554, "grad_norm": 0.2897230375025978, "learning_rate": 5.293071316609425e-05, "loss": 0.3759, "step": 1356 }, { "epoch": 2.280672268907563, "grad_norm": 0.18398828086217575, "learning_rate": 5.288628271461505e-05, "loss": 0.3922, "step": 1357 }, { "epoch": 2.2823529411764705, "grad_norm": 0.24996737026315466, "learning_rate": 5.284183451590459e-05, "loss": 0.381, "step": 1358 }, { "epoch": 2.284033613445378, "grad_norm": 0.31463565943638117, "learning_rate": 5.279736863117779e-05, "loss": 0.3835, "step": 1359 }, { "epoch": 2.2857142857142856, "grad_norm": 0.31935407114628883, "learning_rate": 5.2752885121673884e-05, "loss": 0.3813, "step": 1360 }, { "epoch": 2.287394957983193, "grad_norm": 0.3491540625884755, "learning_rate": 5.270838404865639e-05, "loss": 0.383, "step": 1361 }, { "epoch": 2.2890756302521007, "grad_norm": 0.36798740300635896, "learning_rate": 5.266386547341305e-05, "loss": 0.3756, "step": 1362 }, { "epoch": 2.2907563025210083, "grad_norm": 0.302521680921954, "learning_rate": 5.261932945725567e-05, "loss": 0.3754, "step": 1363 }, { "epoch": 2.292436974789916, "grad_norm": 0.29443600005967635, "learning_rate": 5.257477606152009e-05, "loss": 0.3865, "step": 1364 }, { "epoch": 2.2941176470588234, "grad_norm": 0.32046712025935437, "learning_rate": 5.253020534756608e-05, "loss": 0.3881, "step": 1365 }, { "epoch": 2.295798319327731, "grad_norm": 0.24737698706526, "learning_rate": 5.248561737677728e-05, "loss": 0.3769, "step": 1366 }, { "epoch": 2.2974789915966385, "grad_norm": 0.20142990467148633, "learning_rate": 5.2441012210561075e-05, "loss": 0.3872, "step": 1367 }, { "epoch": 2.299159663865546, "grad_norm": 0.2070812884392602, "learning_rate": 5.239638991034854e-05, "loss": 0.3752, "step": 1368 }, { "epoch": 2.3008403361344536, "grad_norm": 0.2553202469208752, "learning_rate": 5.235175053759435e-05, "loss": 0.3837, "step": 1369 }, { "epoch": 2.302521008403361, "grad_norm": 0.260472568624179, "learning_rate": 5.2307094153776686e-05, "loss": 0.3799, "step": 1370 }, { "epoch": 2.3042016806722687, "grad_norm": 0.26928204106524056, "learning_rate": 5.226242082039717e-05, "loss": 0.3746, "step": 1371 }, { "epoch": 2.3058823529411763, "grad_norm": 0.27978505675938853, "learning_rate": 5.221773059898074e-05, "loss": 0.3832, "step": 1372 }, { "epoch": 2.307563025210084, "grad_norm": 0.2581778175527114, "learning_rate": 5.2173023551075624e-05, "loss": 0.3768, "step": 1373 }, { "epoch": 2.3092436974789914, "grad_norm": 0.2137253142184654, "learning_rate": 5.212829973825322e-05, "loss": 0.3784, "step": 1374 }, { "epoch": 2.310924369747899, "grad_norm": 0.22195027982565022, "learning_rate": 5.208355922210799e-05, "loss": 0.3825, "step": 1375 }, { "epoch": 2.3126050420168065, "grad_norm": 0.2688358596581987, "learning_rate": 5.2038802064257416e-05, "loss": 0.3831, "step": 1376 }, { "epoch": 2.314285714285714, "grad_norm": 0.24411980644125789, "learning_rate": 5.199402832634189e-05, "loss": 0.3782, "step": 1377 }, { "epoch": 2.3159663865546216, "grad_norm": 0.23918615898306111, "learning_rate": 5.194923807002468e-05, "loss": 0.3879, "step": 1378 }, { "epoch": 2.317647058823529, "grad_norm": 0.21179779937485252, "learning_rate": 5.190443135699175e-05, "loss": 0.3768, "step": 1379 }, { "epoch": 2.3193277310924367, "grad_norm": 0.21906887298963748, "learning_rate": 5.185960824895173e-05, "loss": 0.3798, "step": 1380 }, { "epoch": 2.3210084033613447, "grad_norm": 0.25196480827225204, "learning_rate": 5.1814768807635876e-05, "loss": 0.3854, "step": 1381 }, { "epoch": 2.3226890756302523, "grad_norm": 0.28069321688139187, "learning_rate": 5.176991309479791e-05, "loss": 0.3799, "step": 1382 }, { "epoch": 2.32436974789916, "grad_norm": 0.2973063905770544, "learning_rate": 5.172504117221396e-05, "loss": 0.3879, "step": 1383 }, { "epoch": 2.3260504201680674, "grad_norm": 0.2626919186404296, "learning_rate": 5.1680153101682455e-05, "loss": 0.3717, "step": 1384 }, { "epoch": 2.327731092436975, "grad_norm": 0.1925319575935791, "learning_rate": 5.1635248945024144e-05, "loss": 0.3822, "step": 1385 }, { "epoch": 2.3294117647058825, "grad_norm": 0.1838375313368681, "learning_rate": 5.1590328764081835e-05, "loss": 0.3847, "step": 1386 }, { "epoch": 2.33109243697479, "grad_norm": 0.21134204523258487, "learning_rate": 5.154539262072046e-05, "loss": 0.385, "step": 1387 }, { "epoch": 2.3327731092436976, "grad_norm": 0.2616090569325717, "learning_rate": 5.150044057682693e-05, "loss": 0.3798, "step": 1388 }, { "epoch": 2.334453781512605, "grad_norm": 0.2798427328063022, "learning_rate": 5.1455472694310026e-05, "loss": 0.3772, "step": 1389 }, { "epoch": 2.3361344537815127, "grad_norm": 0.24567800727292619, "learning_rate": 5.14104890351004e-05, "loss": 0.3805, "step": 1390 }, { "epoch": 2.3378151260504203, "grad_norm": 0.21429579088585826, "learning_rate": 5.136548966115036e-05, "loss": 0.3785, "step": 1391 }, { "epoch": 2.339495798319328, "grad_norm": 0.2137270018677992, "learning_rate": 5.1320474634433905e-05, "loss": 0.3733, "step": 1392 }, { "epoch": 2.3411764705882354, "grad_norm": 0.3022417405940602, "learning_rate": 5.127544401694657e-05, "loss": 0.3715, "step": 1393 }, { "epoch": 2.342857142857143, "grad_norm": 0.35167342479002905, "learning_rate": 5.123039787070537e-05, "loss": 0.3804, "step": 1394 }, { "epoch": 2.3445378151260505, "grad_norm": 0.33212608216761375, "learning_rate": 5.1185336257748736e-05, "loss": 0.3796, "step": 1395 }, { "epoch": 2.346218487394958, "grad_norm": 0.3521545177232956, "learning_rate": 5.114025924013632e-05, "loss": 0.3837, "step": 1396 }, { "epoch": 2.3478991596638656, "grad_norm": 0.35535071956227754, "learning_rate": 5.1095166879949066e-05, "loss": 0.381, "step": 1397 }, { "epoch": 2.349579831932773, "grad_norm": 0.3462764138417168, "learning_rate": 5.105005923928904e-05, "loss": 0.3799, "step": 1398 }, { "epoch": 2.3512605042016808, "grad_norm": 0.2659071548236109, "learning_rate": 5.10049363802793e-05, "loss": 0.367, "step": 1399 }, { "epoch": 2.3529411764705883, "grad_norm": 0.21614100893267013, "learning_rate": 5.09597983650639e-05, "loss": 0.3897, "step": 1400 }, { "epoch": 2.354621848739496, "grad_norm": 0.23151512174327898, "learning_rate": 5.09146452558078e-05, "loss": 0.3863, "step": 1401 }, { "epoch": 2.3563025210084034, "grad_norm": 0.2623634456708903, "learning_rate": 5.086947711469668e-05, "loss": 0.3821, "step": 1402 }, { "epoch": 2.357983193277311, "grad_norm": 0.25074542582395765, "learning_rate": 5.082429400393696e-05, "loss": 0.3784, "step": 1403 }, { "epoch": 2.3596638655462185, "grad_norm": 0.2082968263604987, "learning_rate": 5.077909598575566e-05, "loss": 0.3775, "step": 1404 }, { "epoch": 2.361344537815126, "grad_norm": 0.27603072509343995, "learning_rate": 5.073388312240038e-05, "loss": 0.3833, "step": 1405 }, { "epoch": 2.3630252100840337, "grad_norm": 0.2613980611503576, "learning_rate": 5.0688655476139105e-05, "loss": 0.3821, "step": 1406 }, { "epoch": 2.364705882352941, "grad_norm": 0.17779089060397876, "learning_rate": 5.0643413109260195e-05, "loss": 0.3814, "step": 1407 }, { "epoch": 2.3663865546218488, "grad_norm": 0.2233658995143273, "learning_rate": 5.0598156084072275e-05, "loss": 0.383, "step": 1408 }, { "epoch": 2.3680672268907563, "grad_norm": 0.22455034549583167, "learning_rate": 5.055288446290421e-05, "loss": 0.3804, "step": 1409 }, { "epoch": 2.369747899159664, "grad_norm": 0.22017060814913386, "learning_rate": 5.050759830810492e-05, "loss": 0.3796, "step": 1410 }, { "epoch": 2.3714285714285714, "grad_norm": 0.24889103183693453, "learning_rate": 5.046229768204332e-05, "loss": 0.377, "step": 1411 }, { "epoch": 2.373109243697479, "grad_norm": 0.2666540760573357, "learning_rate": 5.0416982647108316e-05, "loss": 0.3806, "step": 1412 }, { "epoch": 2.3747899159663866, "grad_norm": 0.26231766874057794, "learning_rate": 5.037165326570862e-05, "loss": 0.3812, "step": 1413 }, { "epoch": 2.376470588235294, "grad_norm": 0.23032388414923038, "learning_rate": 5.032630960027272e-05, "loss": 0.3847, "step": 1414 }, { "epoch": 2.3781512605042017, "grad_norm": 0.22063292943541205, "learning_rate": 5.0280951713248736e-05, "loss": 0.377, "step": 1415 }, { "epoch": 2.3798319327731092, "grad_norm": 0.21287019212896874, "learning_rate": 5.023557966710442e-05, "loss": 0.3756, "step": 1416 }, { "epoch": 2.381512605042017, "grad_norm": 0.22792926349905093, "learning_rate": 5.019019352432703e-05, "loss": 0.3752, "step": 1417 }, { "epoch": 2.3831932773109243, "grad_norm": 0.29103191516519217, "learning_rate": 5.014479334742319e-05, "loss": 0.3787, "step": 1418 }, { "epoch": 2.384873949579832, "grad_norm": 0.33556143717159814, "learning_rate": 5.009937919891888e-05, "loss": 0.3894, "step": 1419 }, { "epoch": 2.3865546218487395, "grad_norm": 0.3465811685153073, "learning_rate": 5.0053951141359334e-05, "loss": 0.3826, "step": 1420 }, { "epoch": 2.388235294117647, "grad_norm": 0.26283960481498236, "learning_rate": 5.0008509237308917e-05, "loss": 0.3884, "step": 1421 }, { "epoch": 2.3899159663865546, "grad_norm": 0.26064621227556645, "learning_rate": 4.9963053549351086e-05, "loss": 0.3771, "step": 1422 }, { "epoch": 2.391596638655462, "grad_norm": 0.27430097961175964, "learning_rate": 4.9917584140088246e-05, "loss": 0.3828, "step": 1423 }, { "epoch": 2.3932773109243697, "grad_norm": 0.23195918004094115, "learning_rate": 4.987210107214175e-05, "loss": 0.3767, "step": 1424 }, { "epoch": 2.3949579831932772, "grad_norm": 0.2091143257529225, "learning_rate": 4.982660440815174e-05, "loss": 0.379, "step": 1425 }, { "epoch": 2.396638655462185, "grad_norm": 0.2820058054120438, "learning_rate": 4.978109421077706e-05, "loss": 0.3804, "step": 1426 }, { "epoch": 2.3983193277310924, "grad_norm": 0.28386815182393293, "learning_rate": 4.973557054269521e-05, "loss": 0.3807, "step": 1427 }, { "epoch": 2.4, "grad_norm": 0.2499389331100949, "learning_rate": 4.969003346660226e-05, "loss": 0.3823, "step": 1428 }, { "epoch": 2.4016806722689075, "grad_norm": 0.21950685195650993, "learning_rate": 4.964448304521272e-05, "loss": 0.3781, "step": 1429 }, { "epoch": 2.403361344537815, "grad_norm": 0.2841046295031337, "learning_rate": 4.959891934125949e-05, "loss": 0.3799, "step": 1430 }, { "epoch": 2.4050420168067226, "grad_norm": 0.25970189526725584, "learning_rate": 4.9553342417493744e-05, "loss": 0.3785, "step": 1431 }, { "epoch": 2.40672268907563, "grad_norm": 0.2909892631069831, "learning_rate": 4.9507752336684906e-05, "loss": 0.3811, "step": 1432 }, { "epoch": 2.4084033613445377, "grad_norm": 0.3198397103386197, "learning_rate": 4.9462149161620476e-05, "loss": 0.3753, "step": 1433 }, { "epoch": 2.4100840336134453, "grad_norm": 0.3250123030199019, "learning_rate": 4.941653295510602e-05, "loss": 0.3802, "step": 1434 }, { "epoch": 2.411764705882353, "grad_norm": 1.1489053342354836, "learning_rate": 4.9370903779965014e-05, "loss": 0.3948, "step": 1435 }, { "epoch": 2.4134453781512604, "grad_norm": 0.21899260975349727, "learning_rate": 4.932526169903884e-05, "loss": 0.3774, "step": 1436 }, { "epoch": 2.415126050420168, "grad_norm": 0.2832985196308378, "learning_rate": 4.9279606775186635e-05, "loss": 0.3859, "step": 1437 }, { "epoch": 2.4168067226890755, "grad_norm": 0.3186803080246221, "learning_rate": 4.923393907128519e-05, "loss": 0.3715, "step": 1438 }, { "epoch": 2.418487394957983, "grad_norm": 0.353373304686362, "learning_rate": 4.918825865022896e-05, "loss": 0.3824, "step": 1439 }, { "epoch": 2.4201680672268906, "grad_norm": 0.3497907299159824, "learning_rate": 4.914256557492986e-05, "loss": 0.3856, "step": 1440 }, { "epoch": 2.421848739495798, "grad_norm": 0.2873387906598512, "learning_rate": 4.909685990831726e-05, "loss": 0.3825, "step": 1441 }, { "epoch": 2.4235294117647057, "grad_norm": 0.2701545264706445, "learning_rate": 4.905114171333787e-05, "loss": 0.3799, "step": 1442 }, { "epoch": 2.4252100840336133, "grad_norm": 0.23737027044558576, "learning_rate": 4.900541105295563e-05, "loss": 0.3825, "step": 1443 }, { "epoch": 2.426890756302521, "grad_norm": 0.3617688213838577, "learning_rate": 4.8959667990151694e-05, "loss": 0.3862, "step": 1444 }, { "epoch": 2.4285714285714284, "grad_norm": 0.1799462229858181, "learning_rate": 4.8913912587924266e-05, "loss": 0.3776, "step": 1445 }, { "epoch": 2.4302521008403364, "grad_norm": 0.22354408514894264, "learning_rate": 4.886814490928852e-05, "loss": 0.3777, "step": 1446 }, { "epoch": 2.431932773109244, "grad_norm": 0.30665184305912724, "learning_rate": 4.8822365017276585e-05, "loss": 0.3848, "step": 1447 }, { "epoch": 2.4336134453781515, "grad_norm": 0.3305637420050448, "learning_rate": 4.87765729749374e-05, "loss": 0.3848, "step": 1448 }, { "epoch": 2.435294117647059, "grad_norm": 0.3095371475116675, "learning_rate": 4.873076884533662e-05, "loss": 0.384, "step": 1449 }, { "epoch": 2.4369747899159666, "grad_norm": 0.24357907390245204, "learning_rate": 4.868495269155654e-05, "loss": 0.3942, "step": 1450 }, { "epoch": 2.438655462184874, "grad_norm": 0.20278153932511683, "learning_rate": 4.8639124576696045e-05, "loss": 0.3822, "step": 1451 }, { "epoch": 2.4403361344537817, "grad_norm": 0.2450407526658741, "learning_rate": 4.859328456387048e-05, "loss": 0.3838, "step": 1452 }, { "epoch": 2.4420168067226893, "grad_norm": 0.22288677683066282, "learning_rate": 4.854743271621157e-05, "loss": 0.3834, "step": 1453 }, { "epoch": 2.443697478991597, "grad_norm": 0.27253983206680865, "learning_rate": 4.850156909686733e-05, "loss": 0.3812, "step": 1454 }, { "epoch": 2.4453781512605044, "grad_norm": 0.35838646695050064, "learning_rate": 4.845569376900202e-05, "loss": 0.3793, "step": 1455 }, { "epoch": 2.447058823529412, "grad_norm": 0.37778090891580296, "learning_rate": 4.8409806795796e-05, "loss": 0.3746, "step": 1456 }, { "epoch": 2.4487394957983195, "grad_norm": 0.4015689011133594, "learning_rate": 4.836390824044568e-05, "loss": 0.3909, "step": 1457 }, { "epoch": 2.450420168067227, "grad_norm": 0.41118625172794737, "learning_rate": 4.831799816616339e-05, "loss": 0.3761, "step": 1458 }, { "epoch": 2.4521008403361346, "grad_norm": 0.4287544377143525, "learning_rate": 4.8272076636177366e-05, "loss": 0.3955, "step": 1459 }, { "epoch": 2.453781512605042, "grad_norm": 0.4356980246574912, "learning_rate": 4.8226143713731605e-05, "loss": 0.3888, "step": 1460 }, { "epoch": 2.4554621848739497, "grad_norm": 0.4644642908017435, "learning_rate": 4.81801994620858e-05, "loss": 0.3753, "step": 1461 }, { "epoch": 2.4571428571428573, "grad_norm": 0.48559179247037193, "learning_rate": 4.8134243944515226e-05, "loss": 0.3872, "step": 1462 }, { "epoch": 2.458823529411765, "grad_norm": 0.4558968050504929, "learning_rate": 4.80882772243107e-05, "loss": 0.3878, "step": 1463 }, { "epoch": 2.4605042016806724, "grad_norm": 0.36792252058325586, "learning_rate": 4.804229936477844e-05, "loss": 0.382, "step": 1464 }, { "epoch": 2.46218487394958, "grad_norm": 0.33409853815039087, "learning_rate": 4.799631042924003e-05, "loss": 0.381, "step": 1465 }, { "epoch": 2.4638655462184875, "grad_norm": 0.3325921875891175, "learning_rate": 4.795031048103229e-05, "loss": 0.3813, "step": 1466 }, { "epoch": 2.465546218487395, "grad_norm": 0.3980934277590801, "learning_rate": 4.790429958350723e-05, "loss": 0.3878, "step": 1467 }, { "epoch": 2.4672268907563026, "grad_norm": 0.3798222019843301, "learning_rate": 4.785827780003191e-05, "loss": 0.3791, "step": 1468 }, { "epoch": 2.46890756302521, "grad_norm": 0.31221452551798157, "learning_rate": 4.7812245193988406e-05, "loss": 0.3854, "step": 1469 }, { "epoch": 2.4705882352941178, "grad_norm": 0.29231916755686915, "learning_rate": 4.7766201828773674e-05, "loss": 0.3878, "step": 1470 }, { "epoch": 2.4722689075630253, "grad_norm": 0.279549872556132, "learning_rate": 4.772014776779953e-05, "loss": 0.3779, "step": 1471 }, { "epoch": 2.473949579831933, "grad_norm": 0.2392007520748598, "learning_rate": 4.7674083074492473e-05, "loss": 0.3758, "step": 1472 }, { "epoch": 2.4756302521008404, "grad_norm": 0.23665046339949955, "learning_rate": 4.7628007812293684e-05, "loss": 0.3797, "step": 1473 }, { "epoch": 2.477310924369748, "grad_norm": 0.24035011485587388, "learning_rate": 4.758192204465885e-05, "loss": 0.3799, "step": 1474 }, { "epoch": 2.4789915966386555, "grad_norm": 0.2653940586898501, "learning_rate": 4.75358258350582e-05, "loss": 0.3794, "step": 1475 }, { "epoch": 2.480672268907563, "grad_norm": 0.21828089796078476, "learning_rate": 4.7489719246976267e-05, "loss": 0.3731, "step": 1476 }, { "epoch": 2.4823529411764707, "grad_norm": 0.23564270061853745, "learning_rate": 4.744360234391194e-05, "loss": 0.3746, "step": 1477 }, { "epoch": 2.484033613445378, "grad_norm": 0.6946347428307272, "learning_rate": 4.739747518937826e-05, "loss": 0.3852, "step": 1478 }, { "epoch": 2.4857142857142858, "grad_norm": 0.24250646153863792, "learning_rate": 4.735133784690243e-05, "loss": 0.3822, "step": 1479 }, { "epoch": 2.4873949579831933, "grad_norm": 0.2106348223545447, "learning_rate": 4.730519038002567e-05, "loss": 0.3779, "step": 1480 }, { "epoch": 2.489075630252101, "grad_norm": 0.24166082336004666, "learning_rate": 4.725903285230312e-05, "loss": 0.3841, "step": 1481 }, { "epoch": 2.4907563025210084, "grad_norm": 0.2442991102684316, "learning_rate": 4.721286532730382e-05, "loss": 0.3784, "step": 1482 }, { "epoch": 2.492436974789916, "grad_norm": 0.20969462094520558, "learning_rate": 4.716668786861055e-05, "loss": 0.3772, "step": 1483 }, { "epoch": 2.4941176470588236, "grad_norm": 0.21707925313415927, "learning_rate": 4.712050053981976e-05, "loss": 0.3818, "step": 1484 }, { "epoch": 2.495798319327731, "grad_norm": 0.21757829827544684, "learning_rate": 4.7074303404541525e-05, "loss": 0.3811, "step": 1485 }, { "epoch": 2.4974789915966387, "grad_norm": 0.1894146740149861, "learning_rate": 4.7028096526399406e-05, "loss": 0.3798, "step": 1486 }, { "epoch": 2.499159663865546, "grad_norm": 0.32497255094486477, "learning_rate": 4.6981879969030385e-05, "loss": 0.3767, "step": 1487 }, { "epoch": 2.500840336134454, "grad_norm": 0.20656684071604556, "learning_rate": 4.693565379608478e-05, "loss": 0.3853, "step": 1488 }, { "epoch": 2.5025210084033613, "grad_norm": 0.282959952033557, "learning_rate": 4.6889418071226134e-05, "loss": 0.3758, "step": 1489 }, { "epoch": 2.504201680672269, "grad_norm": 0.24706284199666445, "learning_rate": 4.684317285813117e-05, "loss": 0.3836, "step": 1490 }, { "epoch": 2.5058823529411764, "grad_norm": 0.25046224871566497, "learning_rate": 4.6796918220489664e-05, "loss": 0.388, "step": 1491 }, { "epoch": 2.507563025210084, "grad_norm": 0.30745854494023894, "learning_rate": 4.675065422200437e-05, "loss": 0.3779, "step": 1492 }, { "epoch": 2.5092436974789916, "grad_norm": 0.2546210388493414, "learning_rate": 4.6704380926390934e-05, "loss": 0.381, "step": 1493 }, { "epoch": 2.510924369747899, "grad_norm": 0.2173880505989241, "learning_rate": 4.665809839737781e-05, "loss": 0.3802, "step": 1494 }, { "epoch": 2.5126050420168067, "grad_norm": 0.2840840264729783, "learning_rate": 4.66118066987062e-05, "loss": 0.3925, "step": 1495 }, { "epoch": 2.5142857142857142, "grad_norm": 0.30625067971133724, "learning_rate": 4.656550589412986e-05, "loss": 0.3851, "step": 1496 }, { "epoch": 2.515966386554622, "grad_norm": 0.2490538862753483, "learning_rate": 4.651919604741515e-05, "loss": 0.39, "step": 1497 }, { "epoch": 2.5176470588235293, "grad_norm": 0.23637889362802736, "learning_rate": 4.6472877222340874e-05, "loss": 0.3857, "step": 1498 }, { "epoch": 2.519327731092437, "grad_norm": 0.3082731074150229, "learning_rate": 4.642654948269817e-05, "loss": 0.3827, "step": 1499 }, { "epoch": 2.5210084033613445, "grad_norm": 0.2641448410165333, "learning_rate": 4.638021289229049e-05, "loss": 0.3792, "step": 1500 }, { "epoch": 2.522689075630252, "grad_norm": 0.28198291795587016, "learning_rate": 4.633386751493344e-05, "loss": 0.3805, "step": 1501 }, { "epoch": 2.5243697478991596, "grad_norm": 0.22018893119419475, "learning_rate": 4.628751341445476e-05, "loss": 0.3809, "step": 1502 }, { "epoch": 2.526050420168067, "grad_norm": 0.20208242537517315, "learning_rate": 4.624115065469419e-05, "loss": 0.3808, "step": 1503 }, { "epoch": 2.5277310924369747, "grad_norm": 0.28022426428106023, "learning_rate": 4.61947792995034e-05, "loss": 0.3839, "step": 1504 }, { "epoch": 2.5294117647058822, "grad_norm": 0.3322627584200892, "learning_rate": 4.614839941274588e-05, "loss": 0.3912, "step": 1505 }, { "epoch": 2.53109243697479, "grad_norm": 0.29680287487388846, "learning_rate": 4.6102011058296884e-05, "loss": 0.3836, "step": 1506 }, { "epoch": 2.5327731092436974, "grad_norm": 0.25600741759708723, "learning_rate": 4.605561430004335e-05, "loss": 0.3853, "step": 1507 }, { "epoch": 2.534453781512605, "grad_norm": 0.26208037117851524, "learning_rate": 4.600920920188374e-05, "loss": 0.3771, "step": 1508 }, { "epoch": 2.5361344537815125, "grad_norm": 0.24026195773618605, "learning_rate": 4.5962795827728036e-05, "loss": 0.3782, "step": 1509 }, { "epoch": 2.53781512605042, "grad_norm": 0.23878602822382236, "learning_rate": 4.5916374241497627e-05, "loss": 0.389, "step": 1510 }, { "epoch": 2.5394957983193276, "grad_norm": 0.2687599146434978, "learning_rate": 4.5869944507125185e-05, "loss": 0.3878, "step": 1511 }, { "epoch": 2.541176470588235, "grad_norm": 0.25151985666336285, "learning_rate": 4.58235066885546e-05, "loss": 0.3825, "step": 1512 }, { "epoch": 2.5428571428571427, "grad_norm": 0.25800437611010507, "learning_rate": 4.577706084974093e-05, "loss": 0.3781, "step": 1513 }, { "epoch": 2.5445378151260503, "grad_norm": 0.2843556369364158, "learning_rate": 4.5730607054650266e-05, "loss": 0.3837, "step": 1514 }, { "epoch": 2.546218487394958, "grad_norm": 0.2423302242599336, "learning_rate": 4.568414536725962e-05, "loss": 0.3782, "step": 1515 }, { "epoch": 2.5478991596638654, "grad_norm": 0.2351263532843389, "learning_rate": 4.5637675851556927e-05, "loss": 0.3827, "step": 1516 }, { "epoch": 2.549579831932773, "grad_norm": 0.2313008978343842, "learning_rate": 4.5591198571540874e-05, "loss": 0.3715, "step": 1517 }, { "epoch": 2.5512605042016805, "grad_norm": 0.2018672199711318, "learning_rate": 4.5544713591220845e-05, "loss": 0.38, "step": 1518 }, { "epoch": 2.552941176470588, "grad_norm": 0.21187537502547452, "learning_rate": 4.5498220974616837e-05, "loss": 0.3767, "step": 1519 }, { "epoch": 2.5546218487394956, "grad_norm": 0.1989715109173907, "learning_rate": 4.545172078575933e-05, "loss": 0.3759, "step": 1520 }, { "epoch": 2.556302521008403, "grad_norm": 0.1835655582938638, "learning_rate": 4.54052130886893e-05, "loss": 0.3844, "step": 1521 }, { "epoch": 2.5579831932773107, "grad_norm": 0.21427962921574253, "learning_rate": 4.5358697947457995e-05, "loss": 0.3787, "step": 1522 }, { "epoch": 2.5596638655462183, "grad_norm": 0.22549123601752127, "learning_rate": 4.5312175426126956e-05, "loss": 0.3782, "step": 1523 }, { "epoch": 2.561344537815126, "grad_norm": 0.2570127011053044, "learning_rate": 4.526564558876786e-05, "loss": 0.3851, "step": 1524 }, { "epoch": 2.5630252100840334, "grad_norm": 0.22138042562406132, "learning_rate": 4.521910849946249e-05, "loss": 0.3824, "step": 1525 }, { "epoch": 2.564705882352941, "grad_norm": 0.2362222829334786, "learning_rate": 4.517256422230261e-05, "loss": 0.3824, "step": 1526 }, { "epoch": 2.5663865546218485, "grad_norm": 0.21732184557970619, "learning_rate": 4.512601282138985e-05, "loss": 0.3873, "step": 1527 }, { "epoch": 2.568067226890756, "grad_norm": 0.2385931249829493, "learning_rate": 4.507945436083569e-05, "loss": 0.3793, "step": 1528 }, { "epoch": 2.5697478991596636, "grad_norm": 0.21195276760236234, "learning_rate": 4.5032888904761334e-05, "loss": 0.3744, "step": 1529 }, { "epoch": 2.571428571428571, "grad_norm": 0.17583204915821762, "learning_rate": 4.498631651729757e-05, "loss": 0.3857, "step": 1530 }, { "epoch": 2.5731092436974787, "grad_norm": 0.20208255911926357, "learning_rate": 4.49397372625848e-05, "loss": 0.3829, "step": 1531 }, { "epoch": 2.5747899159663863, "grad_norm": 0.21112771110776968, "learning_rate": 4.489315120477283e-05, "loss": 0.3786, "step": 1532 }, { "epoch": 2.576470588235294, "grad_norm": 0.2820187560814114, "learning_rate": 4.484655840802087e-05, "loss": 0.3903, "step": 1533 }, { "epoch": 2.578151260504202, "grad_norm": 0.29720272711737655, "learning_rate": 4.479995893649739e-05, "loss": 0.3741, "step": 1534 }, { "epoch": 2.5798319327731094, "grad_norm": 0.26822410075644165, "learning_rate": 4.475335285438006e-05, "loss": 0.3752, "step": 1535 }, { "epoch": 2.581512605042017, "grad_norm": 0.3230671724825467, "learning_rate": 4.4706740225855656e-05, "loss": 0.3933, "step": 1536 }, { "epoch": 2.5831932773109245, "grad_norm": 0.25905442086914476, "learning_rate": 4.4660121115119964e-05, "loss": 0.3867, "step": 1537 }, { "epoch": 2.584873949579832, "grad_norm": 0.21606592898841476, "learning_rate": 4.4613495586377716e-05, "loss": 0.3725, "step": 1538 }, { "epoch": 2.5865546218487396, "grad_norm": 0.24539552889345428, "learning_rate": 4.456686370384245e-05, "loss": 0.3782, "step": 1539 }, { "epoch": 2.588235294117647, "grad_norm": 0.20535881360887318, "learning_rate": 4.452022553173646e-05, "loss": 0.3691, "step": 1540 }, { "epoch": 2.5899159663865547, "grad_norm": 0.18724571573566828, "learning_rate": 4.447358113429076e-05, "loss": 0.3891, "step": 1541 }, { "epoch": 2.5915966386554623, "grad_norm": 0.20382515539754745, "learning_rate": 4.4426930575744864e-05, "loss": 0.3917, "step": 1542 }, { "epoch": 2.59327731092437, "grad_norm": 0.18295862117801767, "learning_rate": 4.43802739203468e-05, "loss": 0.3842, "step": 1543 }, { "epoch": 2.5949579831932774, "grad_norm": 0.1700037345241106, "learning_rate": 4.4333611232352984e-05, "loss": 0.3798, "step": 1544 }, { "epoch": 2.596638655462185, "grad_norm": 0.21109788755559675, "learning_rate": 4.428694257602817e-05, "loss": 0.3794, "step": 1545 }, { "epoch": 2.5983193277310925, "grad_norm": 0.39144615111636544, "learning_rate": 4.4240268015645305e-05, "loss": 0.3864, "step": 1546 }, { "epoch": 2.6, "grad_norm": 0.26253901190351503, "learning_rate": 4.419358761548546e-05, "loss": 0.3945, "step": 1547 }, { "epoch": 2.6016806722689076, "grad_norm": 0.2679929101159283, "learning_rate": 4.414690143983777e-05, "loss": 0.3892, "step": 1548 }, { "epoch": 2.603361344537815, "grad_norm": 0.20413704255165585, "learning_rate": 4.410020955299932e-05, "loss": 0.3862, "step": 1549 }, { "epoch": 2.6050420168067228, "grad_norm": 0.25594471707319383, "learning_rate": 4.405351201927505e-05, "loss": 0.3794, "step": 1550 }, { "epoch": 2.6067226890756303, "grad_norm": 0.2786556065267551, "learning_rate": 4.4006808902977666e-05, "loss": 0.3805, "step": 1551 }, { "epoch": 2.608403361344538, "grad_norm": 0.20929030936964635, "learning_rate": 4.396010026842763e-05, "loss": 0.3823, "step": 1552 }, { "epoch": 2.6100840336134454, "grad_norm": 0.3287438394902171, "learning_rate": 4.39133861799529e-05, "loss": 0.3841, "step": 1553 }, { "epoch": 2.611764705882353, "grad_norm": 0.3489254746871643, "learning_rate": 4.386666670188902e-05, "loss": 0.3772, "step": 1554 }, { "epoch": 2.6134453781512605, "grad_norm": 0.25124185212053224, "learning_rate": 4.381994189857893e-05, "loss": 0.3829, "step": 1555 }, { "epoch": 2.615126050420168, "grad_norm": 0.20673454566772226, "learning_rate": 4.377321183437293e-05, "loss": 0.3789, "step": 1556 }, { "epoch": 2.6168067226890757, "grad_norm": 0.2504546470635462, "learning_rate": 4.3726476573628516e-05, "loss": 0.3785, "step": 1557 }, { "epoch": 2.618487394957983, "grad_norm": 0.29363213234892743, "learning_rate": 4.3679736180710386e-05, "loss": 0.3824, "step": 1558 }, { "epoch": 2.6201680672268908, "grad_norm": 0.2729898004081585, "learning_rate": 4.363299071999027e-05, "loss": 0.3804, "step": 1559 }, { "epoch": 2.6218487394957983, "grad_norm": 0.24812111923599806, "learning_rate": 4.358624025584693e-05, "loss": 0.381, "step": 1560 }, { "epoch": 2.623529411764706, "grad_norm": 0.2311266757480812, "learning_rate": 4.3539484852665955e-05, "loss": 0.3847, "step": 1561 }, { "epoch": 2.6252100840336134, "grad_norm": 0.2065349299951128, "learning_rate": 4.349272457483978e-05, "loss": 0.3701, "step": 1562 }, { "epoch": 2.626890756302521, "grad_norm": 0.2093273912321658, "learning_rate": 4.344595948676753e-05, "loss": 0.3849, "step": 1563 }, { "epoch": 2.6285714285714286, "grad_norm": 0.2086180952093611, "learning_rate": 4.3399189652855e-05, "loss": 0.3774, "step": 1564 }, { "epoch": 2.630252100840336, "grad_norm": 0.23859049811658659, "learning_rate": 4.335241513751444e-05, "loss": 0.3847, "step": 1565 }, { "epoch": 2.6319327731092437, "grad_norm": 0.24434235752528385, "learning_rate": 4.330563600516462e-05, "loss": 0.3745, "step": 1566 }, { "epoch": 2.6336134453781512, "grad_norm": 0.23405289629639425, "learning_rate": 4.325885232023063e-05, "loss": 0.3805, "step": 1567 }, { "epoch": 2.635294117647059, "grad_norm": 0.22404376220453104, "learning_rate": 4.3212064147143854e-05, "loss": 0.3805, "step": 1568 }, { "epoch": 2.6369747899159663, "grad_norm": 0.2364434885672814, "learning_rate": 4.316527155034184e-05, "loss": 0.3762, "step": 1569 }, { "epoch": 2.638655462184874, "grad_norm": 0.26234127686153946, "learning_rate": 4.311847459426823e-05, "loss": 0.3786, "step": 1570 }, { "epoch": 2.6403361344537815, "grad_norm": 0.20832473262514695, "learning_rate": 4.307167334337267e-05, "loss": 0.3811, "step": 1571 }, { "epoch": 2.642016806722689, "grad_norm": 0.22870158551565242, "learning_rate": 4.3024867862110735e-05, "loss": 0.3797, "step": 1572 }, { "epoch": 2.6436974789915966, "grad_norm": 0.24301633690044003, "learning_rate": 4.297805821494382e-05, "loss": 0.3796, "step": 1573 }, { "epoch": 2.645378151260504, "grad_norm": 0.2747791310548839, "learning_rate": 4.293124446633903e-05, "loss": 0.3799, "step": 1574 }, { "epoch": 2.6470588235294117, "grad_norm": 0.2579596762095668, "learning_rate": 4.288442668076916e-05, "loss": 0.3824, "step": 1575 }, { "epoch": 2.6487394957983192, "grad_norm": 0.24110396522388008, "learning_rate": 4.2837604922712545e-05, "loss": 0.3828, "step": 1576 }, { "epoch": 2.650420168067227, "grad_norm": 0.22828839299856127, "learning_rate": 4.2790779256652975e-05, "loss": 0.3824, "step": 1577 }, { "epoch": 2.6521008403361344, "grad_norm": 0.23767695344427026, "learning_rate": 4.274394974707966e-05, "loss": 0.3776, "step": 1578 }, { "epoch": 2.653781512605042, "grad_norm": 0.24319863632141409, "learning_rate": 4.2697116458487074e-05, "loss": 0.3867, "step": 1579 }, { "epoch": 2.6554621848739495, "grad_norm": 0.2290850673064808, "learning_rate": 4.2650279455374904e-05, "loss": 0.3772, "step": 1580 }, { "epoch": 2.657142857142857, "grad_norm": 0.21247697882457772, "learning_rate": 4.2603438802247954e-05, "loss": 0.3819, "step": 1581 }, { "epoch": 2.6588235294117646, "grad_norm": 0.22899886902197497, "learning_rate": 4.255659456361605e-05, "loss": 0.3861, "step": 1582 }, { "epoch": 2.660504201680672, "grad_norm": 0.20881854934496707, "learning_rate": 4.250974680399396e-05, "loss": 0.3816, "step": 1583 }, { "epoch": 2.6621848739495797, "grad_norm": 0.2523561866925505, "learning_rate": 4.246289558790131e-05, "loss": 0.3764, "step": 1584 }, { "epoch": 2.6638655462184873, "grad_norm": 0.2746600834338096, "learning_rate": 4.241604097986247e-05, "loss": 0.3874, "step": 1585 }, { "epoch": 2.665546218487395, "grad_norm": 0.22793868061777595, "learning_rate": 4.236918304440648e-05, "loss": 0.3868, "step": 1586 }, { "epoch": 2.6672268907563024, "grad_norm": 0.18480237172350641, "learning_rate": 4.232232184606699e-05, "loss": 0.3905, "step": 1587 }, { "epoch": 2.66890756302521, "grad_norm": 0.22618256708205398, "learning_rate": 4.227545744938211e-05, "loss": 0.3808, "step": 1588 }, { "epoch": 2.6705882352941175, "grad_norm": 0.2532743073952666, "learning_rate": 4.222858991889439e-05, "loss": 0.3845, "step": 1589 }, { "epoch": 2.6722689075630255, "grad_norm": 0.27302017039504767, "learning_rate": 4.218171931915064e-05, "loss": 0.3799, "step": 1590 }, { "epoch": 2.673949579831933, "grad_norm": 0.22362192493176802, "learning_rate": 4.2134845714701964e-05, "loss": 0.3873, "step": 1591 }, { "epoch": 2.6756302521008406, "grad_norm": 0.18295867918282335, "learning_rate": 4.208796917010356e-05, "loss": 0.3717, "step": 1592 }, { "epoch": 2.677310924369748, "grad_norm": 0.26487486426204965, "learning_rate": 4.204108974991469e-05, "loss": 0.3817, "step": 1593 }, { "epoch": 2.6789915966386557, "grad_norm": 0.23924709275019643, "learning_rate": 4.199420751869857e-05, "loss": 0.3777, "step": 1594 }, { "epoch": 2.6806722689075633, "grad_norm": 0.20510920718526499, "learning_rate": 4.19473225410223e-05, "loss": 0.384, "step": 1595 }, { "epoch": 2.682352941176471, "grad_norm": 0.1984035540153016, "learning_rate": 4.190043488145676e-05, "loss": 0.3862, "step": 1596 }, { "epoch": 2.6840336134453784, "grad_norm": 0.21199525040240977, "learning_rate": 4.1853544604576504e-05, "loss": 0.391, "step": 1597 }, { "epoch": 2.685714285714286, "grad_norm": 0.20485698156098936, "learning_rate": 4.18066517749597e-05, "loss": 0.3788, "step": 1598 }, { "epoch": 2.6873949579831935, "grad_norm": 0.1931428188013294, "learning_rate": 4.175975645718805e-05, "loss": 0.3759, "step": 1599 }, { "epoch": 2.689075630252101, "grad_norm": 0.2297637360430046, "learning_rate": 4.171285871584667e-05, "loss": 0.3809, "step": 1600 }, { "epoch": 2.6907563025210086, "grad_norm": 0.19986262410539585, "learning_rate": 4.1665958615524005e-05, "loss": 0.3924, "step": 1601 }, { "epoch": 2.692436974789916, "grad_norm": 0.19433718567101033, "learning_rate": 4.161905622081175e-05, "loss": 0.38, "step": 1602 }, { "epoch": 2.6941176470588237, "grad_norm": 0.22812420247240836, "learning_rate": 4.157215159630478e-05, "loss": 0.372, "step": 1603 }, { "epoch": 2.6957983193277313, "grad_norm": 0.17568918026691466, "learning_rate": 4.152524480660103e-05, "loss": 0.3737, "step": 1604 }, { "epoch": 2.697478991596639, "grad_norm": 0.1912063762458744, "learning_rate": 4.147833591630139e-05, "loss": 0.3824, "step": 1605 }, { "epoch": 2.6991596638655464, "grad_norm": 0.22779091577178967, "learning_rate": 4.143142499000969e-05, "loss": 0.373, "step": 1606 }, { "epoch": 2.700840336134454, "grad_norm": 0.2154641689771944, "learning_rate": 4.1384512092332544e-05, "loss": 0.376, "step": 1607 }, { "epoch": 2.7025210084033615, "grad_norm": 0.24847069614446887, "learning_rate": 4.133759728787926e-05, "loss": 0.3858, "step": 1608 }, { "epoch": 2.704201680672269, "grad_norm": 0.1627936875850345, "learning_rate": 4.12906806412618e-05, "loss": 0.3841, "step": 1609 }, { "epoch": 2.7058823529411766, "grad_norm": 0.22943088941714626, "learning_rate": 4.124376221709465e-05, "loss": 0.3916, "step": 1610 }, { "epoch": 2.707563025210084, "grad_norm": 0.2363600362129113, "learning_rate": 4.1196842079994754e-05, "loss": 0.3784, "step": 1611 }, { "epoch": 2.7092436974789917, "grad_norm": 0.1822311632836212, "learning_rate": 4.114992029458141e-05, "loss": 0.3816, "step": 1612 }, { "epoch": 2.7109243697478993, "grad_norm": 0.20655550701532774, "learning_rate": 4.110299692547617e-05, "loss": 0.3892, "step": 1613 }, { "epoch": 2.712605042016807, "grad_norm": 0.20182921783726124, "learning_rate": 4.105607203730278e-05, "loss": 0.376, "step": 1614 }, { "epoch": 2.7142857142857144, "grad_norm": 0.16387151552776577, "learning_rate": 4.100914569468711e-05, "loss": 0.3812, "step": 1615 }, { "epoch": 2.715966386554622, "grad_norm": 0.15692411942698506, "learning_rate": 4.096221796225698e-05, "loss": 0.3776, "step": 1616 }, { "epoch": 2.7176470588235295, "grad_norm": 0.18679212073123183, "learning_rate": 4.091528890464215e-05, "loss": 0.3769, "step": 1617 }, { "epoch": 2.719327731092437, "grad_norm": 0.22906563131443367, "learning_rate": 4.086835858647419e-05, "loss": 0.384, "step": 1618 }, { "epoch": 2.7210084033613446, "grad_norm": 0.22686668893715872, "learning_rate": 4.082142707238644e-05, "loss": 0.3809, "step": 1619 }, { "epoch": 2.722689075630252, "grad_norm": 0.23941047005276977, "learning_rate": 4.077449442701386e-05, "loss": 0.3852, "step": 1620 }, { "epoch": 2.7243697478991598, "grad_norm": 0.17704561727852217, "learning_rate": 4.0727560714992955e-05, "loss": 0.3787, "step": 1621 }, { "epoch": 2.7260504201680673, "grad_norm": 0.19857021405201739, "learning_rate": 4.0680626000961744e-05, "loss": 0.3826, "step": 1622 }, { "epoch": 2.727731092436975, "grad_norm": 0.2455101184323949, "learning_rate": 4.063369034955958e-05, "loss": 0.3683, "step": 1623 }, { "epoch": 2.7294117647058824, "grad_norm": 0.19650081416404103, "learning_rate": 4.0586753825427136e-05, "loss": 0.3779, "step": 1624 }, { "epoch": 2.73109243697479, "grad_norm": 0.22116750966365933, "learning_rate": 4.0539816493206257e-05, "loss": 0.3805, "step": 1625 }, { "epoch": 2.7327731092436975, "grad_norm": 0.22751687006892543, "learning_rate": 4.0492878417539944e-05, "loss": 0.3852, "step": 1626 }, { "epoch": 2.734453781512605, "grad_norm": 0.18150621163688843, "learning_rate": 4.04459396630722e-05, "loss": 0.374, "step": 1627 }, { "epoch": 2.7361344537815127, "grad_norm": 0.22713890376091625, "learning_rate": 4.0399000294447955e-05, "loss": 0.3729, "step": 1628 }, { "epoch": 2.73781512605042, "grad_norm": 0.28929450628671005, "learning_rate": 4.0352060376312974e-05, "loss": 0.3786, "step": 1629 }, { "epoch": 2.7394957983193278, "grad_norm": 0.24088763336550975, "learning_rate": 4.030511997331382e-05, "loss": 0.385, "step": 1630 }, { "epoch": 2.7411764705882353, "grad_norm": 0.1707568229716921, "learning_rate": 4.02581791500977e-05, "loss": 0.3836, "step": 1631 }, { "epoch": 2.742857142857143, "grad_norm": 0.27276018447489403, "learning_rate": 4.0211237971312386e-05, "loss": 0.3787, "step": 1632 }, { "epoch": 2.7445378151260504, "grad_norm": 0.23619910517085452, "learning_rate": 4.016429650160615e-05, "loss": 0.3767, "step": 1633 }, { "epoch": 2.746218487394958, "grad_norm": 0.2114841338599277, "learning_rate": 4.011735480562769e-05, "loss": 0.3832, "step": 1634 }, { "epoch": 2.7478991596638656, "grad_norm": 0.2471231209356055, "learning_rate": 4.007041294802599e-05, "loss": 0.3788, "step": 1635 }, { "epoch": 2.749579831932773, "grad_norm": 0.2466095891439931, "learning_rate": 4.0023470993450234e-05, "loss": 0.3759, "step": 1636 }, { "epoch": 2.7512605042016807, "grad_norm": 0.19352996582028545, "learning_rate": 3.9976529006549786e-05, "loss": 0.3915, "step": 1637 }, { "epoch": 2.7529411764705882, "grad_norm": 0.2584007824477025, "learning_rate": 3.9929587051974025e-05, "loss": 0.3841, "step": 1638 }, { "epoch": 2.754621848739496, "grad_norm": 0.21609063270500545, "learning_rate": 3.988264519437233e-05, "loss": 0.3754, "step": 1639 }, { "epoch": 2.7563025210084033, "grad_norm": 0.17336842070368905, "learning_rate": 3.983570349839386e-05, "loss": 0.3792, "step": 1640 }, { "epoch": 2.757983193277311, "grad_norm": 0.21839046523057584, "learning_rate": 3.9788762028687634e-05, "loss": 0.3893, "step": 1641 }, { "epoch": 2.7596638655462185, "grad_norm": 0.1744968789924748, "learning_rate": 3.974182084990231e-05, "loss": 0.3763, "step": 1642 }, { "epoch": 2.761344537815126, "grad_norm": 0.20310403212029787, "learning_rate": 3.96948800266862e-05, "loss": 0.3857, "step": 1643 }, { "epoch": 2.7630252100840336, "grad_norm": 0.21921768342973283, "learning_rate": 3.964793962368703e-05, "loss": 0.3862, "step": 1644 }, { "epoch": 2.764705882352941, "grad_norm": 0.16268743552537554, "learning_rate": 3.9600999705552065e-05, "loss": 0.3808, "step": 1645 }, { "epoch": 2.7663865546218487, "grad_norm": 0.2292797605370842, "learning_rate": 3.955406033692781e-05, "loss": 0.3768, "step": 1646 }, { "epoch": 2.7680672268907562, "grad_norm": 0.2830869189399112, "learning_rate": 3.950712158246007e-05, "loss": 0.3726, "step": 1647 }, { "epoch": 2.769747899159664, "grad_norm": 0.22546740642094854, "learning_rate": 3.946018350679376e-05, "loss": 0.3753, "step": 1648 }, { "epoch": 2.7714285714285714, "grad_norm": 0.24030467310235162, "learning_rate": 3.941324617457289e-05, "loss": 0.3756, "step": 1649 }, { "epoch": 2.773109243697479, "grad_norm": 0.2748003172317303, "learning_rate": 3.936630965044043e-05, "loss": 0.3721, "step": 1650 }, { "epoch": 2.7747899159663865, "grad_norm": 0.23602000627546263, "learning_rate": 3.931937399903828e-05, "loss": 0.3829, "step": 1651 }, { "epoch": 2.776470588235294, "grad_norm": 0.22355142354415863, "learning_rate": 3.927243928500706e-05, "loss": 0.3817, "step": 1652 }, { "epoch": 2.7781512605042016, "grad_norm": 0.26109550584171337, "learning_rate": 3.9225505572986163e-05, "loss": 0.3898, "step": 1653 }, { "epoch": 2.779831932773109, "grad_norm": 0.24799925749981233, "learning_rate": 3.917857292761357e-05, "loss": 0.3832, "step": 1654 }, { "epoch": 2.7815126050420167, "grad_norm": 0.1939528829591152, "learning_rate": 3.913164141352581e-05, "loss": 0.3752, "step": 1655 }, { "epoch": 2.7831932773109243, "grad_norm": 0.2150312196830647, "learning_rate": 3.9084711095357863e-05, "loss": 0.3819, "step": 1656 }, { "epoch": 2.784873949579832, "grad_norm": 0.2405074318808764, "learning_rate": 3.903778203774302e-05, "loss": 0.3825, "step": 1657 }, { "epoch": 2.7865546218487394, "grad_norm": 0.15416439147389227, "learning_rate": 3.89908543053129e-05, "loss": 0.379, "step": 1658 }, { "epoch": 2.788235294117647, "grad_norm": 0.27918928467820797, "learning_rate": 3.894392796269721e-05, "loss": 0.3779, "step": 1659 }, { "epoch": 2.7899159663865545, "grad_norm": 0.2367763602821621, "learning_rate": 3.889700307452384e-05, "loss": 0.3862, "step": 1660 }, { "epoch": 2.791596638655462, "grad_norm": 0.1659609830579495, "learning_rate": 3.88500797054186e-05, "loss": 0.3949, "step": 1661 }, { "epoch": 2.7932773109243696, "grad_norm": 0.27033397529964337, "learning_rate": 3.880315792000525e-05, "loss": 0.3803, "step": 1662 }, { "epoch": 2.794957983193277, "grad_norm": 0.2140745416092664, "learning_rate": 3.875623778290535e-05, "loss": 0.3788, "step": 1663 }, { "epoch": 2.7966386554621847, "grad_norm": 0.18470299602739929, "learning_rate": 3.8709319358738203e-05, "loss": 0.3805, "step": 1664 }, { "epoch": 2.7983193277310923, "grad_norm": 0.24053203500434275, "learning_rate": 3.866240271212074e-05, "loss": 0.3824, "step": 1665 }, { "epoch": 2.8, "grad_norm": 0.22155212762000334, "learning_rate": 3.861548790766747e-05, "loss": 0.3776, "step": 1666 }, { "epoch": 2.8016806722689074, "grad_norm": 0.17284000327642937, "learning_rate": 3.8568575009990313e-05, "loss": 0.384, "step": 1667 }, { "epoch": 2.803361344537815, "grad_norm": 0.2059832760416997, "learning_rate": 3.8521664083698614e-05, "loss": 0.3713, "step": 1668 }, { "epoch": 2.8050420168067225, "grad_norm": 0.1996298444996923, "learning_rate": 3.847475519339898e-05, "loss": 0.3903, "step": 1669 }, { "epoch": 2.80672268907563, "grad_norm": 0.18470806845797713, "learning_rate": 3.8427848403695235e-05, "loss": 0.3802, "step": 1670 }, { "epoch": 2.8084033613445376, "grad_norm": 0.2054280496183822, "learning_rate": 3.838094377918826e-05, "loss": 0.379, "step": 1671 }, { "epoch": 2.810084033613445, "grad_norm": 0.2104972751669263, "learning_rate": 3.833404138447601e-05, "loss": 0.3773, "step": 1672 }, { "epoch": 2.8117647058823527, "grad_norm": 0.1993487224477167, "learning_rate": 3.828714128415334e-05, "loss": 0.3838, "step": 1673 }, { "epoch": 2.8134453781512603, "grad_norm": 0.18943491060810314, "learning_rate": 3.824024354281196e-05, "loss": 0.3779, "step": 1674 }, { "epoch": 2.815126050420168, "grad_norm": 0.20661147100491337, "learning_rate": 3.819334822504031e-05, "loss": 0.3791, "step": 1675 }, { "epoch": 2.8168067226890754, "grad_norm": 0.21831286795375526, "learning_rate": 3.8146455395423516e-05, "loss": 0.3786, "step": 1676 }, { "epoch": 2.818487394957983, "grad_norm": 0.19863810747020344, "learning_rate": 3.809956511854325e-05, "loss": 0.3818, "step": 1677 }, { "epoch": 2.8201680672268905, "grad_norm": 0.15758478911064144, "learning_rate": 3.805267745897772e-05, "loss": 0.3787, "step": 1678 }, { "epoch": 2.821848739495798, "grad_norm": 0.17810987060322025, "learning_rate": 3.800579248130144e-05, "loss": 0.3806, "step": 1679 }, { "epoch": 2.8235294117647056, "grad_norm": 0.1868084367175716, "learning_rate": 3.795891025008533e-05, "loss": 0.3746, "step": 1680 }, { "epoch": 2.825210084033613, "grad_norm": 0.19599174591181392, "learning_rate": 3.791203082989645e-05, "loss": 0.3789, "step": 1681 }, { "epoch": 2.8268907563025207, "grad_norm": 0.18395211135328424, "learning_rate": 3.7865154285298056e-05, "loss": 0.3866, "step": 1682 }, { "epoch": 2.8285714285714287, "grad_norm": 0.16507270633714777, "learning_rate": 3.7818280680849374e-05, "loss": 0.3769, "step": 1683 }, { "epoch": 2.8302521008403363, "grad_norm": 0.15177049560651867, "learning_rate": 3.777141008110564e-05, "loss": 0.3798, "step": 1684 }, { "epoch": 2.831932773109244, "grad_norm": 0.18482613695240938, "learning_rate": 3.77245425506179e-05, "loss": 0.3721, "step": 1685 }, { "epoch": 2.8336134453781514, "grad_norm": 0.20592248350975814, "learning_rate": 3.767767815393303e-05, "loss": 0.369, "step": 1686 }, { "epoch": 2.835294117647059, "grad_norm": 0.19483435160974505, "learning_rate": 3.763081695559353e-05, "loss": 0.3802, "step": 1687 }, { "epoch": 2.8369747899159665, "grad_norm": 0.17700775746605468, "learning_rate": 3.758395902013755e-05, "loss": 0.377, "step": 1688 }, { "epoch": 2.838655462184874, "grad_norm": 0.1982387018753701, "learning_rate": 3.75371044120987e-05, "loss": 0.3747, "step": 1689 }, { "epoch": 2.8403361344537816, "grad_norm": 0.1868500034420186, "learning_rate": 3.749025319600606e-05, "loss": 0.3709, "step": 1690 }, { "epoch": 2.842016806722689, "grad_norm": 0.39283741892036733, "learning_rate": 3.7443405436383956e-05, "loss": 0.3842, "step": 1691 }, { "epoch": 2.8436974789915967, "grad_norm": 0.2130677346305315, "learning_rate": 3.7396561197752046e-05, "loss": 0.3827, "step": 1692 }, { "epoch": 2.8453781512605043, "grad_norm": 0.18093513947745637, "learning_rate": 3.734972054462511e-05, "loss": 0.3704, "step": 1693 }, { "epoch": 2.847058823529412, "grad_norm": 0.19401983498246286, "learning_rate": 3.730288354151293e-05, "loss": 0.383, "step": 1694 }, { "epoch": 2.8487394957983194, "grad_norm": 0.2054958888003188, "learning_rate": 3.7256050252920346e-05, "loss": 0.3826, "step": 1695 }, { "epoch": 2.850420168067227, "grad_norm": 0.1988615489188937, "learning_rate": 3.720922074334702e-05, "loss": 0.3764, "step": 1696 }, { "epoch": 2.8521008403361345, "grad_norm": 0.20002781301366893, "learning_rate": 3.7162395077287475e-05, "loss": 0.3884, "step": 1697 }, { "epoch": 2.853781512605042, "grad_norm": 0.21012034035078564, "learning_rate": 3.711557331923084e-05, "loss": 0.3747, "step": 1698 }, { "epoch": 2.8554621848739496, "grad_norm": 0.1907152393874672, "learning_rate": 3.7068755533660976e-05, "loss": 0.3852, "step": 1699 }, { "epoch": 2.857142857142857, "grad_norm": 0.3816229271432276, "learning_rate": 3.702194178505619e-05, "loss": 0.3858, "step": 1700 }, { "epoch": 2.8588235294117648, "grad_norm": 0.19217683462441462, "learning_rate": 3.697513213788927e-05, "loss": 0.3863, "step": 1701 }, { "epoch": 2.8605042016806723, "grad_norm": 0.1814156883810769, "learning_rate": 3.6928326656627335e-05, "loss": 0.3731, "step": 1702 }, { "epoch": 2.86218487394958, "grad_norm": 0.21585449529852463, "learning_rate": 3.688152540573178e-05, "loss": 0.385, "step": 1703 }, { "epoch": 2.8638655462184874, "grad_norm": 0.18200278936610234, "learning_rate": 3.6834728449658165e-05, "loss": 0.3785, "step": 1704 }, { "epoch": 2.865546218487395, "grad_norm": 0.17623496082843146, "learning_rate": 3.678793585285616e-05, "loss": 0.3803, "step": 1705 }, { "epoch": 2.8672268907563025, "grad_norm": 0.22564274253698416, "learning_rate": 3.674114767976938e-05, "loss": 0.3678, "step": 1706 }, { "epoch": 2.86890756302521, "grad_norm": 0.19787886654285275, "learning_rate": 3.6694363994835397e-05, "loss": 0.3746, "step": 1707 }, { "epoch": 2.8705882352941177, "grad_norm": 0.17591906034626142, "learning_rate": 3.664758486248557e-05, "loss": 0.3796, "step": 1708 }, { "epoch": 2.872268907563025, "grad_norm": 0.18636391804467706, "learning_rate": 3.6600810347145026e-05, "loss": 0.3794, "step": 1709 }, { "epoch": 2.8739495798319328, "grad_norm": 0.2558808159128357, "learning_rate": 3.6554040513232474e-05, "loss": 0.3843, "step": 1710 }, { "epoch": 2.8756302521008403, "grad_norm": 0.23188576133093924, "learning_rate": 3.650727542516024e-05, "loss": 0.3838, "step": 1711 }, { "epoch": 2.877310924369748, "grad_norm": 0.21188520222458435, "learning_rate": 3.646051514733406e-05, "loss": 0.3806, "step": 1712 }, { "epoch": 2.8789915966386554, "grad_norm": 0.1867084146669841, "learning_rate": 3.64137597441531e-05, "loss": 0.3866, "step": 1713 }, { "epoch": 2.880672268907563, "grad_norm": 0.1946358444550931, "learning_rate": 3.636700928000974e-05, "loss": 0.376, "step": 1714 }, { "epoch": 2.8823529411764706, "grad_norm": 0.19339076341787054, "learning_rate": 3.632026381928964e-05, "loss": 0.3919, "step": 1715 }, { "epoch": 2.884033613445378, "grad_norm": 0.22138685456615412, "learning_rate": 3.62735234263715e-05, "loss": 0.3824, "step": 1716 }, { "epoch": 2.8857142857142857, "grad_norm": 0.2010513215219114, "learning_rate": 3.62267881656271e-05, "loss": 0.3715, "step": 1717 }, { "epoch": 2.8873949579831932, "grad_norm": 0.19374466305321583, "learning_rate": 3.6180058101421084e-05, "loss": 0.3826, "step": 1718 }, { "epoch": 2.889075630252101, "grad_norm": 0.22981720543034997, "learning_rate": 3.6133333298110995e-05, "loss": 0.3834, "step": 1719 }, { "epoch": 2.8907563025210083, "grad_norm": 0.2614975923297543, "learning_rate": 3.608661382004711e-05, "loss": 0.3772, "step": 1720 }, { "epoch": 2.892436974789916, "grad_norm": 0.34027753106056857, "learning_rate": 3.60398997315724e-05, "loss": 0.386, "step": 1721 }, { "epoch": 2.8941176470588235, "grad_norm": 0.28281853151099234, "learning_rate": 3.599319109702234e-05, "loss": 0.393, "step": 1722 }, { "epoch": 2.895798319327731, "grad_norm": 0.27533849340569566, "learning_rate": 3.594648798072498e-05, "loss": 0.3799, "step": 1723 }, { "epoch": 2.8974789915966386, "grad_norm": 0.20721190008874593, "learning_rate": 3.589979044700069e-05, "loss": 0.3759, "step": 1724 }, { "epoch": 2.899159663865546, "grad_norm": 0.22008896333391206, "learning_rate": 3.585309856016225e-05, "loss": 0.3819, "step": 1725 }, { "epoch": 2.9008403361344537, "grad_norm": 0.20372330475554204, "learning_rate": 3.580641238451455e-05, "loss": 0.3877, "step": 1726 }, { "epoch": 2.9025210084033612, "grad_norm": 0.2796744699092284, "learning_rate": 3.57597319843547e-05, "loss": 0.3793, "step": 1727 }, { "epoch": 2.904201680672269, "grad_norm": 0.2314324183277019, "learning_rate": 3.571305742397184e-05, "loss": 0.3798, "step": 1728 }, { "epoch": 2.9058823529411764, "grad_norm": 0.19444295056360406, "learning_rate": 3.566638876764702e-05, "loss": 0.3699, "step": 1729 }, { "epoch": 2.907563025210084, "grad_norm": 0.19933440120344284, "learning_rate": 3.5619726079653215e-05, "loss": 0.3861, "step": 1730 }, { "epoch": 2.9092436974789915, "grad_norm": 0.276354564305299, "learning_rate": 3.557306942425514e-05, "loss": 0.3749, "step": 1731 }, { "epoch": 2.910924369747899, "grad_norm": 0.1906818507099947, "learning_rate": 3.552641886570925e-05, "loss": 0.3797, "step": 1732 }, { "epoch": 2.9126050420168066, "grad_norm": 0.21304212971093986, "learning_rate": 3.547977446826354e-05, "loss": 0.3695, "step": 1733 }, { "epoch": 2.914285714285714, "grad_norm": 0.1930710791623887, "learning_rate": 3.5433136296157565e-05, "loss": 0.3762, "step": 1734 }, { "epoch": 2.9159663865546217, "grad_norm": 0.2350760035737123, "learning_rate": 3.538650441362229e-05, "loss": 0.3784, "step": 1735 }, { "epoch": 2.9176470588235293, "grad_norm": 0.2179319180109049, "learning_rate": 3.533987888488004e-05, "loss": 0.3733, "step": 1736 }, { "epoch": 2.919327731092437, "grad_norm": 0.1983094342500071, "learning_rate": 3.529325977414435e-05, "loss": 0.3775, "step": 1737 }, { "epoch": 2.9210084033613444, "grad_norm": 0.27337948236324233, "learning_rate": 3.524664714561995e-05, "loss": 0.3859, "step": 1738 }, { "epoch": 2.9226890756302524, "grad_norm": 0.1727249954908901, "learning_rate": 3.520004106350261e-05, "loss": 0.3724, "step": 1739 }, { "epoch": 2.92436974789916, "grad_norm": 0.2512563058481026, "learning_rate": 3.515344159197914e-05, "loss": 0.3873, "step": 1740 }, { "epoch": 2.9260504201680675, "grad_norm": 0.2249404832527237, "learning_rate": 3.510684879522718e-05, "loss": 0.3835, "step": 1741 }, { "epoch": 2.927731092436975, "grad_norm": 0.18370943254208957, "learning_rate": 3.506026273741521e-05, "loss": 0.3834, "step": 1742 }, { "epoch": 2.9294117647058826, "grad_norm": 0.20630212914103155, "learning_rate": 3.5013683482702427e-05, "loss": 0.3846, "step": 1743 }, { "epoch": 2.93109243697479, "grad_norm": 0.18769358395913438, "learning_rate": 3.4967111095238687e-05, "loss": 0.3874, "step": 1744 }, { "epoch": 2.9327731092436977, "grad_norm": 0.17373122419298404, "learning_rate": 3.4920545639164315e-05, "loss": 0.3825, "step": 1745 }, { "epoch": 2.9344537815126053, "grad_norm": 0.15867786920800214, "learning_rate": 3.487398717861017e-05, "loss": 0.3782, "step": 1746 }, { "epoch": 2.936134453781513, "grad_norm": 0.21689175952091708, "learning_rate": 3.4827435777697406e-05, "loss": 0.3819, "step": 1747 }, { "epoch": 2.9378151260504204, "grad_norm": 0.22655599370885887, "learning_rate": 3.4780891500537525e-05, "loss": 0.3773, "step": 1748 }, { "epoch": 2.939495798319328, "grad_norm": 0.16793950435180363, "learning_rate": 3.4734354411232156e-05, "loss": 0.3705, "step": 1749 }, { "epoch": 2.9411764705882355, "grad_norm": 0.18646425712548131, "learning_rate": 3.468782457387307e-05, "loss": 0.3767, "step": 1750 }, { "epoch": 2.942857142857143, "grad_norm": 0.2461602170618624, "learning_rate": 3.464130205254202e-05, "loss": 0.3748, "step": 1751 }, { "epoch": 2.9445378151260506, "grad_norm": 0.22106057699932288, "learning_rate": 3.4594786911310725e-05, "loss": 0.3806, "step": 1752 }, { "epoch": 2.946218487394958, "grad_norm": 0.21577664880795344, "learning_rate": 3.454827921424068e-05, "loss": 0.3873, "step": 1753 }, { "epoch": 2.9478991596638657, "grad_norm": 0.2336421662493669, "learning_rate": 3.4501779025383184e-05, "loss": 0.3792, "step": 1754 }, { "epoch": 2.9495798319327733, "grad_norm": 0.3108477071906607, "learning_rate": 3.445528640877916e-05, "loss": 0.3774, "step": 1755 }, { "epoch": 2.951260504201681, "grad_norm": 0.21370360452044412, "learning_rate": 3.4408801428459146e-05, "loss": 0.377, "step": 1756 }, { "epoch": 2.9529411764705884, "grad_norm": 0.17087478068704484, "learning_rate": 3.436232414844308e-05, "loss": 0.384, "step": 1757 }, { "epoch": 2.954621848739496, "grad_norm": 0.24358174177289607, "learning_rate": 3.431585463274039e-05, "loss": 0.3777, "step": 1758 }, { "epoch": 2.9563025210084035, "grad_norm": 0.35690251480283125, "learning_rate": 3.426939294534975e-05, "loss": 0.3861, "step": 1759 }, { "epoch": 2.957983193277311, "grad_norm": 0.27107483062620286, "learning_rate": 3.4222939150259085e-05, "loss": 0.3755, "step": 1760 }, { "epoch": 2.9596638655462186, "grad_norm": 0.21386055772892382, "learning_rate": 3.4176493311445403e-05, "loss": 0.3815, "step": 1761 }, { "epoch": 2.961344537815126, "grad_norm": 0.2859914753739517, "learning_rate": 3.413005549287482e-05, "loss": 0.3922, "step": 1762 }, { "epoch": 2.9630252100840337, "grad_norm": 0.22258684051306665, "learning_rate": 3.408362575850239e-05, "loss": 0.375, "step": 1763 }, { "epoch": 2.9647058823529413, "grad_norm": 0.1923323698340643, "learning_rate": 3.403720417227197e-05, "loss": 0.3781, "step": 1764 }, { "epoch": 2.966386554621849, "grad_norm": 0.2998710950303093, "learning_rate": 3.399079079811627e-05, "loss": 0.3737, "step": 1765 }, { "epoch": 2.9680672268907564, "grad_norm": 0.25623764847016844, "learning_rate": 3.394438569995666e-05, "loss": 0.3789, "step": 1766 }, { "epoch": 2.969747899159664, "grad_norm": 0.19268121135385496, "learning_rate": 3.389798894170312e-05, "loss": 0.3891, "step": 1767 }, { "epoch": 2.9714285714285715, "grad_norm": 0.2807468866016995, "learning_rate": 3.385160058725413e-05, "loss": 0.3922, "step": 1768 }, { "epoch": 2.973109243697479, "grad_norm": 0.28445311997047584, "learning_rate": 3.3805220700496614e-05, "loss": 0.3872, "step": 1769 }, { "epoch": 2.9747899159663866, "grad_norm": 0.17137517793930407, "learning_rate": 3.375884934530581e-05, "loss": 0.3766, "step": 1770 }, { "epoch": 2.976470588235294, "grad_norm": 0.22652297736427132, "learning_rate": 3.371248658554525e-05, "loss": 0.382, "step": 1771 }, { "epoch": 2.9781512605042018, "grad_norm": 0.2348624395705223, "learning_rate": 3.3666132485066564e-05, "loss": 0.3898, "step": 1772 }, { "epoch": 2.9798319327731093, "grad_norm": 0.3256756679645036, "learning_rate": 3.361978710770952e-05, "loss": 0.3813, "step": 1773 }, { "epoch": 2.981512605042017, "grad_norm": 0.21907765482929822, "learning_rate": 3.357345051730183e-05, "loss": 0.3815, "step": 1774 }, { "epoch": 2.9831932773109244, "grad_norm": 0.2507702986936654, "learning_rate": 3.352712277765914e-05, "loss": 0.3858, "step": 1775 }, { "epoch": 2.984873949579832, "grad_norm": 0.5197279678021213, "learning_rate": 3.348080395258485e-05, "loss": 0.3921, "step": 1776 }, { "epoch": 2.9865546218487395, "grad_norm": 0.1983835590145885, "learning_rate": 3.343449410587015e-05, "loss": 0.3832, "step": 1777 }, { "epoch": 2.988235294117647, "grad_norm": 0.5863016810162325, "learning_rate": 3.338819330129381e-05, "loss": 0.3781, "step": 1778 }, { "epoch": 2.9899159663865547, "grad_norm": 0.19523730196591113, "learning_rate": 3.3341901602622196e-05, "loss": 0.3895, "step": 1779 }, { "epoch": 2.991596638655462, "grad_norm": 0.19865126473059094, "learning_rate": 3.329561907360908e-05, "loss": 0.3811, "step": 1780 }, { "epoch": 2.9932773109243698, "grad_norm": 0.19791425424611847, "learning_rate": 3.324934577799565e-05, "loss": 0.3843, "step": 1781 }, { "epoch": 2.9949579831932773, "grad_norm": 0.194360267392776, "learning_rate": 3.3203081779510356e-05, "loss": 0.3839, "step": 1782 }, { "epoch": 2.996638655462185, "grad_norm": 0.20255766285528753, "learning_rate": 3.315682714186885e-05, "loss": 0.3752, "step": 1783 }, { "epoch": 2.9983193277310924, "grad_norm": 0.21695878628570248, "learning_rate": 3.311058192877388e-05, "loss": 0.381, "step": 1784 }, { "epoch": 3.0, "grad_norm": 0.21749500075504694, "learning_rate": 3.3064346203915234e-05, "loss": 0.3745, "step": 1785 }, { "epoch": 3.0016806722689076, "grad_norm": 0.22280617060986443, "learning_rate": 3.301812003096962e-05, "loss": 0.3628, "step": 1786 }, { "epoch": 3.003361344537815, "grad_norm": 0.20257816931286335, "learning_rate": 3.2971903473600614e-05, "loss": 0.3622, "step": 1787 }, { "epoch": 3.0050420168067227, "grad_norm": 0.23458748909048524, "learning_rate": 3.292569659545849e-05, "loss": 0.3569, "step": 1788 }, { "epoch": 3.0067226890756302, "grad_norm": 0.24613559568913423, "learning_rate": 3.2879499460180254e-05, "loss": 0.3514, "step": 1789 }, { "epoch": 3.008403361344538, "grad_norm": 0.2103556190208636, "learning_rate": 3.283331213138946e-05, "loss": 0.3584, "step": 1790 }, { "epoch": 3.0100840336134453, "grad_norm": 0.2685108960562619, "learning_rate": 3.27871346726962e-05, "loss": 0.3519, "step": 1791 }, { "epoch": 3.011764705882353, "grad_norm": 0.20626377041992328, "learning_rate": 3.274096714769689e-05, "loss": 0.3523, "step": 1792 }, { "epoch": 3.0134453781512605, "grad_norm": 0.24305325788137516, "learning_rate": 3.2694809619974357e-05, "loss": 0.3632, "step": 1793 }, { "epoch": 3.015126050420168, "grad_norm": 0.22416707842424805, "learning_rate": 3.264866215309758e-05, "loss": 0.352, "step": 1794 }, { "epoch": 3.0168067226890756, "grad_norm": 0.2100184116889488, "learning_rate": 3.260252481062176e-05, "loss": 0.3505, "step": 1795 }, { "epoch": 3.018487394957983, "grad_norm": 0.188998097817705, "learning_rate": 3.255639765608807e-05, "loss": 0.3579, "step": 1796 }, { "epoch": 3.0201680672268907, "grad_norm": 0.19500939710333584, "learning_rate": 3.2510280753023754e-05, "loss": 0.3588, "step": 1797 }, { "epoch": 3.0218487394957982, "grad_norm": 0.19507818701101728, "learning_rate": 3.246417416494182e-05, "loss": 0.3478, "step": 1798 }, { "epoch": 3.023529411764706, "grad_norm": 0.19591500637184806, "learning_rate": 3.241807795534115e-05, "loss": 0.3521, "step": 1799 }, { "epoch": 3.0252100840336134, "grad_norm": 0.17104011357079368, "learning_rate": 3.2371992187706336e-05, "loss": 0.3558, "step": 1800 }, { "epoch": 3.026890756302521, "grad_norm": 0.19915533284668036, "learning_rate": 3.2325916925507526e-05, "loss": 0.3509, "step": 1801 }, { "epoch": 3.0285714285714285, "grad_norm": 0.16487439436646487, "learning_rate": 3.227985223220049e-05, "loss": 0.3584, "step": 1802 }, { "epoch": 3.030252100840336, "grad_norm": 0.18877924075957264, "learning_rate": 3.223379817122633e-05, "loss": 0.3555, "step": 1803 }, { "epoch": 3.0319327731092436, "grad_norm": 0.23003525091827537, "learning_rate": 3.218775480601161e-05, "loss": 0.3541, "step": 1804 }, { "epoch": 3.033613445378151, "grad_norm": 0.17104497937299987, "learning_rate": 3.214172219996809e-05, "loss": 0.3596, "step": 1805 }, { "epoch": 3.0352941176470587, "grad_norm": 0.22927222057649294, "learning_rate": 3.209570041649278e-05, "loss": 0.35, "step": 1806 }, { "epoch": 3.0369747899159663, "grad_norm": 0.19336606887276167, "learning_rate": 3.204968951896772e-05, "loss": 0.355, "step": 1807 }, { "epoch": 3.038655462184874, "grad_norm": 0.20190685760253888, "learning_rate": 3.200368957075998e-05, "loss": 0.3584, "step": 1808 }, { "epoch": 3.0403361344537814, "grad_norm": 0.19584659323029102, "learning_rate": 3.1957700635221565e-05, "loss": 0.3512, "step": 1809 }, { "epoch": 3.042016806722689, "grad_norm": 0.1958492280349936, "learning_rate": 3.1911722775689315e-05, "loss": 0.356, "step": 1810 }, { "epoch": 3.0436974789915965, "grad_norm": 0.17787207913673278, "learning_rate": 3.186575605548478e-05, "loss": 0.3573, "step": 1811 }, { "epoch": 3.045378151260504, "grad_norm": 0.1917923147199365, "learning_rate": 3.1819800537914206e-05, "loss": 0.3623, "step": 1812 }, { "epoch": 3.0470588235294116, "grad_norm": 0.15541614098953865, "learning_rate": 3.1773856286268395e-05, "loss": 0.3529, "step": 1813 }, { "epoch": 3.048739495798319, "grad_norm": 0.16921467465671286, "learning_rate": 3.172792336382265e-05, "loss": 0.3646, "step": 1814 }, { "epoch": 3.0504201680672267, "grad_norm": 0.16782596938563882, "learning_rate": 3.168200183383662e-05, "loss": 0.3615, "step": 1815 }, { "epoch": 3.0521008403361343, "grad_norm": 0.16695321585760237, "learning_rate": 3.163609175955435e-05, "loss": 0.3456, "step": 1816 }, { "epoch": 3.053781512605042, "grad_norm": 0.16279435039021362, "learning_rate": 3.1590193204204006e-05, "loss": 0.3542, "step": 1817 }, { "epoch": 3.0554621848739494, "grad_norm": 0.15658185809633754, "learning_rate": 3.1544306230998e-05, "loss": 0.3605, "step": 1818 }, { "epoch": 3.057142857142857, "grad_norm": 0.16825765565346693, "learning_rate": 3.1498430903132677e-05, "loss": 0.356, "step": 1819 }, { "epoch": 3.0588235294117645, "grad_norm": 0.1714789480169634, "learning_rate": 3.1452567283788456e-05, "loss": 0.3545, "step": 1820 }, { "epoch": 3.060504201680672, "grad_norm": 0.4894547335916988, "learning_rate": 3.140671543612953e-05, "loss": 0.3609, "step": 1821 }, { "epoch": 3.0621848739495796, "grad_norm": 0.16782891871696737, "learning_rate": 3.1360875423303975e-05, "loss": 0.3581, "step": 1822 }, { "epoch": 3.063865546218487, "grad_norm": 0.18189573104086698, "learning_rate": 3.131504730844347e-05, "loss": 0.3498, "step": 1823 }, { "epoch": 3.065546218487395, "grad_norm": 0.1755385985931241, "learning_rate": 3.1269231154663405e-05, "loss": 0.3526, "step": 1824 }, { "epoch": 3.0672268907563027, "grad_norm": 0.14613581128086994, "learning_rate": 3.1223427025062604e-05, "loss": 0.3526, "step": 1825 }, { "epoch": 3.0689075630252103, "grad_norm": 0.2132527748923217, "learning_rate": 3.117763498272343e-05, "loss": 0.3536, "step": 1826 }, { "epoch": 3.070588235294118, "grad_norm": 0.14557869009782035, "learning_rate": 3.11318550907115e-05, "loss": 0.3593, "step": 1827 }, { "epoch": 3.0722689075630254, "grad_norm": 0.16114120746375266, "learning_rate": 3.108608741207576e-05, "loss": 0.3488, "step": 1828 }, { "epoch": 3.073949579831933, "grad_norm": 0.1479667758675598, "learning_rate": 3.104033200984831e-05, "loss": 0.3535, "step": 1829 }, { "epoch": 3.0756302521008405, "grad_norm": 0.1675586720991456, "learning_rate": 3.099458894704438e-05, "loss": 0.3471, "step": 1830 }, { "epoch": 3.077310924369748, "grad_norm": 0.17606523669711138, "learning_rate": 3.094885828666214e-05, "loss": 0.3502, "step": 1831 }, { "epoch": 3.0789915966386556, "grad_norm": 0.15758658583789104, "learning_rate": 3.090314009168276e-05, "loss": 0.353, "step": 1832 }, { "epoch": 3.080672268907563, "grad_norm": 0.17956054344497363, "learning_rate": 3.085743442507015e-05, "loss": 0.3563, "step": 1833 }, { "epoch": 3.0823529411764707, "grad_norm": 0.17230403810657138, "learning_rate": 3.081174134977105e-05, "loss": 0.3512, "step": 1834 }, { "epoch": 3.0840336134453783, "grad_norm": 0.1692997049512165, "learning_rate": 3.0766060928714815e-05, "loss": 0.3536, "step": 1835 }, { "epoch": 3.085714285714286, "grad_norm": 0.1576119770081443, "learning_rate": 3.072039322481337e-05, "loss": 0.3511, "step": 1836 }, { "epoch": 3.0873949579831934, "grad_norm": 0.2131159846669317, "learning_rate": 3.0674738300961164e-05, "loss": 0.3588, "step": 1837 }, { "epoch": 3.089075630252101, "grad_norm": 0.1574682025660605, "learning_rate": 3.0629096220034986e-05, "loss": 0.3506, "step": 1838 }, { "epoch": 3.0907563025210085, "grad_norm": 0.17357655136770564, "learning_rate": 3.058346704489399e-05, "loss": 0.3564, "step": 1839 }, { "epoch": 3.092436974789916, "grad_norm": 0.22368061431875697, "learning_rate": 3.0537850838379524e-05, "loss": 0.3488, "step": 1840 }, { "epoch": 3.0941176470588236, "grad_norm": 0.12566669111131115, "learning_rate": 3.049224766331511e-05, "loss": 0.3533, "step": 1841 }, { "epoch": 3.095798319327731, "grad_norm": 0.18357210756679374, "learning_rate": 3.0446657582506262e-05, "loss": 0.3522, "step": 1842 }, { "epoch": 3.0974789915966388, "grad_norm": 0.15993693452807173, "learning_rate": 3.0401080658740526e-05, "loss": 0.3571, "step": 1843 }, { "epoch": 3.0991596638655463, "grad_norm": 0.19134839559292116, "learning_rate": 3.035551695478728e-05, "loss": 0.3596, "step": 1844 }, { "epoch": 3.100840336134454, "grad_norm": 0.17116546452046968, "learning_rate": 3.030996653339775e-05, "loss": 0.3561, "step": 1845 }, { "epoch": 3.1025210084033614, "grad_norm": 0.17554699209149846, "learning_rate": 3.0264429457304795e-05, "loss": 0.3548, "step": 1846 }, { "epoch": 3.104201680672269, "grad_norm": 0.17364139806572726, "learning_rate": 3.0218905789222952e-05, "loss": 0.3482, "step": 1847 }, { "epoch": 3.1058823529411765, "grad_norm": 0.18844889803732087, "learning_rate": 3.0173395591848264e-05, "loss": 0.3564, "step": 1848 }, { "epoch": 3.107563025210084, "grad_norm": 0.1597317602991232, "learning_rate": 3.0127898927858255e-05, "loss": 0.3453, "step": 1849 }, { "epoch": 3.1092436974789917, "grad_norm": 0.1692008508657394, "learning_rate": 3.0082415859911757e-05, "loss": 0.3655, "step": 1850 }, { "epoch": 3.110924369747899, "grad_norm": 0.1826776779679122, "learning_rate": 3.0036946450648937e-05, "loss": 0.3524, "step": 1851 }, { "epoch": 3.1126050420168068, "grad_norm": 0.15880145488098465, "learning_rate": 2.9991490762691097e-05, "loss": 0.3605, "step": 1852 }, { "epoch": 3.1142857142857143, "grad_norm": 0.19643160797008022, "learning_rate": 2.9946048858640692e-05, "loss": 0.3627, "step": 1853 }, { "epoch": 3.115966386554622, "grad_norm": 0.19554983945840174, "learning_rate": 2.9900620801081133e-05, "loss": 0.3665, "step": 1854 }, { "epoch": 3.1176470588235294, "grad_norm": 0.13316735630541737, "learning_rate": 2.985520665257683e-05, "loss": 0.3577, "step": 1855 }, { "epoch": 3.119327731092437, "grad_norm": 0.18646691532126622, "learning_rate": 2.9809806475672985e-05, "loss": 0.3547, "step": 1856 }, { "epoch": 3.1210084033613446, "grad_norm": 0.17183736328169574, "learning_rate": 2.9764420332895594e-05, "loss": 0.3591, "step": 1857 }, { "epoch": 3.122689075630252, "grad_norm": 0.168273018842077, "learning_rate": 2.971904828675128e-05, "loss": 0.3413, "step": 1858 }, { "epoch": 3.1243697478991597, "grad_norm": 0.17224405998880396, "learning_rate": 2.9673690399727305e-05, "loss": 0.3477, "step": 1859 }, { "epoch": 3.1260504201680672, "grad_norm": 0.154830966722606, "learning_rate": 2.9628346734291383e-05, "loss": 0.3562, "step": 1860 }, { "epoch": 3.127731092436975, "grad_norm": 0.14295017107353047, "learning_rate": 2.9583017352891698e-05, "loss": 0.3523, "step": 1861 }, { "epoch": 3.1294117647058823, "grad_norm": 0.15574227844672175, "learning_rate": 2.9537702317956692e-05, "loss": 0.3507, "step": 1862 }, { "epoch": 3.13109243697479, "grad_norm": 0.15913364457511336, "learning_rate": 2.9492401691895104e-05, "loss": 0.3603, "step": 1863 }, { "epoch": 3.1327731092436975, "grad_norm": 0.14308202988772545, "learning_rate": 2.9447115537095795e-05, "loss": 0.354, "step": 1864 }, { "epoch": 3.134453781512605, "grad_norm": 0.17124762688984238, "learning_rate": 2.9401843915927732e-05, "loss": 0.3482, "step": 1865 }, { "epoch": 3.1361344537815126, "grad_norm": 0.14306150959979308, "learning_rate": 2.935658689073982e-05, "loss": 0.3497, "step": 1866 }, { "epoch": 3.13781512605042, "grad_norm": 0.17301406898997965, "learning_rate": 2.9311344523860915e-05, "loss": 0.3603, "step": 1867 }, { "epoch": 3.1394957983193277, "grad_norm": 0.164923586784263, "learning_rate": 2.9266116877599623e-05, "loss": 0.3589, "step": 1868 }, { "epoch": 3.1411764705882352, "grad_norm": 0.18120323294396565, "learning_rate": 2.9220904014244334e-05, "loss": 0.3594, "step": 1869 }, { "epoch": 3.142857142857143, "grad_norm": 0.16203667293626348, "learning_rate": 2.9175705996063054e-05, "loss": 0.3544, "step": 1870 }, { "epoch": 3.1445378151260504, "grad_norm": 0.1656881330883425, "learning_rate": 2.9130522885303328e-05, "loss": 0.3531, "step": 1871 }, { "epoch": 3.146218487394958, "grad_norm": 0.17566625598350963, "learning_rate": 2.908535474419222e-05, "loss": 0.3581, "step": 1872 }, { "epoch": 3.1478991596638655, "grad_norm": 0.15216880401050345, "learning_rate": 2.90402016349361e-05, "loss": 0.3549, "step": 1873 }, { "epoch": 3.149579831932773, "grad_norm": 0.18708249133611607, "learning_rate": 2.8995063619720717e-05, "loss": 0.3531, "step": 1874 }, { "epoch": 3.1512605042016806, "grad_norm": 0.1638543395499845, "learning_rate": 2.8949940760710967e-05, "loss": 0.3486, "step": 1875 }, { "epoch": 3.152941176470588, "grad_norm": 0.15322099157034438, "learning_rate": 2.890483312005094e-05, "loss": 0.3586, "step": 1876 }, { "epoch": 3.1546218487394957, "grad_norm": 0.17147191380916235, "learning_rate": 2.8859740759863686e-05, "loss": 0.3551, "step": 1877 }, { "epoch": 3.1563025210084033, "grad_norm": 0.1740841473790802, "learning_rate": 2.881466374225128e-05, "loss": 0.3585, "step": 1878 }, { "epoch": 3.157983193277311, "grad_norm": 0.16930973206964614, "learning_rate": 2.876960212929462e-05, "loss": 0.3519, "step": 1879 }, { "epoch": 3.1596638655462184, "grad_norm": 0.1542333433272394, "learning_rate": 2.8724555983053445e-05, "loss": 0.355, "step": 1880 }, { "epoch": 3.161344537815126, "grad_norm": 0.21294692930812253, "learning_rate": 2.867952536556611e-05, "loss": 0.3633, "step": 1881 }, { "epoch": 3.1630252100840335, "grad_norm": 0.17695508328774703, "learning_rate": 2.8634510338849648e-05, "loss": 0.3491, "step": 1882 }, { "epoch": 3.164705882352941, "grad_norm": 0.14525983197713926, "learning_rate": 2.8589510964899607e-05, "loss": 0.3521, "step": 1883 }, { "epoch": 3.1663865546218486, "grad_norm": 0.16659755688138533, "learning_rate": 2.8544527305689977e-05, "loss": 0.3569, "step": 1884 }, { "epoch": 3.168067226890756, "grad_norm": 0.1373294273200851, "learning_rate": 2.8499559423173078e-05, "loss": 0.3508, "step": 1885 }, { "epoch": 3.1697478991596637, "grad_norm": 0.14006450145575663, "learning_rate": 2.845460737927955e-05, "loss": 0.3579, "step": 1886 }, { "epoch": 3.1714285714285713, "grad_norm": 0.14606873229907272, "learning_rate": 2.8409671235918175e-05, "loss": 0.3461, "step": 1887 }, { "epoch": 3.173109243697479, "grad_norm": 0.1657317965818624, "learning_rate": 2.836475105497588e-05, "loss": 0.3493, "step": 1888 }, { "epoch": 3.1747899159663864, "grad_norm": 0.12048595660208694, "learning_rate": 2.831984689831755e-05, "loss": 0.3457, "step": 1889 }, { "epoch": 3.176470588235294, "grad_norm": 0.1404436049531355, "learning_rate": 2.8274958827786066e-05, "loss": 0.3545, "step": 1890 }, { "epoch": 3.1781512605042015, "grad_norm": 0.14322971230821063, "learning_rate": 2.82300869052021e-05, "loss": 0.3484, "step": 1891 }, { "epoch": 3.179831932773109, "grad_norm": 0.1510607887299229, "learning_rate": 2.8185231192364137e-05, "loss": 0.3505, "step": 1892 }, { "epoch": 3.1815126050420166, "grad_norm": 0.1609188154042469, "learning_rate": 2.814039175104828e-05, "loss": 0.3501, "step": 1893 }, { "epoch": 3.183193277310924, "grad_norm": 0.15572764951620163, "learning_rate": 2.8095568643008272e-05, "loss": 0.3533, "step": 1894 }, { "epoch": 3.184873949579832, "grad_norm": 0.15399633505578417, "learning_rate": 2.805076192997533e-05, "loss": 0.3506, "step": 1895 }, { "epoch": 3.1865546218487397, "grad_norm": 0.1386260143041474, "learning_rate": 2.800597167365812e-05, "loss": 0.3578, "step": 1896 }, { "epoch": 3.1882352941176473, "grad_norm": 0.1470820402838719, "learning_rate": 2.7961197935742594e-05, "loss": 0.3531, "step": 1897 }, { "epoch": 3.189915966386555, "grad_norm": 0.15574809358583958, "learning_rate": 2.791644077789203e-05, "loss": 0.3482, "step": 1898 }, { "epoch": 3.1915966386554624, "grad_norm": 0.13616793236802405, "learning_rate": 2.7871700261746794e-05, "loss": 0.3557, "step": 1899 }, { "epoch": 3.19327731092437, "grad_norm": 0.14835447145935918, "learning_rate": 2.782697644892438e-05, "loss": 0.3591, "step": 1900 }, { "epoch": 3.1949579831932775, "grad_norm": 0.13749393564475926, "learning_rate": 2.7782269401019263e-05, "loss": 0.3536, "step": 1901 }, { "epoch": 3.196638655462185, "grad_norm": 0.1449380775953962, "learning_rate": 2.7737579179602856e-05, "loss": 0.3592, "step": 1902 }, { "epoch": 3.1983193277310926, "grad_norm": 0.1467298489504593, "learning_rate": 2.7692905846223327e-05, "loss": 0.346, "step": 1903 }, { "epoch": 3.2, "grad_norm": 0.1464872562394112, "learning_rate": 2.764824946240567e-05, "loss": 0.3542, "step": 1904 }, { "epoch": 3.2016806722689077, "grad_norm": 0.14132356986052827, "learning_rate": 2.7603610089651467e-05, "loss": 0.3541, "step": 1905 }, { "epoch": 3.2033613445378153, "grad_norm": 0.13287422678018102, "learning_rate": 2.7558987789438928e-05, "loss": 0.3591, "step": 1906 }, { "epoch": 3.205042016806723, "grad_norm": 0.1439518600236742, "learning_rate": 2.7514382623222732e-05, "loss": 0.3517, "step": 1907 }, { "epoch": 3.2067226890756304, "grad_norm": 0.1250830625848471, "learning_rate": 2.7469794652433927e-05, "loss": 0.3569, "step": 1908 }, { "epoch": 3.208403361344538, "grad_norm": 0.14771480212495589, "learning_rate": 2.7425223938479918e-05, "loss": 0.3533, "step": 1909 }, { "epoch": 3.2100840336134455, "grad_norm": 0.1345728302442638, "learning_rate": 2.738067054274433e-05, "loss": 0.3481, "step": 1910 }, { "epoch": 3.211764705882353, "grad_norm": 0.14434203307299254, "learning_rate": 2.7336134526586957e-05, "loss": 0.3511, "step": 1911 }, { "epoch": 3.2134453781512606, "grad_norm": 0.1640356658144656, "learning_rate": 2.729161595134361e-05, "loss": 0.3551, "step": 1912 }, { "epoch": 3.215126050420168, "grad_norm": 0.16424862564723933, "learning_rate": 2.724711487832613e-05, "loss": 0.3658, "step": 1913 }, { "epoch": 3.2168067226890757, "grad_norm": 0.16021237698428148, "learning_rate": 2.7202631368822215e-05, "loss": 0.3541, "step": 1914 }, { "epoch": 3.2184873949579833, "grad_norm": 0.1886028258314919, "learning_rate": 2.7158165484095414e-05, "loss": 0.3492, "step": 1915 }, { "epoch": 3.220168067226891, "grad_norm": 0.16028609945825245, "learning_rate": 2.711371728538496e-05, "loss": 0.3477, "step": 1916 }, { "epoch": 3.2218487394957984, "grad_norm": 0.19723928108266392, "learning_rate": 2.7069286833905766e-05, "loss": 0.3577, "step": 1917 }, { "epoch": 3.223529411764706, "grad_norm": 0.12998572977757855, "learning_rate": 2.702487419084827e-05, "loss": 0.3553, "step": 1918 }, { "epoch": 3.2252100840336135, "grad_norm": 0.17743025937515647, "learning_rate": 2.6980479417378425e-05, "loss": 0.3549, "step": 1919 }, { "epoch": 3.226890756302521, "grad_norm": 0.1497825908713419, "learning_rate": 2.6936102574637522e-05, "loss": 0.3619, "step": 1920 }, { "epoch": 3.2285714285714286, "grad_norm": 0.16128196080098783, "learning_rate": 2.6891743723742202e-05, "loss": 0.3568, "step": 1921 }, { "epoch": 3.230252100840336, "grad_norm": 0.15538003193397912, "learning_rate": 2.68474029257843e-05, "loss": 0.3529, "step": 1922 }, { "epoch": 3.2319327731092438, "grad_norm": 0.16448770740135793, "learning_rate": 2.6803080241830816e-05, "loss": 0.3614, "step": 1923 }, { "epoch": 3.2336134453781513, "grad_norm": 0.14344999492768906, "learning_rate": 2.6758775732923755e-05, "loss": 0.351, "step": 1924 }, { "epoch": 3.235294117647059, "grad_norm": 0.1588586495409182, "learning_rate": 2.671448946008014e-05, "loss": 0.3499, "step": 1925 }, { "epoch": 3.2369747899159664, "grad_norm": 0.14759820325373654, "learning_rate": 2.667022148429186e-05, "loss": 0.3554, "step": 1926 }, { "epoch": 3.238655462184874, "grad_norm": 0.16898649269017468, "learning_rate": 2.6625971866525614e-05, "loss": 0.3622, "step": 1927 }, { "epoch": 3.2403361344537815, "grad_norm": 0.14237974604255083, "learning_rate": 2.6581740667722784e-05, "loss": 0.3516, "step": 1928 }, { "epoch": 3.242016806722689, "grad_norm": 0.14474891449352, "learning_rate": 2.6537527948799435e-05, "loss": 0.3501, "step": 1929 }, { "epoch": 3.2436974789915967, "grad_norm": 0.13729409453820543, "learning_rate": 2.649333377064613e-05, "loss": 0.3504, "step": 1930 }, { "epoch": 3.245378151260504, "grad_norm": 0.1462712282554382, "learning_rate": 2.6449158194127957e-05, "loss": 0.3539, "step": 1931 }, { "epoch": 3.2470588235294118, "grad_norm": 0.1390246256680536, "learning_rate": 2.640500128008431e-05, "loss": 0.3563, "step": 1932 }, { "epoch": 3.2487394957983193, "grad_norm": 0.16271921344485282, "learning_rate": 2.636086308932897e-05, "loss": 0.3589, "step": 1933 }, { "epoch": 3.250420168067227, "grad_norm": 0.1318720824936104, "learning_rate": 2.6316743682649842e-05, "loss": 0.3509, "step": 1934 }, { "epoch": 3.2521008403361344, "grad_norm": 0.6498289619284929, "learning_rate": 2.6272643120809037e-05, "loss": 0.3656, "step": 1935 }, { "epoch": 3.253781512605042, "grad_norm": 0.13914105700669235, "learning_rate": 2.6228561464542655e-05, "loss": 0.3526, "step": 1936 }, { "epoch": 3.2554621848739496, "grad_norm": 0.12913308094894888, "learning_rate": 2.618449877456081e-05, "loss": 0.345, "step": 1937 }, { "epoch": 3.257142857142857, "grad_norm": 0.2347287624258223, "learning_rate": 2.614045511154743e-05, "loss": 0.3518, "step": 1938 }, { "epoch": 3.2588235294117647, "grad_norm": 0.1467265256013064, "learning_rate": 2.609643053616031e-05, "loss": 0.3518, "step": 1939 }, { "epoch": 3.2605042016806722, "grad_norm": 0.15237025699760656, "learning_rate": 2.605242510903091e-05, "loss": 0.3458, "step": 1940 }, { "epoch": 3.26218487394958, "grad_norm": 0.15076042172564533, "learning_rate": 2.6008438890764334e-05, "loss": 0.3435, "step": 1941 }, { "epoch": 3.2638655462184873, "grad_norm": 0.15840805678184622, "learning_rate": 2.5964471941939248e-05, "loss": 0.3537, "step": 1942 }, { "epoch": 3.265546218487395, "grad_norm": 0.14728022730410523, "learning_rate": 2.592052432310773e-05, "loss": 0.3606, "step": 1943 }, { "epoch": 3.2672268907563025, "grad_norm": 0.1643614593679342, "learning_rate": 2.5876596094795287e-05, "loss": 0.3596, "step": 1944 }, { "epoch": 3.26890756302521, "grad_norm": 0.15045720403439705, "learning_rate": 2.58326873175007e-05, "loss": 0.3547, "step": 1945 }, { "epoch": 3.2705882352941176, "grad_norm": 0.1912461848316963, "learning_rate": 2.578879805169598e-05, "loss": 0.3604, "step": 1946 }, { "epoch": 3.272268907563025, "grad_norm": 0.15620250668445412, "learning_rate": 2.5744928357826217e-05, "loss": 0.3539, "step": 1947 }, { "epoch": 3.2739495798319327, "grad_norm": 0.14955305180934367, "learning_rate": 2.5701078296309608e-05, "loss": 0.3538, "step": 1948 }, { "epoch": 3.2756302521008402, "grad_norm": 0.14355373028055324, "learning_rate": 2.5657247927537263e-05, "loss": 0.3501, "step": 1949 }, { "epoch": 3.277310924369748, "grad_norm": 0.1559689203721795, "learning_rate": 2.5613437311873214e-05, "loss": 0.3437, "step": 1950 }, { "epoch": 3.2789915966386554, "grad_norm": 0.14335328283199514, "learning_rate": 2.5569646509654248e-05, "loss": 0.3546, "step": 1951 }, { "epoch": 3.280672268907563, "grad_norm": 0.15385030733552413, "learning_rate": 2.5525875581189885e-05, "loss": 0.3533, "step": 1952 }, { "epoch": 3.2823529411764705, "grad_norm": 0.161475220954115, "learning_rate": 2.5482124586762274e-05, "loss": 0.3566, "step": 1953 }, { "epoch": 3.284033613445378, "grad_norm": 0.12897737153040983, "learning_rate": 2.543839358662612e-05, "loss": 0.3548, "step": 1954 }, { "epoch": 3.2857142857142856, "grad_norm": 0.15081936529196188, "learning_rate": 2.5394682641008554e-05, "loss": 0.3541, "step": 1955 }, { "epoch": 3.287394957983193, "grad_norm": 0.16370805949462716, "learning_rate": 2.535099181010914e-05, "loss": 0.3498, "step": 1956 }, { "epoch": 3.2890756302521007, "grad_norm": 0.14678799306756024, "learning_rate": 2.5307321154099693e-05, "loss": 0.3522, "step": 1957 }, { "epoch": 3.2907563025210083, "grad_norm": 0.1882507708829521, "learning_rate": 2.5263670733124293e-05, "loss": 0.3564, "step": 1958 }, { "epoch": 3.292436974789916, "grad_norm": 0.1594483364998737, "learning_rate": 2.5220040607299078e-05, "loss": 0.3509, "step": 1959 }, { "epoch": 3.2941176470588234, "grad_norm": 0.1228387319949736, "learning_rate": 2.5176430836712315e-05, "loss": 0.3422, "step": 1960 }, { "epoch": 3.295798319327731, "grad_norm": 0.13918571956830172, "learning_rate": 2.5132841481424182e-05, "loss": 0.3572, "step": 1961 }, { "epoch": 3.2974789915966385, "grad_norm": 0.1366772881561171, "learning_rate": 2.5089272601466777e-05, "loss": 0.3525, "step": 1962 }, { "epoch": 3.299159663865546, "grad_norm": 0.1548837932666703, "learning_rate": 2.504572425684396e-05, "loss": 0.3538, "step": 1963 }, { "epoch": 3.3008403361344536, "grad_norm": 0.13932546830887108, "learning_rate": 2.5002196507531346e-05, "loss": 0.3471, "step": 1964 }, { "epoch": 3.302521008403361, "grad_norm": 1.119962274593432, "learning_rate": 2.4958689413476167e-05, "loss": 0.3626, "step": 1965 }, { "epoch": 3.3042016806722687, "grad_norm": 0.13964216066207474, "learning_rate": 2.4915203034597223e-05, "loss": 0.3484, "step": 1966 }, { "epoch": 3.3058823529411763, "grad_norm": 0.13688409026783685, "learning_rate": 2.4871737430784747e-05, "loss": 0.3426, "step": 1967 }, { "epoch": 3.307563025210084, "grad_norm": 0.1377143839801748, "learning_rate": 2.4828292661900424e-05, "loss": 0.3526, "step": 1968 }, { "epoch": 3.3092436974789914, "grad_norm": 0.15548976521696117, "learning_rate": 2.4784868787777178e-05, "loss": 0.359, "step": 1969 }, { "epoch": 3.310924369747899, "grad_norm": 0.1420902514692649, "learning_rate": 2.4741465868219197e-05, "loss": 0.3503, "step": 1970 }, { "epoch": 3.3126050420168065, "grad_norm": 0.14391209270559685, "learning_rate": 2.4698083963001797e-05, "loss": 0.3585, "step": 1971 }, { "epoch": 3.314285714285714, "grad_norm": 0.13572216577058324, "learning_rate": 2.465472313187137e-05, "loss": 0.3635, "step": 1972 }, { "epoch": 3.3159663865546216, "grad_norm": 0.13463331964539185, "learning_rate": 2.4611383434545245e-05, "loss": 0.3498, "step": 1973 }, { "epoch": 3.317647058823529, "grad_norm": 0.13336325098010765, "learning_rate": 2.4568064930711682e-05, "loss": 0.3506, "step": 1974 }, { "epoch": 3.3193277310924367, "grad_norm": 0.129887704645444, "learning_rate": 2.452476768002974e-05, "loss": 0.3436, "step": 1975 }, { "epoch": 3.3210084033613447, "grad_norm": 0.12792887563692792, "learning_rate": 2.44814917421292e-05, "loss": 0.3525, "step": 1976 }, { "epoch": 3.3226890756302523, "grad_norm": 0.13129589535437466, "learning_rate": 2.4438237176610527e-05, "loss": 0.354, "step": 1977 }, { "epoch": 3.32436974789916, "grad_norm": 0.1283245185291386, "learning_rate": 2.439500404304468e-05, "loss": 0.3481, "step": 1978 }, { "epoch": 3.3260504201680674, "grad_norm": 0.12232341709813775, "learning_rate": 2.435179240097318e-05, "loss": 0.345, "step": 1979 }, { "epoch": 3.327731092436975, "grad_norm": 0.1400385007717888, "learning_rate": 2.4308602309907893e-05, "loss": 0.3467, "step": 1980 }, { "epoch": 3.3294117647058825, "grad_norm": 0.5307501304564584, "learning_rate": 2.4265433829331046e-05, "loss": 0.3591, "step": 1981 }, { "epoch": 3.33109243697479, "grad_norm": 0.1578757350798885, "learning_rate": 2.4222287018695062e-05, "loss": 0.3542, "step": 1982 }, { "epoch": 3.3327731092436976, "grad_norm": 0.14667395611060827, "learning_rate": 2.4179161937422554e-05, "loss": 0.3504, "step": 1983 }, { "epoch": 3.334453781512605, "grad_norm": 0.1509615421886387, "learning_rate": 2.413605864490619e-05, "loss": 0.3611, "step": 1984 }, { "epoch": 3.3361344537815127, "grad_norm": 0.14429316412242552, "learning_rate": 2.4092977200508648e-05, "loss": 0.3556, "step": 1985 }, { "epoch": 3.3378151260504203, "grad_norm": 0.14931875308220333, "learning_rate": 2.404991766356249e-05, "loss": 0.3576, "step": 1986 }, { "epoch": 3.339495798319328, "grad_norm": 0.13167798804589512, "learning_rate": 2.400688009337013e-05, "loss": 0.3512, "step": 1987 }, { "epoch": 3.3411764705882354, "grad_norm": 0.13911776724643646, "learning_rate": 2.3963864549203706e-05, "loss": 0.3602, "step": 1988 }, { "epoch": 3.342857142857143, "grad_norm": 0.1342929112043367, "learning_rate": 2.3920871090305066e-05, "loss": 0.3552, "step": 1989 }, { "epoch": 3.3445378151260505, "grad_norm": 0.13594452089232215, "learning_rate": 2.3877899775885576e-05, "loss": 0.3547, "step": 1990 }, { "epoch": 3.346218487394958, "grad_norm": 0.1470117406529032, "learning_rate": 2.383495066512617e-05, "loss": 0.3462, "step": 1991 }, { "epoch": 3.3478991596638656, "grad_norm": 0.13704434405861168, "learning_rate": 2.3792023817177138e-05, "loss": 0.3511, "step": 1992 }, { "epoch": 3.349579831932773, "grad_norm": 0.14370877789246198, "learning_rate": 2.3749119291158192e-05, "loss": 0.3526, "step": 1993 }, { "epoch": 3.3512605042016808, "grad_norm": 0.1248478707121084, "learning_rate": 2.3706237146158203e-05, "loss": 0.3538, "step": 1994 }, { "epoch": 3.3529411764705883, "grad_norm": 0.1597970208546011, "learning_rate": 2.3663377441235297e-05, "loss": 0.3517, "step": 1995 }, { "epoch": 3.354621848739496, "grad_norm": 0.13160293582586544, "learning_rate": 2.3620540235416654e-05, "loss": 0.3569, "step": 1996 }, { "epoch": 3.3563025210084034, "grad_norm": 0.13995737391164267, "learning_rate": 2.3577725587698504e-05, "loss": 0.3553, "step": 1997 }, { "epoch": 3.357983193277311, "grad_norm": 0.12640257352920953, "learning_rate": 2.353493355704595e-05, "loss": 0.3494, "step": 1998 }, { "epoch": 3.3596638655462185, "grad_norm": 0.13475013203412362, "learning_rate": 2.3492164202393014e-05, "loss": 0.3627, "step": 1999 }, { "epoch": 3.361344537815126, "grad_norm": 0.137676770304597, "learning_rate": 2.344941758264243e-05, "loss": 0.3459, "step": 2000 }, { "epoch": 3.3630252100840337, "grad_norm": 0.135503848704927, "learning_rate": 2.3406693756665687e-05, "loss": 0.3526, "step": 2001 }, { "epoch": 3.364705882352941, "grad_norm": 0.13971320173003543, "learning_rate": 2.336399278330279e-05, "loss": 0.3546, "step": 2002 }, { "epoch": 3.3663865546218488, "grad_norm": 0.12530403753476302, "learning_rate": 2.3321314721362377e-05, "loss": 0.3521, "step": 2003 }, { "epoch": 3.3680672268907563, "grad_norm": 0.4532738266611213, "learning_rate": 2.3278659629621432e-05, "loss": 0.3465, "step": 2004 }, { "epoch": 3.369747899159664, "grad_norm": 0.13894280886916155, "learning_rate": 2.32360275668254e-05, "loss": 0.3491, "step": 2005 }, { "epoch": 3.3714285714285714, "grad_norm": 0.11642417413422929, "learning_rate": 2.3193418591687902e-05, "loss": 0.3658, "step": 2006 }, { "epoch": 3.373109243697479, "grad_norm": 0.1824985320433411, "learning_rate": 2.3150832762890866e-05, "loss": 0.357, "step": 2007 }, { "epoch": 3.3747899159663866, "grad_norm": 0.12365042927116046, "learning_rate": 2.310827013908429e-05, "loss": 0.3544, "step": 2008 }, { "epoch": 3.376470588235294, "grad_norm": 0.15125364872329294, "learning_rate": 2.306573077888621e-05, "loss": 0.3612, "step": 2009 }, { "epoch": 3.3781512605042017, "grad_norm": 0.1400494221730116, "learning_rate": 2.302321474088264e-05, "loss": 0.359, "step": 2010 }, { "epoch": 3.3798319327731092, "grad_norm": 0.24588610680042286, "learning_rate": 2.2980722083627463e-05, "loss": 0.3542, "step": 2011 }, { "epoch": 3.381512605042017, "grad_norm": 0.13924758802496517, "learning_rate": 2.293825286564236e-05, "loss": 0.3514, "step": 2012 }, { "epoch": 3.3831932773109243, "grad_norm": 0.1454470434892464, "learning_rate": 2.2895807145416768e-05, "loss": 0.3465, "step": 2013 }, { "epoch": 3.384873949579832, "grad_norm": 0.2073656350937141, "learning_rate": 2.2853384981407708e-05, "loss": 0.3522, "step": 2014 }, { "epoch": 3.3865546218487395, "grad_norm": 0.14154178464661812, "learning_rate": 2.2810986432039787e-05, "loss": 0.3544, "step": 2015 }, { "epoch": 3.388235294117647, "grad_norm": 0.135949047683396, "learning_rate": 2.276861155570509e-05, "loss": 0.3596, "step": 2016 }, { "epoch": 3.3899159663865546, "grad_norm": 0.1516206660854561, "learning_rate": 2.2726260410763078e-05, "loss": 0.353, "step": 2017 }, { "epoch": 3.391596638655462, "grad_norm": 0.13961564843061672, "learning_rate": 2.2683933055540582e-05, "loss": 0.3615, "step": 2018 }, { "epoch": 3.3932773109243697, "grad_norm": 0.15548801922591068, "learning_rate": 2.2641629548331592e-05, "loss": 0.3616, "step": 2019 }, { "epoch": 3.3949579831932772, "grad_norm": 0.14061081073541823, "learning_rate": 2.259934994739732e-05, "loss": 0.3519, "step": 2020 }, { "epoch": 3.396638655462185, "grad_norm": 0.13693149690107084, "learning_rate": 2.255709431096602e-05, "loss": 0.3517, "step": 2021 }, { "epoch": 3.3983193277310924, "grad_norm": 0.14279640422586992, "learning_rate": 2.2514862697232958e-05, "loss": 0.3557, "step": 2022 }, { "epoch": 3.4, "grad_norm": 0.13814539926255, "learning_rate": 2.24726551643603e-05, "loss": 0.3548, "step": 2023 }, { "epoch": 3.4016806722689075, "grad_norm": 0.13282845037169969, "learning_rate": 2.2430471770477055e-05, "loss": 0.3572, "step": 2024 }, { "epoch": 3.403361344537815, "grad_norm": 0.13807800897687802, "learning_rate": 2.2388312573678976e-05, "loss": 0.3634, "step": 2025 }, { "epoch": 3.4050420168067226, "grad_norm": 0.1422574744361144, "learning_rate": 2.2346177632028548e-05, "loss": 0.3594, "step": 2026 }, { "epoch": 3.40672268907563, "grad_norm": 0.15487212703959358, "learning_rate": 2.230406700355474e-05, "loss": 0.3561, "step": 2027 }, { "epoch": 3.4084033613445377, "grad_norm": 0.13700365886852733, "learning_rate": 2.2261980746253157e-05, "loss": 0.3617, "step": 2028 }, { "epoch": 3.4100840336134453, "grad_norm": 0.1329020108435227, "learning_rate": 2.2219918918085767e-05, "loss": 0.3531, "step": 2029 }, { "epoch": 3.411764705882353, "grad_norm": 0.13174353238908337, "learning_rate": 2.21778815769809e-05, "loss": 0.3439, "step": 2030 }, { "epoch": 3.4134453781512604, "grad_norm": 0.13696886938487163, "learning_rate": 2.2135868780833194e-05, "loss": 0.3618, "step": 2031 }, { "epoch": 3.415126050420168, "grad_norm": 0.4743652833850197, "learning_rate": 2.209388058750345e-05, "loss": 0.3634, "step": 2032 }, { "epoch": 3.4168067226890755, "grad_norm": 0.15678812987015114, "learning_rate": 2.205191705481859e-05, "loss": 0.3536, "step": 2033 }, { "epoch": 3.418487394957983, "grad_norm": 0.12381595569103596, "learning_rate": 2.2009978240571622e-05, "loss": 0.3564, "step": 2034 }, { "epoch": 3.4201680672268906, "grad_norm": 0.1656506967872263, "learning_rate": 2.1968064202521412e-05, "loss": 0.3548, "step": 2035 }, { "epoch": 3.421848739495798, "grad_norm": 0.12626513420899865, "learning_rate": 2.1926174998392808e-05, "loss": 0.3578, "step": 2036 }, { "epoch": 3.4235294117647057, "grad_norm": 0.13367419986645435, "learning_rate": 2.188431068587639e-05, "loss": 0.3665, "step": 2037 }, { "epoch": 3.4252100840336133, "grad_norm": 0.12406631690394243, "learning_rate": 2.1842471322628494e-05, "loss": 0.3556, "step": 2038 }, { "epoch": 3.426890756302521, "grad_norm": 0.2896534907079289, "learning_rate": 2.1800656966271045e-05, "loss": 0.3581, "step": 2039 }, { "epoch": 3.4285714285714284, "grad_norm": 0.12438236577911234, "learning_rate": 2.1758867674391623e-05, "loss": 0.354, "step": 2040 }, { "epoch": 3.4302521008403364, "grad_norm": 0.13661764871051763, "learning_rate": 2.1717103504543155e-05, "loss": 0.3546, "step": 2041 }, { "epoch": 3.431932773109244, "grad_norm": 0.1268290999715725, "learning_rate": 2.16753645142441e-05, "loss": 0.3524, "step": 2042 }, { "epoch": 3.4336134453781515, "grad_norm": 0.13690263373206535, "learning_rate": 2.163365076097815e-05, "loss": 0.3495, "step": 2043 }, { "epoch": 3.435294117647059, "grad_norm": 0.246485828675444, "learning_rate": 2.159196230219429e-05, "loss": 0.3663, "step": 2044 }, { "epoch": 3.4369747899159666, "grad_norm": 0.1264479972618893, "learning_rate": 2.1550299195306638e-05, "loss": 0.3617, "step": 2045 }, { "epoch": 3.438655462184874, "grad_norm": 0.12971662287690078, "learning_rate": 2.150866149769441e-05, "loss": 0.3467, "step": 2046 }, { "epoch": 3.4403361344537817, "grad_norm": 0.12286552588260316, "learning_rate": 2.1467049266701822e-05, "loss": 0.3556, "step": 2047 }, { "epoch": 3.4420168067226893, "grad_norm": 0.14976530810834598, "learning_rate": 2.142546255963804e-05, "loss": 0.355, "step": 2048 }, { "epoch": 3.443697478991597, "grad_norm": 0.13356928888702524, "learning_rate": 2.138390143377705e-05, "loss": 0.3558, "step": 2049 }, { "epoch": 3.4453781512605044, "grad_norm": 0.16810369576207623, "learning_rate": 2.1342365946357625e-05, "loss": 0.3525, "step": 2050 }, { "epoch": 3.447058823529412, "grad_norm": 0.13765696208851744, "learning_rate": 2.1300856154583212e-05, "loss": 0.3572, "step": 2051 }, { "epoch": 3.4487394957983195, "grad_norm": 0.16517600316404504, "learning_rate": 2.1259372115621863e-05, "loss": 0.3529, "step": 2052 }, { "epoch": 3.450420168067227, "grad_norm": 0.1310276167741679, "learning_rate": 2.121791388660622e-05, "loss": 0.3485, "step": 2053 }, { "epoch": 3.4521008403361346, "grad_norm": 0.21133331352328727, "learning_rate": 2.1176481524633282e-05, "loss": 0.3522, "step": 2054 }, { "epoch": 3.453781512605042, "grad_norm": 0.14756906950623105, "learning_rate": 2.1135075086764534e-05, "loss": 0.3578, "step": 2055 }, { "epoch": 3.4554621848739497, "grad_norm": 0.18082174294769926, "learning_rate": 2.1093694630025662e-05, "loss": 0.3553, "step": 2056 }, { "epoch": 3.4571428571428573, "grad_norm": 0.19343701636363542, "learning_rate": 2.105234021140663e-05, "loss": 0.3525, "step": 2057 }, { "epoch": 3.458823529411765, "grad_norm": 0.19550276980033612, "learning_rate": 2.101101188786152e-05, "loss": 0.3564, "step": 2058 }, { "epoch": 3.4605042016806724, "grad_norm": 0.53454078775467, "learning_rate": 2.0969709716308467e-05, "loss": 0.3434, "step": 2059 }, { "epoch": 3.46218487394958, "grad_norm": 0.25134725657686946, "learning_rate": 2.09284337536296e-05, "loss": 0.3582, "step": 2060 }, { "epoch": 3.4638655462184875, "grad_norm": 0.18169928193358886, "learning_rate": 2.0887184056670978e-05, "loss": 0.3627, "step": 2061 }, { "epoch": 3.465546218487395, "grad_norm": 0.19603731837033056, "learning_rate": 2.084596068224242e-05, "loss": 0.345, "step": 2062 }, { "epoch": 3.4672268907563026, "grad_norm": 0.16029643824063103, "learning_rate": 2.0804763687117565e-05, "loss": 0.3538, "step": 2063 }, { "epoch": 3.46890756302521, "grad_norm": 0.16921723210508055, "learning_rate": 2.0763593128033678e-05, "loss": 0.3538, "step": 2064 }, { "epoch": 3.4705882352941178, "grad_norm": 0.13266427861747837, "learning_rate": 2.0722449061691633e-05, "loss": 0.3556, "step": 2065 }, { "epoch": 3.4722689075630253, "grad_norm": 0.17988074208371513, "learning_rate": 2.0681331544755812e-05, "loss": 0.3524, "step": 2066 }, { "epoch": 3.473949579831933, "grad_norm": 0.127560398173901, "learning_rate": 2.0640240633854035e-05, "loss": 0.3577, "step": 2067 }, { "epoch": 3.4756302521008404, "grad_norm": 0.20620101725503098, "learning_rate": 2.059917638557746e-05, "loss": 0.3552, "step": 2068 }, { "epoch": 3.477310924369748, "grad_norm": 0.16137647064941318, "learning_rate": 2.0558138856480586e-05, "loss": 0.3614, "step": 2069 }, { "epoch": 3.4789915966386555, "grad_norm": 0.17000527242385904, "learning_rate": 2.051712810308101e-05, "loss": 0.3545, "step": 2070 }, { "epoch": 3.480672268907563, "grad_norm": 0.1836029772846834, "learning_rate": 2.047614418185957e-05, "loss": 0.3538, "step": 2071 }, { "epoch": 3.4823529411764707, "grad_norm": 0.13475600853017491, "learning_rate": 2.043518714926007e-05, "loss": 0.3531, "step": 2072 }, { "epoch": 3.484033613445378, "grad_norm": 0.18245807025193378, "learning_rate": 2.0394257061689313e-05, "loss": 0.3515, "step": 2073 }, { "epoch": 3.4857142857142858, "grad_norm": 0.1340115678891636, "learning_rate": 2.0353353975516955e-05, "loss": 0.358, "step": 2074 }, { "epoch": 3.4873949579831933, "grad_norm": 0.15041985754644085, "learning_rate": 2.0312477947075564e-05, "loss": 0.3517, "step": 2075 }, { "epoch": 3.489075630252101, "grad_norm": 0.18996994813691107, "learning_rate": 2.0271629032660307e-05, "loss": 0.3605, "step": 2076 }, { "epoch": 3.4907563025210084, "grad_norm": 0.38334122816590216, "learning_rate": 2.0230807288529117e-05, "loss": 0.3717, "step": 2077 }, { "epoch": 3.492436974789916, "grad_norm": 0.20350731481784023, "learning_rate": 2.0190012770902464e-05, "loss": 0.3537, "step": 2078 }, { "epoch": 3.4941176470588236, "grad_norm": 0.14339306421620568, "learning_rate": 2.0149245535963323e-05, "loss": 0.359, "step": 2079 }, { "epoch": 3.495798319327731, "grad_norm": 0.6577291112373262, "learning_rate": 2.0108505639857087e-05, "loss": 0.3485, "step": 2080 }, { "epoch": 3.4974789915966387, "grad_norm": 0.19324881095040244, "learning_rate": 2.006779313869152e-05, "loss": 0.3526, "step": 2081 }, { "epoch": 3.499159663865546, "grad_norm": 0.18623099374993574, "learning_rate": 2.0027108088536632e-05, "loss": 0.3607, "step": 2082 }, { "epoch": 3.500840336134454, "grad_norm": 0.20104206079118112, "learning_rate": 1.9986450545424666e-05, "loss": 0.3615, "step": 2083 }, { "epoch": 3.5025210084033613, "grad_norm": 0.1787753609574073, "learning_rate": 1.9945820565349934e-05, "loss": 0.3569, "step": 2084 }, { "epoch": 3.504201680672269, "grad_norm": 0.21038665519256577, "learning_rate": 1.9905218204268822e-05, "loss": 0.3626, "step": 2085 }, { "epoch": 3.5058823529411764, "grad_norm": 0.18561103881255656, "learning_rate": 1.9864643518099663e-05, "loss": 0.3578, "step": 2086 }, { "epoch": 3.507563025210084, "grad_norm": 0.904097089053752, "learning_rate": 1.982409656272266e-05, "loss": 0.3653, "step": 2087 }, { "epoch": 3.5092436974789916, "grad_norm": 0.21616635180695776, "learning_rate": 1.978357739397989e-05, "loss": 0.3544, "step": 2088 }, { "epoch": 3.510924369747899, "grad_norm": 0.22639151174478972, "learning_rate": 1.974308606767506e-05, "loss": 0.3561, "step": 2089 }, { "epoch": 3.5126050420168067, "grad_norm": 0.4214438760957591, "learning_rate": 1.9702622639573623e-05, "loss": 0.3517, "step": 2090 }, { "epoch": 3.5142857142857142, "grad_norm": 0.23038202240193936, "learning_rate": 1.9662187165402567e-05, "loss": 0.3611, "step": 2091 }, { "epoch": 3.515966386554622, "grad_norm": 0.22669633495419594, "learning_rate": 1.9621779700850387e-05, "loss": 0.3572, "step": 2092 }, { "epoch": 3.5176470588235293, "grad_norm": 0.18539897866202182, "learning_rate": 1.9581400301567e-05, "loss": 0.3663, "step": 2093 }, { "epoch": 3.519327731092437, "grad_norm": 0.2515122249464049, "learning_rate": 1.9541049023163683e-05, "loss": 0.3549, "step": 2094 }, { "epoch": 3.5210084033613445, "grad_norm": 0.17389447740739467, "learning_rate": 1.9500725921212958e-05, "loss": 0.3561, "step": 2095 }, { "epoch": 3.522689075630252, "grad_norm": 0.17396412792496932, "learning_rate": 1.94604310512486e-05, "loss": 0.3548, "step": 2096 }, { "epoch": 3.5243697478991596, "grad_norm": 0.21207664565944007, "learning_rate": 1.942016446876541e-05, "loss": 0.3544, "step": 2097 }, { "epoch": 3.526050420168067, "grad_norm": 0.14064024032129657, "learning_rate": 1.9379926229219325e-05, "loss": 0.3452, "step": 2098 }, { "epoch": 3.5277310924369747, "grad_norm": 0.1520246165348071, "learning_rate": 1.933971638802719e-05, "loss": 0.3521, "step": 2099 }, { "epoch": 3.5294117647058822, "grad_norm": 0.15980987699143961, "learning_rate": 1.929953500056676e-05, "loss": 0.359, "step": 2100 }, { "epoch": 3.53109243697479, "grad_norm": 0.17837346678211194, "learning_rate": 1.92593821221766e-05, "loss": 0.3482, "step": 2101 }, { "epoch": 3.5327731092436974, "grad_norm": 0.16280800227870554, "learning_rate": 1.9219257808156e-05, "loss": 0.3554, "step": 2102 }, { "epoch": 3.534453781512605, "grad_norm": 0.1692736557024281, "learning_rate": 1.917916211376492e-05, "loss": 0.3543, "step": 2103 }, { "epoch": 3.5361344537815125, "grad_norm": 0.1698064379279406, "learning_rate": 1.913909509422394e-05, "loss": 0.3586, "step": 2104 }, { "epoch": 3.53781512605042, "grad_norm": 0.19993503887573108, "learning_rate": 1.909905680471406e-05, "loss": 0.3528, "step": 2105 }, { "epoch": 3.5394957983193276, "grad_norm": 0.15064802332434513, "learning_rate": 1.9059047300376806e-05, "loss": 0.3613, "step": 2106 }, { "epoch": 3.541176470588235, "grad_norm": 0.14644648382843065, "learning_rate": 1.9019066636314016e-05, "loss": 0.3695, "step": 2107 }, { "epoch": 3.5428571428571427, "grad_norm": 0.1546706875274462, "learning_rate": 1.89791148675878e-05, "loss": 0.3537, "step": 2108 }, { "epoch": 3.5445378151260503, "grad_norm": 0.13968500708804485, "learning_rate": 1.8939192049220473e-05, "loss": 0.358, "step": 2109 }, { "epoch": 3.546218487394958, "grad_norm": 0.11926735647915593, "learning_rate": 1.889929823619455e-05, "loss": 0.3619, "step": 2110 }, { "epoch": 3.5478991596638654, "grad_norm": 0.1519001586181329, "learning_rate": 1.885943348345247e-05, "loss": 0.357, "step": 2111 }, { "epoch": 3.549579831932773, "grad_norm": 0.19277165573087238, "learning_rate": 1.8819597845896768e-05, "loss": 0.356, "step": 2112 }, { "epoch": 3.5512605042016805, "grad_norm": 0.14416892354034505, "learning_rate": 1.8779791378389826e-05, "loss": 0.3594, "step": 2113 }, { "epoch": 3.552941176470588, "grad_norm": 0.13659219186883598, "learning_rate": 1.874001413575385e-05, "loss": 0.3528, "step": 2114 }, { "epoch": 3.5546218487394956, "grad_norm": 0.11612164370442121, "learning_rate": 1.870026617277082e-05, "loss": 0.3554, "step": 2115 }, { "epoch": 3.556302521008403, "grad_norm": 0.21085070007207896, "learning_rate": 1.8660547544182365e-05, "loss": 0.3499, "step": 2116 }, { "epoch": 3.5579831932773107, "grad_norm": 0.1463880895836366, "learning_rate": 1.862085830468973e-05, "loss": 0.3506, "step": 2117 }, { "epoch": 3.5596638655462183, "grad_norm": 0.15627015026911273, "learning_rate": 1.8581198508953714e-05, "loss": 0.3567, "step": 2118 }, { "epoch": 3.561344537815126, "grad_norm": 0.16532065592278214, "learning_rate": 1.8541568211594483e-05, "loss": 0.3552, "step": 2119 }, { "epoch": 3.5630252100840334, "grad_norm": 0.41786875218637415, "learning_rate": 1.8501967467191667e-05, "loss": 0.3617, "step": 2120 }, { "epoch": 3.564705882352941, "grad_norm": 0.14438346351671424, "learning_rate": 1.846239633028415e-05, "loss": 0.3588, "step": 2121 }, { "epoch": 3.5663865546218485, "grad_norm": 0.14195378792669136, "learning_rate": 1.8422854855370022e-05, "loss": 0.3514, "step": 2122 }, { "epoch": 3.568067226890756, "grad_norm": 0.16654799232075645, "learning_rate": 1.8383343096906605e-05, "loss": 0.3638, "step": 2123 }, { "epoch": 3.5697478991596636, "grad_norm": 0.17179604114970215, "learning_rate": 1.8343861109310163e-05, "loss": 0.3679, "step": 2124 }, { "epoch": 3.571428571428571, "grad_norm": 0.13017601766499617, "learning_rate": 1.830440894695609e-05, "loss": 0.3598, "step": 2125 }, { "epoch": 3.5731092436974787, "grad_norm": 0.19036821312978622, "learning_rate": 1.826498666417863e-05, "loss": 0.3572, "step": 2126 }, { "epoch": 3.5747899159663863, "grad_norm": 0.1286267868930455, "learning_rate": 1.822559431527088e-05, "loss": 0.345, "step": 2127 }, { "epoch": 3.576470588235294, "grad_norm": 0.1837317089551673, "learning_rate": 1.8186231954484742e-05, "loss": 0.3458, "step": 2128 }, { "epoch": 3.578151260504202, "grad_norm": 0.13663834771902011, "learning_rate": 1.8146899636030794e-05, "loss": 0.351, "step": 2129 }, { "epoch": 3.5798319327731094, "grad_norm": 0.14149924753634288, "learning_rate": 1.8107597414078227e-05, "loss": 0.3556, "step": 2130 }, { "epoch": 3.581512605042017, "grad_norm": 0.22474462108038495, "learning_rate": 1.806832534275485e-05, "loss": 0.3487, "step": 2131 }, { "epoch": 3.5831932773109245, "grad_norm": 0.15428852078394817, "learning_rate": 1.802908347614684e-05, "loss": 0.3546, "step": 2132 }, { "epoch": 3.584873949579832, "grad_norm": 0.15673963182616266, "learning_rate": 1.7989871868298878e-05, "loss": 0.3558, "step": 2133 }, { "epoch": 3.5865546218487396, "grad_norm": 0.12250502792775061, "learning_rate": 1.7950690573213914e-05, "loss": 0.3593, "step": 2134 }, { "epoch": 3.588235294117647, "grad_norm": 0.14551169527054417, "learning_rate": 1.7911539644853165e-05, "loss": 0.3577, "step": 2135 }, { "epoch": 3.5899159663865547, "grad_norm": 0.19253831891593717, "learning_rate": 1.787241913713603e-05, "loss": 0.3582, "step": 2136 }, { "epoch": 3.5915966386554623, "grad_norm": 0.17277969651768865, "learning_rate": 1.783332910394001e-05, "loss": 0.3505, "step": 2137 }, { "epoch": 3.59327731092437, "grad_norm": 0.16535233292528148, "learning_rate": 1.7794269599100612e-05, "loss": 0.3488, "step": 2138 }, { "epoch": 3.5949579831932774, "grad_norm": 0.1383209198815878, "learning_rate": 1.7755240676411372e-05, "loss": 0.3618, "step": 2139 }, { "epoch": 3.596638655462185, "grad_norm": 0.15434453758198477, "learning_rate": 1.77162423896236e-05, "loss": 0.3513, "step": 2140 }, { "epoch": 3.5983193277310925, "grad_norm": 0.12392288517110711, "learning_rate": 1.7677274792446512e-05, "loss": 0.3575, "step": 2141 }, { "epoch": 3.6, "grad_norm": 0.15319898421327885, "learning_rate": 1.7638337938546998e-05, "loss": 0.3571, "step": 2142 }, { "epoch": 3.6016806722689076, "grad_norm": 0.13231053009382854, "learning_rate": 1.7599431881549636e-05, "loss": 0.3567, "step": 2143 }, { "epoch": 3.603361344537815, "grad_norm": 0.14952258243300906, "learning_rate": 1.756055667503656e-05, "loss": 0.3546, "step": 2144 }, { "epoch": 3.6050420168067228, "grad_norm": 0.12540139645205475, "learning_rate": 1.752171237254748e-05, "loss": 0.3474, "step": 2145 }, { "epoch": 3.6067226890756303, "grad_norm": 0.12995882674488501, "learning_rate": 1.7482899027579442e-05, "loss": 0.3535, "step": 2146 }, { "epoch": 3.608403361344538, "grad_norm": 0.11712492878127059, "learning_rate": 1.744411669358697e-05, "loss": 0.3447, "step": 2147 }, { "epoch": 3.6100840336134454, "grad_norm": 0.11131982662868037, "learning_rate": 1.7405365423981795e-05, "loss": 0.3563, "step": 2148 }, { "epoch": 3.611764705882353, "grad_norm": 0.11581598063080731, "learning_rate": 1.7366645272132903e-05, "loss": 0.3452, "step": 2149 }, { "epoch": 3.6134453781512605, "grad_norm": 0.10707120019457443, "learning_rate": 1.7327956291366415e-05, "loss": 0.354, "step": 2150 }, { "epoch": 3.615126050420168, "grad_norm": 0.13189891454061806, "learning_rate": 1.7289298534965527e-05, "loss": 0.3564, "step": 2151 }, { "epoch": 3.6168067226890757, "grad_norm": 0.1126195419686618, "learning_rate": 1.725067205617041e-05, "loss": 0.3512, "step": 2152 }, { "epoch": 3.618487394957983, "grad_norm": 0.18766683708392892, "learning_rate": 1.7212076908178225e-05, "loss": 0.3544, "step": 2153 }, { "epoch": 3.6201680672268908, "grad_norm": 0.14305127318790606, "learning_rate": 1.7173513144142876e-05, "loss": 0.3584, "step": 2154 }, { "epoch": 3.6218487394957983, "grad_norm": 0.12893212296533285, "learning_rate": 1.713498081717515e-05, "loss": 0.3536, "step": 2155 }, { "epoch": 3.623529411764706, "grad_norm": 0.1504774656411061, "learning_rate": 1.7096479980342483e-05, "loss": 0.3537, "step": 2156 }, { "epoch": 3.6252100840336134, "grad_norm": 0.11915747751180035, "learning_rate": 1.7058010686668928e-05, "loss": 0.3467, "step": 2157 }, { "epoch": 3.626890756302521, "grad_norm": 0.12486047254850713, "learning_rate": 1.7019572989135174e-05, "loss": 0.3487, "step": 2158 }, { "epoch": 3.6285714285714286, "grad_norm": 0.11651952042596257, "learning_rate": 1.698116694067828e-05, "loss": 0.3539, "step": 2159 }, { "epoch": 3.630252100840336, "grad_norm": 0.1193514336053274, "learning_rate": 1.694279259419182e-05, "loss": 0.3498, "step": 2160 }, { "epoch": 3.6319327731092437, "grad_norm": 0.3567798518276507, "learning_rate": 1.6904450002525656e-05, "loss": 0.3496, "step": 2161 }, { "epoch": 3.6336134453781512, "grad_norm": 0.11719008497934472, "learning_rate": 1.686613921848593e-05, "loss": 0.3505, "step": 2162 }, { "epoch": 3.635294117647059, "grad_norm": 0.15295519928221182, "learning_rate": 1.6827860294834973e-05, "loss": 0.3526, "step": 2163 }, { "epoch": 3.6369747899159663, "grad_norm": 0.11494634848418095, "learning_rate": 1.678961328429123e-05, "loss": 0.3519, "step": 2164 }, { "epoch": 3.638655462184874, "grad_norm": 0.19101207853209498, "learning_rate": 1.675139823952921e-05, "loss": 0.3475, "step": 2165 }, { "epoch": 3.6403361344537815, "grad_norm": 0.13274510999848269, "learning_rate": 1.6713215213179418e-05, "loss": 0.3544, "step": 2166 }, { "epoch": 3.642016806722689, "grad_norm": 0.1512193762421964, "learning_rate": 1.6675064257828183e-05, "loss": 0.3505, "step": 2167 }, { "epoch": 3.6436974789915966, "grad_norm": 0.1714332572938461, "learning_rate": 1.663694542601777e-05, "loss": 0.3469, "step": 2168 }, { "epoch": 3.645378151260504, "grad_norm": 0.12319436589409008, "learning_rate": 1.6598858770246127e-05, "loss": 0.3429, "step": 2169 }, { "epoch": 3.6470588235294117, "grad_norm": 0.18237616227529818, "learning_rate": 1.6560804342966923e-05, "loss": 0.3612, "step": 2170 }, { "epoch": 3.6487394957983192, "grad_norm": 0.12051267408169222, "learning_rate": 1.652278219658943e-05, "loss": 0.3601, "step": 2171 }, { "epoch": 3.650420168067227, "grad_norm": 0.22518848512608006, "learning_rate": 1.648479238347846e-05, "loss": 0.3602, "step": 2172 }, { "epoch": 3.6521008403361344, "grad_norm": 0.1055289555694089, "learning_rate": 1.6446834955954286e-05, "loss": 0.3517, "step": 2173 }, { "epoch": 3.653781512605042, "grad_norm": 0.18111378938267486, "learning_rate": 1.6408909966292638e-05, "loss": 0.3462, "step": 2174 }, { "epoch": 3.6554621848739495, "grad_norm": 0.11043049951612667, "learning_rate": 1.6371017466724467e-05, "loss": 0.3538, "step": 2175 }, { "epoch": 3.657142857142857, "grad_norm": 0.2183005219544272, "learning_rate": 1.6333157509436074e-05, "loss": 0.3522, "step": 2176 }, { "epoch": 3.6588235294117646, "grad_norm": 0.12405723286346375, "learning_rate": 1.6295330146568902e-05, "loss": 0.3571, "step": 2177 }, { "epoch": 3.660504201680672, "grad_norm": 0.15681793123438784, "learning_rate": 1.6257535430219503e-05, "loss": 0.3539, "step": 2178 }, { "epoch": 3.6621848739495797, "grad_norm": 0.13090210698484542, "learning_rate": 1.6219773412439457e-05, "loss": 0.3568, "step": 2179 }, { "epoch": 3.6638655462184873, "grad_norm": 0.1698035934530533, "learning_rate": 1.6182044145235368e-05, "loss": 0.3525, "step": 2180 }, { "epoch": 3.665546218487395, "grad_norm": 0.15244750634360063, "learning_rate": 1.6144347680568638e-05, "loss": 0.3527, "step": 2181 }, { "epoch": 3.6672268907563024, "grad_norm": 0.12864861947038125, "learning_rate": 1.6106684070355592e-05, "loss": 0.3553, "step": 2182 }, { "epoch": 3.66890756302521, "grad_norm": 0.11838016036360638, "learning_rate": 1.6069053366467246e-05, "loss": 0.3435, "step": 2183 }, { "epoch": 3.6705882352941175, "grad_norm": 0.13076252297610674, "learning_rate": 1.6031455620729316e-05, "loss": 0.3568, "step": 2184 }, { "epoch": 3.6722689075630255, "grad_norm": 0.16451860022913808, "learning_rate": 1.5993890884922122e-05, "loss": 0.3524, "step": 2185 }, { "epoch": 3.673949579831933, "grad_norm": 0.15263157270385444, "learning_rate": 1.5956359210780534e-05, "loss": 0.3455, "step": 2186 }, { "epoch": 3.6756302521008406, "grad_norm": 0.11725529693410514, "learning_rate": 1.5918860649993858e-05, "loss": 0.356, "step": 2187 }, { "epoch": 3.677310924369748, "grad_norm": 0.19502291795907697, "learning_rate": 1.5881395254205863e-05, "loss": 0.3497, "step": 2188 }, { "epoch": 3.6789915966386557, "grad_norm": 0.11899336471844904, "learning_rate": 1.584396307501454e-05, "loss": 0.3459, "step": 2189 }, { "epoch": 3.6806722689075633, "grad_norm": 0.1611989123154825, "learning_rate": 1.5806564163972236e-05, "loss": 0.3585, "step": 2190 }, { "epoch": 3.682352941176471, "grad_norm": 0.10827500053380945, "learning_rate": 1.5769198572585425e-05, "loss": 0.3477, "step": 2191 }, { "epoch": 3.6840336134453784, "grad_norm": 0.14957763332575089, "learning_rate": 1.5731866352314685e-05, "loss": 0.3493, "step": 2192 }, { "epoch": 3.685714285714286, "grad_norm": 0.10691893479124774, "learning_rate": 1.5694567554574694e-05, "loss": 0.3541, "step": 2193 }, { "epoch": 3.6873949579831935, "grad_norm": 0.11972748264831737, "learning_rate": 1.5657302230734003e-05, "loss": 0.3467, "step": 2194 }, { "epoch": 3.689075630252101, "grad_norm": 0.09903381027987933, "learning_rate": 1.5620070432115164e-05, "loss": 0.3485, "step": 2195 }, { "epoch": 3.6907563025210086, "grad_norm": 0.10203304584334479, "learning_rate": 1.5582872209994487e-05, "loss": 0.3431, "step": 2196 }, { "epoch": 3.692436974789916, "grad_norm": 0.11470347925574854, "learning_rate": 1.5545707615602074e-05, "loss": 0.3536, "step": 2197 }, { "epoch": 3.6941176470588237, "grad_norm": 0.09758946764508754, "learning_rate": 1.55085767001217e-05, "loss": 0.3555, "step": 2198 }, { "epoch": 3.6957983193277313, "grad_norm": 0.1107134312268977, "learning_rate": 1.547147951469077e-05, "loss": 0.3532, "step": 2199 }, { "epoch": 3.697478991596639, "grad_norm": 0.10393910133546629, "learning_rate": 1.5434416110400197e-05, "loss": 0.3553, "step": 2200 }, { "epoch": 3.6991596638655464, "grad_norm": 0.10103248289869214, "learning_rate": 1.5397386538294447e-05, "loss": 0.3513, "step": 2201 }, { "epoch": 3.700840336134454, "grad_norm": 0.11960214985366727, "learning_rate": 1.5360390849371296e-05, "loss": 0.3566, "step": 2202 }, { "epoch": 3.7025210084033615, "grad_norm": 0.10675358909838462, "learning_rate": 1.532342909458194e-05, "loss": 0.353, "step": 2203 }, { "epoch": 3.704201680672269, "grad_norm": 0.1044943943163477, "learning_rate": 1.5286501324830785e-05, "loss": 0.3507, "step": 2204 }, { "epoch": 3.7058823529411766, "grad_norm": 0.11278934783830089, "learning_rate": 1.5249607590975463e-05, "loss": 0.3569, "step": 2205 }, { "epoch": 3.707563025210084, "grad_norm": 0.11859448087796037, "learning_rate": 1.5212747943826708e-05, "loss": 0.3576, "step": 2206 }, { "epoch": 3.7092436974789917, "grad_norm": 0.10487331745286287, "learning_rate": 1.5175922434148324e-05, "loss": 0.3526, "step": 2207 }, { "epoch": 3.7109243697478993, "grad_norm": 0.11224750994641568, "learning_rate": 1.5139131112657083e-05, "loss": 0.3478, "step": 2208 }, { "epoch": 3.712605042016807, "grad_norm": 0.12106027315993423, "learning_rate": 1.5102374030022726e-05, "loss": 0.3555, "step": 2209 }, { "epoch": 3.7142857142857144, "grad_norm": 0.1125205307870147, "learning_rate": 1.5065651236867749e-05, "loss": 0.3462, "step": 2210 }, { "epoch": 3.715966386554622, "grad_norm": 0.13335923202553884, "learning_rate": 1.5028962783767508e-05, "loss": 0.348, "step": 2211 }, { "epoch": 3.7176470588235295, "grad_norm": 0.10791105563648336, "learning_rate": 1.4992308721250024e-05, "loss": 0.356, "step": 2212 }, { "epoch": 3.719327731092437, "grad_norm": 0.12647593918600214, "learning_rate": 1.4955689099795963e-05, "loss": 0.3466, "step": 2213 }, { "epoch": 3.7210084033613446, "grad_norm": 1.0401971143563964, "learning_rate": 1.4919103969838538e-05, "loss": 0.3626, "step": 2214 }, { "epoch": 3.722689075630252, "grad_norm": 0.16478285386285507, "learning_rate": 1.4882553381763521e-05, "loss": 0.3601, "step": 2215 }, { "epoch": 3.7243697478991598, "grad_norm": 0.11485911848468698, "learning_rate": 1.4846037385909018e-05, "loss": 0.3554, "step": 2216 }, { "epoch": 3.7260504201680673, "grad_norm": 0.1474793663735689, "learning_rate": 1.4809556032565589e-05, "loss": 0.3524, "step": 2217 }, { "epoch": 3.727731092436975, "grad_norm": 0.13548887021648998, "learning_rate": 1.4773109371976024e-05, "loss": 0.3593, "step": 2218 }, { "epoch": 3.7294117647058824, "grad_norm": 0.12276597731232224, "learning_rate": 1.4736697454335352e-05, "loss": 0.3459, "step": 2219 }, { "epoch": 3.73109243697479, "grad_norm": 0.1264828060456165, "learning_rate": 1.4700320329790754e-05, "loss": 0.3598, "step": 2220 }, { "epoch": 3.7327731092436975, "grad_norm": 0.13160252886482401, "learning_rate": 1.4663978048441484e-05, "loss": 0.3619, "step": 2221 }, { "epoch": 3.734453781512605, "grad_norm": 0.12352527000108911, "learning_rate": 1.4627670660338807e-05, "loss": 0.3601, "step": 2222 }, { "epoch": 3.7361344537815127, "grad_norm": 0.12675343281949025, "learning_rate": 1.4591398215485985e-05, "loss": 0.347, "step": 2223 }, { "epoch": 3.73781512605042, "grad_norm": 0.10442188149502399, "learning_rate": 1.4555160763838054e-05, "loss": 0.3489, "step": 2224 }, { "epoch": 3.7394957983193278, "grad_norm": 0.11510199095084521, "learning_rate": 1.4518958355301954e-05, "loss": 0.3457, "step": 2225 }, { "epoch": 3.7411764705882353, "grad_norm": 0.3219838569038828, "learning_rate": 1.448279103973632e-05, "loss": 0.354, "step": 2226 }, { "epoch": 3.742857142857143, "grad_norm": 0.12502035737498216, "learning_rate": 1.4446658866951434e-05, "loss": 0.3464, "step": 2227 }, { "epoch": 3.7445378151260504, "grad_norm": 0.15575765142638168, "learning_rate": 1.4410561886709253e-05, "loss": 0.3535, "step": 2228 }, { "epoch": 3.746218487394958, "grad_norm": 0.13454766600167192, "learning_rate": 1.437450014872316e-05, "loss": 0.363, "step": 2229 }, { "epoch": 3.7478991596638656, "grad_norm": 0.12240777894375707, "learning_rate": 1.4338473702658107e-05, "loss": 0.3456, "step": 2230 }, { "epoch": 3.749579831932773, "grad_norm": 0.12366099225932425, "learning_rate": 1.4302482598130372e-05, "loss": 0.3456, "step": 2231 }, { "epoch": 3.7512605042016807, "grad_norm": 0.1399650636913882, "learning_rate": 1.426652688470759e-05, "loss": 0.3515, "step": 2232 }, { "epoch": 3.7529411764705882, "grad_norm": 0.12956256426836793, "learning_rate": 1.423060661190864e-05, "loss": 0.3476, "step": 2233 }, { "epoch": 3.754621848739496, "grad_norm": 0.11997414645065294, "learning_rate": 1.4194721829203602e-05, "loss": 0.3587, "step": 2234 }, { "epoch": 3.7563025210084033, "grad_norm": 0.1520896118227503, "learning_rate": 1.4158872586013659e-05, "loss": 0.3538, "step": 2235 }, { "epoch": 3.757983193277311, "grad_norm": 0.12487929347090605, "learning_rate": 1.412305893171111e-05, "loss": 0.3545, "step": 2236 }, { "epoch": 3.7596638655462185, "grad_norm": 0.13214470337167317, "learning_rate": 1.4087280915619128e-05, "loss": 0.3571, "step": 2237 }, { "epoch": 3.761344537815126, "grad_norm": 0.14442331422903862, "learning_rate": 1.4051538587011923e-05, "loss": 0.3523, "step": 2238 }, { "epoch": 3.7630252100840336, "grad_norm": 0.1134591972623857, "learning_rate": 1.4015831995114483e-05, "loss": 0.3477, "step": 2239 }, { "epoch": 3.764705882352941, "grad_norm": 0.13590898736182486, "learning_rate": 1.3980161189102606e-05, "loss": 0.3597, "step": 2240 }, { "epoch": 3.7663865546218487, "grad_norm": 0.14257958777116073, "learning_rate": 1.3944526218102793e-05, "loss": 0.3522, "step": 2241 }, { "epoch": 3.7680672268907562, "grad_norm": 0.11812010731519437, "learning_rate": 1.3908927131192194e-05, "loss": 0.3612, "step": 2242 }, { "epoch": 3.769747899159664, "grad_norm": 0.12237910319661505, "learning_rate": 1.387336397739853e-05, "loss": 0.3544, "step": 2243 }, { "epoch": 3.7714285714285714, "grad_norm": 0.1383295216835273, "learning_rate": 1.383783680570009e-05, "loss": 0.3546, "step": 2244 }, { "epoch": 3.773109243697479, "grad_norm": 0.11759662594760031, "learning_rate": 1.3802345665025514e-05, "loss": 0.3588, "step": 2245 }, { "epoch": 3.7747899159663865, "grad_norm": 0.13534924909538734, "learning_rate": 1.37668906042539e-05, "loss": 0.358, "step": 2246 }, { "epoch": 3.776470588235294, "grad_norm": 0.12164135150142152, "learning_rate": 1.3731471672214616e-05, "loss": 0.3518, "step": 2247 }, { "epoch": 3.7781512605042016, "grad_norm": 0.11575236728646841, "learning_rate": 1.3696088917687282e-05, "loss": 0.3578, "step": 2248 }, { "epoch": 3.779831932773109, "grad_norm": 0.11461495713719515, "learning_rate": 1.3660742389401684e-05, "loss": 0.3589, "step": 2249 }, { "epoch": 3.7815126050420167, "grad_norm": 0.1266014478938057, "learning_rate": 1.3625432136037761e-05, "loss": 0.3564, "step": 2250 }, { "epoch": 3.7831932773109243, "grad_norm": 0.10841363421128722, "learning_rate": 1.359015820622541e-05, "loss": 0.3504, "step": 2251 }, { "epoch": 3.784873949579832, "grad_norm": 0.12124441314480666, "learning_rate": 1.3554920648544587e-05, "loss": 0.3515, "step": 2252 }, { "epoch": 3.7865546218487394, "grad_norm": 0.1303352140187431, "learning_rate": 1.3519719511525108e-05, "loss": 0.3613, "step": 2253 }, { "epoch": 3.788235294117647, "grad_norm": 0.11348397514351118, "learning_rate": 1.3484554843646649e-05, "loss": 0.3541, "step": 2254 }, { "epoch": 3.7899159663865545, "grad_norm": 0.12674645972736434, "learning_rate": 1.3449426693338646e-05, "loss": 0.3526, "step": 2255 }, { "epoch": 3.791596638655462, "grad_norm": 0.11275966034773673, "learning_rate": 1.3414335108980247e-05, "loss": 0.3538, "step": 2256 }, { "epoch": 3.7932773109243696, "grad_norm": 0.10840716964555974, "learning_rate": 1.3379280138900236e-05, "loss": 0.3469, "step": 2257 }, { "epoch": 3.794957983193277, "grad_norm": 0.11144517324357532, "learning_rate": 1.3344261831377017e-05, "loss": 0.3512, "step": 2258 }, { "epoch": 3.7966386554621847, "grad_norm": 0.12367731601403657, "learning_rate": 1.3309280234638404e-05, "loss": 0.3556, "step": 2259 }, { "epoch": 3.7983193277310923, "grad_norm": 0.10275069472795874, "learning_rate": 1.3274335396861755e-05, "loss": 0.3413, "step": 2260 }, { "epoch": 3.8, "grad_norm": 0.11253144376009917, "learning_rate": 1.3239427366173745e-05, "loss": 0.3583, "step": 2261 }, { "epoch": 3.8016806722689074, "grad_norm": 0.09769776662648352, "learning_rate": 1.3204556190650357e-05, "loss": 0.348, "step": 2262 }, { "epoch": 3.803361344537815, "grad_norm": 0.10491148481272414, "learning_rate": 1.3169721918316873e-05, "loss": 0.3534, "step": 2263 }, { "epoch": 3.8050420168067225, "grad_norm": 0.10088722194802113, "learning_rate": 1.3134924597147651e-05, "loss": 0.3543, "step": 2264 }, { "epoch": 3.80672268907563, "grad_norm": 0.09903748872793076, "learning_rate": 1.3100164275066262e-05, "loss": 0.3575, "step": 2265 }, { "epoch": 3.8084033613445376, "grad_norm": 0.09074040550451025, "learning_rate": 1.3065440999945259e-05, "loss": 0.3484, "step": 2266 }, { "epoch": 3.810084033613445, "grad_norm": 0.10014141787626939, "learning_rate": 1.3030754819606193e-05, "loss": 0.3483, "step": 2267 }, { "epoch": 3.8117647058823527, "grad_norm": 0.08878489317187951, "learning_rate": 1.2996105781819526e-05, "loss": 0.3485, "step": 2268 }, { "epoch": 3.8134453781512603, "grad_norm": 0.09662907930508324, "learning_rate": 1.2961493934304561e-05, "loss": 0.3517, "step": 2269 }, { "epoch": 3.815126050420168, "grad_norm": 0.0984992584728014, "learning_rate": 1.2926919324729376e-05, "loss": 0.3491, "step": 2270 }, { "epoch": 3.8168067226890754, "grad_norm": 0.08868311092000548, "learning_rate": 1.2892382000710818e-05, "loss": 0.3538, "step": 2271 }, { "epoch": 3.818487394957983, "grad_norm": 0.09931703700556671, "learning_rate": 1.2857882009814278e-05, "loss": 0.36, "step": 2272 }, { "epoch": 3.8201680672268905, "grad_norm": 0.13150696431713324, "learning_rate": 1.282341939955384e-05, "loss": 0.3634, "step": 2273 }, { "epoch": 3.821848739495798, "grad_norm": 0.09519059992008591, "learning_rate": 1.2788994217392045e-05, "loss": 0.3479, "step": 2274 }, { "epoch": 3.8235294117647056, "grad_norm": 0.11392036698216576, "learning_rate": 1.2754606510739903e-05, "loss": 0.359, "step": 2275 }, { "epoch": 3.825210084033613, "grad_norm": 0.11514978699445291, "learning_rate": 1.2720256326956805e-05, "loss": 0.359, "step": 2276 }, { "epoch": 3.8268907563025207, "grad_norm": 0.11336364798109408, "learning_rate": 1.268594371335048e-05, "loss": 0.3536, "step": 2277 }, { "epoch": 3.8285714285714287, "grad_norm": 0.11440506835197503, "learning_rate": 1.265166871717689e-05, "loss": 0.3525, "step": 2278 }, { "epoch": 3.8302521008403363, "grad_norm": 0.11026264033129309, "learning_rate": 1.261743138564024e-05, "loss": 0.3636, "step": 2279 }, { "epoch": 3.831932773109244, "grad_norm": 0.10105085351613931, "learning_rate": 1.2583231765892783e-05, "loss": 0.35, "step": 2280 }, { "epoch": 3.8336134453781514, "grad_norm": 0.1069719317490041, "learning_rate": 1.2549069905034909e-05, "loss": 0.3425, "step": 2281 }, { "epoch": 3.835294117647059, "grad_norm": 0.14595187124348408, "learning_rate": 1.2514945850114972e-05, "loss": 0.3582, "step": 2282 }, { "epoch": 3.8369747899159665, "grad_norm": 0.10813146850195637, "learning_rate": 1.2480859648129258e-05, "loss": 0.351, "step": 2283 }, { "epoch": 3.838655462184874, "grad_norm": 0.09893808079565328, "learning_rate": 1.2446811346021916e-05, "loss": 0.3401, "step": 2284 }, { "epoch": 3.8403361344537816, "grad_norm": 0.10579797734072958, "learning_rate": 1.2412800990684945e-05, "loss": 0.3529, "step": 2285 }, { "epoch": 3.842016806722689, "grad_norm": 0.10742362829497347, "learning_rate": 1.2378828628958002e-05, "loss": 0.3503, "step": 2286 }, { "epoch": 3.8436974789915967, "grad_norm": 0.10378549647978952, "learning_rate": 1.234489430762849e-05, "loss": 0.3578, "step": 2287 }, { "epoch": 3.8453781512605043, "grad_norm": 0.1094106897776916, "learning_rate": 1.2310998073431394e-05, "loss": 0.3539, "step": 2288 }, { "epoch": 3.847058823529412, "grad_norm": 0.09877930382691692, "learning_rate": 1.2277139973049242e-05, "loss": 0.3561, "step": 2289 }, { "epoch": 3.8487394957983194, "grad_norm": 0.11538083694027634, "learning_rate": 1.2243320053112049e-05, "loss": 0.3636, "step": 2290 }, { "epoch": 3.850420168067227, "grad_norm": 0.09780757134632526, "learning_rate": 1.220953836019724e-05, "loss": 0.3547, "step": 2291 }, { "epoch": 3.8521008403361345, "grad_norm": 0.10750333593772925, "learning_rate": 1.2175794940829593e-05, "loss": 0.355, "step": 2292 }, { "epoch": 3.853781512605042, "grad_norm": 0.13245659575036797, "learning_rate": 1.214208984148121e-05, "loss": 0.3515, "step": 2293 }, { "epoch": 3.8554621848739496, "grad_norm": 0.09703465793024003, "learning_rate": 1.2108423108571352e-05, "loss": 0.3583, "step": 2294 }, { "epoch": 3.857142857142857, "grad_norm": 0.1126617289770535, "learning_rate": 1.2074794788466501e-05, "loss": 0.3508, "step": 2295 }, { "epoch": 3.8588235294117648, "grad_norm": 0.13935791021327848, "learning_rate": 1.2041204927480208e-05, "loss": 0.3515, "step": 2296 }, { "epoch": 3.8605042016806723, "grad_norm": 0.10292399549508013, "learning_rate": 1.2007653571873053e-05, "loss": 0.3548, "step": 2297 }, { "epoch": 3.86218487394958, "grad_norm": 0.13794506292286182, "learning_rate": 1.197414076785262e-05, "loss": 0.363, "step": 2298 }, { "epoch": 3.8638655462184874, "grad_norm": 0.1116123940517732, "learning_rate": 1.1940666561573338e-05, "loss": 0.3634, "step": 2299 }, { "epoch": 3.865546218487395, "grad_norm": 0.11582729508656235, "learning_rate": 1.1907230999136541e-05, "loss": 0.3473, "step": 2300 }, { "epoch": 3.8672268907563025, "grad_norm": 0.10394339628610187, "learning_rate": 1.1873834126590307e-05, "loss": 0.3547, "step": 2301 }, { "epoch": 3.86890756302521, "grad_norm": 0.105372515547251, "learning_rate": 1.1840475989929438e-05, "loss": 0.352, "step": 2302 }, { "epoch": 3.8705882352941177, "grad_norm": 0.10793423647505844, "learning_rate": 1.1807156635095391e-05, "loss": 0.3563, "step": 2303 }, { "epoch": 3.872268907563025, "grad_norm": 0.09859671284935982, "learning_rate": 1.1773876107976201e-05, "loss": 0.3509, "step": 2304 }, { "epoch": 3.8739495798319328, "grad_norm": 0.09389926879762485, "learning_rate": 1.1740634454406434e-05, "loss": 0.3507, "step": 2305 }, { "epoch": 3.8756302521008403, "grad_norm": 0.10526529785700492, "learning_rate": 1.170743172016715e-05, "loss": 0.3407, "step": 2306 }, { "epoch": 3.877310924369748, "grad_norm": 0.10469818669972676, "learning_rate": 1.1674267950985736e-05, "loss": 0.3556, "step": 2307 }, { "epoch": 3.8789915966386554, "grad_norm": 0.10214985730167982, "learning_rate": 1.1641143192535997e-05, "loss": 0.3582, "step": 2308 }, { "epoch": 3.880672268907563, "grad_norm": 0.10759235047550737, "learning_rate": 1.1608057490437959e-05, "loss": 0.3509, "step": 2309 }, { "epoch": 3.8823529411764706, "grad_norm": 0.09332026061768348, "learning_rate": 1.1575010890257871e-05, "loss": 0.3529, "step": 2310 }, { "epoch": 3.884033613445378, "grad_norm": 0.10000271204386116, "learning_rate": 1.154200343750814e-05, "loss": 0.3635, "step": 2311 }, { "epoch": 3.8857142857142857, "grad_norm": 0.10209263145174159, "learning_rate": 1.150903517764725e-05, "loss": 0.3462, "step": 2312 }, { "epoch": 3.8873949579831932, "grad_norm": 0.09942234892304841, "learning_rate": 1.1476106156079693e-05, "loss": 0.3617, "step": 2313 }, { "epoch": 3.889075630252101, "grad_norm": 0.09868819200203104, "learning_rate": 1.1443216418155978e-05, "loss": 0.3584, "step": 2314 }, { "epoch": 3.8907563025210083, "grad_norm": 0.0962887867724304, "learning_rate": 1.141036600917242e-05, "loss": 0.3489, "step": 2315 }, { "epoch": 3.892436974789916, "grad_norm": 0.1031447339298465, "learning_rate": 1.1377554974371261e-05, "loss": 0.3469, "step": 2316 }, { "epoch": 3.8941176470588235, "grad_norm": 0.10370713057728337, "learning_rate": 1.1344783358940461e-05, "loss": 0.3489, "step": 2317 }, { "epoch": 3.895798319327731, "grad_norm": 0.102075766798489, "learning_rate": 1.1312051208013708e-05, "loss": 0.3581, "step": 2318 }, { "epoch": 3.8974789915966386, "grad_norm": 0.10316954510952683, "learning_rate": 1.1279358566670325e-05, "loss": 0.3529, "step": 2319 }, { "epoch": 3.899159663865546, "grad_norm": 0.1055611316656801, "learning_rate": 1.1246705479935267e-05, "loss": 0.3563, "step": 2320 }, { "epoch": 3.9008403361344537, "grad_norm": 0.09576570591100975, "learning_rate": 1.1214091992778932e-05, "loss": 0.3494, "step": 2321 }, { "epoch": 3.9025210084033612, "grad_norm": 0.11622216943923065, "learning_rate": 1.1181518150117268e-05, "loss": 0.3509, "step": 2322 }, { "epoch": 3.904201680672269, "grad_norm": 0.10574888453601104, "learning_rate": 1.1148983996811574e-05, "loss": 0.3525, "step": 2323 }, { "epoch": 3.9058823529411764, "grad_norm": 0.10888907680883923, "learning_rate": 1.1116489577668489e-05, "loss": 0.3518, "step": 2324 }, { "epoch": 3.907563025210084, "grad_norm": 0.10215491130178032, "learning_rate": 1.1084034937439947e-05, "loss": 0.3509, "step": 2325 }, { "epoch": 3.9092436974789915, "grad_norm": 0.1121185720038754, "learning_rate": 1.105162012082309e-05, "loss": 0.3524, "step": 2326 }, { "epoch": 3.910924369747899, "grad_norm": 0.10472298062487444, "learning_rate": 1.1019245172460198e-05, "loss": 0.3492, "step": 2327 }, { "epoch": 3.9126050420168066, "grad_norm": 0.10514815337520857, "learning_rate": 1.0986910136938702e-05, "loss": 0.3447, "step": 2328 }, { "epoch": 3.914285714285714, "grad_norm": 0.12387085037075818, "learning_rate": 1.0954615058790962e-05, "loss": 0.366, "step": 2329 }, { "epoch": 3.9159663865546217, "grad_norm": 0.10045241548396028, "learning_rate": 1.0922359982494419e-05, "loss": 0.3558, "step": 2330 }, { "epoch": 3.9176470588235293, "grad_norm": 0.12357967678106584, "learning_rate": 1.0890144952471347e-05, "loss": 0.3634, "step": 2331 }, { "epoch": 3.919327731092437, "grad_norm": 0.10246723265262465, "learning_rate": 1.0857970013088898e-05, "loss": 0.353, "step": 2332 }, { "epoch": 3.9210084033613444, "grad_norm": 0.11626542539495544, "learning_rate": 1.0825835208659007e-05, "loss": 0.3484, "step": 2333 }, { "epoch": 3.9226890756302524, "grad_norm": 0.10881574361749738, "learning_rate": 1.0793740583438312e-05, "loss": 0.3526, "step": 2334 }, { "epoch": 3.92436974789916, "grad_norm": 0.10369182730957352, "learning_rate": 1.0761686181628171e-05, "loss": 0.3623, "step": 2335 }, { "epoch": 3.9260504201680675, "grad_norm": 0.09772149420297072, "learning_rate": 1.0729672047374482e-05, "loss": 0.3537, "step": 2336 }, { "epoch": 3.927731092436975, "grad_norm": 0.1069475499740503, "learning_rate": 1.0697698224767725e-05, "loss": 0.3515, "step": 2337 }, { "epoch": 3.9294117647058826, "grad_norm": 0.09759907429465464, "learning_rate": 1.0665764757842853e-05, "loss": 0.352, "step": 2338 }, { "epoch": 3.93109243697479, "grad_norm": 0.10764641611564624, "learning_rate": 1.0633871690579229e-05, "loss": 0.351, "step": 2339 }, { "epoch": 3.9327731092436977, "grad_norm": 0.09764447634954752, "learning_rate": 1.0602019066900575e-05, "loss": 0.3553, "step": 2340 }, { "epoch": 3.9344537815126053, "grad_norm": 0.6083705446462734, "learning_rate": 1.057020693067497e-05, "loss": 0.3576, "step": 2341 }, { "epoch": 3.936134453781513, "grad_norm": 0.11648439155083624, "learning_rate": 1.0538435325714627e-05, "loss": 0.3396, "step": 2342 }, { "epoch": 3.9378151260504204, "grad_norm": 0.1069766698029467, "learning_rate": 1.0506704295776045e-05, "loss": 0.3519, "step": 2343 }, { "epoch": 3.939495798319328, "grad_norm": 0.10388450317967049, "learning_rate": 1.0475013884559772e-05, "loss": 0.3481, "step": 2344 }, { "epoch": 3.9411764705882355, "grad_norm": 0.10343606648145368, "learning_rate": 1.0443364135710454e-05, "loss": 0.3499, "step": 2345 }, { "epoch": 3.942857142857143, "grad_norm": 0.3038286890062019, "learning_rate": 1.0411755092816706e-05, "loss": 0.344, "step": 2346 }, { "epoch": 3.9445378151260506, "grad_norm": 0.10599717744178176, "learning_rate": 1.0380186799411111e-05, "loss": 0.3554, "step": 2347 }, { "epoch": 3.946218487394958, "grad_norm": 0.14997693940286552, "learning_rate": 1.0348659298970097e-05, "loss": 0.3522, "step": 2348 }, { "epoch": 3.9478991596638657, "grad_norm": 0.10417938572342696, "learning_rate": 1.0317172634913968e-05, "loss": 0.3503, "step": 2349 }, { "epoch": 3.9495798319327733, "grad_norm": 0.09521786381411468, "learning_rate": 1.0285726850606706e-05, "loss": 0.349, "step": 2350 }, { "epoch": 3.951260504201681, "grad_norm": 0.13346217325919144, "learning_rate": 1.025432198935607e-05, "loss": 0.3522, "step": 2351 }, { "epoch": 3.9529411764705884, "grad_norm": 0.1042748643529104, "learning_rate": 1.0222958094413418e-05, "loss": 0.3575, "step": 2352 }, { "epoch": 3.954621848739496, "grad_norm": 0.11126139569744413, "learning_rate": 1.0191635208973695e-05, "loss": 0.3465, "step": 2353 }, { "epoch": 3.9563025210084035, "grad_norm": 0.12158063646297253, "learning_rate": 1.0160353376175353e-05, "loss": 0.3542, "step": 2354 }, { "epoch": 3.957983193277311, "grad_norm": 0.09652859356264273, "learning_rate": 1.0129112639100364e-05, "loss": 0.3479, "step": 2355 }, { "epoch": 3.9596638655462186, "grad_norm": 0.12412222827694787, "learning_rate": 1.009791304077401e-05, "loss": 0.3483, "step": 2356 }, { "epoch": 3.961344537815126, "grad_norm": 0.1022149945903801, "learning_rate": 1.0066754624164993e-05, "loss": 0.3536, "step": 2357 }, { "epoch": 3.9630252100840337, "grad_norm": 0.10021990927921609, "learning_rate": 1.003563743218527e-05, "loss": 0.354, "step": 2358 }, { "epoch": 3.9647058823529413, "grad_norm": 0.11988060048661563, "learning_rate": 1.0004561507690021e-05, "loss": 0.358, "step": 2359 }, { "epoch": 3.966386554621849, "grad_norm": 0.16237216698385185, "learning_rate": 9.973526893477591e-06, "loss": 0.3532, "step": 2360 }, { "epoch": 3.9680672268907564, "grad_norm": 0.09947505106092717, "learning_rate": 9.94253363228944e-06, "loss": 0.3529, "step": 2361 }, { "epoch": 3.969747899159664, "grad_norm": 0.10940090744339583, "learning_rate": 9.911581766810055e-06, "loss": 0.3483, "step": 2362 }, { "epoch": 3.9714285714285715, "grad_norm": 0.10098090790271062, "learning_rate": 9.880671339666965e-06, "loss": 0.3516, "step": 2363 }, { "epoch": 3.973109243697479, "grad_norm": 0.09179626759631618, "learning_rate": 9.849802393430545e-06, "loss": 0.3563, "step": 2364 }, { "epoch": 3.9747899159663866, "grad_norm": 0.11087531568886852, "learning_rate": 9.818974970614125e-06, "loss": 0.3502, "step": 2365 }, { "epoch": 3.976470588235294, "grad_norm": 0.09934693304617247, "learning_rate": 9.788189113673799e-06, "loss": 0.3463, "step": 2366 }, { "epoch": 3.9781512605042018, "grad_norm": 0.09668343868994408, "learning_rate": 9.757444865008434e-06, "loss": 0.3591, "step": 2367 }, { "epoch": 3.9798319327731093, "grad_norm": 0.10526471079009012, "learning_rate": 9.726742266959582e-06, "loss": 0.3556, "step": 2368 }, { "epoch": 3.981512605042017, "grad_norm": 0.09790497493747256, "learning_rate": 9.69608136181143e-06, "loss": 0.3533, "step": 2369 }, { "epoch": 3.9831932773109244, "grad_norm": 0.09123414710741953, "learning_rate": 9.665462191790777e-06, "loss": 0.3513, "step": 2370 }, { "epoch": 3.984873949579832, "grad_norm": 0.10079316761935538, "learning_rate": 9.634884799066913e-06, "loss": 0.3553, "step": 2371 }, { "epoch": 3.9865546218487395, "grad_norm": 0.09161286989870943, "learning_rate": 9.604349225751597e-06, "loss": 0.3435, "step": 2372 }, { "epoch": 3.988235294117647, "grad_norm": 0.0990373195074711, "learning_rate": 9.57385551389899e-06, "loss": 0.3567, "step": 2373 }, { "epoch": 3.9899159663865547, "grad_norm": 0.0914140210933955, "learning_rate": 9.543403705505611e-06, "loss": 0.3529, "step": 2374 }, { "epoch": 3.991596638655462, "grad_norm": 0.10771885336916434, "learning_rate": 9.512993842510255e-06, "loss": 0.3472, "step": 2375 }, { "epoch": 3.9932773109243698, "grad_norm": 0.09218683325930689, "learning_rate": 9.482625966793985e-06, "loss": 0.3464, "step": 2376 }, { "epoch": 3.9949579831932773, "grad_norm": 0.09575560758728835, "learning_rate": 9.452300120179978e-06, "loss": 0.3594, "step": 2377 }, { "epoch": 3.996638655462185, "grad_norm": 0.09527089804336485, "learning_rate": 9.422016344433582e-06, "loss": 0.3519, "step": 2378 }, { "epoch": 3.9983193277310924, "grad_norm": 0.09647194655053191, "learning_rate": 9.391774681262178e-06, "loss": 0.3584, "step": 2379 }, { "epoch": 4.0, "grad_norm": 0.10675653811971857, "learning_rate": 9.361575172315157e-06, "loss": 0.3393, "step": 2380 }, { "epoch": 4.001680672268908, "grad_norm": 0.1199498503024151, "learning_rate": 9.331417859183842e-06, "loss": 0.3387, "step": 2381 }, { "epoch": 4.003361344537815, "grad_norm": 0.10779024382652437, "learning_rate": 9.301302783401463e-06, "loss": 0.3401, "step": 2382 }, { "epoch": 4.005042016806723, "grad_norm": 0.11089608462442815, "learning_rate": 9.271229986443054e-06, "loss": 0.33, "step": 2383 }, { "epoch": 4.00672268907563, "grad_norm": 0.11669860542249466, "learning_rate": 9.241199509725472e-06, "loss": 0.3337, "step": 2384 }, { "epoch": 4.008403361344538, "grad_norm": 0.12329474420453605, "learning_rate": 9.211211394607206e-06, "loss": 0.3307, "step": 2385 }, { "epoch": 4.010084033613445, "grad_norm": 0.12121001165265713, "learning_rate": 9.181265682388495e-06, "loss": 0.3364, "step": 2386 }, { "epoch": 4.011764705882353, "grad_norm": 0.11438495243728929, "learning_rate": 9.15136241431112e-06, "loss": 0.3404, "step": 2387 }, { "epoch": 4.0134453781512605, "grad_norm": 0.12087266969006094, "learning_rate": 9.121501631558426e-06, "loss": 0.3354, "step": 2388 }, { "epoch": 4.015126050420168, "grad_norm": 0.12287958939628094, "learning_rate": 9.091683375255233e-06, "loss": 0.3268, "step": 2389 }, { "epoch": 4.016806722689076, "grad_norm": 0.11575845921868504, "learning_rate": 9.061907686467841e-06, "loss": 0.3338, "step": 2390 }, { "epoch": 4.018487394957983, "grad_norm": 0.11347699232482532, "learning_rate": 9.032174606203847e-06, "loss": 0.3368, "step": 2391 }, { "epoch": 4.020168067226891, "grad_norm": 0.11850495411345756, "learning_rate": 9.002484175412238e-06, "loss": 0.3291, "step": 2392 }, { "epoch": 4.021848739495798, "grad_norm": 0.11521790002856067, "learning_rate": 8.972836434983225e-06, "loss": 0.3314, "step": 2393 }, { "epoch": 4.023529411764706, "grad_norm": 0.11696143031227353, "learning_rate": 8.943231425748235e-06, "loss": 0.3292, "step": 2394 }, { "epoch": 4.025210084033613, "grad_norm": 0.1023549102340406, "learning_rate": 8.913669188479846e-06, "loss": 0.3274, "step": 2395 }, { "epoch": 4.026890756302521, "grad_norm": 0.12411319497040221, "learning_rate": 8.884149763891723e-06, "loss": 0.3413, "step": 2396 }, { "epoch": 4.0285714285714285, "grad_norm": 0.10767175905612476, "learning_rate": 8.85467319263857e-06, "loss": 0.3375, "step": 2397 }, { "epoch": 4.030252100840336, "grad_norm": 0.1002183172580371, "learning_rate": 8.8252395153161e-06, "loss": 0.3269, "step": 2398 }, { "epoch": 4.031932773109244, "grad_norm": 0.10481901603553802, "learning_rate": 8.795848772460891e-06, "loss": 0.3323, "step": 2399 }, { "epoch": 4.033613445378151, "grad_norm": 0.10678226395373731, "learning_rate": 8.766501004550453e-06, "loss": 0.3341, "step": 2400 }, { "epoch": 4.035294117647059, "grad_norm": 0.09213371971648683, "learning_rate": 8.737196252003084e-06, "loss": 0.3361, "step": 2401 }, { "epoch": 4.036974789915966, "grad_norm": 0.09909082972346815, "learning_rate": 8.707934555177835e-06, "loss": 0.3351, "step": 2402 }, { "epoch": 4.038655462184874, "grad_norm": 0.11147776684671225, "learning_rate": 8.678715954374466e-06, "loss": 0.3325, "step": 2403 }, { "epoch": 4.040336134453781, "grad_norm": 0.09525798810973968, "learning_rate": 8.64954048983337e-06, "loss": 0.3308, "step": 2404 }, { "epoch": 4.042016806722689, "grad_norm": 0.09955769694935754, "learning_rate": 8.620408201735579e-06, "loss": 0.3262, "step": 2405 }, { "epoch": 4.0436974789915965, "grad_norm": 0.10299292339074874, "learning_rate": 8.591319130202605e-06, "loss": 0.3319, "step": 2406 }, { "epoch": 4.045378151260504, "grad_norm": 0.09675699228052216, "learning_rate": 8.562273315296475e-06, "loss": 0.3319, "step": 2407 }, { "epoch": 4.047058823529412, "grad_norm": 0.09148958330459493, "learning_rate": 8.53327079701963e-06, "loss": 0.3309, "step": 2408 }, { "epoch": 4.048739495798319, "grad_norm": 0.097096247149616, "learning_rate": 8.504311615314878e-06, "loss": 0.3247, "step": 2409 }, { "epoch": 4.050420168067227, "grad_norm": 0.0990492892044789, "learning_rate": 8.475395810065348e-06, "loss": 0.3434, "step": 2410 }, { "epoch": 4.052100840336134, "grad_norm": 0.0892934880650849, "learning_rate": 8.446523421094457e-06, "loss": 0.3316, "step": 2411 }, { "epoch": 4.053781512605042, "grad_norm": 0.09033247225312316, "learning_rate": 8.417694488165766e-06, "loss": 0.3461, "step": 2412 }, { "epoch": 4.055462184873949, "grad_norm": 0.10132755034255188, "learning_rate": 8.38890905098305e-06, "loss": 0.3324, "step": 2413 }, { "epoch": 4.057142857142857, "grad_norm": 0.09113423032556765, "learning_rate": 8.360167149190146e-06, "loss": 0.3295, "step": 2414 }, { "epoch": 4.0588235294117645, "grad_norm": 0.09513299408146962, "learning_rate": 8.331468822370947e-06, "loss": 0.3313, "step": 2415 }, { "epoch": 4.060504201680672, "grad_norm": 0.09383141014795054, "learning_rate": 8.302814110049327e-06, "loss": 0.3263, "step": 2416 }, { "epoch": 4.06218487394958, "grad_norm": 0.0932317938065123, "learning_rate": 8.274203051689094e-06, "loss": 0.3283, "step": 2417 }, { "epoch": 4.063865546218487, "grad_norm": 0.0952109128410542, "learning_rate": 8.245635686693925e-06, "loss": 0.3357, "step": 2418 }, { "epoch": 4.065546218487395, "grad_norm": 0.09093068165620247, "learning_rate": 8.217112054407366e-06, "loss": 0.327, "step": 2419 }, { "epoch": 4.067226890756302, "grad_norm": 0.08778039590695538, "learning_rate": 8.18863219411266e-06, "loss": 0.3349, "step": 2420 }, { "epoch": 4.06890756302521, "grad_norm": 0.08991929821328302, "learning_rate": 8.160196145032838e-06, "loss": 0.3365, "step": 2421 }, { "epoch": 4.070588235294117, "grad_norm": 0.1026124970944456, "learning_rate": 8.131803946330552e-06, "loss": 0.3406, "step": 2422 }, { "epoch": 4.072268907563025, "grad_norm": 0.09053560904370488, "learning_rate": 8.103455637108069e-06, "loss": 0.3354, "step": 2423 }, { "epoch": 4.0739495798319325, "grad_norm": 0.09339022036417857, "learning_rate": 8.07515125640721e-06, "loss": 0.3309, "step": 2424 }, { "epoch": 4.07563025210084, "grad_norm": 0.09609531624804346, "learning_rate": 8.046890843209327e-06, "loss": 0.3443, "step": 2425 }, { "epoch": 4.077310924369748, "grad_norm": 0.09150291721409645, "learning_rate": 8.018674436435155e-06, "loss": 0.3311, "step": 2426 }, { "epoch": 4.078991596638655, "grad_norm": 0.08938933586163132, "learning_rate": 7.99050207494489e-06, "loss": 0.3383, "step": 2427 }, { "epoch": 4.080672268907563, "grad_norm": 0.1081287595448833, "learning_rate": 7.962373797538028e-06, "loss": 0.3373, "step": 2428 }, { "epoch": 4.08235294117647, "grad_norm": 0.09221023599784126, "learning_rate": 7.93428964295337e-06, "loss": 0.344, "step": 2429 }, { "epoch": 4.084033613445378, "grad_norm": 0.08330965470083779, "learning_rate": 7.906249649868929e-06, "loss": 0.333, "step": 2430 }, { "epoch": 4.085714285714285, "grad_norm": 0.09091810119140786, "learning_rate": 7.878253856901925e-06, "loss": 0.3241, "step": 2431 }, { "epoch": 4.087394957983193, "grad_norm": 0.0919245381744328, "learning_rate": 7.850302302608672e-06, "loss": 0.3291, "step": 2432 }, { "epoch": 4.0890756302521005, "grad_norm": 0.09284880167956168, "learning_rate": 7.822395025484612e-06, "loss": 0.3366, "step": 2433 }, { "epoch": 4.090756302521008, "grad_norm": 0.1114847286426954, "learning_rate": 7.79453206396413e-06, "loss": 0.3395, "step": 2434 }, { "epoch": 4.092436974789916, "grad_norm": 0.08641576254425276, "learning_rate": 7.766713456420656e-06, "loss": 0.3308, "step": 2435 }, { "epoch": 4.094117647058823, "grad_norm": 0.09205488029245515, "learning_rate": 7.738939241166484e-06, "loss": 0.3381, "step": 2436 }, { "epoch": 4.095798319327731, "grad_norm": 0.09059633655019855, "learning_rate": 7.711209456452789e-06, "loss": 0.3343, "step": 2437 }, { "epoch": 4.097478991596638, "grad_norm": 0.08717299744626239, "learning_rate": 7.683524140469551e-06, "loss": 0.3392, "step": 2438 }, { "epoch": 4.099159663865546, "grad_norm": 0.08599482794603706, "learning_rate": 7.655883331345512e-06, "loss": 0.3384, "step": 2439 }, { "epoch": 4.100840336134453, "grad_norm": 0.08931005064512303, "learning_rate": 7.6282870671481055e-06, "loss": 0.3356, "step": 2440 }, { "epoch": 4.102521008403361, "grad_norm": 0.08531703768388452, "learning_rate": 7.60073538588344e-06, "loss": 0.3329, "step": 2441 }, { "epoch": 4.1042016806722685, "grad_norm": 0.08849761359078556, "learning_rate": 7.573228325496207e-06, "loss": 0.3293, "step": 2442 }, { "epoch": 4.105882352941176, "grad_norm": 0.09216803603056635, "learning_rate": 7.545765923869645e-06, "loss": 0.3288, "step": 2443 }, { "epoch": 4.107563025210084, "grad_norm": 0.08492112085242333, "learning_rate": 7.5183482188254885e-06, "loss": 0.3282, "step": 2444 }, { "epoch": 4.109243697478991, "grad_norm": 0.08310526335442774, "learning_rate": 7.490975248123913e-06, "loss": 0.3285, "step": 2445 }, { "epoch": 4.110924369747899, "grad_norm": 0.08694723434705177, "learning_rate": 7.463647049463514e-06, "loss": 0.3247, "step": 2446 }, { "epoch": 4.112605042016806, "grad_norm": 0.09443018000255975, "learning_rate": 7.4363636604811625e-06, "loss": 0.336, "step": 2447 }, { "epoch": 4.114285714285714, "grad_norm": 0.08845307286327596, "learning_rate": 7.409125118752087e-06, "loss": 0.333, "step": 2448 }, { "epoch": 4.115966386554621, "grad_norm": 0.0877530911053768, "learning_rate": 7.381931461789711e-06, "loss": 0.3333, "step": 2449 }, { "epoch": 4.117647058823529, "grad_norm": 0.08704257865885467, "learning_rate": 7.354782727045648e-06, "loss": 0.3322, "step": 2450 }, { "epoch": 4.1193277310924366, "grad_norm": 0.0939600340357566, "learning_rate": 7.327678951909654e-06, "loss": 0.3271, "step": 2451 }, { "epoch": 4.121008403361344, "grad_norm": 0.08685744938480158, "learning_rate": 7.300620173709547e-06, "loss": 0.3281, "step": 2452 }, { "epoch": 4.122689075630252, "grad_norm": 0.08252204795720121, "learning_rate": 7.273606429711182e-06, "loss": 0.3344, "step": 2453 }, { "epoch": 4.124369747899159, "grad_norm": 0.10605669799721319, "learning_rate": 7.246637757118429e-06, "loss": 0.3407, "step": 2454 }, { "epoch": 4.126050420168067, "grad_norm": 0.08257034844392677, "learning_rate": 7.219714193073004e-06, "loss": 0.335, "step": 2455 }, { "epoch": 4.127731092436974, "grad_norm": 0.08649049155754407, "learning_rate": 7.192835774654585e-06, "loss": 0.3318, "step": 2456 }, { "epoch": 4.129411764705883, "grad_norm": 0.10009584525335757, "learning_rate": 7.166002538880623e-06, "loss": 0.3324, "step": 2457 }, { "epoch": 4.13109243697479, "grad_norm": 0.08380247622278542, "learning_rate": 7.139214522706357e-06, "loss": 0.3291, "step": 2458 }, { "epoch": 4.132773109243698, "grad_norm": 0.08771976774377395, "learning_rate": 7.1124717630247285e-06, "loss": 0.3373, "step": 2459 }, { "epoch": 4.1344537815126055, "grad_norm": 0.09192831723777052, "learning_rate": 7.085774296666414e-06, "loss": 0.3307, "step": 2460 }, { "epoch": 4.136134453781513, "grad_norm": 0.08890248557788129, "learning_rate": 7.059122160399616e-06, "loss": 0.3313, "step": 2461 }, { "epoch": 4.137815126050421, "grad_norm": 0.08611878550285151, "learning_rate": 7.0325153909301905e-06, "loss": 0.3365, "step": 2462 }, { "epoch": 4.139495798319328, "grad_norm": 0.08400447822895396, "learning_rate": 7.005954024901478e-06, "loss": 0.335, "step": 2463 }, { "epoch": 4.141176470588236, "grad_norm": 0.09436136084199594, "learning_rate": 6.979438098894284e-06, "loss": 0.3304, "step": 2464 }, { "epoch": 4.142857142857143, "grad_norm": 0.09051399440332873, "learning_rate": 6.952967649426847e-06, "loss": 0.3336, "step": 2465 }, { "epoch": 4.144537815126051, "grad_norm": 0.0882155767130185, "learning_rate": 6.926542712954765e-06, "loss": 0.3362, "step": 2466 }, { "epoch": 4.146218487394958, "grad_norm": 0.08645304622536261, "learning_rate": 6.9001633258709475e-06, "loss": 0.3337, "step": 2467 }, { "epoch": 4.147899159663866, "grad_norm": 0.08856347968626853, "learning_rate": 6.873829524505615e-06, "loss": 0.332, "step": 2468 }, { "epoch": 4.1495798319327735, "grad_norm": 0.08727665639959868, "learning_rate": 6.847541345126139e-06, "loss": 0.3254, "step": 2469 }, { "epoch": 4.151260504201681, "grad_norm": 0.09442255208020339, "learning_rate": 6.821298823937117e-06, "loss": 0.3388, "step": 2470 }, { "epoch": 4.152941176470589, "grad_norm": 0.09770752189306202, "learning_rate": 6.795101997080236e-06, "loss": 0.3272, "step": 2471 }, { "epoch": 4.154621848739496, "grad_norm": 0.10172573771831644, "learning_rate": 6.768950900634266e-06, "loss": 0.3309, "step": 2472 }, { "epoch": 4.156302521008404, "grad_norm": 0.08798825240931665, "learning_rate": 6.74284557061498e-06, "loss": 0.3283, "step": 2473 }, { "epoch": 4.157983193277311, "grad_norm": 0.09021723343112954, "learning_rate": 6.716786042975134e-06, "loss": 0.3312, "step": 2474 }, { "epoch": 4.159663865546219, "grad_norm": 0.094038411273032, "learning_rate": 6.690772353604393e-06, "loss": 0.3418, "step": 2475 }, { "epoch": 4.161344537815126, "grad_norm": 0.08692086557994695, "learning_rate": 6.664804538329317e-06, "loss": 0.3347, "step": 2476 }, { "epoch": 4.163025210084034, "grad_norm": 0.08589232196414025, "learning_rate": 6.638882632913261e-06, "loss": 0.3337, "step": 2477 }, { "epoch": 4.1647058823529415, "grad_norm": 0.08980045918276984, "learning_rate": 6.613006673056363e-06, "loss": 0.3289, "step": 2478 }, { "epoch": 4.166386554621849, "grad_norm": 0.08375942795058218, "learning_rate": 6.58717669439548e-06, "loss": 0.3293, "step": 2479 }, { "epoch": 4.168067226890757, "grad_norm": 0.09433111242785339, "learning_rate": 6.561392732504135e-06, "loss": 0.3308, "step": 2480 }, { "epoch": 4.169747899159664, "grad_norm": 0.08899792665884339, "learning_rate": 6.535654822892517e-06, "loss": 0.3325, "step": 2481 }, { "epoch": 4.171428571428572, "grad_norm": 0.08369070687165289, "learning_rate": 6.509963001007315e-06, "loss": 0.3386, "step": 2482 }, { "epoch": 4.173109243697479, "grad_norm": 0.08235028991958863, "learning_rate": 6.484317302231828e-06, "loss": 0.3263, "step": 2483 }, { "epoch": 4.174789915966387, "grad_norm": 0.08637933114749481, "learning_rate": 6.458717761885785e-06, "loss": 0.3226, "step": 2484 }, { "epoch": 4.176470588235294, "grad_norm": 0.08991158228795468, "learning_rate": 6.433164415225346e-06, "loss": 0.3293, "step": 2485 }, { "epoch": 4.178151260504202, "grad_norm": 0.08261930261663042, "learning_rate": 6.407657297443069e-06, "loss": 0.3393, "step": 2486 }, { "epoch": 4.1798319327731095, "grad_norm": 0.08670455248695087, "learning_rate": 6.382196443667834e-06, "loss": 0.3265, "step": 2487 }, { "epoch": 4.181512605042017, "grad_norm": 0.09756363190475703, "learning_rate": 6.356781888964803e-06, "loss": 0.3349, "step": 2488 }, { "epoch": 4.183193277310925, "grad_norm": 0.07937602966814962, "learning_rate": 6.331413668335402e-06, "loss": 0.3373, "step": 2489 }, { "epoch": 4.184873949579832, "grad_norm": 0.08832716814146911, "learning_rate": 6.3060918167171926e-06, "loss": 0.3325, "step": 2490 }, { "epoch": 4.18655462184874, "grad_norm": 0.0918749595782592, "learning_rate": 6.280816368983922e-06, "loss": 0.3349, "step": 2491 }, { "epoch": 4.188235294117647, "grad_norm": 0.08563103778205133, "learning_rate": 6.255587359945412e-06, "loss": 0.3331, "step": 2492 }, { "epoch": 4.189915966386555, "grad_norm": 0.08674107606429529, "learning_rate": 6.230404824347531e-06, "loss": 0.3323, "step": 2493 }, { "epoch": 4.191596638655462, "grad_norm": 0.0922741183054131, "learning_rate": 6.205268796872123e-06, "loss": 0.3336, "step": 2494 }, { "epoch": 4.19327731092437, "grad_norm": 0.08896684465460153, "learning_rate": 6.180179312137036e-06, "loss": 0.3371, "step": 2495 }, { "epoch": 4.1949579831932775, "grad_norm": 0.08913660158305972, "learning_rate": 6.155136404695934e-06, "loss": 0.3374, "step": 2496 }, { "epoch": 4.196638655462185, "grad_norm": 0.09235337538903898, "learning_rate": 6.130140109038416e-06, "loss": 0.334, "step": 2497 }, { "epoch": 4.198319327731093, "grad_norm": 0.10176119487953011, "learning_rate": 6.105190459589842e-06, "loss": 0.3396, "step": 2498 }, { "epoch": 4.2, "grad_norm": 0.08833638856233234, "learning_rate": 6.080287490711336e-06, "loss": 0.3468, "step": 2499 }, { "epoch": 4.201680672268908, "grad_norm": 0.0884474852586273, "learning_rate": 6.055431236699734e-06, "loss": 0.3353, "step": 2500 }, { "epoch": 4.203361344537815, "grad_norm": 0.09334992414313713, "learning_rate": 6.03062173178754e-06, "loss": 0.3223, "step": 2501 }, { "epoch": 4.205042016806723, "grad_norm": 0.09137056559417517, "learning_rate": 6.0058590101428645e-06, "loss": 0.3394, "step": 2502 }, { "epoch": 4.20672268907563, "grad_norm": 0.08914005368199106, "learning_rate": 5.981143105869418e-06, "loss": 0.3309, "step": 2503 }, { "epoch": 4.208403361344538, "grad_norm": 0.08561207484672721, "learning_rate": 5.956474053006376e-06, "loss": 0.3399, "step": 2504 }, { "epoch": 4.2100840336134455, "grad_norm": 0.08431019690863648, "learning_rate": 5.931851885528446e-06, "loss": 0.331, "step": 2505 }, { "epoch": 4.211764705882353, "grad_norm": 0.08484273290250516, "learning_rate": 5.9072766373457335e-06, "loss": 0.3346, "step": 2506 }, { "epoch": 4.213445378151261, "grad_norm": 0.08820619197332003, "learning_rate": 5.882748342303735e-06, "loss": 0.3346, "step": 2507 }, { "epoch": 4.215126050420168, "grad_norm": 0.0910444197912663, "learning_rate": 5.858267034183276e-06, "loss": 0.3334, "step": 2508 }, { "epoch": 4.216806722689076, "grad_norm": 0.08127458853364329, "learning_rate": 5.833832746700481e-06, "loss": 0.337, "step": 2509 }, { "epoch": 4.218487394957983, "grad_norm": 0.08787612951618741, "learning_rate": 5.809445513506693e-06, "loss": 0.3334, "step": 2510 }, { "epoch": 4.220168067226891, "grad_norm": 0.08664779839760861, "learning_rate": 5.7851053681884994e-06, "loss": 0.3309, "step": 2511 }, { "epoch": 4.221848739495798, "grad_norm": 0.08244019775277667, "learning_rate": 5.760812344267588e-06, "loss": 0.3258, "step": 2512 }, { "epoch": 4.223529411764706, "grad_norm": 0.08321486535306509, "learning_rate": 5.736566475200769e-06, "loss": 0.3293, "step": 2513 }, { "epoch": 4.2252100840336135, "grad_norm": 0.0894155427047037, "learning_rate": 5.712367794379918e-06, "loss": 0.3325, "step": 2514 }, { "epoch": 4.226890756302521, "grad_norm": 0.08813315108647926, "learning_rate": 5.688216335131898e-06, "loss": 0.332, "step": 2515 }, { "epoch": 4.228571428571429, "grad_norm": 0.08561008236861008, "learning_rate": 5.66411213071858e-06, "loss": 0.3358, "step": 2516 }, { "epoch": 4.230252100840336, "grad_norm": 0.08720142300515017, "learning_rate": 5.640055214336695e-06, "loss": 0.3378, "step": 2517 }, { "epoch": 4.231932773109244, "grad_norm": 0.08400980846814311, "learning_rate": 5.616045619117909e-06, "loss": 0.3357, "step": 2518 }, { "epoch": 4.233613445378151, "grad_norm": 0.08704460017052819, "learning_rate": 5.5920833781286785e-06, "loss": 0.3312, "step": 2519 }, { "epoch": 4.235294117647059, "grad_norm": 0.08314744193143735, "learning_rate": 5.568168524370254e-06, "loss": 0.3321, "step": 2520 }, { "epoch": 4.236974789915966, "grad_norm": 0.09204303633801034, "learning_rate": 5.544301090778623e-06, "loss": 0.3394, "step": 2521 }, { "epoch": 4.238655462184874, "grad_norm": 0.08663136856436619, "learning_rate": 5.5204811102244606e-06, "loss": 0.3248, "step": 2522 }, { "epoch": 4.2403361344537815, "grad_norm": 0.0837364358060016, "learning_rate": 5.496708615513089e-06, "loss": 0.3399, "step": 2523 }, { "epoch": 4.242016806722689, "grad_norm": 0.08788968220087828, "learning_rate": 5.472983639384448e-06, "loss": 0.3342, "step": 2524 }, { "epoch": 4.243697478991597, "grad_norm": 0.08517161153266949, "learning_rate": 5.449306214513015e-06, "loss": 0.3321, "step": 2525 }, { "epoch": 4.245378151260504, "grad_norm": 0.08292666158497757, "learning_rate": 5.425676373507789e-06, "loss": 0.3429, "step": 2526 }, { "epoch": 4.247058823529412, "grad_norm": 0.08379525670240134, "learning_rate": 5.402094148912227e-06, "loss": 0.3394, "step": 2527 }, { "epoch": 4.248739495798319, "grad_norm": 0.08359397043110066, "learning_rate": 5.3785595732042165e-06, "loss": 0.3298, "step": 2528 }, { "epoch": 4.250420168067227, "grad_norm": 0.08471907443699639, "learning_rate": 5.355072678796007e-06, "loss": 0.3336, "step": 2529 }, { "epoch": 4.2521008403361344, "grad_norm": 0.08806490064839279, "learning_rate": 5.3316334980342185e-06, "loss": 0.3299, "step": 2530 }, { "epoch": 4.253781512605042, "grad_norm": 0.08680216342865354, "learning_rate": 5.3082420631997e-06, "loss": 0.3344, "step": 2531 }, { "epoch": 4.25546218487395, "grad_norm": 0.0857070134021209, "learning_rate": 5.284898406507597e-06, "loss": 0.3326, "step": 2532 }, { "epoch": 4.257142857142857, "grad_norm": 0.08997067808758227, "learning_rate": 5.261602560107228e-06, "loss": 0.3335, "step": 2533 }, { "epoch": 4.258823529411765, "grad_norm": 0.08954379211553636, "learning_rate": 5.238354556082073e-06, "loss": 0.3312, "step": 2534 }, { "epoch": 4.260504201680672, "grad_norm": 0.08802777839320015, "learning_rate": 5.215154426449722e-06, "loss": 0.3331, "step": 2535 }, { "epoch": 4.26218487394958, "grad_norm": 0.0813688644672805, "learning_rate": 5.192002203161823e-06, "loss": 0.3394, "step": 2536 }, { "epoch": 4.263865546218487, "grad_norm": 0.09018953832625091, "learning_rate": 5.168897918104048e-06, "loss": 0.3297, "step": 2537 }, { "epoch": 4.265546218487395, "grad_norm": 0.09018883415598813, "learning_rate": 5.145841603096084e-06, "loss": 0.339, "step": 2538 }, { "epoch": 4.2672268907563025, "grad_norm": 0.08094233478487674, "learning_rate": 5.122833289891489e-06, "loss": 0.3323, "step": 2539 }, { "epoch": 4.26890756302521, "grad_norm": 0.08517854921647727, "learning_rate": 5.099873010177763e-06, "loss": 0.3363, "step": 2540 }, { "epoch": 4.270588235294118, "grad_norm": 1.062225032820902, "learning_rate": 5.076960795576229e-06, "loss": 0.3363, "step": 2541 }, { "epoch": 4.272268907563025, "grad_norm": 0.08700957824460567, "learning_rate": 5.054096677642024e-06, "loss": 0.3325, "step": 2542 }, { "epoch": 4.273949579831933, "grad_norm": 0.08956625524892858, "learning_rate": 5.031280687864044e-06, "loss": 0.3326, "step": 2543 }, { "epoch": 4.27563025210084, "grad_norm": 0.0837573036972431, "learning_rate": 5.008512857664891e-06, "loss": 0.33, "step": 2544 }, { "epoch": 4.277310924369748, "grad_norm": 0.0987392003590681, "learning_rate": 4.985793218400852e-06, "loss": 0.3464, "step": 2545 }, { "epoch": 4.278991596638655, "grad_norm": 0.08605960502877327, "learning_rate": 4.9631218013618565e-06, "loss": 0.3331, "step": 2546 }, { "epoch": 4.280672268907563, "grad_norm": 0.08351973603897438, "learning_rate": 4.940498637771382e-06, "loss": 0.3304, "step": 2547 }, { "epoch": 4.2823529411764705, "grad_norm": 0.09125355058489516, "learning_rate": 4.917923758786498e-06, "loss": 0.3376, "step": 2548 }, { "epoch": 4.284033613445378, "grad_norm": 0.08365519567183174, "learning_rate": 4.895397195497742e-06, "loss": 0.3346, "step": 2549 }, { "epoch": 4.285714285714286, "grad_norm": 0.08465516624945739, "learning_rate": 4.872918978929115e-06, "loss": 0.337, "step": 2550 }, { "epoch": 4.287394957983193, "grad_norm": 0.09047315609982828, "learning_rate": 4.8504891400380635e-06, "loss": 0.3348, "step": 2551 }, { "epoch": 4.289075630252101, "grad_norm": 0.08691953666842762, "learning_rate": 4.828107709715357e-06, "loss": 0.3369, "step": 2552 }, { "epoch": 4.290756302521008, "grad_norm": 0.08179323927134177, "learning_rate": 4.805774718785139e-06, "loss": 0.3358, "step": 2553 }, { "epoch": 4.292436974789916, "grad_norm": 0.08408912410766053, "learning_rate": 4.783490198004823e-06, "loss": 0.3318, "step": 2554 }, { "epoch": 4.294117647058823, "grad_norm": 0.08370293111255196, "learning_rate": 4.761254178065069e-06, "loss": 0.3321, "step": 2555 }, { "epoch": 4.295798319327731, "grad_norm": 0.08515259087787702, "learning_rate": 4.7390666895897355e-06, "loss": 0.3341, "step": 2556 }, { "epoch": 4.2974789915966385, "grad_norm": 0.1126271355800037, "learning_rate": 4.716927763135868e-06, "loss": 0.3387, "step": 2557 }, { "epoch": 4.299159663865546, "grad_norm": 0.0870884758982915, "learning_rate": 4.694837429193588e-06, "loss": 0.3361, "step": 2558 }, { "epoch": 4.300840336134454, "grad_norm": 0.09758044037161584, "learning_rate": 4.672795718186138e-06, "loss": 0.3416, "step": 2559 }, { "epoch": 4.302521008403361, "grad_norm": 0.08470010816452032, "learning_rate": 4.6508026604697775e-06, "loss": 0.3252, "step": 2560 }, { "epoch": 4.304201680672269, "grad_norm": 0.5859537195439442, "learning_rate": 4.6288582863337525e-06, "loss": 0.336, "step": 2561 }, { "epoch": 4.305882352941176, "grad_norm": 0.08635285864401622, "learning_rate": 4.606962626000267e-06, "loss": 0.3382, "step": 2562 }, { "epoch": 4.307563025210084, "grad_norm": 0.08795346069568773, "learning_rate": 4.585115709624442e-06, "loss": 0.3321, "step": 2563 }, { "epoch": 4.309243697478991, "grad_norm": 0.08621634458316951, "learning_rate": 4.563317567294254e-06, "loss": 0.3443, "step": 2564 }, { "epoch": 4.310924369747899, "grad_norm": 0.08255113326279563, "learning_rate": 4.541568229030535e-06, "loss": 0.3403, "step": 2565 }, { "epoch": 4.3126050420168065, "grad_norm": 0.09212932889924356, "learning_rate": 4.519867724786857e-06, "loss": 0.3383, "step": 2566 }, { "epoch": 4.314285714285714, "grad_norm": 0.08431697585391326, "learning_rate": 4.49821608444958e-06, "loss": 0.3348, "step": 2567 }, { "epoch": 4.315966386554622, "grad_norm": 0.08854121705129955, "learning_rate": 4.4766133378377495e-06, "loss": 0.3369, "step": 2568 }, { "epoch": 4.317647058823529, "grad_norm": 0.08579682491041972, "learning_rate": 4.455059514703073e-06, "loss": 0.3381, "step": 2569 }, { "epoch": 4.319327731092437, "grad_norm": 0.08679072653431283, "learning_rate": 4.43355464472988e-06, "loss": 0.3319, "step": 2570 }, { "epoch": 4.321008403361344, "grad_norm": 0.08323913471703785, "learning_rate": 4.412098757535082e-06, "loss": 0.3249, "step": 2571 }, { "epoch": 4.322689075630252, "grad_norm": 0.08757442100793895, "learning_rate": 4.390691882668128e-06, "loss": 0.3391, "step": 2572 }, { "epoch": 4.324369747899159, "grad_norm": 0.08535605937339462, "learning_rate": 4.369334049610987e-06, "loss": 0.3301, "step": 2573 }, { "epoch": 4.326050420168067, "grad_norm": 0.08939367182429969, "learning_rate": 4.3480252877780375e-06, "loss": 0.3479, "step": 2574 }, { "epoch": 4.3277310924369745, "grad_norm": 0.08040336201963015, "learning_rate": 4.326765626516136e-06, "loss": 0.3397, "step": 2575 }, { "epoch": 4.329411764705882, "grad_norm": 0.08128584580522208, "learning_rate": 4.305555095104477e-06, "loss": 0.3353, "step": 2576 }, { "epoch": 4.33109243697479, "grad_norm": 0.08236680607518185, "learning_rate": 4.284393722754607e-06, "loss": 0.3279, "step": 2577 }, { "epoch": 4.332773109243697, "grad_norm": 0.08506362068938433, "learning_rate": 4.263281538610362e-06, "loss": 0.3401, "step": 2578 }, { "epoch": 4.334453781512605, "grad_norm": 0.08066383799487215, "learning_rate": 4.242218571747842e-06, "loss": 0.3312, "step": 2579 }, { "epoch": 4.336134453781512, "grad_norm": 0.08653729353291971, "learning_rate": 4.2212048511753556e-06, "loss": 0.3408, "step": 2580 }, { "epoch": 4.33781512605042, "grad_norm": 0.08304747753500683, "learning_rate": 4.200240405833418e-06, "loss": 0.3249, "step": 2581 }, { "epoch": 4.339495798319327, "grad_norm": 0.08341621463530578, "learning_rate": 4.179325264594622e-06, "loss": 0.3438, "step": 2582 }, { "epoch": 4.341176470588235, "grad_norm": 0.08003613338328289, "learning_rate": 4.158459456263724e-06, "loss": 0.3306, "step": 2583 }, { "epoch": 4.3428571428571425, "grad_norm": 0.08372783665058042, "learning_rate": 4.137643009577495e-06, "loss": 0.3277, "step": 2584 }, { "epoch": 4.34453781512605, "grad_norm": 0.08469493989820803, "learning_rate": 4.116875953204731e-06, "loss": 0.346, "step": 2585 }, { "epoch": 4.346218487394958, "grad_norm": 0.0816087615748333, "learning_rate": 4.096158315746235e-06, "loss": 0.3281, "step": 2586 }, { "epoch": 4.347899159663865, "grad_norm": 0.0844165217384175, "learning_rate": 4.075490125734703e-06, "loss": 0.3397, "step": 2587 }, { "epoch": 4.349579831932773, "grad_norm": 0.0915850355522151, "learning_rate": 4.054871411634773e-06, "loss": 0.3444, "step": 2588 }, { "epoch": 4.35126050420168, "grad_norm": 0.08402842200523021, "learning_rate": 4.034302201842919e-06, "loss": 0.33, "step": 2589 }, { "epoch": 4.352941176470588, "grad_norm": 0.08989466105427354, "learning_rate": 4.013782524687444e-06, "loss": 0.3354, "step": 2590 }, { "epoch": 4.354621848739495, "grad_norm": 0.08565544184307342, "learning_rate": 3.9933124084284315e-06, "loss": 0.3317, "step": 2591 }, { "epoch": 4.356302521008403, "grad_norm": 0.09147101812898967, "learning_rate": 3.972891881257725e-06, "loss": 0.3331, "step": 2592 }, { "epoch": 4.3579831932773105, "grad_norm": 0.08350148899356287, "learning_rate": 3.952520971298839e-06, "loss": 0.3264, "step": 2593 }, { "epoch": 4.359663865546218, "grad_norm": 0.07924330265632717, "learning_rate": 3.932199706606987e-06, "loss": 0.3385, "step": 2594 }, { "epoch": 4.361344537815126, "grad_norm": 0.08428435391764419, "learning_rate": 3.911928115168997e-06, "loss": 0.3314, "step": 2595 }, { "epoch": 4.363025210084033, "grad_norm": 0.08412738589229395, "learning_rate": 3.891706224903287e-06, "loss": 0.3421, "step": 2596 }, { "epoch": 4.364705882352941, "grad_norm": 0.08352770886736458, "learning_rate": 3.871534063659819e-06, "loss": 0.3267, "step": 2597 }, { "epoch": 4.366386554621848, "grad_norm": 0.07917224956893648, "learning_rate": 3.851411659220077e-06, "loss": 0.3419, "step": 2598 }, { "epoch": 4.368067226890757, "grad_norm": 0.08415678392313791, "learning_rate": 3.831339039297009e-06, "loss": 0.3437, "step": 2599 }, { "epoch": 4.369747899159664, "grad_norm": 0.08113787590695479, "learning_rate": 3.8113162315350207e-06, "loss": 0.3323, "step": 2600 }, { "epoch": 4.371428571428572, "grad_norm": 0.0825472413969988, "learning_rate": 3.791343263509881e-06, "loss": 0.3325, "step": 2601 }, { "epoch": 4.373109243697479, "grad_norm": 0.07838381366339636, "learning_rate": 3.7714201627287517e-06, "loss": 0.3313, "step": 2602 }, { "epoch": 4.374789915966387, "grad_norm": 0.0804878574417966, "learning_rate": 3.751546956630101e-06, "loss": 0.3405, "step": 2603 }, { "epoch": 4.376470588235295, "grad_norm": 0.07703841528836143, "learning_rate": 3.7317236725836803e-06, "loss": 0.3338, "step": 2604 }, { "epoch": 4.378151260504202, "grad_norm": 0.08274966704676819, "learning_rate": 3.7119503378904865e-06, "loss": 0.3325, "step": 2605 }, { "epoch": 4.37983193277311, "grad_norm": 0.08979732227995174, "learning_rate": 3.6922269797827314e-06, "loss": 0.3309, "step": 2606 }, { "epoch": 4.381512605042017, "grad_norm": 0.08268096338407861, "learning_rate": 3.672553625423785e-06, "loss": 0.3343, "step": 2607 }, { "epoch": 4.383193277310925, "grad_norm": 0.07923758686245207, "learning_rate": 3.6529303019081908e-06, "loss": 0.3365, "step": 2608 }, { "epoch": 4.384873949579832, "grad_norm": 0.07458565119036356, "learning_rate": 3.633357036261522e-06, "loss": 0.3343, "step": 2609 }, { "epoch": 4.38655462184874, "grad_norm": 0.08084153454200493, "learning_rate": 3.6138338554404784e-06, "loss": 0.3303, "step": 2610 }, { "epoch": 4.3882352941176475, "grad_norm": 0.07842115962461844, "learning_rate": 3.594360786332738e-06, "loss": 0.3362, "step": 2611 }, { "epoch": 4.389915966386555, "grad_norm": 0.08575885412455382, "learning_rate": 3.574937855756977e-06, "loss": 0.3375, "step": 2612 }, { "epoch": 4.391596638655463, "grad_norm": 0.07884149703780789, "learning_rate": 3.555565090462829e-06, "loss": 0.34, "step": 2613 }, { "epoch": 4.39327731092437, "grad_norm": 0.0766069321699475, "learning_rate": 3.5362425171308277e-06, "loss": 0.3326, "step": 2614 }, { "epoch": 4.394957983193278, "grad_norm": 0.07725597858441346, "learning_rate": 3.5169701623723793e-06, "loss": 0.3338, "step": 2615 }, { "epoch": 4.396638655462185, "grad_norm": 0.07857763110922689, "learning_rate": 3.4977480527297505e-06, "loss": 0.3396, "step": 2616 }, { "epoch": 4.398319327731093, "grad_norm": 0.08316748333218513, "learning_rate": 3.4785762146759682e-06, "loss": 0.3425, "step": 2617 }, { "epoch": 4.4, "grad_norm": 0.07826611623610961, "learning_rate": 3.459454674614868e-06, "loss": 0.3384, "step": 2618 }, { "epoch": 4.401680672268908, "grad_norm": 0.0815860381793515, "learning_rate": 3.4403834588809935e-06, "loss": 0.3312, "step": 2619 }, { "epoch": 4.4033613445378155, "grad_norm": 0.0760351845138154, "learning_rate": 3.4213625937395658e-06, "loss": 0.3411, "step": 2620 }, { "epoch": 4.405042016806723, "grad_norm": 0.08692877458443105, "learning_rate": 3.4023921053865095e-06, "loss": 0.3318, "step": 2621 }, { "epoch": 4.406722689075631, "grad_norm": 0.07825336813804905, "learning_rate": 3.3834720199482997e-06, "loss": 0.333, "step": 2622 }, { "epoch": 4.408403361344538, "grad_norm": 0.07921812456231384, "learning_rate": 3.3646023634820657e-06, "loss": 0.3393, "step": 2623 }, { "epoch": 4.410084033613446, "grad_norm": 0.0858330675435136, "learning_rate": 3.3457831619754423e-06, "loss": 0.3334, "step": 2624 }, { "epoch": 4.411764705882353, "grad_norm": 0.08474414349691398, "learning_rate": 3.3270144413465854e-06, "loss": 0.3319, "step": 2625 }, { "epoch": 4.413445378151261, "grad_norm": 0.08368322661837499, "learning_rate": 3.30829622744413e-06, "loss": 0.3467, "step": 2626 }, { "epoch": 4.415126050420168, "grad_norm": 0.08009460091465903, "learning_rate": 3.2896285460471656e-06, "loss": 0.3384, "step": 2627 }, { "epoch": 4.416806722689076, "grad_norm": 0.08364278967679625, "learning_rate": 3.2710114228651533e-06, "loss": 0.3408, "step": 2628 }, { "epoch": 4.4184873949579835, "grad_norm": 0.07740916871334334, "learning_rate": 3.252444883537966e-06, "loss": 0.3277, "step": 2629 }, { "epoch": 4.420168067226891, "grad_norm": 0.07750158290620719, "learning_rate": 3.2339289536357808e-06, "loss": 0.3372, "step": 2630 }, { "epoch": 4.421848739495799, "grad_norm": 0.08119869113316686, "learning_rate": 3.2154636586590883e-06, "loss": 0.3337, "step": 2631 }, { "epoch": 4.423529411764706, "grad_norm": 0.08249805087152542, "learning_rate": 3.1970490240386433e-06, "loss": 0.3295, "step": 2632 }, { "epoch": 4.425210084033614, "grad_norm": 0.07879139625924311, "learning_rate": 3.178685075135417e-06, "loss": 0.3346, "step": 2633 }, { "epoch": 4.426890756302521, "grad_norm": 0.0771677827668004, "learning_rate": 3.160371837240592e-06, "loss": 0.3375, "step": 2634 }, { "epoch": 4.428571428571429, "grad_norm": 0.07888624767107004, "learning_rate": 3.142109335575514e-06, "loss": 0.3355, "step": 2635 }, { "epoch": 4.430252100840336, "grad_norm": 0.0785123298310197, "learning_rate": 3.123897595291623e-06, "loss": 0.336, "step": 2636 }, { "epoch": 4.431932773109244, "grad_norm": 0.07672849604133891, "learning_rate": 3.1057366414704917e-06, "loss": 0.331, "step": 2637 }, { "epoch": 4.4336134453781515, "grad_norm": 0.07634154974104455, "learning_rate": 3.0876264991237125e-06, "loss": 0.336, "step": 2638 }, { "epoch": 4.435294117647059, "grad_norm": 0.08120329921982936, "learning_rate": 3.0695671931929263e-06, "loss": 0.3359, "step": 2639 }, { "epoch": 4.436974789915967, "grad_norm": 0.08169624553707755, "learning_rate": 3.0515587485497434e-06, "loss": 0.3313, "step": 2640 }, { "epoch": 4.438655462184874, "grad_norm": 0.07721357462290462, "learning_rate": 3.033601189995734e-06, "loss": 0.3233, "step": 2641 }, { "epoch": 4.440336134453782, "grad_norm": 0.08130330078124431, "learning_rate": 3.0156945422623774e-06, "loss": 0.3372, "step": 2642 }, { "epoch": 4.442016806722689, "grad_norm": 0.07582797414779661, "learning_rate": 2.997838830011066e-06, "loss": 0.3311, "step": 2643 }, { "epoch": 4.443697478991597, "grad_norm": 0.0825590477754094, "learning_rate": 2.9800340778330007e-06, "loss": 0.3468, "step": 2644 }, { "epoch": 4.445378151260504, "grad_norm": 0.07596697926639318, "learning_rate": 2.962280310249237e-06, "loss": 0.3433, "step": 2645 }, { "epoch": 4.447058823529412, "grad_norm": 0.07907067680444488, "learning_rate": 2.94457755171059e-06, "loss": 0.3308, "step": 2646 }, { "epoch": 4.4487394957983195, "grad_norm": 0.08055416595600622, "learning_rate": 2.9269258265976374e-06, "loss": 0.336, "step": 2647 }, { "epoch": 4.450420168067227, "grad_norm": 0.07953760164212507, "learning_rate": 2.909325159220666e-06, "loss": 0.3312, "step": 2648 }, { "epoch": 4.452100840336135, "grad_norm": 0.08560402458492748, "learning_rate": 2.891775573819646e-06, "loss": 0.3369, "step": 2649 }, { "epoch": 4.453781512605042, "grad_norm": 0.07769032865156635, "learning_rate": 2.8742770945641907e-06, "loss": 0.3409, "step": 2650 }, { "epoch": 4.45546218487395, "grad_norm": 0.07886376911723401, "learning_rate": 2.856829745553551e-06, "loss": 0.3441, "step": 2651 }, { "epoch": 4.457142857142857, "grad_norm": 0.07601417430302916, "learning_rate": 2.839433550816524e-06, "loss": 0.3311, "step": 2652 }, { "epoch": 4.458823529411765, "grad_norm": 0.07836748294935963, "learning_rate": 2.822088534311491e-06, "loss": 0.3334, "step": 2653 }, { "epoch": 4.460504201680672, "grad_norm": 0.07925208183682175, "learning_rate": 2.804794719926336e-06, "loss": 0.3372, "step": 2654 }, { "epoch": 4.46218487394958, "grad_norm": 0.07878219807114034, "learning_rate": 2.7875521314784147e-06, "loss": 0.3401, "step": 2655 }, { "epoch": 4.4638655462184875, "grad_norm": 0.07484917167525021, "learning_rate": 2.7703607927145594e-06, "loss": 0.326, "step": 2656 }, { "epoch": 4.465546218487395, "grad_norm": 0.07725215238605801, "learning_rate": 2.75322072731099e-06, "loss": 0.3378, "step": 2657 }, { "epoch": 4.467226890756303, "grad_norm": 0.07564165208302857, "learning_rate": 2.7361319588733406e-06, "loss": 0.345, "step": 2658 }, { "epoch": 4.46890756302521, "grad_norm": 0.07713979350464503, "learning_rate": 2.7190945109365797e-06, "loss": 0.3392, "step": 2659 }, { "epoch": 4.470588235294118, "grad_norm": 0.07442541893367137, "learning_rate": 2.7021084069650006e-06, "loss": 0.331, "step": 2660 }, { "epoch": 4.472268907563025, "grad_norm": 0.07664120912750066, "learning_rate": 2.685173670352179e-06, "loss": 0.3401, "step": 2661 }, { "epoch": 4.473949579831933, "grad_norm": 0.08099104227090868, "learning_rate": 2.6682903244209745e-06, "loss": 0.3498, "step": 2662 }, { "epoch": 4.47563025210084, "grad_norm": 0.5039609894064592, "learning_rate": 2.6514583924234227e-06, "loss": 0.3422, "step": 2663 }, { "epoch": 4.477310924369748, "grad_norm": 0.07513914374683217, "learning_rate": 2.6346778975407985e-06, "loss": 0.3317, "step": 2664 }, { "epoch": 4.4789915966386555, "grad_norm": 0.07931223406925907, "learning_rate": 2.6179488628835036e-06, "loss": 0.338, "step": 2665 }, { "epoch": 4.480672268907563, "grad_norm": 0.0795227607504858, "learning_rate": 2.6012713114910824e-06, "loss": 0.3271, "step": 2666 }, { "epoch": 4.482352941176471, "grad_norm": 0.07753718053052441, "learning_rate": 2.584645266332171e-06, "loss": 0.3348, "step": 2667 }, { "epoch": 4.484033613445378, "grad_norm": 0.0756536854744665, "learning_rate": 2.5680707503044787e-06, "loss": 0.3358, "step": 2668 }, { "epoch": 4.485714285714286, "grad_norm": 0.07767522864588389, "learning_rate": 2.5515477862347293e-06, "loss": 0.3281, "step": 2669 }, { "epoch": 4.487394957983193, "grad_norm": 0.07519670258447995, "learning_rate": 2.535076396878675e-06, "loss": 0.3351, "step": 2670 }, { "epoch": 4.489075630252101, "grad_norm": 0.07997055519478093, "learning_rate": 2.5186566049210105e-06, "loss": 0.3442, "step": 2671 }, { "epoch": 4.490756302521008, "grad_norm": 0.07604464554157288, "learning_rate": 2.5022884329753882e-06, "loss": 0.3296, "step": 2672 }, { "epoch": 4.492436974789916, "grad_norm": 0.07650165504506642, "learning_rate": 2.485971903584363e-06, "loss": 0.3426, "step": 2673 }, { "epoch": 4.4941176470588236, "grad_norm": 0.08031781291297499, "learning_rate": 2.469707039219369e-06, "loss": 0.3372, "step": 2674 }, { "epoch": 4.495798319327731, "grad_norm": 0.07988155755593232, "learning_rate": 2.453493862280678e-06, "loss": 0.3409, "step": 2675 }, { "epoch": 4.497478991596639, "grad_norm": 0.07774318508496271, "learning_rate": 2.437332395097389e-06, "loss": 0.3323, "step": 2676 }, { "epoch": 4.499159663865546, "grad_norm": 0.07649313330566568, "learning_rate": 2.4212226599273646e-06, "loss": 0.3355, "step": 2677 }, { "epoch": 4.500840336134454, "grad_norm": 0.07808584715062078, "learning_rate": 2.4051646789572614e-06, "loss": 0.3379, "step": 2678 }, { "epoch": 4.502521008403361, "grad_norm": 0.07759185806602914, "learning_rate": 2.389158474302402e-06, "loss": 0.3337, "step": 2679 }, { "epoch": 4.504201680672269, "grad_norm": 0.07982863319426065, "learning_rate": 2.373204068006856e-06, "loss": 0.3396, "step": 2680 }, { "epoch": 4.5058823529411764, "grad_norm": 0.07604056436985747, "learning_rate": 2.357301482043326e-06, "loss": 0.326, "step": 2681 }, { "epoch": 4.507563025210084, "grad_norm": 0.07807553874979914, "learning_rate": 2.3414507383131512e-06, "loss": 0.3316, "step": 2682 }, { "epoch": 4.509243697478992, "grad_norm": 0.07910497059114395, "learning_rate": 2.3256518586462786e-06, "loss": 0.3393, "step": 2683 }, { "epoch": 4.510924369747899, "grad_norm": 0.07538404831213012, "learning_rate": 2.3099048648012135e-06, "loss": 0.3258, "step": 2684 }, { "epoch": 4.512605042016807, "grad_norm": 0.07548508287773663, "learning_rate": 2.2942097784650174e-06, "loss": 0.3323, "step": 2685 }, { "epoch": 4.514285714285714, "grad_norm": 0.07594407200476615, "learning_rate": 2.2785666212532663e-06, "loss": 0.3423, "step": 2686 }, { "epoch": 4.515966386554622, "grad_norm": 0.08104622007546759, "learning_rate": 2.262975414709998e-06, "loss": 0.336, "step": 2687 }, { "epoch": 4.517647058823529, "grad_norm": 0.07838200848169276, "learning_rate": 2.2474361803077248e-06, "loss": 0.3305, "step": 2688 }, { "epoch": 4.519327731092437, "grad_norm": 0.07667155879764316, "learning_rate": 2.231948939447368e-06, "loss": 0.3258, "step": 2689 }, { "epoch": 4.5210084033613445, "grad_norm": 0.07431948868145574, "learning_rate": 2.216513713458248e-06, "loss": 0.3392, "step": 2690 }, { "epoch": 4.522689075630252, "grad_norm": 0.07911001559849108, "learning_rate": 2.2011305235980627e-06, "loss": 0.3411, "step": 2691 }, { "epoch": 4.52436974789916, "grad_norm": 0.07460881048280583, "learning_rate": 2.185799391052803e-06, "loss": 0.3354, "step": 2692 }, { "epoch": 4.526050420168067, "grad_norm": 0.07598746940624555, "learning_rate": 2.1705203369368145e-06, "loss": 0.3322, "step": 2693 }, { "epoch": 4.527731092436975, "grad_norm": 0.07561273399681986, "learning_rate": 2.155293382292696e-06, "loss": 0.3381, "step": 2694 }, { "epoch": 4.529411764705882, "grad_norm": 0.07405134867980455, "learning_rate": 2.1401185480912945e-06, "loss": 0.3306, "step": 2695 }, { "epoch": 4.53109243697479, "grad_norm": 0.07459344167080617, "learning_rate": 2.1249958552316664e-06, "loss": 0.341, "step": 2696 }, { "epoch": 4.532773109243697, "grad_norm": 0.07918843062645917, "learning_rate": 2.1099253245410887e-06, "loss": 0.337, "step": 2697 }, { "epoch": 4.534453781512605, "grad_norm": 0.07484283340130968, "learning_rate": 2.094906976774964e-06, "loss": 0.3345, "step": 2698 }, { "epoch": 4.5361344537815125, "grad_norm": 0.07825154471723364, "learning_rate": 2.0799408326168535e-06, "loss": 0.3301, "step": 2699 }, { "epoch": 4.53781512605042, "grad_norm": 0.07811953556885866, "learning_rate": 2.0650269126784072e-06, "loss": 0.3344, "step": 2700 }, { "epoch": 4.539495798319328, "grad_norm": 0.07556378495286234, "learning_rate": 2.0501652374993597e-06, "loss": 0.3298, "step": 2701 }, { "epoch": 4.541176470588235, "grad_norm": 0.0759914114822111, "learning_rate": 2.035355827547485e-06, "loss": 0.3378, "step": 2702 }, { "epoch": 4.542857142857143, "grad_norm": 0.07623013204088087, "learning_rate": 2.020598703218588e-06, "loss": 0.3263, "step": 2703 }, { "epoch": 4.54453781512605, "grad_norm": 0.07287500887664777, "learning_rate": 2.005893884836452e-06, "loss": 0.3392, "step": 2704 }, { "epoch": 4.546218487394958, "grad_norm": 0.07508794761886683, "learning_rate": 1.991241392652841e-06, "loss": 0.3395, "step": 2705 }, { "epoch": 4.547899159663865, "grad_norm": 0.07369231864156138, "learning_rate": 1.976641246847426e-06, "loss": 0.3308, "step": 2706 }, { "epoch": 4.549579831932773, "grad_norm": 0.07635397988508649, "learning_rate": 1.962093467527817e-06, "loss": 0.3299, "step": 2707 }, { "epoch": 4.5512605042016805, "grad_norm": 0.07269465515638357, "learning_rate": 1.9475980747294845e-06, "loss": 0.3346, "step": 2708 }, { "epoch": 4.552941176470588, "grad_norm": 0.07424948208571039, "learning_rate": 1.933155088415757e-06, "loss": 0.3401, "step": 2709 }, { "epoch": 4.554621848739496, "grad_norm": 0.07740481684549737, "learning_rate": 1.91876452847779e-06, "loss": 0.3381, "step": 2710 }, { "epoch": 4.556302521008403, "grad_norm": 0.07814827923731087, "learning_rate": 1.904426414734526e-06, "loss": 0.3354, "step": 2711 }, { "epoch": 4.557983193277311, "grad_norm": 0.07635932403317687, "learning_rate": 1.8901407669326887e-06, "loss": 0.3329, "step": 2712 }, { "epoch": 4.559663865546218, "grad_norm": 0.07668338391713518, "learning_rate": 1.8759076047467495e-06, "loss": 0.3345, "step": 2713 }, { "epoch": 4.561344537815126, "grad_norm": 0.0716382991459258, "learning_rate": 1.8617269477788723e-06, "loss": 0.3319, "step": 2714 }, { "epoch": 4.563025210084033, "grad_norm": 0.07911880111429734, "learning_rate": 1.8475988155589375e-06, "loss": 0.3389, "step": 2715 }, { "epoch": 4.564705882352941, "grad_norm": 0.0749609629970055, "learning_rate": 1.8335232275444692e-06, "loss": 0.33, "step": 2716 }, { "epoch": 4.5663865546218485, "grad_norm": 0.07498897711905675, "learning_rate": 1.8195002031206367e-06, "loss": 0.3341, "step": 2717 }, { "epoch": 4.568067226890756, "grad_norm": 0.07719800060491405, "learning_rate": 1.8055297616002088e-06, "loss": 0.3328, "step": 2718 }, { "epoch": 4.569747899159664, "grad_norm": 0.0726038661594311, "learning_rate": 1.7916119222235417e-06, "loss": 0.3396, "step": 2719 }, { "epoch": 4.571428571428571, "grad_norm": 0.07821632768089458, "learning_rate": 1.7777467041585427e-06, "loss": 0.3477, "step": 2720 }, { "epoch": 4.573109243697479, "grad_norm": 0.08224219412935112, "learning_rate": 1.7639341265006616e-06, "loss": 0.3358, "step": 2721 }, { "epoch": 4.574789915966386, "grad_norm": 0.07401610064454849, "learning_rate": 1.7501742082728279e-06, "loss": 0.3351, "step": 2722 }, { "epoch": 4.576470588235294, "grad_norm": 0.07939584699922261, "learning_rate": 1.7364669684254698e-06, "loss": 0.3367, "step": 2723 }, { "epoch": 4.578151260504201, "grad_norm": 0.07468351314630442, "learning_rate": 1.7228124258364509e-06, "loss": 0.3343, "step": 2724 }, { "epoch": 4.579831932773109, "grad_norm": 0.07753723040882435, "learning_rate": 1.7092105993110664e-06, "loss": 0.3387, "step": 2725 }, { "epoch": 4.5815126050420165, "grad_norm": 0.07300610319829243, "learning_rate": 1.6956615075820115e-06, "loss": 0.3243, "step": 2726 }, { "epoch": 4.583193277310924, "grad_norm": 0.07365172145072177, "learning_rate": 1.6821651693093376e-06, "loss": 0.3288, "step": 2727 }, { "epoch": 4.584873949579832, "grad_norm": 0.07709032550028795, "learning_rate": 1.6687216030804697e-06, "loss": 0.3301, "step": 2728 }, { "epoch": 4.586554621848739, "grad_norm": 0.07516127214498249, "learning_rate": 1.6553308274101354e-06, "loss": 0.3347, "step": 2729 }, { "epoch": 4.588235294117647, "grad_norm": 0.07295966777584915, "learning_rate": 1.6419928607403647e-06, "loss": 0.3316, "step": 2730 }, { "epoch": 4.589915966386554, "grad_norm": 0.07210889942975067, "learning_rate": 1.62870772144045e-06, "loss": 0.3405, "step": 2731 }, { "epoch": 4.591596638655462, "grad_norm": 0.07585980131389897, "learning_rate": 1.6154754278069517e-06, "loss": 0.3243, "step": 2732 }, { "epoch": 4.593277310924369, "grad_norm": 0.07767973532589138, "learning_rate": 1.6022959980636122e-06, "loss": 0.3341, "step": 2733 }, { "epoch": 4.594957983193277, "grad_norm": 0.07289138154907306, "learning_rate": 1.5891694503614053e-06, "loss": 0.3296, "step": 2734 }, { "epoch": 4.5966386554621845, "grad_norm": 0.07901618366164445, "learning_rate": 1.5760958027784568e-06, "loss": 0.3424, "step": 2735 }, { "epoch": 4.598319327731092, "grad_norm": 0.07570127253676485, "learning_rate": 1.5630750733200351e-06, "loss": 0.3355, "step": 2736 }, { "epoch": 4.6, "grad_norm": 0.07650937303818936, "learning_rate": 1.550107279918538e-06, "loss": 0.3334, "step": 2737 }, { "epoch": 4.601680672268907, "grad_norm": 0.07320664864988856, "learning_rate": 1.537192440433457e-06, "loss": 0.3347, "step": 2738 }, { "epoch": 4.603361344537815, "grad_norm": 0.07821331167839703, "learning_rate": 1.5243305726513468e-06, "loss": 0.3395, "step": 2739 }, { "epoch": 4.605042016806722, "grad_norm": 0.13741282384079773, "learning_rate": 1.5115216942858247e-06, "loss": 0.3305, "step": 2740 }, { "epoch": 4.60672268907563, "grad_norm": 0.07511165418208943, "learning_rate": 1.4987658229774994e-06, "loss": 0.3316, "step": 2741 }, { "epoch": 4.608403361344537, "grad_norm": 0.0789326710371173, "learning_rate": 1.486062976294016e-06, "loss": 0.3396, "step": 2742 }, { "epoch": 4.610084033613445, "grad_norm": 0.073452671730366, "learning_rate": 1.473413171729967e-06, "loss": 0.3325, "step": 2743 }, { "epoch": 4.6117647058823525, "grad_norm": 0.07166060798185817, "learning_rate": 1.4608164267069014e-06, "loss": 0.3329, "step": 2744 }, { "epoch": 4.61344537815126, "grad_norm": 0.07394700169171686, "learning_rate": 1.448272758573297e-06, "loss": 0.3363, "step": 2745 }, { "epoch": 4.615126050420168, "grad_norm": 0.07397937336747207, "learning_rate": 1.4357821846045217e-06, "loss": 0.3231, "step": 2746 }, { "epoch": 4.616806722689075, "grad_norm": 0.08121641256033343, "learning_rate": 1.423344722002833e-06, "loss": 0.3267, "step": 2747 }, { "epoch": 4.618487394957983, "grad_norm": 0.07754467147662653, "learning_rate": 1.410960387897351e-06, "loss": 0.3341, "step": 2748 }, { "epoch": 4.62016806722689, "grad_norm": 0.0750950164944087, "learning_rate": 1.3986291993439928e-06, "loss": 0.3288, "step": 2749 }, { "epoch": 4.621848739495798, "grad_norm": 0.0694735283637858, "learning_rate": 1.3863511733255196e-06, "loss": 0.3242, "step": 2750 }, { "epoch": 4.623529411764705, "grad_norm": 0.07495422131113165, "learning_rate": 1.3741263267514505e-06, "loss": 0.3388, "step": 2751 }, { "epoch": 4.625210084033613, "grad_norm": 0.07355157234662685, "learning_rate": 1.3619546764580815e-06, "loss": 0.3365, "step": 2752 }, { "epoch": 4.626890756302521, "grad_norm": 0.07342715687977053, "learning_rate": 1.3498362392084307e-06, "loss": 0.3345, "step": 2753 }, { "epoch": 4.628571428571428, "grad_norm": 0.0745500294631132, "learning_rate": 1.3377710316922455e-06, "loss": 0.3398, "step": 2754 }, { "epoch": 4.630252100840336, "grad_norm": 0.07912682576687503, "learning_rate": 1.3257590705259493e-06, "loss": 0.3308, "step": 2755 }, { "epoch": 4.631932773109243, "grad_norm": 0.0723565110760415, "learning_rate": 1.313800372252656e-06, "loss": 0.3302, "step": 2756 }, { "epoch": 4.633613445378151, "grad_norm": 0.07612766013891362, "learning_rate": 1.3018949533420933e-06, "loss": 0.3355, "step": 2757 }, { "epoch": 4.635294117647058, "grad_norm": 0.07444038241060053, "learning_rate": 1.2900428301906342e-06, "loss": 0.33, "step": 2758 }, { "epoch": 4.636974789915966, "grad_norm": 0.07432094076643861, "learning_rate": 1.278244019121253e-06, "loss": 0.3304, "step": 2759 }, { "epoch": 4.6386554621848735, "grad_norm": 0.0747369266784318, "learning_rate": 1.2664985363834847e-06, "loss": 0.3415, "step": 2760 }, { "epoch": 4.640336134453781, "grad_norm": 0.3767942067659613, "learning_rate": 1.2548063981534343e-06, "loss": 0.3346, "step": 2761 }, { "epoch": 4.6420168067226895, "grad_norm": 0.07511266329848278, "learning_rate": 1.2431676205337274e-06, "loss": 0.3339, "step": 2762 }, { "epoch": 4.643697478991597, "grad_norm": 0.08362047254670665, "learning_rate": 1.23158221955352e-06, "loss": 0.343, "step": 2763 }, { "epoch": 4.645378151260505, "grad_norm": 0.07546647191187704, "learning_rate": 1.2200502111684354e-06, "loss": 0.339, "step": 2764 }, { "epoch": 4.647058823529412, "grad_norm": 0.07426723974983658, "learning_rate": 1.2085716112605782e-06, "loss": 0.3292, "step": 2765 }, { "epoch": 4.64873949579832, "grad_norm": 0.07275746927479186, "learning_rate": 1.1971464356384898e-06, "loss": 0.3361, "step": 2766 }, { "epoch": 4.650420168067227, "grad_norm": 0.0745293640263058, "learning_rate": 1.1857747000371389e-06, "loss": 0.333, "step": 2767 }, { "epoch": 4.652100840336135, "grad_norm": 0.07365150097918186, "learning_rate": 1.1744564201178865e-06, "loss": 0.338, "step": 2768 }, { "epoch": 4.653781512605042, "grad_norm": 0.07371624933274945, "learning_rate": 1.1631916114684949e-06, "loss": 0.3305, "step": 2769 }, { "epoch": 4.65546218487395, "grad_norm": 0.07105909960168079, "learning_rate": 1.1519802896030562e-06, "loss": 0.323, "step": 2770 }, { "epoch": 4.6571428571428575, "grad_norm": 0.07217763394633166, "learning_rate": 1.140822469962024e-06, "loss": 0.3306, "step": 2771 }, { "epoch": 4.658823529411765, "grad_norm": 0.0779947752252101, "learning_rate": 1.1297181679121549e-06, "loss": 0.3313, "step": 2772 }, { "epoch": 4.660504201680673, "grad_norm": 0.07771486054426338, "learning_rate": 1.1186673987465002e-06, "loss": 0.3321, "step": 2773 }, { "epoch": 4.66218487394958, "grad_norm": 0.07516545407714731, "learning_rate": 1.1076701776843835e-06, "loss": 0.3368, "step": 2774 }, { "epoch": 4.663865546218488, "grad_norm": 0.07389998933568423, "learning_rate": 1.0967265198714006e-06, "loss": 0.3295, "step": 2775 }, { "epoch": 4.665546218487395, "grad_norm": 0.07852863684436098, "learning_rate": 1.0858364403793487e-06, "loss": 0.3378, "step": 2776 }, { "epoch": 4.667226890756303, "grad_norm": 0.07340532442160293, "learning_rate": 1.0749999542062529e-06, "loss": 0.335, "step": 2777 }, { "epoch": 4.66890756302521, "grad_norm": 0.07495405388664747, "learning_rate": 1.0642170762763303e-06, "loss": 0.3344, "step": 2778 }, { "epoch": 4.670588235294118, "grad_norm": 0.07334510340570795, "learning_rate": 1.0534878214399603e-06, "loss": 0.3329, "step": 2779 }, { "epoch": 4.6722689075630255, "grad_norm": 0.07596754886798267, "learning_rate": 1.0428122044736733e-06, "loss": 0.3391, "step": 2780 }, { "epoch": 4.673949579831933, "grad_norm": 0.07989818728762628, "learning_rate": 1.0321902400801352e-06, "loss": 0.3354, "step": 2781 }, { "epoch": 4.675630252100841, "grad_norm": 0.07334356775401943, "learning_rate": 1.0216219428881114e-06, "loss": 0.3351, "step": 2782 }, { "epoch": 4.677310924369748, "grad_norm": 0.07191187135540035, "learning_rate": 1.0111073274524608e-06, "loss": 0.3375, "step": 2783 }, { "epoch": 4.678991596638656, "grad_norm": 0.0749761156483234, "learning_rate": 1.0006464082541067e-06, "loss": 0.3321, "step": 2784 }, { "epoch": 4.680672268907563, "grad_norm": 0.07321826573221325, "learning_rate": 9.902391997000315e-07, "loss": 0.3315, "step": 2785 }, { "epoch": 4.682352941176471, "grad_norm": 0.07183807140532462, "learning_rate": 9.798857161232323e-07, "loss": 0.3362, "step": 2786 }, { "epoch": 4.684033613445378, "grad_norm": 0.07139690504621792, "learning_rate": 9.695859717827294e-07, "loss": 0.3331, "step": 2787 }, { "epoch": 4.685714285714286, "grad_norm": 0.07626141793352766, "learning_rate": 9.59339980863514e-07, "loss": 0.3435, "step": 2788 }, { "epoch": 4.6873949579831935, "grad_norm": 0.07216218322209839, "learning_rate": 9.491477574765606e-07, "loss": 0.3343, "step": 2789 }, { "epoch": 4.689075630252101, "grad_norm": 0.07055330177113761, "learning_rate": 9.390093156587965e-07, "loss": 0.3294, "step": 2790 }, { "epoch": 4.690756302521009, "grad_norm": 0.07396951976314821, "learning_rate": 9.289246693730746e-07, "loss": 0.3341, "step": 2791 }, { "epoch": 4.692436974789916, "grad_norm": 0.07609153066970092, "learning_rate": 9.18893832508152e-07, "loss": 0.3369, "step": 2792 }, { "epoch": 4.694117647058824, "grad_norm": 0.07544422890995602, "learning_rate": 9.089168188786935e-07, "loss": 0.3309, "step": 2793 }, { "epoch": 4.695798319327731, "grad_norm": 0.07471399789663853, "learning_rate": 8.989936422252232e-07, "loss": 0.3411, "step": 2794 }, { "epoch": 4.697478991596639, "grad_norm": 0.07155036809083479, "learning_rate": 8.891243162141383e-07, "loss": 0.3305, "step": 2795 }, { "epoch": 4.699159663865546, "grad_norm": 0.07512052513906325, "learning_rate": 8.793088544376504e-07, "loss": 0.3349, "step": 2796 }, { "epoch": 4.700840336134454, "grad_norm": 0.07361780742362171, "learning_rate": 8.695472704137997e-07, "loss": 0.338, "step": 2797 }, { "epoch": 4.7025210084033615, "grad_norm": 0.1264259614519206, "learning_rate": 8.598395775864277e-07, "loss": 0.3299, "step": 2798 }, { "epoch": 4.704201680672269, "grad_norm": 0.07620889956316024, "learning_rate": 8.501857893251463e-07, "loss": 0.3314, "step": 2799 }, { "epoch": 4.705882352941177, "grad_norm": 0.07390963183260883, "learning_rate": 8.405859189253429e-07, "loss": 0.3224, "step": 2800 }, { "epoch": 4.707563025210084, "grad_norm": 0.07344933391266796, "learning_rate": 8.310399796081259e-07, "loss": 0.3487, "step": 2801 }, { "epoch": 4.709243697478992, "grad_norm": 0.0729136715255949, "learning_rate": 8.215479845203611e-07, "loss": 0.3324, "step": 2802 }, { "epoch": 4.710924369747899, "grad_norm": 0.07265947112388588, "learning_rate": 8.121099467345872e-07, "loss": 0.3297, "step": 2803 }, { "epoch": 4.712605042016807, "grad_norm": 0.07031429420246972, "learning_rate": 8.027258792490556e-07, "loss": 0.3275, "step": 2804 }, { "epoch": 4.714285714285714, "grad_norm": 0.07227444889001385, "learning_rate": 7.933957949876769e-07, "loss": 0.3336, "step": 2805 }, { "epoch": 4.715966386554622, "grad_norm": 0.07335667114277507, "learning_rate": 7.841197068000262e-07, "loss": 0.3405, "step": 2806 }, { "epoch": 4.7176470588235295, "grad_norm": 0.07160183874840885, "learning_rate": 7.748976274612974e-07, "loss": 0.3279, "step": 2807 }, { "epoch": 4.719327731092437, "grad_norm": 0.07288764118956018, "learning_rate": 7.657295696723133e-07, "loss": 0.3363, "step": 2808 }, { "epoch": 4.721008403361345, "grad_norm": 0.07462963323594883, "learning_rate": 7.566155460594982e-07, "loss": 0.3363, "step": 2809 }, { "epoch": 4.722689075630252, "grad_norm": 0.06891215789599908, "learning_rate": 7.475555691748648e-07, "loss": 0.3284, "step": 2810 }, { "epoch": 4.72436974789916, "grad_norm": 0.07360977538873295, "learning_rate": 7.3854965149597e-07, "loss": 0.3351, "step": 2811 }, { "epoch": 4.726050420168067, "grad_norm": 0.07232202664014274, "learning_rate": 7.295978054259412e-07, "loss": 0.3397, "step": 2812 }, { "epoch": 4.727731092436975, "grad_norm": 0.07396237969843529, "learning_rate": 7.207000432934274e-07, "loss": 0.3375, "step": 2813 }, { "epoch": 4.729411764705882, "grad_norm": 0.07146176061548781, "learning_rate": 7.118563773525911e-07, "loss": 0.3382, "step": 2814 }, { "epoch": 4.73109243697479, "grad_norm": 0.0747046035776833, "learning_rate": 7.030668197831025e-07, "loss": 0.3379, "step": 2815 }, { "epoch": 4.7327731092436975, "grad_norm": 0.07367328250203871, "learning_rate": 6.943313826901055e-07, "loss": 0.3299, "step": 2816 }, { "epoch": 4.734453781512605, "grad_norm": 0.07122739730860346, "learning_rate": 6.856500781041986e-07, "loss": 0.3271, "step": 2817 }, { "epoch": 4.736134453781513, "grad_norm": 0.0695089635526133, "learning_rate": 6.770229179814492e-07, "loss": 0.3278, "step": 2818 }, { "epoch": 4.73781512605042, "grad_norm": 0.0708439169459932, "learning_rate": 6.684499142033352e-07, "loss": 0.3281, "step": 2819 }, { "epoch": 4.739495798319328, "grad_norm": 0.0728466676723447, "learning_rate": 6.599310785767676e-07, "loss": 0.3292, "step": 2820 }, { "epoch": 4.741176470588235, "grad_norm": 0.07088951131599884, "learning_rate": 6.514664228340461e-07, "loss": 0.3361, "step": 2821 }, { "epoch": 4.742857142857143, "grad_norm": 0.13327375028507815, "learning_rate": 6.4305595863285e-07, "loss": 0.3278, "step": 2822 }, { "epoch": 4.74453781512605, "grad_norm": 0.07355115185959447, "learning_rate": 6.346996975562292e-07, "loss": 0.3409, "step": 2823 }, { "epoch": 4.746218487394958, "grad_norm": 0.07451992408768346, "learning_rate": 6.263976511125869e-07, "loss": 0.3321, "step": 2824 }, { "epoch": 4.7478991596638656, "grad_norm": 0.07710830673432931, "learning_rate": 6.181498307356482e-07, "loss": 0.3362, "step": 2825 }, { "epoch": 4.749579831932773, "grad_norm": 0.07357955683330006, "learning_rate": 6.099562477844867e-07, "loss": 0.338, "step": 2826 }, { "epoch": 4.751260504201681, "grad_norm": 0.07282273865638589, "learning_rate": 6.01816913543436e-07, "loss": 0.3353, "step": 2827 }, { "epoch": 4.752941176470588, "grad_norm": 0.0723366401535732, "learning_rate": 5.937318392221602e-07, "loss": 0.3266, "step": 2828 }, { "epoch": 4.754621848739496, "grad_norm": 0.07208581667169485, "learning_rate": 5.857010359555615e-07, "loss": 0.3324, "step": 2829 }, { "epoch": 4.756302521008403, "grad_norm": 0.07590148233624472, "learning_rate": 5.777245148038236e-07, "loss": 0.3355, "step": 2830 }, { "epoch": 4.757983193277311, "grad_norm": 0.0756585860885383, "learning_rate": 5.698022867523545e-07, "loss": 0.332, "step": 2831 }, { "epoch": 4.7596638655462185, "grad_norm": 0.0762257840278863, "learning_rate": 5.619343627118001e-07, "loss": 0.3464, "step": 2832 }, { "epoch": 4.761344537815126, "grad_norm": 0.07065225927268676, "learning_rate": 5.541207535180082e-07, "loss": 0.3387, "step": 2833 }, { "epoch": 4.763025210084034, "grad_norm": 0.07315195828408202, "learning_rate": 5.46361469932033e-07, "loss": 0.3311, "step": 2834 }, { "epoch": 4.764705882352941, "grad_norm": 0.07110842526763334, "learning_rate": 5.386565226401086e-07, "loss": 0.3319, "step": 2835 }, { "epoch": 4.766386554621849, "grad_norm": 0.07774934250991844, "learning_rate": 5.310059222536268e-07, "loss": 0.3301, "step": 2836 }, { "epoch": 4.768067226890756, "grad_norm": 0.0727501513784572, "learning_rate": 5.234096793091458e-07, "loss": 0.3456, "step": 2837 }, { "epoch": 4.769747899159664, "grad_norm": 0.06949308393272172, "learning_rate": 5.158678042683463e-07, "loss": 0.334, "step": 2838 }, { "epoch": 4.771428571428571, "grad_norm": 0.07259965706978978, "learning_rate": 5.083803075180482e-07, "loss": 0.3393, "step": 2839 }, { "epoch": 4.773109243697479, "grad_norm": 0.07257945508665885, "learning_rate": 5.00947199370172e-07, "loss": 0.3281, "step": 2840 }, { "epoch": 4.7747899159663865, "grad_norm": 0.07210437957510765, "learning_rate": 4.935684900617288e-07, "loss": 0.3409, "step": 2841 }, { "epoch": 4.776470588235294, "grad_norm": 0.07358781862218523, "learning_rate": 4.862441897548209e-07, "loss": 0.3397, "step": 2842 }, { "epoch": 4.778151260504202, "grad_norm": 0.5613962510627222, "learning_rate": 4.789743085366106e-07, "loss": 0.3374, "step": 2843 }, { "epoch": 4.779831932773109, "grad_norm": 0.07418651178700938, "learning_rate": 4.7175885641931097e-07, "loss": 0.3288, "step": 2844 }, { "epoch": 4.781512605042017, "grad_norm": 0.07463560772623774, "learning_rate": 4.6459784334019096e-07, "loss": 0.3304, "step": 2845 }, { "epoch": 4.783193277310924, "grad_norm": 0.07030062206602385, "learning_rate": 4.5749127916151715e-07, "loss": 0.3318, "step": 2846 }, { "epoch": 4.784873949579832, "grad_norm": 0.07118173965559571, "learning_rate": 4.504391736705982e-07, "loss": 0.3369, "step": 2847 }, { "epoch": 4.786554621848739, "grad_norm": 0.07345732748589241, "learning_rate": 4.434415365797184e-07, "loss": 0.3386, "step": 2848 }, { "epoch": 4.788235294117647, "grad_norm": 0.06928060152286751, "learning_rate": 4.3649837752615555e-07, "loss": 0.3338, "step": 2849 }, { "epoch": 4.7899159663865545, "grad_norm": 0.07219461060606094, "learning_rate": 4.2960970607215825e-07, "loss": 0.333, "step": 2850 }, { "epoch": 4.791596638655462, "grad_norm": 0.07106391683002627, "learning_rate": 4.22775531704942e-07, "loss": 0.3328, "step": 2851 }, { "epoch": 4.79327731092437, "grad_norm": 0.0729694023395653, "learning_rate": 4.15995863836649e-07, "loss": 0.3323, "step": 2852 }, { "epoch": 4.794957983193277, "grad_norm": 0.07267849829140234, "learning_rate": 4.0927071180437483e-07, "loss": 0.336, "step": 2853 }, { "epoch": 4.796638655462185, "grad_norm": 0.07017186302182755, "learning_rate": 4.0260008487011947e-07, "loss": 0.3339, "step": 2854 }, { "epoch": 4.798319327731092, "grad_norm": 0.07118675838465849, "learning_rate": 3.959839922208053e-07, "loss": 0.3369, "step": 2855 }, { "epoch": 4.8, "grad_norm": 0.07355140572640384, "learning_rate": 3.894224429682281e-07, "loss": 0.3374, "step": 2856 }, { "epoch": 4.801680672268907, "grad_norm": 0.0753591852009976, "learning_rate": 3.829154461490836e-07, "loss": 0.333, "step": 2857 }, { "epoch": 4.803361344537815, "grad_norm": 0.06965932415792905, "learning_rate": 3.7646301072492784e-07, "loss": 0.3265, "step": 2858 }, { "epoch": 4.8050420168067225, "grad_norm": 0.07070648656126338, "learning_rate": 3.7006514558217686e-07, "loss": 0.3293, "step": 2859 }, { "epoch": 4.80672268907563, "grad_norm": 0.07570786668652481, "learning_rate": 3.637218595320935e-07, "loss": 0.3456, "step": 2860 }, { "epoch": 4.808403361344538, "grad_norm": 0.07109329231462885, "learning_rate": 3.5743316131076953e-07, "loss": 0.3408, "step": 2861 }, { "epoch": 4.810084033613445, "grad_norm": 0.07336274344637005, "learning_rate": 3.511990595791126e-07, "loss": 0.3291, "step": 2862 }, { "epoch": 4.811764705882353, "grad_norm": 0.0813518002101307, "learning_rate": 3.450195629228548e-07, "loss": 0.334, "step": 2863 }, { "epoch": 4.81344537815126, "grad_norm": 0.07975250233743121, "learning_rate": 3.388946798525128e-07, "loss": 0.3338, "step": 2864 }, { "epoch": 4.815126050420168, "grad_norm": 0.07397312640913922, "learning_rate": 3.32824418803388e-07, "loss": 0.3303, "step": 2865 }, { "epoch": 4.816806722689075, "grad_norm": 0.0696096298252683, "learning_rate": 3.268087881355575e-07, "loss": 0.3362, "step": 2866 }, { "epoch": 4.818487394957983, "grad_norm": 0.06957705434673465, "learning_rate": 3.208477961338652e-07, "loss": 0.3339, "step": 2867 }, { "epoch": 4.8201680672268905, "grad_norm": 0.07211105352952532, "learning_rate": 3.149414510078952e-07, "loss": 0.3347, "step": 2868 }, { "epoch": 4.821848739495798, "grad_norm": 0.07134549433108683, "learning_rate": 3.0908976089198514e-07, "loss": 0.3344, "step": 2869 }, { "epoch": 4.823529411764706, "grad_norm": 0.5617024497655523, "learning_rate": 3.0329273384518634e-07, "loss": 0.3495, "step": 2870 }, { "epoch": 4.825210084033613, "grad_norm": 0.07389217447742008, "learning_rate": 2.975503778512767e-07, "loss": 0.3356, "step": 2871 }, { "epoch": 4.826890756302521, "grad_norm": 0.07234259405440109, "learning_rate": 2.9186270081873916e-07, "loss": 0.3374, "step": 2872 }, { "epoch": 4.828571428571428, "grad_norm": 0.06954598929900785, "learning_rate": 2.862297105807432e-07, "loss": 0.3233, "step": 2873 }, { "epoch": 4.830252100840336, "grad_norm": 0.07417026432358358, "learning_rate": 2.806514148951589e-07, "loss": 0.3427, "step": 2874 }, { "epoch": 4.831932773109243, "grad_norm": 0.07211075614259456, "learning_rate": 2.7512782144451187e-07, "loss": 0.3273, "step": 2875 }, { "epoch": 4.833613445378151, "grad_norm": 0.07704119732264453, "learning_rate": 2.6965893783599706e-07, "loss": 0.3376, "step": 2876 }, { "epoch": 4.8352941176470585, "grad_norm": 0.06973632088419854, "learning_rate": 2.64244771601474e-07, "loss": 0.3373, "step": 2877 }, { "epoch": 4.836974789915966, "grad_norm": 0.06878660606039466, "learning_rate": 2.5888533019742703e-07, "loss": 0.326, "step": 2878 }, { "epoch": 4.838655462184874, "grad_norm": 0.07216881608029187, "learning_rate": 2.535806210049785e-07, "loss": 0.3391, "step": 2879 }, { "epoch": 4.840336134453781, "grad_norm": 0.07053854730236686, "learning_rate": 2.483306513298844e-07, "loss": 0.3328, "step": 2880 }, { "epoch": 4.842016806722689, "grad_norm": 0.07100611083623684, "learning_rate": 2.4313542840248563e-07, "loss": 0.339, "step": 2881 }, { "epoch": 4.843697478991596, "grad_norm": 0.07208029952706169, "learning_rate": 2.379949593777564e-07, "loss": 0.3341, "step": 2882 }, { "epoch": 4.845378151260504, "grad_norm": 0.07335677046755623, "learning_rate": 2.3290925133523822e-07, "loss": 0.3347, "step": 2883 }, { "epoch": 4.847058823529411, "grad_norm": 0.07155956826017562, "learning_rate": 2.278783112790661e-07, "loss": 0.3459, "step": 2884 }, { "epoch": 4.848739495798319, "grad_norm": 0.06906730214483309, "learning_rate": 2.229021461379466e-07, "loss": 0.3353, "step": 2885 }, { "epoch": 4.8504201680672265, "grad_norm": 0.07084791802058345, "learning_rate": 2.1798076276514867e-07, "loss": 0.3314, "step": 2886 }, { "epoch": 4.852100840336134, "grad_norm": 0.07000007626693382, "learning_rate": 2.1311416793848628e-07, "loss": 0.3347, "step": 2887 }, { "epoch": 4.853781512605042, "grad_norm": 0.06986656959378969, "learning_rate": 2.0830236836034022e-07, "loss": 0.3323, "step": 2888 }, { "epoch": 4.855462184873949, "grad_norm": 0.0699902471385287, "learning_rate": 2.035453706576007e-07, "loss": 0.3306, "step": 2889 }, { "epoch": 4.857142857142857, "grad_norm": 0.07079274889346905, "learning_rate": 1.988431813816938e-07, "loss": 0.3353, "step": 2890 }, { "epoch": 4.858823529411764, "grad_norm": 0.07050301738740979, "learning_rate": 1.9419580700857254e-07, "loss": 0.3406, "step": 2891 }, { "epoch": 4.860504201680673, "grad_norm": 0.07128532207818952, "learning_rate": 1.8960325393867717e-07, "loss": 0.3402, "step": 2892 }, { "epoch": 4.86218487394958, "grad_norm": 0.07074964843688175, "learning_rate": 1.8506552849696602e-07, "loss": 0.3314, "step": 2893 }, { "epoch": 4.863865546218488, "grad_norm": 0.07167164460166978, "learning_rate": 1.8058263693288003e-07, "loss": 0.3284, "step": 2894 }, { "epoch": 4.865546218487395, "grad_norm": 0.0698812016869339, "learning_rate": 1.7615458542033391e-07, "loss": 0.3301, "step": 2895 }, { "epoch": 4.867226890756303, "grad_norm": 0.07084942220741997, "learning_rate": 1.7178138005773392e-07, "loss": 0.3329, "step": 2896 }, { "epoch": 4.8689075630252105, "grad_norm": 0.07016604256795736, "learning_rate": 1.6746302686793336e-07, "loss": 0.3419, "step": 2897 }, { "epoch": 4.870588235294118, "grad_norm": 0.06879324627648252, "learning_rate": 1.6319953179825044e-07, "loss": 0.3359, "step": 2898 }, { "epoch": 4.872268907563026, "grad_norm": 0.0697799102007613, "learning_rate": 1.5899090072045486e-07, "loss": 0.333, "step": 2899 }, { "epoch": 4.873949579831933, "grad_norm": 0.07162324531777793, "learning_rate": 1.548371394307502e-07, "loss": 0.3355, "step": 2900 }, { "epoch": 4.875630252100841, "grad_norm": 0.07423111900098947, "learning_rate": 1.507382536497781e-07, "loss": 0.3474, "step": 2901 }, { "epoch": 4.877310924369748, "grad_norm": 0.06806030007406456, "learning_rate": 1.4669424902259644e-07, "loss": 0.3281, "step": 2902 }, { "epoch": 4.878991596638656, "grad_norm": 0.06909802213717489, "learning_rate": 1.4270513111868335e-07, "loss": 0.331, "step": 2903 }, { "epoch": 4.8806722689075634, "grad_norm": 0.07238791076886332, "learning_rate": 1.3877090543193306e-07, "loss": 0.3389, "step": 2904 }, { "epoch": 4.882352941176471, "grad_norm": 0.07100227310201052, "learning_rate": 1.348915773806292e-07, "loss": 0.3435, "step": 2905 }, { "epoch": 4.884033613445379, "grad_norm": 0.07346227631046029, "learning_rate": 1.3106715230745804e-07, "loss": 0.3342, "step": 2906 }, { "epoch": 4.885714285714286, "grad_norm": 0.0706383227551099, "learning_rate": 1.2729763547949525e-07, "loss": 0.335, "step": 2907 }, { "epoch": 4.887394957983194, "grad_norm": 0.0703813994593793, "learning_rate": 1.2358303208818369e-07, "loss": 0.3347, "step": 2908 }, { "epoch": 4.889075630252101, "grad_norm": 0.06996048721402706, "learning_rate": 1.1992334724934662e-07, "loss": 0.3336, "step": 2909 }, { "epoch": 4.890756302521009, "grad_norm": 0.07039648718518178, "learning_rate": 1.1631858600317014e-07, "loss": 0.3385, "step": 2910 }, { "epoch": 4.892436974789916, "grad_norm": 0.06889792789843621, "learning_rate": 1.1276875331419856e-07, "loss": 0.3369, "step": 2911 }, { "epoch": 4.894117647058824, "grad_norm": 0.07082088423400383, "learning_rate": 1.0927385407133451e-07, "loss": 0.3317, "step": 2912 }, { "epoch": 4.8957983193277315, "grad_norm": 0.07335758672557095, "learning_rate": 1.0583389308781223e-07, "loss": 0.3433, "step": 2913 }, { "epoch": 4.897478991596639, "grad_norm": 0.19611676313888196, "learning_rate": 1.0244887510121094e-07, "loss": 0.3464, "step": 2914 }, { "epoch": 4.899159663865547, "grad_norm": 0.07023177196140648, "learning_rate": 9.911880477344593e-08, "loss": 0.3283, "step": 2915 }, { "epoch": 4.900840336134454, "grad_norm": 0.07119529024836718, "learning_rate": 9.584368669075084e-08, "loss": 0.324, "step": 2916 }, { "epoch": 4.902521008403362, "grad_norm": 0.0731684682768762, "learning_rate": 9.262352536367759e-08, "loss": 0.3369, "step": 2917 }, { "epoch": 4.904201680672269, "grad_norm": 0.07009144024906018, "learning_rate": 8.945832522709641e-08, "loss": 0.3367, "step": 2918 }, { "epoch": 4.905882352941177, "grad_norm": 0.07474373470405969, "learning_rate": 8.634809064017813e-08, "loss": 0.3429, "step": 2919 }, { "epoch": 4.907563025210084, "grad_norm": 0.07140755535537448, "learning_rate": 8.329282588639853e-08, "loss": 0.3243, "step": 2920 }, { "epoch": 4.909243697478992, "grad_norm": 0.07260304201787195, "learning_rate": 8.029253517352509e-08, "loss": 0.3323, "step": 2921 }, { "epoch": 4.9109243697478995, "grad_norm": 0.06971218268963965, "learning_rate": 7.734722263360805e-08, "loss": 0.3336, "step": 2922 }, { "epoch": 4.912605042016807, "grad_norm": 0.07224467148828273, "learning_rate": 7.445689232299825e-08, "loss": 0.3367, "step": 2923 }, { "epoch": 4.914285714285715, "grad_norm": 0.07207179334721515, "learning_rate": 7.162154822230705e-08, "loss": 0.3322, "step": 2924 }, { "epoch": 4.915966386554622, "grad_norm": 0.07083510340288979, "learning_rate": 6.884119423642421e-08, "loss": 0.3346, "step": 2925 }, { "epoch": 4.91764705882353, "grad_norm": 0.07101727759256929, "learning_rate": 6.611583419450895e-08, "loss": 0.34, "step": 2926 }, { "epoch": 4.919327731092437, "grad_norm": 1.1175723343292177, "learning_rate": 6.344547184997219e-08, "loss": 0.3663, "step": 2927 }, { "epoch": 4.921008403361345, "grad_norm": 0.06935301210149025, "learning_rate": 6.083011088049428e-08, "loss": 0.3424, "step": 2928 }, { "epoch": 4.922689075630252, "grad_norm": 0.06907053229510791, "learning_rate": 5.8269754887994024e-08, "loss": 0.3272, "step": 2929 }, { "epoch": 4.92436974789916, "grad_norm": 0.4705419487426645, "learning_rate": 5.5764407398641865e-08, "loss": 0.3443, "step": 2930 }, { "epoch": 4.9260504201680675, "grad_norm": 0.06950005226117378, "learning_rate": 5.331407186285553e-08, "loss": 0.3271, "step": 2931 }, { "epoch": 4.927731092436975, "grad_norm": 0.07064313100639817, "learning_rate": 5.091875165527782e-08, "loss": 0.3279, "step": 2932 }, { "epoch": 4.929411764705883, "grad_norm": 0.0710899402295265, "learning_rate": 4.85784500747899e-08, "loss": 0.3328, "step": 2933 }, { "epoch": 4.93109243697479, "grad_norm": 0.06953443630439608, "learning_rate": 4.6293170344497985e-08, "loss": 0.3382, "step": 2934 }, { "epoch": 4.932773109243698, "grad_norm": 0.07258390993592052, "learning_rate": 4.4062915611737814e-08, "loss": 0.3333, "step": 2935 }, { "epoch": 4.934453781512605, "grad_norm": 0.07298347380055746, "learning_rate": 4.18876889480524e-08, "loss": 0.3447, "step": 2936 }, { "epoch": 4.936134453781513, "grad_norm": 0.0750302285553371, "learning_rate": 3.9767493349205376e-08, "loss": 0.3326, "step": 2937 }, { "epoch": 4.93781512605042, "grad_norm": 0.0713445045645601, "learning_rate": 3.770233173517213e-08, "loss": 0.3366, "step": 2938 }, { "epoch": 4.939495798319328, "grad_norm": 0.0710632368278214, "learning_rate": 3.569220695013531e-08, "loss": 0.3379, "step": 2939 }, { "epoch": 4.9411764705882355, "grad_norm": 0.07169790605000607, "learning_rate": 3.373712176246713e-08, "loss": 0.3301, "step": 2940 }, { "epoch": 4.942857142857143, "grad_norm": 0.07211388537081478, "learning_rate": 3.1837078864755954e-08, "loss": 0.3407, "step": 2941 }, { "epoch": 4.944537815126051, "grad_norm": 0.06989112920575895, "learning_rate": 2.9992080873775254e-08, "loss": 0.3383, "step": 2942 }, { "epoch": 4.946218487394958, "grad_norm": 0.12864777198640115, "learning_rate": 2.820213033048802e-08, "loss": 0.3408, "step": 2943 }, { "epoch": 4.947899159663866, "grad_norm": 0.07000872887671095, "learning_rate": 2.6467229700051222e-08, "loss": 0.3385, "step": 2944 }, { "epoch": 4.949579831932773, "grad_norm": 0.0697728240749645, "learning_rate": 2.4787381371802476e-08, "loss": 0.3368, "step": 2945 }, { "epoch": 4.951260504201681, "grad_norm": 0.07081180272604388, "learning_rate": 2.316258765926005e-08, "loss": 0.3322, "step": 2946 }, { "epoch": 4.952941176470588, "grad_norm": 0.07311689270116577, "learning_rate": 2.1592850800113975e-08, "loss": 0.3358, "step": 2947 }, { "epoch": 4.954621848739496, "grad_norm": 0.0732525774894796, "learning_rate": 2.0078172956248255e-08, "loss": 0.3377, "step": 2948 }, { "epoch": 4.9563025210084035, "grad_norm": 0.07941234689388833, "learning_rate": 1.861855621369202e-08, "loss": 0.3327, "step": 2949 }, { "epoch": 4.957983193277311, "grad_norm": 0.07139369472537992, "learning_rate": 1.7214002582668365e-08, "loss": 0.3341, "step": 2950 }, { "epoch": 4.959663865546219, "grad_norm": 0.07227317574962128, "learning_rate": 1.5864513997549958e-08, "loss": 0.3405, "step": 2951 }, { "epoch": 4.961344537815126, "grad_norm": 0.06885653400464055, "learning_rate": 1.4570092316876783e-08, "loss": 0.3306, "step": 2952 }, { "epoch": 4.963025210084034, "grad_norm": 0.0738303627292344, "learning_rate": 1.3330739323351716e-08, "loss": 0.3311, "step": 2953 }, { "epoch": 4.964705882352941, "grad_norm": 0.06758400770769002, "learning_rate": 1.2146456723836075e-08, "loss": 0.3296, "step": 2954 }, { "epoch": 4.966386554621849, "grad_norm": 0.0729582458411108, "learning_rate": 1.1017246149345184e-08, "loss": 0.3384, "step": 2955 }, { "epoch": 4.968067226890756, "grad_norm": 0.06909386772847433, "learning_rate": 9.943109155048369e-09, "loss": 0.3303, "step": 2956 }, { "epoch": 4.969747899159664, "grad_norm": 0.0690584494292919, "learning_rate": 8.92404722026896e-09, "loss": 0.3268, "step": 2957 }, { "epoch": 4.9714285714285715, "grad_norm": 0.07589642559633065, "learning_rate": 7.960061748475412e-09, "loss": 0.3376, "step": 2958 }, { "epoch": 4.973109243697479, "grad_norm": 0.14049107817429116, "learning_rate": 7.051154067294619e-09, "loss": 0.3411, "step": 2959 }, { "epoch": 4.974789915966387, "grad_norm": 0.08475798088349623, "learning_rate": 6.197325428480838e-09, "loss": 0.3318, "step": 2960 }, { "epoch": 4.976470588235294, "grad_norm": 0.06893230227754317, "learning_rate": 5.398577007951211e-09, "loss": 0.3238, "step": 2961 }, { "epoch": 4.978151260504202, "grad_norm": 0.06999238726298442, "learning_rate": 4.654909905750238e-09, "loss": 0.3347, "step": 2962 }, { "epoch": 4.979831932773109, "grad_norm": 0.07129456676481473, "learning_rate": 3.966325146076422e-09, "loss": 0.3333, "step": 2963 }, { "epoch": 4.981512605042017, "grad_norm": 0.06920976080464546, "learning_rate": 3.3328236772600663e-09, "loss": 0.3357, "step": 2964 }, { "epoch": 4.983193277310924, "grad_norm": 0.07161935851125115, "learning_rate": 2.754406371767715e-09, "loss": 0.3386, "step": 2965 }, { "epoch": 4.984873949579832, "grad_norm": 0.07096482858861825, "learning_rate": 2.231074026206592e-09, "loss": 0.3369, "step": 2966 }, { "epoch": 4.9865546218487395, "grad_norm": 0.07175003060650498, "learning_rate": 1.7628273613290448e-09, "loss": 0.3342, "step": 2967 }, { "epoch": 4.988235294117647, "grad_norm": 0.07050688849834598, "learning_rate": 1.3496670220014552e-09, "loss": 0.3323, "step": 2968 }, { "epoch": 4.989915966386555, "grad_norm": 0.07512967967234267, "learning_rate": 9.91593577239769e-10, "loss": 0.3341, "step": 2969 }, { "epoch": 4.991596638655462, "grad_norm": 0.07021506905185929, "learning_rate": 6.886075201961718e-10, "loss": 0.3385, "step": 2970 }, { "epoch": 4.99327731092437, "grad_norm": 0.06977304149025215, "learning_rate": 4.4070926814132607e-10, "loss": 0.3261, "step": 2971 }, { "epoch": 4.994957983193277, "grad_norm": 0.0703826450949334, "learning_rate": 2.478991624865756e-10, "loss": 0.3271, "step": 2972 }, { "epoch": 4.996638655462185, "grad_norm": 0.07122637160392331, "learning_rate": 1.1017746877506341e-10, "loss": 0.3337, "step": 2973 }, { "epoch": 4.998319327731092, "grad_norm": 0.06879395237047256, "learning_rate": 2.7544376677290928e-11, "loss": 0.3299, "step": 2974 }, { "epoch": 5.0, "grad_norm": 0.07709157049953634, "learning_rate": 0.0, "loss": 0.328, "step": 2975 }, { "epoch": 5.0, "step": 2975, "total_flos": 4.99090232180736e+16, "train_loss": 0.0, "train_runtime": 11.202, "train_samples_per_second": 135839.226, "train_steps_per_second": 265.579 } ], "logging_steps": 1, "max_steps": 2975, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.99090232180736e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }