{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 5520, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018115942028985507, "grad_norm": 155.45926107494319, "learning_rate": 6.024096385542168e-09, "loss": 3.2778, "step": 1 }, { "epoch": 0.00036231884057971015, "grad_norm": 146.53525241554215, "learning_rate": 1.2048192771084337e-08, "loss": 2.96, "step": 2 }, { "epoch": 0.0005434782608695652, "grad_norm": 166.08084514994493, "learning_rate": 1.8072289156626504e-08, "loss": 3.2065, "step": 3 }, { "epoch": 0.0007246376811594203, "grad_norm": 167.59524060451398, "learning_rate": 2.4096385542168673e-08, "loss": 3.436, "step": 4 }, { "epoch": 0.0009057971014492754, "grad_norm": 162.79759647875963, "learning_rate": 3.012048192771084e-08, "loss": 3.1548, "step": 5 }, { "epoch": 0.0010869565217391304, "grad_norm": 155.23379157235624, "learning_rate": 3.614457831325301e-08, "loss": 3.3721, "step": 6 }, { "epoch": 0.0012681159420289854, "grad_norm": 165.5495705406923, "learning_rate": 4.2168674698795174e-08, "loss": 3.2246, "step": 7 }, { "epoch": 0.0014492753623188406, "grad_norm": 156.2303696546355, "learning_rate": 4.8192771084337347e-08, "loss": 3.3623, "step": 8 }, { "epoch": 0.0016304347826086956, "grad_norm": 161.74945993469353, "learning_rate": 5.421686746987952e-08, "loss": 3.2163, "step": 9 }, { "epoch": 0.0018115942028985507, "grad_norm": 155.0623773701399, "learning_rate": 6.024096385542168e-08, "loss": 3.4561, "step": 10 }, { "epoch": 0.0019927536231884057, "grad_norm": 161.30079568597247, "learning_rate": 6.626506024096386e-08, "loss": 3.6455, "step": 11 }, { "epoch": 0.002173913043478261, "grad_norm": 156.25269673838352, "learning_rate": 7.228915662650602e-08, "loss": 3.2124, "step": 12 }, { "epoch": 0.002355072463768116, "grad_norm": 162.03906983237863, "learning_rate": 7.83132530120482e-08, "loss": 3.3145, "step": 13 }, { "epoch": 0.002536231884057971, "grad_norm": 166.9528940623657, "learning_rate": 8.433734939759035e-08, "loss": 3.2222, "step": 14 }, { "epoch": 0.002717391304347826, "grad_norm": 167.9591489096079, "learning_rate": 9.036144578313253e-08, "loss": 3.105, "step": 15 }, { "epoch": 0.002898550724637681, "grad_norm": 168.23589443984898, "learning_rate": 9.638554216867469e-08, "loss": 2.9004, "step": 16 }, { "epoch": 0.0030797101449275364, "grad_norm": 165.85795575431166, "learning_rate": 1.0240963855421686e-07, "loss": 3.2441, "step": 17 }, { "epoch": 0.003260869565217391, "grad_norm": 165.04155349261808, "learning_rate": 1.0843373493975904e-07, "loss": 3.0679, "step": 18 }, { "epoch": 0.0034420289855072463, "grad_norm": 170.34587941618398, "learning_rate": 1.1445783132530119e-07, "loss": 3.3999, "step": 19 }, { "epoch": 0.0036231884057971015, "grad_norm": 160.2615797007729, "learning_rate": 1.2048192771084337e-07, "loss": 2.8306, "step": 20 }, { "epoch": 0.0038043478260869567, "grad_norm": 162.039582212009, "learning_rate": 1.2650602409638554e-07, "loss": 2.9673, "step": 21 }, { "epoch": 0.003985507246376811, "grad_norm": 176.08714589279865, "learning_rate": 1.3253012048192773e-07, "loss": 3.3125, "step": 22 }, { "epoch": 0.004166666666666667, "grad_norm": 174.85115283084608, "learning_rate": 1.3855421686746987e-07, "loss": 3.0752, "step": 23 }, { "epoch": 0.004347826086956522, "grad_norm": 191.75937464009905, "learning_rate": 1.4457831325301203e-07, "loss": 3.3062, "step": 24 }, { "epoch": 0.004528985507246377, "grad_norm": 180.97084764424042, "learning_rate": 1.5060240963855423e-07, "loss": 2.9961, "step": 25 }, { "epoch": 0.004710144927536232, "grad_norm": 186.813096280298, "learning_rate": 1.566265060240964e-07, "loss": 2.9917, "step": 26 }, { "epoch": 0.004891304347826087, "grad_norm": 171.4489308986305, "learning_rate": 1.6265060240963853e-07, "loss": 2.645, "step": 27 }, { "epoch": 0.005072463768115942, "grad_norm": 169.187919981961, "learning_rate": 1.686746987951807e-07, "loss": 2.7319, "step": 28 }, { "epoch": 0.005253623188405797, "grad_norm": 171.130930708941, "learning_rate": 1.746987951807229e-07, "loss": 2.5078, "step": 29 }, { "epoch": 0.005434782608695652, "grad_norm": 164.74902247102506, "learning_rate": 1.8072289156626505e-07, "loss": 2.4355, "step": 30 }, { "epoch": 0.005615942028985507, "grad_norm": 172.31334333571056, "learning_rate": 1.8674698795180722e-07, "loss": 2.478, "step": 31 }, { "epoch": 0.005797101449275362, "grad_norm": 149.52183581815365, "learning_rate": 1.9277108433734939e-07, "loss": 2.4834, "step": 32 }, { "epoch": 0.005978260869565218, "grad_norm": 173.68692872709474, "learning_rate": 1.9879518072289155e-07, "loss": 2.5942, "step": 33 }, { "epoch": 0.006159420289855073, "grad_norm": 171.72660128591411, "learning_rate": 2.0481927710843372e-07, "loss": 2.5977, "step": 34 }, { "epoch": 0.006340579710144928, "grad_norm": 177.23977883062634, "learning_rate": 2.108433734939759e-07, "loss": 2.666, "step": 35 }, { "epoch": 0.006521739130434782, "grad_norm": 191.88320532811127, "learning_rate": 2.1686746987951808e-07, "loss": 2.9033, "step": 36 }, { "epoch": 0.0067028985507246374, "grad_norm": 170.88276296688394, "learning_rate": 2.2289156626506022e-07, "loss": 2.6719, "step": 37 }, { "epoch": 0.006884057971014493, "grad_norm": 149.19705339355804, "learning_rate": 2.2891566265060238e-07, "loss": 1.8291, "step": 38 }, { "epoch": 0.007065217391304348, "grad_norm": 153.45683505992974, "learning_rate": 2.3493975903614457e-07, "loss": 1.7681, "step": 39 }, { "epoch": 0.007246376811594203, "grad_norm": 149.0156445006739, "learning_rate": 2.4096385542168674e-07, "loss": 1.6543, "step": 40 }, { "epoch": 0.007427536231884058, "grad_norm": 135.04326091666607, "learning_rate": 2.469879518072289e-07, "loss": 1.5229, "step": 41 }, { "epoch": 0.007608695652173913, "grad_norm": 133.04893540033066, "learning_rate": 2.5301204819277107e-07, "loss": 1.4431, "step": 42 }, { "epoch": 0.0077898550724637685, "grad_norm": 132.1471804123453, "learning_rate": 2.5903614457831324e-07, "loss": 1.4153, "step": 43 }, { "epoch": 0.007971014492753623, "grad_norm": 131.80069966825823, "learning_rate": 2.6506024096385546e-07, "loss": 1.5205, "step": 44 }, { "epoch": 0.008152173913043478, "grad_norm": 138.90404312299518, "learning_rate": 2.7108433734939757e-07, "loss": 1.4946, "step": 45 }, { "epoch": 0.008333333333333333, "grad_norm": 126.72977971234664, "learning_rate": 2.7710843373493974e-07, "loss": 1.4854, "step": 46 }, { "epoch": 0.008514492753623188, "grad_norm": 133.6903705502816, "learning_rate": 2.8313253012048195e-07, "loss": 1.4331, "step": 47 }, { "epoch": 0.008695652173913044, "grad_norm": 107.52537178733128, "learning_rate": 2.8915662650602407e-07, "loss": 1.2329, "step": 48 }, { "epoch": 0.008876811594202899, "grad_norm": 116.93755567385571, "learning_rate": 2.9518072289156623e-07, "loss": 1.2612, "step": 49 }, { "epoch": 0.009057971014492754, "grad_norm": 109.63231545954613, "learning_rate": 3.0120481927710845e-07, "loss": 1.2361, "step": 50 }, { "epoch": 0.00923913043478261, "grad_norm": 79.48921062100135, "learning_rate": 3.0722891566265056e-07, "loss": 1.0171, "step": 51 }, { "epoch": 0.009420289855072464, "grad_norm": 38.81750692142498, "learning_rate": 3.132530120481928e-07, "loss": 0.8708, "step": 52 }, { "epoch": 0.00960144927536232, "grad_norm": 68.41113509995378, "learning_rate": 3.192771084337349e-07, "loss": 1.0142, "step": 53 }, { "epoch": 0.009782608695652175, "grad_norm": 69.43876122106217, "learning_rate": 3.2530120481927706e-07, "loss": 0.9929, "step": 54 }, { "epoch": 0.009963768115942028, "grad_norm": 72.7311351452876, "learning_rate": 3.313253012048193e-07, "loss": 1.0474, "step": 55 }, { "epoch": 0.010144927536231883, "grad_norm": 67.84585170203647, "learning_rate": 3.373493975903614e-07, "loss": 1.0488, "step": 56 }, { "epoch": 0.010326086956521738, "grad_norm": 55.83187351478142, "learning_rate": 3.433734939759036e-07, "loss": 0.8875, "step": 57 }, { "epoch": 0.010507246376811594, "grad_norm": 67.99864051190457, "learning_rate": 3.493975903614458e-07, "loss": 1.0031, "step": 58 }, { "epoch": 0.010688405797101449, "grad_norm": 54.07576863846796, "learning_rate": 3.554216867469879e-07, "loss": 0.8564, "step": 59 }, { "epoch": 0.010869565217391304, "grad_norm": 52.809527375230566, "learning_rate": 3.614457831325301e-07, "loss": 0.7832, "step": 60 }, { "epoch": 0.01105072463768116, "grad_norm": 39.022030058357664, "learning_rate": 3.674698795180723e-07, "loss": 0.7292, "step": 61 }, { "epoch": 0.011231884057971014, "grad_norm": 34.14652534748319, "learning_rate": 3.7349397590361444e-07, "loss": 0.7489, "step": 62 }, { "epoch": 0.01141304347826087, "grad_norm": 18.239066099044848, "learning_rate": 3.795180722891566e-07, "loss": 0.692, "step": 63 }, { "epoch": 0.011594202898550725, "grad_norm": 13.40286627581432, "learning_rate": 3.8554216867469877e-07, "loss": 0.6915, "step": 64 }, { "epoch": 0.01177536231884058, "grad_norm": 19.66059978230837, "learning_rate": 3.9156626506024094e-07, "loss": 0.7246, "step": 65 }, { "epoch": 0.011956521739130435, "grad_norm": 13.357821616072009, "learning_rate": 3.975903614457831e-07, "loss": 0.676, "step": 66 }, { "epoch": 0.01213768115942029, "grad_norm": 23.978021954471387, "learning_rate": 4.036144578313253e-07, "loss": 0.6628, "step": 67 }, { "epoch": 0.012318840579710146, "grad_norm": 34.6756530637303, "learning_rate": 4.0963855421686744e-07, "loss": 0.6841, "step": 68 }, { "epoch": 0.0125, "grad_norm": 28.61333355137964, "learning_rate": 4.156626506024096e-07, "loss": 0.6216, "step": 69 }, { "epoch": 0.012681159420289856, "grad_norm": 26.677689329855557, "learning_rate": 4.216867469879518e-07, "loss": 0.6619, "step": 70 }, { "epoch": 0.01286231884057971, "grad_norm": 22.19223137338174, "learning_rate": 4.2771084337349393e-07, "loss": 0.6655, "step": 71 }, { "epoch": 0.013043478260869565, "grad_norm": 21.79590029398801, "learning_rate": 4.3373493975903615e-07, "loss": 0.6466, "step": 72 }, { "epoch": 0.01322463768115942, "grad_norm": 9.856574571433779, "learning_rate": 4.3975903614457827e-07, "loss": 0.649, "step": 73 }, { "epoch": 0.013405797101449275, "grad_norm": 9.445511619298856, "learning_rate": 4.4578313253012043e-07, "loss": 0.6648, "step": 74 }, { "epoch": 0.01358695652173913, "grad_norm": 10.053471572311627, "learning_rate": 4.5180722891566265e-07, "loss": 0.6207, "step": 75 }, { "epoch": 0.013768115942028985, "grad_norm": 9.903442777888861, "learning_rate": 4.5783132530120476e-07, "loss": 0.6351, "step": 76 }, { "epoch": 0.01394927536231884, "grad_norm": 8.29476593406457, "learning_rate": 4.63855421686747e-07, "loss": 0.5902, "step": 77 }, { "epoch": 0.014130434782608696, "grad_norm": 13.48145931778345, "learning_rate": 4.6987951807228915e-07, "loss": 0.5927, "step": 78 }, { "epoch": 0.01431159420289855, "grad_norm": 8.231585996611404, "learning_rate": 4.7590361445783126e-07, "loss": 0.5648, "step": 79 }, { "epoch": 0.014492753623188406, "grad_norm": 9.1626932531575, "learning_rate": 4.819277108433735e-07, "loss": 0.5475, "step": 80 }, { "epoch": 0.014673913043478261, "grad_norm": 10.168836300766785, "learning_rate": 4.879518072289156e-07, "loss": 0.5139, "step": 81 }, { "epoch": 0.014855072463768116, "grad_norm": 9.158146838509008, "learning_rate": 4.939759036144578e-07, "loss": 0.5251, "step": 82 }, { "epoch": 0.015036231884057972, "grad_norm": 7.381762153704901, "learning_rate": 5e-07, "loss": 0.6141, "step": 83 }, { "epoch": 0.015217391304347827, "grad_norm": 6.28495104475644, "learning_rate": 5.060240963855421e-07, "loss": 0.5575, "step": 84 }, { "epoch": 0.015398550724637682, "grad_norm": 6.046197754805433, "learning_rate": 5.120481927710843e-07, "loss": 0.5264, "step": 85 }, { "epoch": 0.015579710144927537, "grad_norm": 11.499601951009717, "learning_rate": 5.180722891566265e-07, "loss": 0.4857, "step": 86 }, { "epoch": 0.01576086956521739, "grad_norm": 7.747865070902684, "learning_rate": 5.240963855421686e-07, "loss": 0.5394, "step": 87 }, { "epoch": 0.015942028985507246, "grad_norm": 6.741894867585539, "learning_rate": 5.301204819277109e-07, "loss": 0.5691, "step": 88 }, { "epoch": 0.0161231884057971, "grad_norm": 16.55724194012545, "learning_rate": 5.36144578313253e-07, "loss": 0.6082, "step": 89 }, { "epoch": 0.016304347826086956, "grad_norm": 7.216912783357828, "learning_rate": 5.421686746987951e-07, "loss": 0.558, "step": 90 }, { "epoch": 0.01648550724637681, "grad_norm": 6.683798937927288, "learning_rate": 5.481927710843374e-07, "loss": 0.5511, "step": 91 }, { "epoch": 0.016666666666666666, "grad_norm": 6.3427743897580715, "learning_rate": 5.542168674698795e-07, "loss": 0.5498, "step": 92 }, { "epoch": 0.01684782608695652, "grad_norm": 7.035954184887222, "learning_rate": 5.602409638554216e-07, "loss": 0.5524, "step": 93 }, { "epoch": 0.017028985507246377, "grad_norm": 8.266621192800468, "learning_rate": 5.662650602409639e-07, "loss": 0.522, "step": 94 }, { "epoch": 0.017210144927536232, "grad_norm": 7.569481363939274, "learning_rate": 5.72289156626506e-07, "loss": 0.5659, "step": 95 }, { "epoch": 0.017391304347826087, "grad_norm": 23.300962393338285, "learning_rate": 5.783132530120481e-07, "loss": 0.5211, "step": 96 }, { "epoch": 0.017572463768115942, "grad_norm": 7.741231260250192, "learning_rate": 5.843373493975904e-07, "loss": 0.5302, "step": 97 }, { "epoch": 0.017753623188405798, "grad_norm": 9.211361854353234, "learning_rate": 5.903614457831325e-07, "loss": 0.5004, "step": 98 }, { "epoch": 0.017934782608695653, "grad_norm": 8.848024831725077, "learning_rate": 5.963855421686746e-07, "loss": 0.5242, "step": 99 }, { "epoch": 0.018115942028985508, "grad_norm": 9.209307129604856, "learning_rate": 6.024096385542169e-07, "loss": 0.4996, "step": 100 }, { "epoch": 0.018115942028985508, "eval_loss": 0.5314062237739563, "eval_runtime": 9.6882, "eval_samples_per_second": 51.609, "eval_steps_per_second": 0.103, "step": 100 }, { "epoch": 0.018297101449275363, "grad_norm": 9.374402909979962, "learning_rate": 6.084337349397591e-07, "loss": 0.4619, "step": 101 }, { "epoch": 0.01847826086956522, "grad_norm": 18.94663963081601, "learning_rate": 6.144578313253011e-07, "loss": 0.5286, "step": 102 }, { "epoch": 0.018659420289855073, "grad_norm": 11.927461303799193, "learning_rate": 6.204819277108434e-07, "loss": 0.5277, "step": 103 }, { "epoch": 0.01884057971014493, "grad_norm": 11.418326931268298, "learning_rate": 6.265060240963856e-07, "loss": 0.5803, "step": 104 }, { "epoch": 0.019021739130434784, "grad_norm": 7.289335842869234, "learning_rate": 6.325301204819276e-07, "loss": 0.4612, "step": 105 }, { "epoch": 0.01920289855072464, "grad_norm": 13.189116520311075, "learning_rate": 6.385542168674698e-07, "loss": 0.5441, "step": 106 }, { "epoch": 0.019384057971014494, "grad_norm": 11.451763312237627, "learning_rate": 6.445783132530121e-07, "loss": 0.5208, "step": 107 }, { "epoch": 0.01956521739130435, "grad_norm": 6.2912922072094535, "learning_rate": 6.506024096385541e-07, "loss": 0.5115, "step": 108 }, { "epoch": 0.019746376811594205, "grad_norm": 13.702007675182921, "learning_rate": 6.566265060240963e-07, "loss": 0.5343, "step": 109 }, { "epoch": 0.019927536231884056, "grad_norm": 5.401659969002201, "learning_rate": 6.626506024096386e-07, "loss": 0.4936, "step": 110 }, { "epoch": 0.02010869565217391, "grad_norm": 6.777443622517859, "learning_rate": 6.686746987951807e-07, "loss": 0.5105, "step": 111 }, { "epoch": 0.020289855072463767, "grad_norm": 5.894655570480427, "learning_rate": 6.746987951807228e-07, "loss": 0.4784, "step": 112 }, { "epoch": 0.020471014492753622, "grad_norm": 10.45271767331002, "learning_rate": 6.807228915662651e-07, "loss": 0.5002, "step": 113 }, { "epoch": 0.020652173913043477, "grad_norm": 15.563610610032606, "learning_rate": 6.867469879518072e-07, "loss": 0.5154, "step": 114 }, { "epoch": 0.020833333333333332, "grad_norm": 7.9179192602020505, "learning_rate": 6.927710843373493e-07, "loss": 0.4873, "step": 115 }, { "epoch": 0.021014492753623187, "grad_norm": 8.436961625904862, "learning_rate": 6.987951807228916e-07, "loss": 0.554, "step": 116 }, { "epoch": 0.021195652173913043, "grad_norm": 8.404752827874326, "learning_rate": 7.048192771084337e-07, "loss": 0.4906, "step": 117 }, { "epoch": 0.021376811594202898, "grad_norm": 6.01808853532882, "learning_rate": 7.108433734939758e-07, "loss": 0.5338, "step": 118 }, { "epoch": 0.021557971014492753, "grad_norm": 8.583538396465919, "learning_rate": 7.168674698795181e-07, "loss": 0.5269, "step": 119 }, { "epoch": 0.021739130434782608, "grad_norm": 8.179015465558333, "learning_rate": 7.228915662650602e-07, "loss": 0.505, "step": 120 }, { "epoch": 0.021920289855072463, "grad_norm": 7.816411195873641, "learning_rate": 7.289156626506024e-07, "loss": 0.4822, "step": 121 }, { "epoch": 0.02210144927536232, "grad_norm": 7.723181238831617, "learning_rate": 7.349397590361446e-07, "loss": 0.4962, "step": 122 }, { "epoch": 0.022282608695652174, "grad_norm": 6.877138873147807, "learning_rate": 7.409638554216867e-07, "loss": 0.4381, "step": 123 }, { "epoch": 0.02246376811594203, "grad_norm": 6.81832569687752, "learning_rate": 7.469879518072289e-07, "loss": 0.4794, "step": 124 }, { "epoch": 0.022644927536231884, "grad_norm": 5.676680125317034, "learning_rate": 7.53012048192771e-07, "loss": 0.5306, "step": 125 }, { "epoch": 0.02282608695652174, "grad_norm": 5.00121652261405, "learning_rate": 7.590361445783132e-07, "loss": 0.5134, "step": 126 }, { "epoch": 0.023007246376811594, "grad_norm": 5.01429666835957, "learning_rate": 7.650602409638554e-07, "loss": 0.4814, "step": 127 }, { "epoch": 0.02318840579710145, "grad_norm": 6.608334533851778, "learning_rate": 7.710843373493975e-07, "loss": 0.5015, "step": 128 }, { "epoch": 0.023369565217391305, "grad_norm": 6.914300961015702, "learning_rate": 7.771084337349397e-07, "loss": 0.4758, "step": 129 }, { "epoch": 0.02355072463768116, "grad_norm": 6.24984997664612, "learning_rate": 7.831325301204819e-07, "loss": 0.4501, "step": 130 }, { "epoch": 0.023731884057971015, "grad_norm": 5.112118214192944, "learning_rate": 7.891566265060241e-07, "loss": 0.4746, "step": 131 }, { "epoch": 0.02391304347826087, "grad_norm": 8.387817660357083, "learning_rate": 7.951807228915662e-07, "loss": 0.5029, "step": 132 }, { "epoch": 0.024094202898550725, "grad_norm": 7.980049913886308, "learning_rate": 8.012048192771084e-07, "loss": 0.4542, "step": 133 }, { "epoch": 0.02427536231884058, "grad_norm": 14.421928860935134, "learning_rate": 8.072289156626506e-07, "loss": 0.4762, "step": 134 }, { "epoch": 0.024456521739130436, "grad_norm": 6.524127827626026, "learning_rate": 8.132530120481927e-07, "loss": 0.4901, "step": 135 }, { "epoch": 0.02463768115942029, "grad_norm": 8.971696806473568, "learning_rate": 8.192771084337349e-07, "loss": 0.5189, "step": 136 }, { "epoch": 0.024818840579710146, "grad_norm": 7.043008284400751, "learning_rate": 8.253012048192771e-07, "loss": 0.4894, "step": 137 }, { "epoch": 0.025, "grad_norm": 4.834709782757821, "learning_rate": 8.313253012048192e-07, "loss": 0.442, "step": 138 }, { "epoch": 0.025181159420289857, "grad_norm": 9.895461132716854, "learning_rate": 8.373493975903614e-07, "loss": 0.4325, "step": 139 }, { "epoch": 0.025362318840579712, "grad_norm": 7.4036240261540245, "learning_rate": 8.433734939759036e-07, "loss": 0.4438, "step": 140 }, { "epoch": 0.025543478260869567, "grad_norm": 4.548397494814773, "learning_rate": 8.493975903614458e-07, "loss": 0.4884, "step": 141 }, { "epoch": 0.02572463768115942, "grad_norm": 4.152457111490965, "learning_rate": 8.554216867469879e-07, "loss": 0.4636, "step": 142 }, { "epoch": 0.025905797101449274, "grad_norm": 5.924137019684747, "learning_rate": 8.614457831325301e-07, "loss": 0.4912, "step": 143 }, { "epoch": 0.02608695652173913, "grad_norm": 12.975084883779902, "learning_rate": 8.674698795180723e-07, "loss": 0.5563, "step": 144 }, { "epoch": 0.026268115942028984, "grad_norm": 6.1368116621228115, "learning_rate": 8.734939759036144e-07, "loss": 0.4487, "step": 145 }, { "epoch": 0.02644927536231884, "grad_norm": 5.155799288498959, "learning_rate": 8.795180722891565e-07, "loss": 0.5308, "step": 146 }, { "epoch": 0.026630434782608695, "grad_norm": 5.118362961694278, "learning_rate": 8.855421686746988e-07, "loss": 0.4598, "step": 147 }, { "epoch": 0.02681159420289855, "grad_norm": 8.632679047110537, "learning_rate": 8.915662650602409e-07, "loss": 0.5283, "step": 148 }, { "epoch": 0.026992753623188405, "grad_norm": 7.180786024663195, "learning_rate": 8.97590361445783e-07, "loss": 0.4962, "step": 149 }, { "epoch": 0.02717391304347826, "grad_norm": 4.547074967752511, "learning_rate": 9.036144578313253e-07, "loss": 0.4877, "step": 150 }, { "epoch": 0.027355072463768115, "grad_norm": 6.909285053575947, "learning_rate": 9.096385542168675e-07, "loss": 0.5031, "step": 151 }, { "epoch": 0.02753623188405797, "grad_norm": 8.46572825149969, "learning_rate": 9.156626506024095e-07, "loss": 0.5369, "step": 152 }, { "epoch": 0.027717391304347826, "grad_norm": 17.21557975553529, "learning_rate": 9.216867469879518e-07, "loss": 0.444, "step": 153 }, { "epoch": 0.02789855072463768, "grad_norm": 7.958798793828718, "learning_rate": 9.27710843373494e-07, "loss": 0.5594, "step": 154 }, { "epoch": 0.028079710144927536, "grad_norm": 20.089943951856156, "learning_rate": 9.33734939759036e-07, "loss": 0.4508, "step": 155 }, { "epoch": 0.02826086956521739, "grad_norm": 8.348366765620753, "learning_rate": 9.397590361445783e-07, "loss": 0.5292, "step": 156 }, { "epoch": 0.028442028985507246, "grad_norm": 5.368109540960397, "learning_rate": 9.457831325301205e-07, "loss": 0.5422, "step": 157 }, { "epoch": 0.0286231884057971, "grad_norm": 5.388950045227162, "learning_rate": 9.518072289156625e-07, "loss": 0.4666, "step": 158 }, { "epoch": 0.028804347826086957, "grad_norm": 5.053872505552171, "learning_rate": 9.57831325301205e-07, "loss": 0.4701, "step": 159 }, { "epoch": 0.028985507246376812, "grad_norm": 5.421594424612676, "learning_rate": 9.63855421686747e-07, "loss": 0.52, "step": 160 }, { "epoch": 0.029166666666666667, "grad_norm": 4.1264872603776785, "learning_rate": 9.69879518072289e-07, "loss": 0.4995, "step": 161 }, { "epoch": 0.029347826086956522, "grad_norm": 5.044524575984205, "learning_rate": 9.759036144578313e-07, "loss": 0.5116, "step": 162 }, { "epoch": 0.029528985507246377, "grad_norm": 5.499238571136398, "learning_rate": 9.819277108433734e-07, "loss": 0.5065, "step": 163 }, { "epoch": 0.029710144927536233, "grad_norm": 4.698303937944457, "learning_rate": 9.879518072289156e-07, "loss": 0.4603, "step": 164 }, { "epoch": 0.029891304347826088, "grad_norm": 5.9825894126873065, "learning_rate": 9.93975903614458e-07, "loss": 0.5424, "step": 165 }, { "epoch": 0.030072463768115943, "grad_norm": 7.165804565484212, "learning_rate": 1e-06, "loss": 0.5096, "step": 166 }, { "epoch": 0.030253623188405798, "grad_norm": 4.681956950030581, "learning_rate": 9.999999139238154e-07, "loss": 0.4768, "step": 167 }, { "epoch": 0.030434782608695653, "grad_norm": 5.049888174912839, "learning_rate": 9.999996556952915e-07, "loss": 0.4945, "step": 168 }, { "epoch": 0.03061594202898551, "grad_norm": 4.8280242174132315, "learning_rate": 9.99999225314517e-07, "loss": 0.501, "step": 169 }, { "epoch": 0.030797101449275364, "grad_norm": 6.038857119417974, "learning_rate": 9.999986227816403e-07, "loss": 0.4701, "step": 170 }, { "epoch": 0.03097826086956522, "grad_norm": 9.780202990037095, "learning_rate": 9.999978480968688e-07, "loss": 0.515, "step": 171 }, { "epoch": 0.031159420289855074, "grad_norm": 7.9245280562881755, "learning_rate": 9.999969012604688e-07, "loss": 0.4884, "step": 172 }, { "epoch": 0.03134057971014493, "grad_norm": 6.229806982396466, "learning_rate": 9.99995782272767e-07, "loss": 0.5007, "step": 173 }, { "epoch": 0.03152173913043478, "grad_norm": 5.74944160350927, "learning_rate": 9.999944911341482e-07, "loss": 0.4702, "step": 174 }, { "epoch": 0.03170289855072464, "grad_norm": 8.494426280655142, "learning_rate": 9.999930278450572e-07, "loss": 0.4605, "step": 175 }, { "epoch": 0.03188405797101449, "grad_norm": 17.83791204432153, "learning_rate": 9.999913924059976e-07, "loss": 0.5116, "step": 176 }, { "epoch": 0.03206521739130435, "grad_norm": 3.6510004514342236, "learning_rate": 9.999895848175326e-07, "loss": 0.4397, "step": 177 }, { "epoch": 0.0322463768115942, "grad_norm": 6.016683146970344, "learning_rate": 9.999876050802845e-07, "loss": 0.4323, "step": 178 }, { "epoch": 0.03242753623188406, "grad_norm": 5.310990129162454, "learning_rate": 9.99985453194935e-07, "loss": 0.462, "step": 179 }, { "epoch": 0.03260869565217391, "grad_norm": 7.131501884340848, "learning_rate": 9.999831291622249e-07, "loss": 0.4573, "step": 180 }, { "epoch": 0.03278985507246377, "grad_norm": 5.1447709167957365, "learning_rate": 9.999806329829546e-07, "loss": 0.4775, "step": 181 }, { "epoch": 0.03297101449275362, "grad_norm": 3.4309908977880976, "learning_rate": 9.999779646579833e-07, "loss": 0.4683, "step": 182 }, { "epoch": 0.03315217391304348, "grad_norm": 3.8563425479027713, "learning_rate": 9.9997512418823e-07, "loss": 0.4828, "step": 183 }, { "epoch": 0.03333333333333333, "grad_norm": 7.324398994673771, "learning_rate": 9.999721115746724e-07, "loss": 0.4888, "step": 184 }, { "epoch": 0.03351449275362319, "grad_norm": 5.109586000472538, "learning_rate": 9.999689268183479e-07, "loss": 0.432, "step": 185 }, { "epoch": 0.03369565217391304, "grad_norm": 8.132923394746593, "learning_rate": 9.999655699203529e-07, "loss": 0.5387, "step": 186 }, { "epoch": 0.0338768115942029, "grad_norm": 12.603147876277006, "learning_rate": 9.999620408818434e-07, "loss": 0.4939, "step": 187 }, { "epoch": 0.034057971014492754, "grad_norm": 4.1141953695608, "learning_rate": 9.999583397040342e-07, "loss": 0.4778, "step": 188 }, { "epoch": 0.034239130434782605, "grad_norm": 5.898156883814606, "learning_rate": 9.999544663881998e-07, "loss": 0.5166, "step": 189 }, { "epoch": 0.034420289855072464, "grad_norm": 5.274643328578065, "learning_rate": 9.99950420935674e-07, "loss": 0.4391, "step": 190 }, { "epoch": 0.034601449275362316, "grad_norm": 3.9149201160873317, "learning_rate": 9.999462033478495e-07, "loss": 0.4356, "step": 191 }, { "epoch": 0.034782608695652174, "grad_norm": 3.8837631578712903, "learning_rate": 9.999418136261781e-07, "loss": 0.4534, "step": 192 }, { "epoch": 0.034963768115942026, "grad_norm": 7.8220231109937695, "learning_rate": 9.999372517721716e-07, "loss": 0.4681, "step": 193 }, { "epoch": 0.035144927536231885, "grad_norm": 4.092954063858875, "learning_rate": 9.999325177874004e-07, "loss": 0.4421, "step": 194 }, { "epoch": 0.035326086956521736, "grad_norm": 6.792191106956772, "learning_rate": 9.99927611673495e-07, "loss": 0.4451, "step": 195 }, { "epoch": 0.035507246376811595, "grad_norm": 4.001072586649773, "learning_rate": 9.99922533432144e-07, "loss": 0.4827, "step": 196 }, { "epoch": 0.03568840579710145, "grad_norm": 12.554730407999608, "learning_rate": 9.99917283065096e-07, "loss": 0.4956, "step": 197 }, { "epoch": 0.035869565217391305, "grad_norm": 11.02545191723406, "learning_rate": 9.999118605741587e-07, "loss": 0.5145, "step": 198 }, { "epoch": 0.03605072463768116, "grad_norm": 5.486883030122579, "learning_rate": 9.999062659611993e-07, "loss": 0.4768, "step": 199 }, { "epoch": 0.036231884057971016, "grad_norm": 3.863833273779658, "learning_rate": 9.99900499228144e-07, "loss": 0.4772, "step": 200 }, { "epoch": 0.036231884057971016, "eval_loss": 0.4799531102180481, "eval_runtime": 9.7837, "eval_samples_per_second": 51.106, "eval_steps_per_second": 0.102, "step": 200 }, { "epoch": 0.03641304347826087, "grad_norm": 4.203418376242059, "learning_rate": 9.998945603769783e-07, "loss": 0.4934, "step": 201 }, { "epoch": 0.036594202898550726, "grad_norm": 11.39032778557563, "learning_rate": 9.998884494097466e-07, "loss": 0.4871, "step": 202 }, { "epoch": 0.03677536231884058, "grad_norm": 9.118931686244139, "learning_rate": 9.998821663285535e-07, "loss": 0.4519, "step": 203 }, { "epoch": 0.03695652173913044, "grad_norm": 4.48001367048963, "learning_rate": 9.998757111355617e-07, "loss": 0.4495, "step": 204 }, { "epoch": 0.03713768115942029, "grad_norm": 6.399610205483096, "learning_rate": 9.998690838329946e-07, "loss": 0.4878, "step": 205 }, { "epoch": 0.03731884057971015, "grad_norm": 6.947629181854651, "learning_rate": 9.998622844231333e-07, "loss": 0.4778, "step": 206 }, { "epoch": 0.0375, "grad_norm": 4.04023671442893, "learning_rate": 9.99855312908319e-07, "loss": 0.4459, "step": 207 }, { "epoch": 0.03768115942028986, "grad_norm": 5.948010825711479, "learning_rate": 9.998481692909519e-07, "loss": 0.488, "step": 208 }, { "epoch": 0.03786231884057971, "grad_norm": 5.4261911391284725, "learning_rate": 9.998408535734921e-07, "loss": 0.4771, "step": 209 }, { "epoch": 0.03804347826086957, "grad_norm": 4.358401844220579, "learning_rate": 9.99833365758458e-07, "loss": 0.4382, "step": 210 }, { "epoch": 0.03822463768115942, "grad_norm": 3.5567647760960823, "learning_rate": 9.99825705848428e-07, "loss": 0.5084, "step": 211 }, { "epoch": 0.03840579710144928, "grad_norm": 8.107530278422496, "learning_rate": 9.99817873846039e-07, "loss": 0.5029, "step": 212 }, { "epoch": 0.03858695652173913, "grad_norm": 3.6972001074897296, "learning_rate": 9.99809869753988e-07, "loss": 0.4844, "step": 213 }, { "epoch": 0.03876811594202899, "grad_norm": 4.530244950059987, "learning_rate": 9.998016935750306e-07, "loss": 0.4925, "step": 214 }, { "epoch": 0.03894927536231884, "grad_norm": 3.4162458991098856, "learning_rate": 9.99793345311982e-07, "loss": 0.4503, "step": 215 }, { "epoch": 0.0391304347826087, "grad_norm": 8.119593317890072, "learning_rate": 9.997848249677165e-07, "loss": 0.4741, "step": 216 }, { "epoch": 0.03931159420289855, "grad_norm": 4.235818762951556, "learning_rate": 9.99776132545168e-07, "loss": 0.4564, "step": 217 }, { "epoch": 0.03949275362318841, "grad_norm": 4.100374620065906, "learning_rate": 9.997672680473288e-07, "loss": 0.5018, "step": 218 }, { "epoch": 0.03967391304347826, "grad_norm": 3.5455145575604514, "learning_rate": 9.997582314772513e-07, "loss": 0.45, "step": 219 }, { "epoch": 0.03985507246376811, "grad_norm": 7.115455836069538, "learning_rate": 9.997490228380469e-07, "loss": 0.4753, "step": 220 }, { "epoch": 0.04003623188405797, "grad_norm": 4.038268060372091, "learning_rate": 9.99739642132886e-07, "loss": 0.452, "step": 221 }, { "epoch": 0.04021739130434782, "grad_norm": 4.58804149646508, "learning_rate": 9.997300893649985e-07, "loss": 0.4331, "step": 222 }, { "epoch": 0.04039855072463768, "grad_norm": 4.889877840343678, "learning_rate": 9.997203645376735e-07, "loss": 0.4084, "step": 223 }, { "epoch": 0.04057971014492753, "grad_norm": 3.7746173579882063, "learning_rate": 9.997104676542592e-07, "loss": 0.4845, "step": 224 }, { "epoch": 0.04076086956521739, "grad_norm": 4.878456578233542, "learning_rate": 9.997003987181633e-07, "loss": 0.486, "step": 225 }, { "epoch": 0.040942028985507244, "grad_norm": 8.464404261598398, "learning_rate": 9.996901577328524e-07, "loss": 0.4651, "step": 226 }, { "epoch": 0.0411231884057971, "grad_norm": 3.239182945202472, "learning_rate": 9.996797447018527e-07, "loss": 0.4742, "step": 227 }, { "epoch": 0.041304347826086954, "grad_norm": 6.575912040753638, "learning_rate": 9.996691596287494e-07, "loss": 0.4789, "step": 228 }, { "epoch": 0.04148550724637681, "grad_norm": 4.654026176911899, "learning_rate": 9.99658402517187e-07, "loss": 0.4991, "step": 229 }, { "epoch": 0.041666666666666664, "grad_norm": 13.698454104683812, "learning_rate": 9.996474733708688e-07, "loss": 0.4452, "step": 230 }, { "epoch": 0.04184782608695652, "grad_norm": 11.32330868708894, "learning_rate": 9.996363721935584e-07, "loss": 0.4279, "step": 231 }, { "epoch": 0.042028985507246375, "grad_norm": 4.4802404094538115, "learning_rate": 9.996250989890777e-07, "loss": 0.4308, "step": 232 }, { "epoch": 0.04221014492753623, "grad_norm": 5.584918400826132, "learning_rate": 9.996136537613081e-07, "loss": 0.4337, "step": 233 }, { "epoch": 0.042391304347826085, "grad_norm": 3.523418864145712, "learning_rate": 9.996020365141904e-07, "loss": 0.4679, "step": 234 }, { "epoch": 0.042572463768115944, "grad_norm": 4.150982567104567, "learning_rate": 9.995902472517244e-07, "loss": 0.4253, "step": 235 }, { "epoch": 0.042753623188405795, "grad_norm": 5.031011676263364, "learning_rate": 9.99578285977969e-07, "loss": 0.4541, "step": 236 }, { "epoch": 0.042934782608695654, "grad_norm": 10.171250203654354, "learning_rate": 9.995661526970429e-07, "loss": 0.4103, "step": 237 }, { "epoch": 0.043115942028985506, "grad_norm": 3.145143864333559, "learning_rate": 9.995538474131233e-07, "loss": 0.4147, "step": 238 }, { "epoch": 0.043297101449275364, "grad_norm": 3.3572508567730774, "learning_rate": 9.995413701304472e-07, "loss": 0.4592, "step": 239 }, { "epoch": 0.043478260869565216, "grad_norm": 5.400613603597964, "learning_rate": 9.995287208533102e-07, "loss": 0.4142, "step": 240 }, { "epoch": 0.043659420289855075, "grad_norm": 6.903415810833302, "learning_rate": 9.995158995860681e-07, "loss": 0.4605, "step": 241 }, { "epoch": 0.04384057971014493, "grad_norm": 3.2645045771492427, "learning_rate": 9.995029063331348e-07, "loss": 0.4274, "step": 242 }, { "epoch": 0.044021739130434785, "grad_norm": 3.7995981788280875, "learning_rate": 9.994897410989843e-07, "loss": 0.4455, "step": 243 }, { "epoch": 0.04420289855072464, "grad_norm": 3.49231316391393, "learning_rate": 9.994764038881494e-07, "loss": 0.476, "step": 244 }, { "epoch": 0.044384057971014496, "grad_norm": 5.007822367640591, "learning_rate": 9.994628947052218e-07, "loss": 0.436, "step": 245 }, { "epoch": 0.04456521739130435, "grad_norm": 11.82610182564281, "learning_rate": 9.994492135548532e-07, "loss": 0.5, "step": 246 }, { "epoch": 0.044746376811594206, "grad_norm": 7.8903958784074275, "learning_rate": 9.99435360441754e-07, "loss": 0.4843, "step": 247 }, { "epoch": 0.04492753623188406, "grad_norm": 6.512054775710009, "learning_rate": 9.994213353706937e-07, "loss": 0.4386, "step": 248 }, { "epoch": 0.045108695652173916, "grad_norm": 3.593400698737775, "learning_rate": 9.994071383465015e-07, "loss": 0.4912, "step": 249 }, { "epoch": 0.04528985507246377, "grad_norm": 3.70358783415324, "learning_rate": 9.99392769374065e-07, "loss": 0.4406, "step": 250 }, { "epoch": 0.04547101449275362, "grad_norm": 5.803109189517661, "learning_rate": 9.99378228458332e-07, "loss": 0.4911, "step": 251 }, { "epoch": 0.04565217391304348, "grad_norm": 3.6893076225191366, "learning_rate": 9.99363515604309e-07, "loss": 0.4008, "step": 252 }, { "epoch": 0.04583333333333333, "grad_norm": 11.30397560941352, "learning_rate": 9.993486308170612e-07, "loss": 0.4109, "step": 253 }, { "epoch": 0.04601449275362319, "grad_norm": 12.350789542199472, "learning_rate": 9.993335741017142e-07, "loss": 0.469, "step": 254 }, { "epoch": 0.04619565217391304, "grad_norm": 5.098644742786573, "learning_rate": 9.993183454634518e-07, "loss": 0.4467, "step": 255 }, { "epoch": 0.0463768115942029, "grad_norm": 13.048604609016728, "learning_rate": 9.99302944907517e-07, "loss": 0.4725, "step": 256 }, { "epoch": 0.04655797101449275, "grad_norm": 3.6081675063214647, "learning_rate": 9.992873724392125e-07, "loss": 0.4821, "step": 257 }, { "epoch": 0.04673913043478261, "grad_norm": 8.134011732152958, "learning_rate": 9.992716280639e-07, "loss": 0.5192, "step": 258 }, { "epoch": 0.04692028985507246, "grad_norm": 4.312079872375456, "learning_rate": 9.992557117870004e-07, "loss": 0.4406, "step": 259 }, { "epoch": 0.04710144927536232, "grad_norm": 3.0720013735899614, "learning_rate": 9.992396236139938e-07, "loss": 0.3787, "step": 260 }, { "epoch": 0.04728260869565217, "grad_norm": 3.197982546937872, "learning_rate": 9.992233635504192e-07, "loss": 0.405, "step": 261 }, { "epoch": 0.04746376811594203, "grad_norm": 4.466568402155062, "learning_rate": 9.992069316018753e-07, "loss": 0.476, "step": 262 }, { "epoch": 0.04764492753623188, "grad_norm": 9.687932325929705, "learning_rate": 9.991903277740194e-07, "loss": 0.3793, "step": 263 }, { "epoch": 0.04782608695652174, "grad_norm": 4.472993031097122, "learning_rate": 9.991735520725686e-07, "loss": 0.4799, "step": 264 }, { "epoch": 0.04800724637681159, "grad_norm": 5.149212903988031, "learning_rate": 9.991566045032987e-07, "loss": 0.4429, "step": 265 }, { "epoch": 0.04818840579710145, "grad_norm": 10.002158108751402, "learning_rate": 9.991394850720447e-07, "loss": 0.4584, "step": 266 }, { "epoch": 0.0483695652173913, "grad_norm": 5.019567737873869, "learning_rate": 9.991221937847009e-07, "loss": 0.3803, "step": 267 }, { "epoch": 0.04855072463768116, "grad_norm": 3.2146775101458225, "learning_rate": 9.991047306472212e-07, "loss": 0.3998, "step": 268 }, { "epoch": 0.04873188405797101, "grad_norm": 3.589919460415493, "learning_rate": 9.990870956656177e-07, "loss": 0.4702, "step": 269 }, { "epoch": 0.04891304347826087, "grad_norm": 3.2269059369290596, "learning_rate": 9.990692888459624e-07, "loss": 0.4026, "step": 270 }, { "epoch": 0.04909420289855072, "grad_norm": 4.6381853298897076, "learning_rate": 9.990513101943865e-07, "loss": 0.3651, "step": 271 }, { "epoch": 0.04927536231884058, "grad_norm": 3.5029065849997374, "learning_rate": 9.990331597170799e-07, "loss": 0.4736, "step": 272 }, { "epoch": 0.049456521739130434, "grad_norm": 3.468095053203953, "learning_rate": 9.990148374202918e-07, "loss": 0.4406, "step": 273 }, { "epoch": 0.04963768115942029, "grad_norm": 5.608119264163545, "learning_rate": 9.98996343310331e-07, "loss": 0.4315, "step": 274 }, { "epoch": 0.049818840579710144, "grad_norm": 4.481775347083576, "learning_rate": 9.989776773935647e-07, "loss": 0.4048, "step": 275 }, { "epoch": 0.05, "grad_norm": 8.42617168385361, "learning_rate": 9.9895883967642e-07, "loss": 0.4717, "step": 276 }, { "epoch": 0.050181159420289854, "grad_norm": 2.9730415433744795, "learning_rate": 9.989398301653827e-07, "loss": 0.4453, "step": 277 }, { "epoch": 0.05036231884057971, "grad_norm": 6.367426887077506, "learning_rate": 9.989206488669977e-07, "loss": 0.5167, "step": 278 }, { "epoch": 0.050543478260869565, "grad_norm": 3.3475584911089813, "learning_rate": 9.989012957878696e-07, "loss": 0.4429, "step": 279 }, { "epoch": 0.050724637681159424, "grad_norm": 9.549679012326568, "learning_rate": 9.988817709346613e-07, "loss": 0.4263, "step": 280 }, { "epoch": 0.050905797101449275, "grad_norm": 14.673903217076143, "learning_rate": 9.988620743140954e-07, "loss": 0.4167, "step": 281 }, { "epoch": 0.051086956521739134, "grad_norm": 8.83576126339579, "learning_rate": 9.98842205932954e-07, "loss": 0.4107, "step": 282 }, { "epoch": 0.051268115942028986, "grad_norm": 3.721884330371865, "learning_rate": 9.988221657980773e-07, "loss": 0.3893, "step": 283 }, { "epoch": 0.05144927536231884, "grad_norm": 3.1958082516461084, "learning_rate": 9.988019539163656e-07, "loss": 0.4677, "step": 284 }, { "epoch": 0.051630434782608696, "grad_norm": 4.774367249930716, "learning_rate": 9.987815702947778e-07, "loss": 0.4857, "step": 285 }, { "epoch": 0.05181159420289855, "grad_norm": 4.653595638505687, "learning_rate": 9.987610149403318e-07, "loss": 0.3866, "step": 286 }, { "epoch": 0.051992753623188406, "grad_norm": 4.4555986130321665, "learning_rate": 9.987402878601054e-07, "loss": 0.4225, "step": 287 }, { "epoch": 0.05217391304347826, "grad_norm": 5.074289715952272, "learning_rate": 9.98719389061235e-07, "loss": 0.394, "step": 288 }, { "epoch": 0.05235507246376812, "grad_norm": 21.000755364790056, "learning_rate": 9.986983185509154e-07, "loss": 0.5396, "step": 289 }, { "epoch": 0.05253623188405797, "grad_norm": 8.756603860656229, "learning_rate": 9.986770763364022e-07, "loss": 0.4343, "step": 290 }, { "epoch": 0.05271739130434783, "grad_norm": 13.006270890305858, "learning_rate": 9.98655662425009e-07, "loss": 0.5182, "step": 291 }, { "epoch": 0.05289855072463768, "grad_norm": 4.322906945976464, "learning_rate": 9.986340768241082e-07, "loss": 0.4838, "step": 292 }, { "epoch": 0.05307971014492754, "grad_norm": 7.013753420131705, "learning_rate": 9.986123195411325e-07, "loss": 0.4889, "step": 293 }, { "epoch": 0.05326086956521739, "grad_norm": 5.939365992478394, "learning_rate": 9.985903905835724e-07, "loss": 0.4141, "step": 294 }, { "epoch": 0.05344202898550725, "grad_norm": 6.907585093152365, "learning_rate": 9.985682899589786e-07, "loss": 0.3719, "step": 295 }, { "epoch": 0.0536231884057971, "grad_norm": 4.789625418427414, "learning_rate": 9.985460176749603e-07, "loss": 0.4395, "step": 296 }, { "epoch": 0.05380434782608696, "grad_norm": 9.530957504007826, "learning_rate": 9.985235737391859e-07, "loss": 0.47, "step": 297 }, { "epoch": 0.05398550724637681, "grad_norm": 14.37981639887827, "learning_rate": 9.985009581593832e-07, "loss": 0.4323, "step": 298 }, { "epoch": 0.05416666666666667, "grad_norm": 6.161679840080418, "learning_rate": 9.984781709433385e-07, "loss": 0.4067, "step": 299 }, { "epoch": 0.05434782608695652, "grad_norm": 11.887943982222478, "learning_rate": 9.984552120988977e-07, "loss": 0.4812, "step": 300 }, { "epoch": 0.05434782608695652, "eval_loss": 0.47120311856269836, "eval_runtime": 9.8004, "eval_samples_per_second": 51.018, "eval_steps_per_second": 0.102, "step": 300 }, { "epoch": 0.05452898550724638, "grad_norm": 6.5873591367134985, "learning_rate": 9.984320816339657e-07, "loss": 0.3944, "step": 301 }, { "epoch": 0.05471014492753623, "grad_norm": 12.355267798919755, "learning_rate": 9.984087795565062e-07, "loss": 0.4695, "step": 302 }, { "epoch": 0.05489130434782609, "grad_norm": 12.589460413890302, "learning_rate": 9.983853058745427e-07, "loss": 0.4464, "step": 303 }, { "epoch": 0.05507246376811594, "grad_norm": 3.258325196650175, "learning_rate": 9.983616605961567e-07, "loss": 0.4586, "step": 304 }, { "epoch": 0.0552536231884058, "grad_norm": 3.1999294224444133, "learning_rate": 9.983378437294898e-07, "loss": 0.4337, "step": 305 }, { "epoch": 0.05543478260869565, "grad_norm": 4.610310287741306, "learning_rate": 9.983138552827421e-07, "loss": 0.4576, "step": 306 }, { "epoch": 0.05561594202898551, "grad_norm": 17.25045959911115, "learning_rate": 9.982896952641729e-07, "loss": 0.4327, "step": 307 }, { "epoch": 0.05579710144927536, "grad_norm": 3.2771407388093863, "learning_rate": 9.982653636821009e-07, "loss": 0.4128, "step": 308 }, { "epoch": 0.05597826086956522, "grad_norm": 12.474300239271598, "learning_rate": 9.98240860544903e-07, "loss": 0.4292, "step": 309 }, { "epoch": 0.05615942028985507, "grad_norm": 9.765697228028497, "learning_rate": 9.982161858610164e-07, "loss": 0.4678, "step": 310 }, { "epoch": 0.05634057971014493, "grad_norm": 6.223457294323614, "learning_rate": 9.981913396389363e-07, "loss": 0.4363, "step": 311 }, { "epoch": 0.05652173913043478, "grad_norm": 11.23907673982082, "learning_rate": 9.981663218872176e-07, "loss": 0.4471, "step": 312 }, { "epoch": 0.05670289855072464, "grad_norm": 6.29386904352475, "learning_rate": 9.981411326144739e-07, "loss": 0.3714, "step": 313 }, { "epoch": 0.05688405797101449, "grad_norm": 4.454392684265817, "learning_rate": 9.981157718293778e-07, "loss": 0.431, "step": 314 }, { "epoch": 0.057065217391304345, "grad_norm": 6.7689194707428815, "learning_rate": 9.980902395406614e-07, "loss": 0.3973, "step": 315 }, { "epoch": 0.0572463768115942, "grad_norm": 3.8662898008650397, "learning_rate": 9.980645357571155e-07, "loss": 0.4624, "step": 316 }, { "epoch": 0.057427536231884055, "grad_norm": 11.61237583189014, "learning_rate": 9.980386604875901e-07, "loss": 0.4101, "step": 317 }, { "epoch": 0.057608695652173914, "grad_norm": 18.647937963415455, "learning_rate": 9.980126137409943e-07, "loss": 0.4664, "step": 318 }, { "epoch": 0.057789855072463765, "grad_norm": 20.235344240487514, "learning_rate": 9.979863955262958e-07, "loss": 0.485, "step": 319 }, { "epoch": 0.057971014492753624, "grad_norm": 8.673295240220018, "learning_rate": 9.979600058525218e-07, "loss": 0.5213, "step": 320 }, { "epoch": 0.058152173913043476, "grad_norm": 5.0332827313105, "learning_rate": 9.979334447287583e-07, "loss": 0.4302, "step": 321 }, { "epoch": 0.058333333333333334, "grad_norm": 2.938752515020906, "learning_rate": 9.979067121641508e-07, "loss": 0.4212, "step": 322 }, { "epoch": 0.058514492753623186, "grad_norm": 3.923096115868624, "learning_rate": 9.97879808167903e-07, "loss": 0.4178, "step": 323 }, { "epoch": 0.058695652173913045, "grad_norm": 13.5987032976378, "learning_rate": 9.978527327492782e-07, "loss": 0.5078, "step": 324 }, { "epoch": 0.058876811594202896, "grad_norm": 5.213367702287492, "learning_rate": 9.978254859175989e-07, "loss": 0.3997, "step": 325 }, { "epoch": 0.059057971014492755, "grad_norm": 18.126153719639184, "learning_rate": 9.977980676822457e-07, "loss": 0.5056, "step": 326 }, { "epoch": 0.05923913043478261, "grad_norm": 8.946691114858897, "learning_rate": 9.977704780526595e-07, "loss": 0.4235, "step": 327 }, { "epoch": 0.059420289855072465, "grad_norm": 6.093794338617538, "learning_rate": 9.97742717038339e-07, "loss": 0.4073, "step": 328 }, { "epoch": 0.05960144927536232, "grad_norm": 3.4112452861358578, "learning_rate": 9.977147846488427e-07, "loss": 0.4564, "step": 329 }, { "epoch": 0.059782608695652176, "grad_norm": 7.300564120321209, "learning_rate": 9.976866808937879e-07, "loss": 0.4362, "step": 330 }, { "epoch": 0.05996376811594203, "grad_norm": 5.495126326007882, "learning_rate": 9.976584057828507e-07, "loss": 0.4456, "step": 331 }, { "epoch": 0.060144927536231886, "grad_norm": 12.94818234024048, "learning_rate": 9.976299593257665e-07, "loss": 0.3757, "step": 332 }, { "epoch": 0.06032608695652174, "grad_norm": 5.127082909766672, "learning_rate": 9.976013415323294e-07, "loss": 0.4197, "step": 333 }, { "epoch": 0.060507246376811596, "grad_norm": 7.707380300916236, "learning_rate": 9.975725524123928e-07, "loss": 0.5027, "step": 334 }, { "epoch": 0.06068840579710145, "grad_norm": 2.991295798355372, "learning_rate": 9.975435919758688e-07, "loss": 0.4419, "step": 335 }, { "epoch": 0.06086956521739131, "grad_norm": 11.079079182236235, "learning_rate": 9.975144602327288e-07, "loss": 0.4434, "step": 336 }, { "epoch": 0.06105072463768116, "grad_norm": 18.89291545985812, "learning_rate": 9.974851571930028e-07, "loss": 0.467, "step": 337 }, { "epoch": 0.06123188405797102, "grad_norm": 9.552257156822742, "learning_rate": 9.9745568286678e-07, "loss": 0.4831, "step": 338 }, { "epoch": 0.06141304347826087, "grad_norm": 14.262346669578724, "learning_rate": 9.974260372642085e-07, "loss": 0.485, "step": 339 }, { "epoch": 0.06159420289855073, "grad_norm": 4.106830280026516, "learning_rate": 9.973962203954958e-07, "loss": 0.4163, "step": 340 }, { "epoch": 0.06177536231884058, "grad_norm": 6.518748191177843, "learning_rate": 9.973662322709075e-07, "loss": 0.4545, "step": 341 }, { "epoch": 0.06195652173913044, "grad_norm": 6.564197662443871, "learning_rate": 9.973360729007689e-07, "loss": 0.4459, "step": 342 }, { "epoch": 0.06213768115942029, "grad_norm": 5.044300770936432, "learning_rate": 9.97305742295464e-07, "loss": 0.5336, "step": 343 }, { "epoch": 0.06231884057971015, "grad_norm": 3.7795514206422443, "learning_rate": 9.972752404654356e-07, "loss": 0.394, "step": 344 }, { "epoch": 0.0625, "grad_norm": 3.8497984211877716, "learning_rate": 9.972445674211858e-07, "loss": 0.4195, "step": 345 }, { "epoch": 0.06268115942028986, "grad_norm": 3.8332303461266717, "learning_rate": 9.972137231732755e-07, "loss": 0.3574, "step": 346 }, { "epoch": 0.0628623188405797, "grad_norm": 4.271922254758665, "learning_rate": 9.971827077323246e-07, "loss": 0.4225, "step": 347 }, { "epoch": 0.06304347826086956, "grad_norm": 7.1585652686951535, "learning_rate": 9.971515211090116e-07, "loss": 0.4087, "step": 348 }, { "epoch": 0.06322463768115942, "grad_norm": 15.57564963359638, "learning_rate": 9.971201633140745e-07, "loss": 0.4055, "step": 349 }, { "epoch": 0.06340579710144928, "grad_norm": 5.448346024226663, "learning_rate": 9.970886343583096e-07, "loss": 0.4016, "step": 350 }, { "epoch": 0.06358695652173912, "grad_norm": 13.117456422021675, "learning_rate": 9.970569342525725e-07, "loss": 0.4028, "step": 351 }, { "epoch": 0.06376811594202898, "grad_norm": 6.0202611923497455, "learning_rate": 9.97025063007778e-07, "loss": 0.4211, "step": 352 }, { "epoch": 0.06394927536231884, "grad_norm": 6.870526620490523, "learning_rate": 9.969930206348993e-07, "loss": 0.4077, "step": 353 }, { "epoch": 0.0641304347826087, "grad_norm": 10.515467615290369, "learning_rate": 9.969608071449688e-07, "loss": 0.4629, "step": 354 }, { "epoch": 0.06431159420289854, "grad_norm": 11.80631388821243, "learning_rate": 9.969284225490778e-07, "loss": 0.4268, "step": 355 }, { "epoch": 0.0644927536231884, "grad_norm": 7.6392809525616725, "learning_rate": 9.968958668583764e-07, "loss": 0.4094, "step": 356 }, { "epoch": 0.06467391304347826, "grad_norm": 9.29797531345794, "learning_rate": 9.968631400840736e-07, "loss": 0.4138, "step": 357 }, { "epoch": 0.06485507246376812, "grad_norm": 6.846604831154082, "learning_rate": 9.968302422374377e-07, "loss": 0.4102, "step": 358 }, { "epoch": 0.06503623188405797, "grad_norm": 6.0701576869940315, "learning_rate": 9.967971733297954e-07, "loss": 0.5007, "step": 359 }, { "epoch": 0.06521739130434782, "grad_norm": 3.411764228446719, "learning_rate": 9.967639333725321e-07, "loss": 0.4359, "step": 360 }, { "epoch": 0.06539855072463768, "grad_norm": 6.689856685176495, "learning_rate": 9.96730522377093e-07, "loss": 0.465, "step": 361 }, { "epoch": 0.06557971014492754, "grad_norm": 4.401029651137588, "learning_rate": 9.966969403549816e-07, "loss": 0.4526, "step": 362 }, { "epoch": 0.06576086956521739, "grad_norm": 8.21724962171842, "learning_rate": 9.9666318731776e-07, "loss": 0.4962, "step": 363 }, { "epoch": 0.06594202898550725, "grad_norm": 3.065975264812797, "learning_rate": 9.9662926327705e-07, "loss": 0.3774, "step": 364 }, { "epoch": 0.0661231884057971, "grad_norm": 3.1515456547780785, "learning_rate": 9.965951682445316e-07, "loss": 0.4674, "step": 365 }, { "epoch": 0.06630434782608696, "grad_norm": 8.873169062696592, "learning_rate": 9.965609022319436e-07, "loss": 0.4257, "step": 366 }, { "epoch": 0.06648550724637681, "grad_norm": 7.077801766150442, "learning_rate": 9.965264652510844e-07, "loss": 0.4055, "step": 367 }, { "epoch": 0.06666666666666667, "grad_norm": 2.9996172220762496, "learning_rate": 9.964918573138104e-07, "loss": 0.3944, "step": 368 }, { "epoch": 0.06684782608695652, "grad_norm": 10.369047989122329, "learning_rate": 9.964570784320377e-07, "loss": 0.4819, "step": 369 }, { "epoch": 0.06702898550724638, "grad_norm": 4.9460142568376355, "learning_rate": 9.964221286177406e-07, "loss": 0.394, "step": 370 }, { "epoch": 0.06721014492753623, "grad_norm": 3.893525051628319, "learning_rate": 9.963870078829525e-07, "loss": 0.4529, "step": 371 }, { "epoch": 0.06739130434782609, "grad_norm": 4.304044234640589, "learning_rate": 9.963517162397657e-07, "loss": 0.3751, "step": 372 }, { "epoch": 0.06757246376811595, "grad_norm": 7.732223667642914, "learning_rate": 9.963162537003312e-07, "loss": 0.3972, "step": 373 }, { "epoch": 0.0677536231884058, "grad_norm": 6.0130996151181595, "learning_rate": 9.96280620276859e-07, "loss": 0.3921, "step": 374 }, { "epoch": 0.06793478260869565, "grad_norm": 8.460173035380654, "learning_rate": 9.962448159816177e-07, "loss": 0.3807, "step": 375 }, { "epoch": 0.06811594202898551, "grad_norm": 4.819996735574024, "learning_rate": 9.962088408269352e-07, "loss": 0.3499, "step": 376 }, { "epoch": 0.06829710144927537, "grad_norm": 3.3718245778120006, "learning_rate": 9.961726948251974e-07, "loss": 0.4545, "step": 377 }, { "epoch": 0.06847826086956521, "grad_norm": 7.821014083891689, "learning_rate": 9.9613637798885e-07, "loss": 0.4446, "step": 378 }, { "epoch": 0.06865942028985507, "grad_norm": 6.309749251328278, "learning_rate": 9.960998903303972e-07, "loss": 0.4193, "step": 379 }, { "epoch": 0.06884057971014493, "grad_norm": 6.135018571957307, "learning_rate": 9.960632318624013e-07, "loss": 0.4197, "step": 380 }, { "epoch": 0.06902173913043479, "grad_norm": 2.797929551699773, "learning_rate": 9.960264025974843e-07, "loss": 0.4111, "step": 381 }, { "epoch": 0.06920289855072463, "grad_norm": 4.3193744861577255, "learning_rate": 9.959894025483267e-07, "loss": 0.4464, "step": 382 }, { "epoch": 0.06938405797101449, "grad_norm": 3.112028280208757, "learning_rate": 9.959522317276677e-07, "loss": 0.3816, "step": 383 }, { "epoch": 0.06956521739130435, "grad_norm": 5.335998953603253, "learning_rate": 9.959148901483054e-07, "loss": 0.4342, "step": 384 }, { "epoch": 0.06974637681159421, "grad_norm": 9.249965647740233, "learning_rate": 9.95877377823097e-07, "loss": 0.4338, "step": 385 }, { "epoch": 0.06992753623188405, "grad_norm": 3.925470899440518, "learning_rate": 9.958396947649576e-07, "loss": 0.3627, "step": 386 }, { "epoch": 0.07010869565217391, "grad_norm": 3.9316397537793297, "learning_rate": 9.95801840986862e-07, "loss": 0.428, "step": 387 }, { "epoch": 0.07028985507246377, "grad_norm": 4.057789454179014, "learning_rate": 9.957638165018436e-07, "loss": 0.4597, "step": 388 }, { "epoch": 0.07047101449275363, "grad_norm": 5.255851869069346, "learning_rate": 9.957256213229941e-07, "loss": 0.4545, "step": 389 }, { "epoch": 0.07065217391304347, "grad_norm": 13.445809799619013, "learning_rate": 9.956872554634643e-07, "loss": 0.3859, "step": 390 }, { "epoch": 0.07083333333333333, "grad_norm": 7.045721748771448, "learning_rate": 9.95648718936464e-07, "loss": 0.4236, "step": 391 }, { "epoch": 0.07101449275362319, "grad_norm": 10.569858991413426, "learning_rate": 9.95610011755261e-07, "loss": 0.3715, "step": 392 }, { "epoch": 0.07119565217391305, "grad_norm": 5.582768581984532, "learning_rate": 9.95571133933183e-07, "loss": 0.4978, "step": 393 }, { "epoch": 0.0713768115942029, "grad_norm": 8.552602155127621, "learning_rate": 9.955320854836154e-07, "loss": 0.3998, "step": 394 }, { "epoch": 0.07155797101449275, "grad_norm": 4.5124503893903185, "learning_rate": 9.954928664200028e-07, "loss": 0.4288, "step": 395 }, { "epoch": 0.07173913043478261, "grad_norm": 5.905639449788327, "learning_rate": 9.954534767558488e-07, "loss": 0.4193, "step": 396 }, { "epoch": 0.07192028985507247, "grad_norm": 4.068999111492898, "learning_rate": 9.954139165047153e-07, "loss": 0.4124, "step": 397 }, { "epoch": 0.07210144927536231, "grad_norm": 7.630135633731, "learning_rate": 9.953741856802226e-07, "loss": 0.4202, "step": 398 }, { "epoch": 0.07228260869565217, "grad_norm": 5.424395768705924, "learning_rate": 9.95334284296051e-07, "loss": 0.3836, "step": 399 }, { "epoch": 0.07246376811594203, "grad_norm": 7.323502213352483, "learning_rate": 9.952942123659383e-07, "loss": 0.4171, "step": 400 }, { "epoch": 0.07246376811594203, "eval_loss": 0.439328134059906, "eval_runtime": 9.758, "eval_samples_per_second": 51.24, "eval_steps_per_second": 0.102, "step": 400 }, { "epoch": 0.07264492753623189, "grad_norm": 9.340926912974801, "learning_rate": 9.952539699036817e-07, "loss": 0.4515, "step": 401 }, { "epoch": 0.07282608695652174, "grad_norm": 7.860902436954148, "learning_rate": 9.952135569231364e-07, "loss": 0.4265, "step": 402 }, { "epoch": 0.0730072463768116, "grad_norm": 12.500932961139199, "learning_rate": 9.951729734382173e-07, "loss": 0.45, "step": 403 }, { "epoch": 0.07318840579710145, "grad_norm": 6.523900108438604, "learning_rate": 9.95132219462897e-07, "loss": 0.4467, "step": 404 }, { "epoch": 0.07336956521739131, "grad_norm": 6.27576837518242, "learning_rate": 9.950912950112078e-07, "loss": 0.457, "step": 405 }, { "epoch": 0.07355072463768116, "grad_norm": 7.767324684551404, "learning_rate": 9.9505020009724e-07, "loss": 0.4264, "step": 406 }, { "epoch": 0.07373188405797101, "grad_norm": 6.471263780811469, "learning_rate": 9.950089347351424e-07, "loss": 0.3779, "step": 407 }, { "epoch": 0.07391304347826087, "grad_norm": 3.4477486851014305, "learning_rate": 9.949674989391235e-07, "loss": 0.3806, "step": 408 }, { "epoch": 0.07409420289855072, "grad_norm": 3.253994445320859, "learning_rate": 9.949258927234493e-07, "loss": 0.4117, "step": 409 }, { "epoch": 0.07427536231884058, "grad_norm": 3.0371762069799297, "learning_rate": 9.948841161024452e-07, "loss": 0.4017, "step": 410 }, { "epoch": 0.07445652173913044, "grad_norm": 3.53851217966538, "learning_rate": 9.948421690904953e-07, "loss": 0.3984, "step": 411 }, { "epoch": 0.0746376811594203, "grad_norm": 10.225663070573427, "learning_rate": 9.94800051702042e-07, "loss": 0.4229, "step": 412 }, { "epoch": 0.07481884057971014, "grad_norm": 11.11255647953521, "learning_rate": 9.947577639515862e-07, "loss": 0.4225, "step": 413 }, { "epoch": 0.075, "grad_norm": 6.892373928834471, "learning_rate": 9.947153058536882e-07, "loss": 0.4576, "step": 414 }, { "epoch": 0.07518115942028986, "grad_norm": 3.0793916374965153, "learning_rate": 9.946726774229664e-07, "loss": 0.4047, "step": 415 }, { "epoch": 0.07536231884057971, "grad_norm": 5.650987984564611, "learning_rate": 9.94629878674098e-07, "loss": 0.4232, "step": 416 }, { "epoch": 0.07554347826086956, "grad_norm": 5.125240624974368, "learning_rate": 9.945869096218188e-07, "loss": 0.3933, "step": 417 }, { "epoch": 0.07572463768115942, "grad_norm": 6.004904831867239, "learning_rate": 9.94543770280923e-07, "loss": 0.4163, "step": 418 }, { "epoch": 0.07590579710144928, "grad_norm": 12.202913838238823, "learning_rate": 9.945004606662642e-07, "loss": 0.4258, "step": 419 }, { "epoch": 0.07608695652173914, "grad_norm": 19.806067834435986, "learning_rate": 9.944569807927534e-07, "loss": 0.4641, "step": 420 }, { "epoch": 0.07626811594202898, "grad_norm": 21.45097326185226, "learning_rate": 9.944133306753616e-07, "loss": 0.4027, "step": 421 }, { "epoch": 0.07644927536231884, "grad_norm": 15.711150416386833, "learning_rate": 9.943695103291175e-07, "loss": 0.408, "step": 422 }, { "epoch": 0.0766304347826087, "grad_norm": 5.636549116664538, "learning_rate": 9.943255197691085e-07, "loss": 0.4625, "step": 423 }, { "epoch": 0.07681159420289856, "grad_norm": 6.729199348614858, "learning_rate": 9.94281359010481e-07, "loss": 0.4002, "step": 424 }, { "epoch": 0.0769927536231884, "grad_norm": 3.8159797431684983, "learning_rate": 9.942370280684396e-07, "loss": 0.4036, "step": 425 }, { "epoch": 0.07717391304347826, "grad_norm": 6.753286773647914, "learning_rate": 9.941925269582477e-07, "loss": 0.3701, "step": 426 }, { "epoch": 0.07735507246376812, "grad_norm": 11.310994484797527, "learning_rate": 9.94147855695227e-07, "loss": 0.4587, "step": 427 }, { "epoch": 0.07753623188405798, "grad_norm": 11.5073236225397, "learning_rate": 9.941030142947586e-07, "loss": 0.4494, "step": 428 }, { "epoch": 0.07771739130434782, "grad_norm": 5.733456623019978, "learning_rate": 9.94058002772281e-07, "loss": 0.4662, "step": 429 }, { "epoch": 0.07789855072463768, "grad_norm": 3.0184337786908553, "learning_rate": 9.940128211432923e-07, "loss": 0.3815, "step": 430 }, { "epoch": 0.07807971014492754, "grad_norm": 3.7515337294113276, "learning_rate": 9.939674694233487e-07, "loss": 0.4439, "step": 431 }, { "epoch": 0.0782608695652174, "grad_norm": 3.6615852173475627, "learning_rate": 9.939219476280648e-07, "loss": 0.4047, "step": 432 }, { "epoch": 0.07844202898550724, "grad_norm": 3.7865592767186, "learning_rate": 9.93876255773114e-07, "loss": 0.4135, "step": 433 }, { "epoch": 0.0786231884057971, "grad_norm": 4.890425711346586, "learning_rate": 9.938303938742284e-07, "loss": 0.4216, "step": 434 }, { "epoch": 0.07880434782608696, "grad_norm": 8.781852176695326, "learning_rate": 9.937843619471984e-07, "loss": 0.4204, "step": 435 }, { "epoch": 0.07898550724637682, "grad_norm": 5.451014653615925, "learning_rate": 9.93738160007873e-07, "loss": 0.4648, "step": 436 }, { "epoch": 0.07916666666666666, "grad_norm": 6.440770007617795, "learning_rate": 9.936917880721596e-07, "loss": 0.4434, "step": 437 }, { "epoch": 0.07934782608695652, "grad_norm": 5.793893214872514, "learning_rate": 9.936452461560242e-07, "loss": 0.4867, "step": 438 }, { "epoch": 0.07952898550724638, "grad_norm": 3.1069520285809995, "learning_rate": 9.93598534275492e-07, "loss": 0.3624, "step": 439 }, { "epoch": 0.07971014492753623, "grad_norm": 5.282723070651829, "learning_rate": 9.935516524466456e-07, "loss": 0.4459, "step": 440 }, { "epoch": 0.07989130434782608, "grad_norm": 3.8561441294534524, "learning_rate": 9.935046006856269e-07, "loss": 0.4055, "step": 441 }, { "epoch": 0.08007246376811594, "grad_norm": 11.283086680324008, "learning_rate": 9.934573790086355e-07, "loss": 0.3904, "step": 442 }, { "epoch": 0.0802536231884058, "grad_norm": 5.699597939314377, "learning_rate": 9.93409987431931e-07, "loss": 0.4396, "step": 443 }, { "epoch": 0.08043478260869565, "grad_norm": 6.436041850723825, "learning_rate": 9.933624259718295e-07, "loss": 0.3696, "step": 444 }, { "epoch": 0.0806159420289855, "grad_norm": 5.295092388071156, "learning_rate": 9.933146946447075e-07, "loss": 0.3792, "step": 445 }, { "epoch": 0.08079710144927536, "grad_norm": 4.352238066478703, "learning_rate": 9.932667934669985e-07, "loss": 0.4491, "step": 446 }, { "epoch": 0.08097826086956522, "grad_norm": 7.410705472114366, "learning_rate": 9.932187224551955e-07, "loss": 0.3228, "step": 447 }, { "epoch": 0.08115942028985507, "grad_norm": 11.511784915461405, "learning_rate": 9.931704816258494e-07, "loss": 0.395, "step": 448 }, { "epoch": 0.08134057971014493, "grad_norm": 11.001562864344784, "learning_rate": 9.931220709955698e-07, "loss": 0.4397, "step": 449 }, { "epoch": 0.08152173913043478, "grad_norm": 3.949317430969973, "learning_rate": 9.930734905810248e-07, "loss": 0.3653, "step": 450 }, { "epoch": 0.08170289855072464, "grad_norm": 6.613952449363454, "learning_rate": 9.930247403989407e-07, "loss": 0.4393, "step": 451 }, { "epoch": 0.08188405797101449, "grad_norm": 13.77915475688408, "learning_rate": 9.929758204661026e-07, "loss": 0.426, "step": 452 }, { "epoch": 0.08206521739130435, "grad_norm": 19.507550047347742, "learning_rate": 9.929267307993535e-07, "loss": 0.4501, "step": 453 }, { "epoch": 0.0822463768115942, "grad_norm": 5.963643178822908, "learning_rate": 9.928774714155956e-07, "loss": 0.3923, "step": 454 }, { "epoch": 0.08242753623188406, "grad_norm": 15.061735947604319, "learning_rate": 9.928280423317889e-07, "loss": 0.4199, "step": 455 }, { "epoch": 0.08260869565217391, "grad_norm": 6.680897612163597, "learning_rate": 9.927784435649522e-07, "loss": 0.4019, "step": 456 }, { "epoch": 0.08278985507246377, "grad_norm": 4.3251429861266795, "learning_rate": 9.927286751321625e-07, "loss": 0.4664, "step": 457 }, { "epoch": 0.08297101449275363, "grad_norm": 10.960962341505919, "learning_rate": 9.926787370505555e-07, "loss": 0.4075, "step": 458 }, { "epoch": 0.08315217391304348, "grad_norm": 7.749145911295322, "learning_rate": 9.926286293373247e-07, "loss": 0.407, "step": 459 }, { "epoch": 0.08333333333333333, "grad_norm": 3.729712142380696, "learning_rate": 9.925783520097232e-07, "loss": 0.4122, "step": 460 }, { "epoch": 0.08351449275362319, "grad_norm": 3.8514934585283225, "learning_rate": 9.925279050850607e-07, "loss": 0.3975, "step": 461 }, { "epoch": 0.08369565217391305, "grad_norm": 4.7496888468099785, "learning_rate": 9.92477288580707e-07, "loss": 0.4564, "step": 462 }, { "epoch": 0.0838768115942029, "grad_norm": 3.709577026722062, "learning_rate": 9.924265025140895e-07, "loss": 0.4185, "step": 463 }, { "epoch": 0.08405797101449275, "grad_norm": 9.222652780805598, "learning_rate": 9.92375546902694e-07, "loss": 0.4202, "step": 464 }, { "epoch": 0.08423913043478261, "grad_norm": 5.765529327125465, "learning_rate": 9.923244217640648e-07, "loss": 0.4355, "step": 465 }, { "epoch": 0.08442028985507247, "grad_norm": 5.914602480754362, "learning_rate": 9.922731271158043e-07, "loss": 0.3654, "step": 466 }, { "epoch": 0.08460144927536233, "grad_norm": 3.079931421155299, "learning_rate": 9.922216629755738e-07, "loss": 0.4221, "step": 467 }, { "epoch": 0.08478260869565217, "grad_norm": 4.747874203838646, "learning_rate": 9.921700293610927e-07, "loss": 0.4155, "step": 468 }, { "epoch": 0.08496376811594203, "grad_norm": 3.88032264069236, "learning_rate": 9.921182262901385e-07, "loss": 0.4491, "step": 469 }, { "epoch": 0.08514492753623189, "grad_norm": 5.187752169289457, "learning_rate": 9.92066253780547e-07, "loss": 0.3927, "step": 470 }, { "epoch": 0.08532608695652173, "grad_norm": 8.771015796895298, "learning_rate": 9.920141118502132e-07, "loss": 0.4647, "step": 471 }, { "epoch": 0.08550724637681159, "grad_norm": 6.553980309219007, "learning_rate": 9.919618005170894e-07, "loss": 0.4016, "step": 472 }, { "epoch": 0.08568840579710145, "grad_norm": 3.4027620169061112, "learning_rate": 9.919093197991866e-07, "loss": 0.4774, "step": 473 }, { "epoch": 0.08586956521739131, "grad_norm": 4.949986845985805, "learning_rate": 9.918566697145744e-07, "loss": 0.3714, "step": 474 }, { "epoch": 0.08605072463768115, "grad_norm": 3.6963389796294885, "learning_rate": 9.918038502813803e-07, "loss": 0.4652, "step": 475 }, { "epoch": 0.08623188405797101, "grad_norm": 6.434617160328624, "learning_rate": 9.917508615177903e-07, "loss": 0.392, "step": 476 }, { "epoch": 0.08641304347826087, "grad_norm": 3.4434552084107035, "learning_rate": 9.91697703442049e-07, "loss": 0.4035, "step": 477 }, { "epoch": 0.08659420289855073, "grad_norm": 6.467189772046257, "learning_rate": 9.916443760724582e-07, "loss": 0.3875, "step": 478 }, { "epoch": 0.08677536231884057, "grad_norm": 4.818885264761668, "learning_rate": 9.915908794273796e-07, "loss": 0.451, "step": 479 }, { "epoch": 0.08695652173913043, "grad_norm": 5.608456207161903, "learning_rate": 9.915372135252317e-07, "loss": 0.439, "step": 480 }, { "epoch": 0.08713768115942029, "grad_norm": 4.436576067151408, "learning_rate": 9.914833783844926e-07, "loss": 0.4002, "step": 481 }, { "epoch": 0.08731884057971015, "grad_norm": 7.373402547890791, "learning_rate": 9.914293740236974e-07, "loss": 0.4265, "step": 482 }, { "epoch": 0.0875, "grad_norm": 6.5813453782574145, "learning_rate": 9.913752004614404e-07, "loss": 0.4747, "step": 483 }, { "epoch": 0.08768115942028985, "grad_norm": 4.546582872445812, "learning_rate": 9.913208577163736e-07, "loss": 0.3824, "step": 484 }, { "epoch": 0.08786231884057971, "grad_norm": 3.514233704435464, "learning_rate": 9.912663458072077e-07, "loss": 0.3755, "step": 485 }, { "epoch": 0.08804347826086957, "grad_norm": 6.692949143335827, "learning_rate": 9.91211664752711e-07, "loss": 0.4484, "step": 486 }, { "epoch": 0.08822463768115942, "grad_norm": 8.375534585210136, "learning_rate": 9.91156814571711e-07, "loss": 0.4218, "step": 487 }, { "epoch": 0.08840579710144927, "grad_norm": 3.356234671918528, "learning_rate": 9.911017952830926e-07, "loss": 0.3743, "step": 488 }, { "epoch": 0.08858695652173913, "grad_norm": 3.0297919611763287, "learning_rate": 9.91046606905799e-07, "loss": 0.4025, "step": 489 }, { "epoch": 0.08876811594202899, "grad_norm": 12.139794903176407, "learning_rate": 9.90991249458832e-07, "loss": 0.4568, "step": 490 }, { "epoch": 0.08894927536231884, "grad_norm": 14.89649793798213, "learning_rate": 9.909357229612516e-07, "loss": 0.4219, "step": 491 }, { "epoch": 0.0891304347826087, "grad_norm": 9.39213140678959, "learning_rate": 9.908800274321757e-07, "loss": 0.4205, "step": 492 }, { "epoch": 0.08931159420289855, "grad_norm": 11.847798168359438, "learning_rate": 9.908241628907806e-07, "loss": 0.3959, "step": 493 }, { "epoch": 0.08949275362318841, "grad_norm": 4.464857190048925, "learning_rate": 9.907681293563004e-07, "loss": 0.414, "step": 494 }, { "epoch": 0.08967391304347826, "grad_norm": 3.6050304449575177, "learning_rate": 9.90711926848028e-07, "loss": 0.4686, "step": 495 }, { "epoch": 0.08985507246376812, "grad_norm": 4.346200294431187, "learning_rate": 9.906555553853142e-07, "loss": 0.3239, "step": 496 }, { "epoch": 0.09003623188405797, "grad_norm": 5.373120289089078, "learning_rate": 9.90599014987568e-07, "loss": 0.4218, "step": 497 }, { "epoch": 0.09021739130434783, "grad_norm": 13.948418387535892, "learning_rate": 9.905423056742561e-07, "loss": 0.4207, "step": 498 }, { "epoch": 0.09039855072463768, "grad_norm": 9.299621889157287, "learning_rate": 9.904854274649045e-07, "loss": 0.4248, "step": 499 }, { "epoch": 0.09057971014492754, "grad_norm": 3.7046528030287362, "learning_rate": 9.90428380379096e-07, "loss": 0.4031, "step": 500 }, { "epoch": 0.09057971014492754, "eval_loss": 0.4216562509536743, "eval_runtime": 9.762, "eval_samples_per_second": 51.219, "eval_steps_per_second": 0.102, "step": 500 }, { "epoch": 0.0907608695652174, "grad_norm": 4.909743322977319, "learning_rate": 9.903711644364726e-07, "loss": 0.3991, "step": 501 }, { "epoch": 0.09094202898550724, "grad_norm": 3.2100999274972066, "learning_rate": 9.903137796567338e-07, "loss": 0.3815, "step": 502 }, { "epoch": 0.0911231884057971, "grad_norm": 8.237659705884834, "learning_rate": 9.902562260596374e-07, "loss": 0.3948, "step": 503 }, { "epoch": 0.09130434782608696, "grad_norm": 4.880081596683351, "learning_rate": 9.901985036649997e-07, "loss": 0.3484, "step": 504 }, { "epoch": 0.09148550724637682, "grad_norm": 8.958099383680779, "learning_rate": 9.901406124926945e-07, "loss": 0.3733, "step": 505 }, { "epoch": 0.09166666666666666, "grad_norm": 8.110912513386152, "learning_rate": 9.90082552562654e-07, "loss": 0.3782, "step": 506 }, { "epoch": 0.09184782608695652, "grad_norm": 6.328915402744954, "learning_rate": 9.900243238948686e-07, "loss": 0.4155, "step": 507 }, { "epoch": 0.09202898550724638, "grad_norm": 7.345956243956656, "learning_rate": 9.899659265093867e-07, "loss": 0.4436, "step": 508 }, { "epoch": 0.09221014492753624, "grad_norm": 7.499421591266258, "learning_rate": 9.89907360426315e-07, "loss": 0.3823, "step": 509 }, { "epoch": 0.09239130434782608, "grad_norm": 8.795202176113806, "learning_rate": 9.898486256658176e-07, "loss": 0.4067, "step": 510 }, { "epoch": 0.09257246376811594, "grad_norm": 8.561886093558764, "learning_rate": 9.897897222481176e-07, "loss": 0.4363, "step": 511 }, { "epoch": 0.0927536231884058, "grad_norm": 3.4906015491203295, "learning_rate": 9.897306501934954e-07, "loss": 0.4357, "step": 512 }, { "epoch": 0.09293478260869566, "grad_norm": 3.8086664027306103, "learning_rate": 9.8967140952229e-07, "loss": 0.4391, "step": 513 }, { "epoch": 0.0931159420289855, "grad_norm": 4.021423759166613, "learning_rate": 9.896120002548984e-07, "loss": 0.428, "step": 514 }, { "epoch": 0.09329710144927536, "grad_norm": 3.5751648538740204, "learning_rate": 9.895524224117751e-07, "loss": 0.4229, "step": 515 }, { "epoch": 0.09347826086956522, "grad_norm": 4.377740980156112, "learning_rate": 9.894926760134332e-07, "loss": 0.4141, "step": 516 }, { "epoch": 0.09365942028985508, "grad_norm": 6.746029498524841, "learning_rate": 9.894327610804437e-07, "loss": 0.4437, "step": 517 }, { "epoch": 0.09384057971014492, "grad_norm": 4.662321750690026, "learning_rate": 9.893726776334357e-07, "loss": 0.3762, "step": 518 }, { "epoch": 0.09402173913043478, "grad_norm": 6.028252363662997, "learning_rate": 9.89312425693096e-07, "loss": 0.381, "step": 519 }, { "epoch": 0.09420289855072464, "grad_norm": 10.820973639970484, "learning_rate": 9.892520052801696e-07, "loss": 0.4092, "step": 520 }, { "epoch": 0.0943840579710145, "grad_norm": 4.994053097632154, "learning_rate": 9.891914164154597e-07, "loss": 0.3728, "step": 521 }, { "epoch": 0.09456521739130434, "grad_norm": 4.335073532593652, "learning_rate": 9.891306591198273e-07, "loss": 0.4326, "step": 522 }, { "epoch": 0.0947463768115942, "grad_norm": 4.839108409033974, "learning_rate": 9.890697334141917e-07, "loss": 0.3738, "step": 523 }, { "epoch": 0.09492753623188406, "grad_norm": 3.822368363444714, "learning_rate": 9.890086393195293e-07, "loss": 0.3946, "step": 524 }, { "epoch": 0.09510869565217392, "grad_norm": 7.7936536704105, "learning_rate": 9.889473768568756e-07, "loss": 0.3494, "step": 525 }, { "epoch": 0.09528985507246376, "grad_norm": 3.274470581444187, "learning_rate": 9.888859460473233e-07, "loss": 0.349, "step": 526 }, { "epoch": 0.09547101449275362, "grad_norm": 3.521029986233931, "learning_rate": 9.888243469120232e-07, "loss": 0.3986, "step": 527 }, { "epoch": 0.09565217391304348, "grad_norm": 4.716810008159726, "learning_rate": 9.887625794721847e-07, "loss": 0.3723, "step": 528 }, { "epoch": 0.09583333333333334, "grad_norm": 3.1238151651325814, "learning_rate": 9.88700643749074e-07, "loss": 0.3514, "step": 529 }, { "epoch": 0.09601449275362318, "grad_norm": 9.71183723127655, "learning_rate": 9.886385397640164e-07, "loss": 0.4037, "step": 530 }, { "epoch": 0.09619565217391304, "grad_norm": 5.939099150935516, "learning_rate": 9.885762675383942e-07, "loss": 0.4487, "step": 531 }, { "epoch": 0.0963768115942029, "grad_norm": 3.8273230631587127, "learning_rate": 9.88513827093648e-07, "loss": 0.4105, "step": 532 }, { "epoch": 0.09655797101449276, "grad_norm": 5.485473164960536, "learning_rate": 9.884512184512767e-07, "loss": 0.4277, "step": 533 }, { "epoch": 0.0967391304347826, "grad_norm": 5.031933316795367, "learning_rate": 9.883884416328366e-07, "loss": 0.39, "step": 534 }, { "epoch": 0.09692028985507246, "grad_norm": 3.7205972464354407, "learning_rate": 9.883254966599419e-07, "loss": 0.3806, "step": 535 }, { "epoch": 0.09710144927536232, "grad_norm": 3.7845619210949937, "learning_rate": 9.882623835542648e-07, "loss": 0.4013, "step": 536 }, { "epoch": 0.09728260869565217, "grad_norm": 4.897714959472158, "learning_rate": 9.881991023375361e-07, "loss": 0.3812, "step": 537 }, { "epoch": 0.09746376811594203, "grad_norm": 4.205051144146041, "learning_rate": 9.88135653031543e-07, "loss": 0.391, "step": 538 }, { "epoch": 0.09764492753623188, "grad_norm": 5.9123946130328635, "learning_rate": 9.88072035658132e-07, "loss": 0.4523, "step": 539 }, { "epoch": 0.09782608695652174, "grad_norm": 4.035804154739938, "learning_rate": 9.88008250239206e-07, "loss": 0.4333, "step": 540 }, { "epoch": 0.09800724637681159, "grad_norm": 3.6662815956216366, "learning_rate": 9.879442967967277e-07, "loss": 0.4395, "step": 541 }, { "epoch": 0.09818840579710145, "grad_norm": 3.853913747890982, "learning_rate": 9.87880175352716e-07, "loss": 0.3721, "step": 542 }, { "epoch": 0.0983695652173913, "grad_norm": 4.9243426979249, "learning_rate": 9.87815885929248e-07, "loss": 0.4215, "step": 543 }, { "epoch": 0.09855072463768116, "grad_norm": 4.836214182798005, "learning_rate": 9.877514285484596e-07, "loss": 0.4345, "step": 544 }, { "epoch": 0.09873188405797101, "grad_norm": 2.6599651591981437, "learning_rate": 9.876868032325431e-07, "loss": 0.3733, "step": 545 }, { "epoch": 0.09891304347826087, "grad_norm": 3.89849000510434, "learning_rate": 9.876220100037495e-07, "loss": 0.4503, "step": 546 }, { "epoch": 0.09909420289855073, "grad_norm": 5.7650794262787395, "learning_rate": 9.875570488843877e-07, "loss": 0.415, "step": 547 }, { "epoch": 0.09927536231884058, "grad_norm": 5.045400385597417, "learning_rate": 9.874919198968238e-07, "loss": 0.3721, "step": 548 }, { "epoch": 0.09945652173913043, "grad_norm": 10.845790348299131, "learning_rate": 9.874266230634817e-07, "loss": 0.3974, "step": 549 }, { "epoch": 0.09963768115942029, "grad_norm": 5.820459631963228, "learning_rate": 9.87361158406844e-07, "loss": 0.4004, "step": 550 }, { "epoch": 0.09981884057971015, "grad_norm": 4.982527003568789, "learning_rate": 9.872955259494507e-07, "loss": 0.4426, "step": 551 }, { "epoch": 0.1, "grad_norm": 4.09022491278035, "learning_rate": 9.872297257138986e-07, "loss": 0.4067, "step": 552 }, { "epoch": 0.10018115942028985, "grad_norm": 4.0414344654466925, "learning_rate": 9.871637577228435e-07, "loss": 0.403, "step": 553 }, { "epoch": 0.10036231884057971, "grad_norm": 5.408776258682077, "learning_rate": 9.870976219989983e-07, "loss": 0.392, "step": 554 }, { "epoch": 0.10054347826086957, "grad_norm": 6.118416413008888, "learning_rate": 9.87031318565134e-07, "loss": 0.3667, "step": 555 }, { "epoch": 0.10072463768115943, "grad_norm": 3.713821558799002, "learning_rate": 9.86964847444079e-07, "loss": 0.4355, "step": 556 }, { "epoch": 0.10090579710144927, "grad_norm": 5.084458387294805, "learning_rate": 9.868982086587198e-07, "loss": 0.3884, "step": 557 }, { "epoch": 0.10108695652173913, "grad_norm": 5.474832946669612, "learning_rate": 9.868314022320004e-07, "loss": 0.4503, "step": 558 }, { "epoch": 0.10126811594202899, "grad_norm": 8.965183447895956, "learning_rate": 9.867644281869225e-07, "loss": 0.4244, "step": 559 }, { "epoch": 0.10144927536231885, "grad_norm": 6.668905041203743, "learning_rate": 9.866972865465458e-07, "loss": 0.4434, "step": 560 }, { "epoch": 0.10163043478260869, "grad_norm": 9.970457044149805, "learning_rate": 9.866299773339872e-07, "loss": 0.3823, "step": 561 }, { "epoch": 0.10181159420289855, "grad_norm": 4.516665327587777, "learning_rate": 9.865625005724218e-07, "loss": 0.4406, "step": 562 }, { "epoch": 0.10199275362318841, "grad_norm": 4.297869573489257, "learning_rate": 9.86494856285082e-07, "loss": 0.3742, "step": 563 }, { "epoch": 0.10217391304347827, "grad_norm": 2.9766661533359096, "learning_rate": 9.86427044495258e-07, "loss": 0.402, "step": 564 }, { "epoch": 0.10235507246376811, "grad_norm": 2.9515335020993776, "learning_rate": 9.86359065226298e-07, "loss": 0.3802, "step": 565 }, { "epoch": 0.10253623188405797, "grad_norm": 6.602180704091807, "learning_rate": 9.862909185016075e-07, "loss": 0.4214, "step": 566 }, { "epoch": 0.10271739130434783, "grad_norm": 6.746499100917246, "learning_rate": 9.862226043446495e-07, "loss": 0.3825, "step": 567 }, { "epoch": 0.10289855072463767, "grad_norm": 3.6099067207943136, "learning_rate": 9.861541227789453e-07, "loss": 0.3969, "step": 568 }, { "epoch": 0.10307971014492753, "grad_norm": 13.761913336067417, "learning_rate": 9.860854738280729e-07, "loss": 0.4191, "step": 569 }, { "epoch": 0.10326086956521739, "grad_norm": 6.489451433994243, "learning_rate": 9.860166575156687e-07, "loss": 0.3791, "step": 570 }, { "epoch": 0.10344202898550725, "grad_norm": 4.314055565227107, "learning_rate": 9.859476738654267e-07, "loss": 0.4182, "step": 571 }, { "epoch": 0.1036231884057971, "grad_norm": 4.69387661324399, "learning_rate": 9.85878522901098e-07, "loss": 0.4029, "step": 572 }, { "epoch": 0.10380434782608695, "grad_norm": 10.001203485806512, "learning_rate": 9.858092046464918e-07, "loss": 0.4319, "step": 573 }, { "epoch": 0.10398550724637681, "grad_norm": 10.494122030255477, "learning_rate": 9.857397191254745e-07, "loss": 0.3797, "step": 574 }, { "epoch": 0.10416666666666667, "grad_norm": 5.345093980495571, "learning_rate": 9.856700663619706e-07, "loss": 0.4002, "step": 575 }, { "epoch": 0.10434782608695652, "grad_norm": 4.908460973089521, "learning_rate": 9.856002463799618e-07, "loss": 0.3571, "step": 576 }, { "epoch": 0.10452898550724637, "grad_norm": 2.7921889624281984, "learning_rate": 9.85530259203487e-07, "loss": 0.3745, "step": 577 }, { "epoch": 0.10471014492753623, "grad_norm": 14.436192603938819, "learning_rate": 9.854601048566439e-07, "loss": 0.4241, "step": 578 }, { "epoch": 0.10489130434782609, "grad_norm": 2.92397445519826, "learning_rate": 9.853897833635862e-07, "loss": 0.4128, "step": 579 }, { "epoch": 0.10507246376811594, "grad_norm": 10.900420049048746, "learning_rate": 9.853192947485264e-07, "loss": 0.3661, "step": 580 }, { "epoch": 0.1052536231884058, "grad_norm": 14.626108636879042, "learning_rate": 9.852486390357338e-07, "loss": 0.3942, "step": 581 }, { "epoch": 0.10543478260869565, "grad_norm": 8.30319118243175, "learning_rate": 9.851778162495356e-07, "loss": 0.3694, "step": 582 }, { "epoch": 0.10561594202898551, "grad_norm": 8.169036934257266, "learning_rate": 9.851068264143167e-07, "loss": 0.4099, "step": 583 }, { "epoch": 0.10579710144927536, "grad_norm": 2.9906093830436498, "learning_rate": 9.850356695545188e-07, "loss": 0.3566, "step": 584 }, { "epoch": 0.10597826086956522, "grad_norm": 3.6001680612909825, "learning_rate": 9.849643456946418e-07, "loss": 0.4521, "step": 585 }, { "epoch": 0.10615942028985507, "grad_norm": 6.085782026961147, "learning_rate": 9.848928548592427e-07, "loss": 0.3563, "step": 586 }, { "epoch": 0.10634057971014493, "grad_norm": 7.688400779311696, "learning_rate": 9.848211970729363e-07, "loss": 0.4225, "step": 587 }, { "epoch": 0.10652173913043478, "grad_norm": 5.300904388674724, "learning_rate": 9.847493723603945e-07, "loss": 0.4058, "step": 588 }, { "epoch": 0.10670289855072464, "grad_norm": 11.21739541512806, "learning_rate": 9.846773807463472e-07, "loss": 0.4172, "step": 589 }, { "epoch": 0.1068840579710145, "grad_norm": 8.766216878129292, "learning_rate": 9.84605222255581e-07, "loss": 0.3654, "step": 590 }, { "epoch": 0.10706521739130435, "grad_norm": 12.331651847214227, "learning_rate": 9.845328969129408e-07, "loss": 0.4144, "step": 591 }, { "epoch": 0.1072463768115942, "grad_norm": 6.724710853490635, "learning_rate": 9.844604047433284e-07, "loss": 0.3847, "step": 592 }, { "epoch": 0.10742753623188406, "grad_norm": 3.7769319461712776, "learning_rate": 9.84387745771703e-07, "loss": 0.3605, "step": 593 }, { "epoch": 0.10760869565217392, "grad_norm": 9.274282451844847, "learning_rate": 9.84314920023082e-07, "loss": 0.4225, "step": 594 }, { "epoch": 0.10778985507246377, "grad_norm": 9.08703773612549, "learning_rate": 9.842419275225393e-07, "loss": 0.3561, "step": 595 }, { "epoch": 0.10797101449275362, "grad_norm": 7.237535536958595, "learning_rate": 9.841687682952065e-07, "loss": 0.3491, "step": 596 }, { "epoch": 0.10815217391304348, "grad_norm": 5.977113474938249, "learning_rate": 9.840954423662725e-07, "loss": 0.4388, "step": 597 }, { "epoch": 0.10833333333333334, "grad_norm": 8.476019821951454, "learning_rate": 9.840219497609843e-07, "loss": 0.4062, "step": 598 }, { "epoch": 0.10851449275362318, "grad_norm": 3.530986362196853, "learning_rate": 9.839482905046454e-07, "loss": 0.4208, "step": 599 }, { "epoch": 0.10869565217391304, "grad_norm": 5.094473008842111, "learning_rate": 9.83874464622617e-07, "loss": 0.3959, "step": 600 }, { "epoch": 0.10869565217391304, "eval_loss": 0.3958437442779541, "eval_runtime": 9.7363, "eval_samples_per_second": 51.354, "eval_steps_per_second": 0.103, "step": 600 }, { "epoch": 0.1088768115942029, "grad_norm": 3.2452463718213727, "learning_rate": 9.83800472140318e-07, "loss": 0.3601, "step": 601 }, { "epoch": 0.10905797101449276, "grad_norm": 3.0100855619960862, "learning_rate": 9.837263130832238e-07, "loss": 0.3766, "step": 602 }, { "epoch": 0.1092391304347826, "grad_norm": 2.8678179697818305, "learning_rate": 9.836519874768683e-07, "loss": 0.3745, "step": 603 }, { "epoch": 0.10942028985507246, "grad_norm": 5.053696469470357, "learning_rate": 9.835774953468418e-07, "loss": 0.496, "step": 604 }, { "epoch": 0.10960144927536232, "grad_norm": 4.415771420539539, "learning_rate": 9.835028367187925e-07, "loss": 0.4315, "step": 605 }, { "epoch": 0.10978260869565218, "grad_norm": 3.9164017173426124, "learning_rate": 9.834280116184256e-07, "loss": 0.4481, "step": 606 }, { "epoch": 0.10996376811594202, "grad_norm": 8.193746453862184, "learning_rate": 9.833530200715036e-07, "loss": 0.4083, "step": 607 }, { "epoch": 0.11014492753623188, "grad_norm": 9.998514741027849, "learning_rate": 9.832778621038468e-07, "loss": 0.4246, "step": 608 }, { "epoch": 0.11032608695652174, "grad_norm": 3.60639495102559, "learning_rate": 9.832025377413323e-07, "loss": 0.3658, "step": 609 }, { "epoch": 0.1105072463768116, "grad_norm": 3.8901053542554647, "learning_rate": 9.831270470098945e-07, "loss": 0.4045, "step": 610 }, { "epoch": 0.11068840579710144, "grad_norm": 5.487699115996111, "learning_rate": 9.830513899355256e-07, "loss": 0.3884, "step": 611 }, { "epoch": 0.1108695652173913, "grad_norm": 7.847929804798982, "learning_rate": 9.82975566544274e-07, "loss": 0.3587, "step": 612 }, { "epoch": 0.11105072463768116, "grad_norm": 4.105159274722128, "learning_rate": 9.828995768622466e-07, "loss": 0.3392, "step": 613 }, { "epoch": 0.11123188405797102, "grad_norm": 6.811306174094375, "learning_rate": 9.828234209156068e-07, "loss": 0.4077, "step": 614 }, { "epoch": 0.11141304347826086, "grad_norm": 7.662751539071657, "learning_rate": 9.827470987305755e-07, "loss": 0.3582, "step": 615 }, { "epoch": 0.11159420289855072, "grad_norm": 3.7154738441060866, "learning_rate": 9.82670610333431e-07, "loss": 0.4308, "step": 616 }, { "epoch": 0.11177536231884058, "grad_norm": 3.5686291897697036, "learning_rate": 9.825939557505084e-07, "loss": 0.3893, "step": 617 }, { "epoch": 0.11195652173913044, "grad_norm": 6.22065233119125, "learning_rate": 9.825171350082e-07, "loss": 0.4276, "step": 618 }, { "epoch": 0.11213768115942029, "grad_norm": 10.433056372259445, "learning_rate": 9.82440148132956e-07, "loss": 0.4078, "step": 619 }, { "epoch": 0.11231884057971014, "grad_norm": 2.9469994737947705, "learning_rate": 9.82362995151283e-07, "loss": 0.3905, "step": 620 }, { "epoch": 0.1125, "grad_norm": 3.082673000388614, "learning_rate": 9.822856760897455e-07, "loss": 0.4229, "step": 621 }, { "epoch": 0.11268115942028986, "grad_norm": 14.433871663160714, "learning_rate": 9.822081909749645e-07, "loss": 0.3834, "step": 622 }, { "epoch": 0.1128623188405797, "grad_norm": 3.179853208322323, "learning_rate": 9.821305398336185e-07, "loss": 0.3336, "step": 623 }, { "epoch": 0.11304347826086956, "grad_norm": 4.095122390888196, "learning_rate": 9.820527226924434e-07, "loss": 0.3553, "step": 624 }, { "epoch": 0.11322463768115942, "grad_norm": 3.8474876482605014, "learning_rate": 9.81974739578232e-07, "loss": 0.3805, "step": 625 }, { "epoch": 0.11340579710144928, "grad_norm": 2.877984972194894, "learning_rate": 9.818965905178338e-07, "loss": 0.3715, "step": 626 }, { "epoch": 0.11358695652173913, "grad_norm": 3.0809366208593034, "learning_rate": 9.818182755381564e-07, "loss": 0.3597, "step": 627 }, { "epoch": 0.11376811594202899, "grad_norm": 9.166389852537947, "learning_rate": 9.817397946661637e-07, "loss": 0.4059, "step": 628 }, { "epoch": 0.11394927536231884, "grad_norm": 7.8759684402183465, "learning_rate": 9.816611479288771e-07, "loss": 0.4109, "step": 629 }, { "epoch": 0.11413043478260869, "grad_norm": 11.728571521660111, "learning_rate": 9.815823353533754e-07, "loss": 0.4954, "step": 630 }, { "epoch": 0.11431159420289855, "grad_norm": 3.206979668453135, "learning_rate": 9.815033569667936e-07, "loss": 0.3672, "step": 631 }, { "epoch": 0.1144927536231884, "grad_norm": 7.017852524469355, "learning_rate": 9.814242127963249e-07, "loss": 0.4023, "step": 632 }, { "epoch": 0.11467391304347826, "grad_norm": 4.072937382431754, "learning_rate": 9.813449028692183e-07, "loss": 0.4221, "step": 633 }, { "epoch": 0.11485507246376811, "grad_norm": 6.597252865370013, "learning_rate": 9.812654272127811e-07, "loss": 0.3888, "step": 634 }, { "epoch": 0.11503623188405797, "grad_norm": 10.301603098662682, "learning_rate": 9.811857858543774e-07, "loss": 0.4195, "step": 635 }, { "epoch": 0.11521739130434783, "grad_norm": 4.660264186912435, "learning_rate": 9.811059788214272e-07, "loss": 0.4183, "step": 636 }, { "epoch": 0.11539855072463769, "grad_norm": 5.642509998661616, "learning_rate": 9.81026006141409e-07, "loss": 0.4374, "step": 637 }, { "epoch": 0.11557971014492753, "grad_norm": 6.220978507427946, "learning_rate": 9.80945867841858e-07, "loss": 0.3696, "step": 638 }, { "epoch": 0.11576086956521739, "grad_norm": 6.348755432588962, "learning_rate": 9.80865563950366e-07, "loss": 0.3975, "step": 639 }, { "epoch": 0.11594202898550725, "grad_norm": 2.584135286875402, "learning_rate": 9.807850944945816e-07, "loss": 0.3565, "step": 640 }, { "epoch": 0.1161231884057971, "grad_norm": 11.783637917498103, "learning_rate": 9.807044595022115e-07, "loss": 0.351, "step": 641 }, { "epoch": 0.11630434782608695, "grad_norm": 8.196457156054365, "learning_rate": 9.806236590010183e-07, "loss": 0.4321, "step": 642 }, { "epoch": 0.11648550724637681, "grad_norm": 4.471093443157233, "learning_rate": 9.805426930188219e-07, "loss": 0.3306, "step": 643 }, { "epoch": 0.11666666666666667, "grad_norm": 3.377125441232469, "learning_rate": 9.804615615834994e-07, "loss": 0.3226, "step": 644 }, { "epoch": 0.11684782608695653, "grad_norm": 13.3291823205937, "learning_rate": 9.803802647229851e-07, "loss": 0.4011, "step": 645 }, { "epoch": 0.11702898550724637, "grad_norm": 8.118372845401215, "learning_rate": 9.802988024652691e-07, "loss": 0.3384, "step": 646 }, { "epoch": 0.11721014492753623, "grad_norm": 3.806946572896715, "learning_rate": 9.802171748384e-07, "loss": 0.3943, "step": 647 }, { "epoch": 0.11739130434782609, "grad_norm": 3.446292255646783, "learning_rate": 9.801353818704825e-07, "loss": 0.4333, "step": 648 }, { "epoch": 0.11757246376811595, "grad_norm": 10.789095146819566, "learning_rate": 9.800534235896777e-07, "loss": 0.3832, "step": 649 }, { "epoch": 0.11775362318840579, "grad_norm": 5.649727144288448, "learning_rate": 9.799713000242048e-07, "loss": 0.3973, "step": 650 }, { "epoch": 0.11793478260869565, "grad_norm": 4.459102989692808, "learning_rate": 9.79889011202339e-07, "loss": 0.353, "step": 651 }, { "epoch": 0.11811594202898551, "grad_norm": 4.759071213763818, "learning_rate": 9.79806557152413e-07, "loss": 0.3931, "step": 652 }, { "epoch": 0.11829710144927537, "grad_norm": 6.921810591021103, "learning_rate": 9.797239379028162e-07, "loss": 0.3712, "step": 653 }, { "epoch": 0.11847826086956521, "grad_norm": 3.440311046248852, "learning_rate": 9.796411534819944e-07, "loss": 0.3821, "step": 654 }, { "epoch": 0.11865942028985507, "grad_norm": 13.185666667653342, "learning_rate": 9.795582039184508e-07, "loss": 0.3495, "step": 655 }, { "epoch": 0.11884057971014493, "grad_norm": 11.383458188725191, "learning_rate": 9.794750892407455e-07, "loss": 0.4305, "step": 656 }, { "epoch": 0.11902173913043479, "grad_norm": 3.580592974471932, "learning_rate": 9.79391809477495e-07, "loss": 0.3491, "step": 657 }, { "epoch": 0.11920289855072463, "grad_norm": 9.90590132292987, "learning_rate": 9.79308364657373e-07, "loss": 0.4762, "step": 658 }, { "epoch": 0.11938405797101449, "grad_norm": 14.952403412799473, "learning_rate": 9.792247548091105e-07, "loss": 0.4676, "step": 659 }, { "epoch": 0.11956521739130435, "grad_norm": 3.556782685122642, "learning_rate": 9.79140979961494e-07, "loss": 0.3681, "step": 660 }, { "epoch": 0.1197463768115942, "grad_norm": 3.356889662500131, "learning_rate": 9.79057040143368e-07, "loss": 0.4087, "step": 661 }, { "epoch": 0.11992753623188405, "grad_norm": 11.935608592160225, "learning_rate": 9.789729353836333e-07, "loss": 0.4225, "step": 662 }, { "epoch": 0.12010869565217391, "grad_norm": 13.74587458686005, "learning_rate": 9.788886657112473e-07, "loss": 0.3547, "step": 663 }, { "epoch": 0.12028985507246377, "grad_norm": 12.814818226606935, "learning_rate": 9.78804231155225e-07, "loss": 0.428, "step": 664 }, { "epoch": 0.12047101449275362, "grad_norm": 16.426466256408123, "learning_rate": 9.787196317446368e-07, "loss": 0.3838, "step": 665 }, { "epoch": 0.12065217391304348, "grad_norm": 6.089270057942982, "learning_rate": 9.786348675086115e-07, "loss": 0.4162, "step": 666 }, { "epoch": 0.12083333333333333, "grad_norm": 6.513792221842381, "learning_rate": 9.785499384763336e-07, "loss": 0.3676, "step": 667 }, { "epoch": 0.12101449275362319, "grad_norm": 5.519976041785401, "learning_rate": 9.784648446770442e-07, "loss": 0.3644, "step": 668 }, { "epoch": 0.12119565217391304, "grad_norm": 9.847255679651372, "learning_rate": 9.78379586140042e-07, "loss": 0.4197, "step": 669 }, { "epoch": 0.1213768115942029, "grad_norm": 9.779224060063939, "learning_rate": 9.782941628946817e-07, "loss": 0.3552, "step": 670 }, { "epoch": 0.12155797101449275, "grad_norm": 5.233732751639689, "learning_rate": 9.782085749703747e-07, "loss": 0.3907, "step": 671 }, { "epoch": 0.12173913043478261, "grad_norm": 3.2881841013666295, "learning_rate": 9.781228223965897e-07, "loss": 0.3704, "step": 672 }, { "epoch": 0.12192028985507246, "grad_norm": 3.1476049070256273, "learning_rate": 9.780369052028514e-07, "loss": 0.3734, "step": 673 }, { "epoch": 0.12210144927536232, "grad_norm": 4.262774040056997, "learning_rate": 9.779508234187418e-07, "loss": 0.4518, "step": 674 }, { "epoch": 0.12228260869565218, "grad_norm": 8.404520746520697, "learning_rate": 9.778645770738989e-07, "loss": 0.3849, "step": 675 }, { "epoch": 0.12246376811594203, "grad_norm": 3.435724116865664, "learning_rate": 9.777781661980183e-07, "loss": 0.3946, "step": 676 }, { "epoch": 0.12264492753623188, "grad_norm": 3.3178740566243046, "learning_rate": 9.77691590820851e-07, "loss": 0.4237, "step": 677 }, { "epoch": 0.12282608695652174, "grad_norm": 7.137062377583599, "learning_rate": 9.776048509722058e-07, "loss": 0.4645, "step": 678 }, { "epoch": 0.1230072463768116, "grad_norm": 3.3447094611975925, "learning_rate": 9.775179466819473e-07, "loss": 0.3505, "step": 679 }, { "epoch": 0.12318840579710146, "grad_norm": 9.45090635641566, "learning_rate": 9.774308779799973e-07, "loss": 0.3764, "step": 680 }, { "epoch": 0.1233695652173913, "grad_norm": 8.990675798337948, "learning_rate": 9.77343644896334e-07, "loss": 0.3374, "step": 681 }, { "epoch": 0.12355072463768116, "grad_norm": 6.0427782176287135, "learning_rate": 9.77256247460992e-07, "loss": 0.4135, "step": 682 }, { "epoch": 0.12373188405797102, "grad_norm": 4.548460165953483, "learning_rate": 9.771686857040628e-07, "loss": 0.3986, "step": 683 }, { "epoch": 0.12391304347826088, "grad_norm": 4.814153264451756, "learning_rate": 9.770809596556941e-07, "loss": 0.463, "step": 684 }, { "epoch": 0.12409420289855072, "grad_norm": 4.316694317097807, "learning_rate": 9.769930693460905e-07, "loss": 0.3973, "step": 685 }, { "epoch": 0.12427536231884058, "grad_norm": 7.435108970514274, "learning_rate": 9.769050148055132e-07, "loss": 0.3527, "step": 686 }, { "epoch": 0.12445652173913044, "grad_norm": 6.476662969387697, "learning_rate": 9.768167960642797e-07, "loss": 0.3514, "step": 687 }, { "epoch": 0.1246376811594203, "grad_norm": 2.9049206699213035, "learning_rate": 9.76728413152764e-07, "loss": 0.4276, "step": 688 }, { "epoch": 0.12481884057971014, "grad_norm": 4.764847311211572, "learning_rate": 9.766398661013971e-07, "loss": 0.4558, "step": 689 }, { "epoch": 0.125, "grad_norm": 6.407684827808415, "learning_rate": 9.765511549406656e-07, "loss": 0.3802, "step": 690 }, { "epoch": 0.12518115942028984, "grad_norm": 3.7028837829026564, "learning_rate": 9.764622797011137e-07, "loss": 0.4103, "step": 691 }, { "epoch": 0.12536231884057972, "grad_norm": 7.71378838626683, "learning_rate": 9.763732404133413e-07, "loss": 0.363, "step": 692 }, { "epoch": 0.12554347826086956, "grad_norm": 9.643257822231876, "learning_rate": 9.762840371080053e-07, "loss": 0.4159, "step": 693 }, { "epoch": 0.1257246376811594, "grad_norm": 4.696104032789181, "learning_rate": 9.761946698158184e-07, "loss": 0.3883, "step": 694 }, { "epoch": 0.12590579710144928, "grad_norm": 3.2731710565805168, "learning_rate": 9.761051385675505e-07, "loss": 0.4235, "step": 695 }, { "epoch": 0.12608695652173912, "grad_norm": 2.7622798539691713, "learning_rate": 9.760154433940277e-07, "loss": 0.3998, "step": 696 }, { "epoch": 0.126268115942029, "grad_norm": 7.475118099531793, "learning_rate": 9.759255843261321e-07, "loss": 0.3657, "step": 697 }, { "epoch": 0.12644927536231884, "grad_norm": 3.0033639726547863, "learning_rate": 9.75835561394803e-07, "loss": 0.3854, "step": 698 }, { "epoch": 0.1266304347826087, "grad_norm": 3.8583730298714145, "learning_rate": 9.757453746310356e-07, "loss": 0.4194, "step": 699 }, { "epoch": 0.12681159420289856, "grad_norm": 5.064421534590891, "learning_rate": 9.756550240658813e-07, "loss": 0.386, "step": 700 }, { "epoch": 0.12681159420289856, "eval_loss": 0.3960312604904175, "eval_runtime": 9.762, "eval_samples_per_second": 51.219, "eval_steps_per_second": 0.102, "step": 700 }, { "epoch": 0.1269927536231884, "grad_norm": 6.6325477871274945, "learning_rate": 9.755645097304487e-07, "loss": 0.3879, "step": 701 }, { "epoch": 0.12717391304347825, "grad_norm": 13.7923980110677, "learning_rate": 9.754738316559021e-07, "loss": 0.4185, "step": 702 }, { "epoch": 0.12735507246376812, "grad_norm": 3.700581849731215, "learning_rate": 9.753829898734625e-07, "loss": 0.3622, "step": 703 }, { "epoch": 0.12753623188405797, "grad_norm": 3.0346633543449992, "learning_rate": 9.752919844144072e-07, "loss": 0.3396, "step": 704 }, { "epoch": 0.12771739130434784, "grad_norm": 6.3041659774770595, "learning_rate": 9.752008153100694e-07, "loss": 0.4014, "step": 705 }, { "epoch": 0.12789855072463768, "grad_norm": 6.063651744567198, "learning_rate": 9.751094825918396e-07, "loss": 0.4123, "step": 706 }, { "epoch": 0.12807971014492753, "grad_norm": 9.294585763947483, "learning_rate": 9.750179862911636e-07, "loss": 0.3395, "step": 707 }, { "epoch": 0.1282608695652174, "grad_norm": 3.7473147595140923, "learning_rate": 9.749263264395442e-07, "loss": 0.3564, "step": 708 }, { "epoch": 0.12844202898550725, "grad_norm": 6.465478677553752, "learning_rate": 9.748345030685405e-07, "loss": 0.2896, "step": 709 }, { "epoch": 0.1286231884057971, "grad_norm": 4.18430460390063, "learning_rate": 9.747425162097675e-07, "loss": 0.3494, "step": 710 }, { "epoch": 0.12880434782608696, "grad_norm": 8.4926540296536, "learning_rate": 9.74650365894897e-07, "loss": 0.4205, "step": 711 }, { "epoch": 0.1289855072463768, "grad_norm": 4.976275165228336, "learning_rate": 9.745580521556565e-07, "loss": 0.362, "step": 712 }, { "epoch": 0.12916666666666668, "grad_norm": 3.5694099166930733, "learning_rate": 9.7446557502383e-07, "loss": 0.4035, "step": 713 }, { "epoch": 0.12934782608695652, "grad_norm": 8.431100018244468, "learning_rate": 9.74372934531258e-07, "loss": 0.4276, "step": 714 }, { "epoch": 0.12952898550724637, "grad_norm": 5.324249127684661, "learning_rate": 9.74280130709837e-07, "loss": 0.369, "step": 715 }, { "epoch": 0.12971014492753624, "grad_norm": 6.61440037267252, "learning_rate": 9.741871635915198e-07, "loss": 0.4131, "step": 716 }, { "epoch": 0.1298913043478261, "grad_norm": 8.301797080913259, "learning_rate": 9.740940332083157e-07, "loss": 0.4225, "step": 717 }, { "epoch": 0.13007246376811593, "grad_norm": 8.86277659785471, "learning_rate": 9.740007395922894e-07, "loss": 0.3135, "step": 718 }, { "epoch": 0.1302536231884058, "grad_norm": 4.698305190045604, "learning_rate": 9.739072827755625e-07, "loss": 0.4027, "step": 719 }, { "epoch": 0.13043478260869565, "grad_norm": 2.789284161666489, "learning_rate": 9.738136627903128e-07, "loss": 0.3113, "step": 720 }, { "epoch": 0.13061594202898552, "grad_norm": 3.926952487330974, "learning_rate": 9.737198796687741e-07, "loss": 0.354, "step": 721 }, { "epoch": 0.13079710144927537, "grad_norm": 7.832881188785057, "learning_rate": 9.736259334432365e-07, "loss": 0.3597, "step": 722 }, { "epoch": 0.1309782608695652, "grad_norm": 3.475532568110265, "learning_rate": 9.735318241460455e-07, "loss": 0.3256, "step": 723 }, { "epoch": 0.13115942028985508, "grad_norm": 6.701946998152024, "learning_rate": 9.73437551809604e-07, "loss": 0.3619, "step": 724 }, { "epoch": 0.13134057971014493, "grad_norm": 4.738607910166096, "learning_rate": 9.733431164663704e-07, "loss": 0.4333, "step": 725 }, { "epoch": 0.13152173913043477, "grad_norm": 3.7634470648572504, "learning_rate": 9.732485181488587e-07, "loss": 0.421, "step": 726 }, { "epoch": 0.13170289855072465, "grad_norm": 4.2927344771472855, "learning_rate": 9.731537568896402e-07, "loss": 0.3489, "step": 727 }, { "epoch": 0.1318840579710145, "grad_norm": 13.340906086846939, "learning_rate": 9.730588327213413e-07, "loss": 0.3699, "step": 728 }, { "epoch": 0.13206521739130433, "grad_norm": 4.351896014543642, "learning_rate": 9.729637456766448e-07, "loss": 0.3864, "step": 729 }, { "epoch": 0.1322463768115942, "grad_norm": 5.070168107936124, "learning_rate": 9.728684957882897e-07, "loss": 0.3193, "step": 730 }, { "epoch": 0.13242753623188405, "grad_norm": 9.97750769002732, "learning_rate": 9.727730830890711e-07, "loss": 0.3527, "step": 731 }, { "epoch": 0.13260869565217392, "grad_norm": 3.5442634744989308, "learning_rate": 9.7267750761184e-07, "loss": 0.3431, "step": 732 }, { "epoch": 0.13278985507246377, "grad_norm": 3.1279346551227056, "learning_rate": 9.725817693895033e-07, "loss": 0.3572, "step": 733 }, { "epoch": 0.13297101449275361, "grad_norm": 5.618630634188929, "learning_rate": 9.724858684550242e-07, "loss": 0.4297, "step": 734 }, { "epoch": 0.1331521739130435, "grad_norm": 3.2992168776312347, "learning_rate": 9.72389804841422e-07, "loss": 0.3508, "step": 735 }, { "epoch": 0.13333333333333333, "grad_norm": 3.4024123655505205, "learning_rate": 9.722935785817719e-07, "loss": 0.3826, "step": 736 }, { "epoch": 0.13351449275362318, "grad_norm": 3.3014334098875877, "learning_rate": 9.721971897092047e-07, "loss": 0.3617, "step": 737 }, { "epoch": 0.13369565217391305, "grad_norm": 3.6500039654226373, "learning_rate": 9.721006382569078e-07, "loss": 0.3816, "step": 738 }, { "epoch": 0.1338768115942029, "grad_norm": 14.450200674257177, "learning_rate": 9.720039242581244e-07, "loss": 0.3958, "step": 739 }, { "epoch": 0.13405797101449277, "grad_norm": 6.404906931479888, "learning_rate": 9.719070477461534e-07, "loss": 0.3394, "step": 740 }, { "epoch": 0.1342391304347826, "grad_norm": 5.9014248150966155, "learning_rate": 9.7181000875435e-07, "loss": 0.3561, "step": 741 }, { "epoch": 0.13442028985507246, "grad_norm": 5.323203958593394, "learning_rate": 9.717128073161251e-07, "loss": 0.3605, "step": 742 }, { "epoch": 0.13460144927536233, "grad_norm": 5.167294648938972, "learning_rate": 9.716154434649454e-07, "loss": 0.4105, "step": 743 }, { "epoch": 0.13478260869565217, "grad_norm": 8.858319730185437, "learning_rate": 9.715179172343342e-07, "loss": 0.3992, "step": 744 }, { "epoch": 0.13496376811594202, "grad_norm": 4.038283642494259, "learning_rate": 9.7142022865787e-07, "loss": 0.4045, "step": 745 }, { "epoch": 0.1351449275362319, "grad_norm": 6.217163933503129, "learning_rate": 9.713223777691873e-07, "loss": 0.345, "step": 746 }, { "epoch": 0.13532608695652174, "grad_norm": 6.6243645812344765, "learning_rate": 9.712243646019768e-07, "loss": 0.3281, "step": 747 }, { "epoch": 0.1355072463768116, "grad_norm": 3.6858113189654027, "learning_rate": 9.71126189189985e-07, "loss": 0.3591, "step": 748 }, { "epoch": 0.13568840579710145, "grad_norm": 5.855006578912425, "learning_rate": 9.710278515670138e-07, "loss": 0.363, "step": 749 }, { "epoch": 0.1358695652173913, "grad_norm": 4.718939998893757, "learning_rate": 9.709293517669216e-07, "loss": 0.4333, "step": 750 }, { "epoch": 0.13605072463768117, "grad_norm": 3.990509621420453, "learning_rate": 9.708306898236224e-07, "loss": 0.3584, "step": 751 }, { "epoch": 0.13623188405797101, "grad_norm": 6.355724151961903, "learning_rate": 9.707318657710856e-07, "loss": 0.3691, "step": 752 }, { "epoch": 0.13641304347826086, "grad_norm": 5.630874686294518, "learning_rate": 9.706328796433372e-07, "loss": 0.3609, "step": 753 }, { "epoch": 0.13659420289855073, "grad_norm": 15.440653376058837, "learning_rate": 9.705337314744584e-07, "loss": 0.3926, "step": 754 }, { "epoch": 0.13677536231884058, "grad_norm": 8.163687939741473, "learning_rate": 9.704344212985864e-07, "loss": 0.3619, "step": 755 }, { "epoch": 0.13695652173913042, "grad_norm": 8.200076470458544, "learning_rate": 9.703349491499141e-07, "loss": 0.3517, "step": 756 }, { "epoch": 0.1371376811594203, "grad_norm": 9.886028220333985, "learning_rate": 9.702353150626905e-07, "loss": 0.3894, "step": 757 }, { "epoch": 0.13731884057971014, "grad_norm": 11.179868795711787, "learning_rate": 9.701355190712198e-07, "loss": 0.3624, "step": 758 }, { "epoch": 0.1375, "grad_norm": 3.7661974639531945, "learning_rate": 9.700355612098625e-07, "loss": 0.3106, "step": 759 }, { "epoch": 0.13768115942028986, "grad_norm": 10.171930780588786, "learning_rate": 9.699354415130342e-07, "loss": 0.3062, "step": 760 }, { "epoch": 0.1378623188405797, "grad_norm": 5.922694462163911, "learning_rate": 9.698351600152069e-07, "loss": 0.3886, "step": 761 }, { "epoch": 0.13804347826086957, "grad_norm": 11.546057025043572, "learning_rate": 9.69734716750908e-07, "loss": 0.451, "step": 762 }, { "epoch": 0.13822463768115942, "grad_norm": 8.915262455626223, "learning_rate": 9.696341117547203e-07, "loss": 0.4023, "step": 763 }, { "epoch": 0.13840579710144926, "grad_norm": 4.161229591208603, "learning_rate": 9.695333450612826e-07, "loss": 0.3854, "step": 764 }, { "epoch": 0.13858695652173914, "grad_norm": 11.600860504527866, "learning_rate": 9.694324167052897e-07, "loss": 0.3711, "step": 765 }, { "epoch": 0.13876811594202898, "grad_norm": 4.0145484078765765, "learning_rate": 9.693313267214916e-07, "loss": 0.3898, "step": 766 }, { "epoch": 0.13894927536231885, "grad_norm": 5.804797337669499, "learning_rate": 9.692300751446939e-07, "loss": 0.3209, "step": 767 }, { "epoch": 0.1391304347826087, "grad_norm": 4.193754548751051, "learning_rate": 9.691286620097578e-07, "loss": 0.3179, "step": 768 }, { "epoch": 0.13931159420289854, "grad_norm": 3.475967173310117, "learning_rate": 9.69027087351601e-07, "loss": 0.3642, "step": 769 }, { "epoch": 0.13949275362318841, "grad_norm": 7.257326955939186, "learning_rate": 9.689253512051953e-07, "loss": 0.3701, "step": 770 }, { "epoch": 0.13967391304347826, "grad_norm": 5.009573319900192, "learning_rate": 9.688234536055697e-07, "loss": 0.4319, "step": 771 }, { "epoch": 0.1398550724637681, "grad_norm": 4.11774493284656, "learning_rate": 9.687213945878077e-07, "loss": 0.3671, "step": 772 }, { "epoch": 0.14003623188405798, "grad_norm": 4.00577694651643, "learning_rate": 9.686191741870484e-07, "loss": 0.4042, "step": 773 }, { "epoch": 0.14021739130434782, "grad_norm": 4.933949750613941, "learning_rate": 9.685167924384874e-07, "loss": 0.3898, "step": 774 }, { "epoch": 0.1403985507246377, "grad_norm": 6.733188882480495, "learning_rate": 9.684142493773746e-07, "loss": 0.3627, "step": 775 }, { "epoch": 0.14057971014492754, "grad_norm": 6.449572526510951, "learning_rate": 9.683115450390166e-07, "loss": 0.406, "step": 776 }, { "epoch": 0.14076086956521738, "grad_norm": 5.138868832834836, "learning_rate": 9.682086794587746e-07, "loss": 0.4077, "step": 777 }, { "epoch": 0.14094202898550726, "grad_norm": 5.662777115952582, "learning_rate": 9.681056526720659e-07, "loss": 0.3893, "step": 778 }, { "epoch": 0.1411231884057971, "grad_norm": 4.011876697773631, "learning_rate": 9.68002464714363e-07, "loss": 0.3967, "step": 779 }, { "epoch": 0.14130434782608695, "grad_norm": 5.1476725760419635, "learning_rate": 9.67899115621194e-07, "loss": 0.3622, "step": 780 }, { "epoch": 0.14148550724637682, "grad_norm": 6.817405599928272, "learning_rate": 9.677956054281427e-07, "loss": 0.3489, "step": 781 }, { "epoch": 0.14166666666666666, "grad_norm": 7.3995868163142395, "learning_rate": 9.676919341708478e-07, "loss": 0.4357, "step": 782 }, { "epoch": 0.14184782608695654, "grad_norm": 6.939671348813399, "learning_rate": 9.675881018850042e-07, "loss": 0.3524, "step": 783 }, { "epoch": 0.14202898550724638, "grad_norm": 12.293898606053197, "learning_rate": 9.674841086063615e-07, "loss": 0.3965, "step": 784 }, { "epoch": 0.14221014492753623, "grad_norm": 10.188664348028176, "learning_rate": 9.673799543707253e-07, "loss": 0.3198, "step": 785 }, { "epoch": 0.1423913043478261, "grad_norm": 16.162971647431426, "learning_rate": 9.672756392139562e-07, "loss": 0.4337, "step": 786 }, { "epoch": 0.14257246376811594, "grad_norm": 3.8420235768561644, "learning_rate": 9.671711631719705e-07, "loss": 0.3297, "step": 787 }, { "epoch": 0.1427536231884058, "grad_norm": 6.522246431494951, "learning_rate": 9.670665262807401e-07, "loss": 0.3169, "step": 788 }, { "epoch": 0.14293478260869566, "grad_norm": 6.425771496097432, "learning_rate": 9.669617285762915e-07, "loss": 0.295, "step": 789 }, { "epoch": 0.1431159420289855, "grad_norm": 3.7260951294235203, "learning_rate": 9.668567700947073e-07, "loss": 0.4178, "step": 790 }, { "epoch": 0.14329710144927535, "grad_norm": 5.104163619378291, "learning_rate": 9.66751650872125e-07, "loss": 0.4174, "step": 791 }, { "epoch": 0.14347826086956522, "grad_norm": 7.84633879459535, "learning_rate": 9.666463709447378e-07, "loss": 0.3659, "step": 792 }, { "epoch": 0.14365942028985507, "grad_norm": 7.278148656240213, "learning_rate": 9.665409303487942e-07, "loss": 0.4153, "step": 793 }, { "epoch": 0.14384057971014494, "grad_norm": 6.103875019351923, "learning_rate": 9.664353291205977e-07, "loss": 0.35, "step": 794 }, { "epoch": 0.14402173913043478, "grad_norm": 3.403446085823657, "learning_rate": 9.663295672965072e-07, "loss": 0.3642, "step": 795 }, { "epoch": 0.14420289855072463, "grad_norm": 3.5099159116141974, "learning_rate": 9.662236449129376e-07, "loss": 0.34, "step": 796 }, { "epoch": 0.1443840579710145, "grad_norm": 3.9585793014698125, "learning_rate": 9.661175620063577e-07, "loss": 0.4159, "step": 797 }, { "epoch": 0.14456521739130435, "grad_norm": 3.279449429804172, "learning_rate": 9.660113186132929e-07, "loss": 0.3469, "step": 798 }, { "epoch": 0.1447463768115942, "grad_norm": 10.946868427677378, "learning_rate": 9.659049147703229e-07, "loss": 0.3452, "step": 799 }, { "epoch": 0.14492753623188406, "grad_norm": 5.565521881030789, "learning_rate": 9.657983505140832e-07, "loss": 0.3445, "step": 800 }, { "epoch": 0.14492753623188406, "eval_loss": 0.38487499952316284, "eval_runtime": 9.836, "eval_samples_per_second": 50.834, "eval_steps_per_second": 0.102, "step": 800 }, { "epoch": 0.1451086956521739, "grad_norm": 4.548090245672361, "learning_rate": 9.656916258812644e-07, "loss": 0.3264, "step": 801 }, { "epoch": 0.14528985507246378, "grad_norm": 3.60462194109557, "learning_rate": 9.655847409086127e-07, "loss": 0.3765, "step": 802 }, { "epoch": 0.14547101449275363, "grad_norm": 6.9962945480349745, "learning_rate": 9.654776956329282e-07, "loss": 0.417, "step": 803 }, { "epoch": 0.14565217391304347, "grad_norm": 4.42724840563812, "learning_rate": 9.65370490091068e-07, "loss": 0.3067, "step": 804 }, { "epoch": 0.14583333333333334, "grad_norm": 5.433112866664569, "learning_rate": 9.65263124319943e-07, "loss": 0.3631, "step": 805 }, { "epoch": 0.1460144927536232, "grad_norm": 4.506366773451285, "learning_rate": 9.651555983565197e-07, "loss": 0.3542, "step": 806 }, { "epoch": 0.14619565217391303, "grad_norm": 3.549703730721839, "learning_rate": 9.650479122378202e-07, "loss": 0.3972, "step": 807 }, { "epoch": 0.1463768115942029, "grad_norm": 3.9561570455500323, "learning_rate": 9.649400660009209e-07, "loss": 0.4058, "step": 808 }, { "epoch": 0.14655797101449275, "grad_norm": 13.666271787920849, "learning_rate": 9.648320596829538e-07, "loss": 0.43, "step": 809 }, { "epoch": 0.14673913043478262, "grad_norm": 7.1884870677029244, "learning_rate": 9.647238933211064e-07, "loss": 0.4365, "step": 810 }, { "epoch": 0.14692028985507247, "grad_norm": 9.467113610615664, "learning_rate": 9.646155669526204e-07, "loss": 0.3666, "step": 811 }, { "epoch": 0.1471014492753623, "grad_norm": 4.560384820550004, "learning_rate": 9.645070806147936e-07, "loss": 0.4078, "step": 812 }, { "epoch": 0.14728260869565218, "grad_norm": 7.542947790041231, "learning_rate": 9.643984343449777e-07, "loss": 0.3817, "step": 813 }, { "epoch": 0.14746376811594203, "grad_norm": 4.889270994146063, "learning_rate": 9.642896281805805e-07, "loss": 0.412, "step": 814 }, { "epoch": 0.14764492753623187, "grad_norm": 5.810871439198549, "learning_rate": 9.641806621590647e-07, "loss": 0.3763, "step": 815 }, { "epoch": 0.14782608695652175, "grad_norm": 3.2506554566136145, "learning_rate": 9.640715363179477e-07, "loss": 0.3561, "step": 816 }, { "epoch": 0.1480072463768116, "grad_norm": 5.478664739910058, "learning_rate": 9.639622506948017e-07, "loss": 0.3551, "step": 817 }, { "epoch": 0.14818840579710144, "grad_norm": 5.054596974684607, "learning_rate": 9.638528053272544e-07, "loss": 0.4258, "step": 818 }, { "epoch": 0.1483695652173913, "grad_norm": 6.284451731268105, "learning_rate": 9.637432002529886e-07, "loss": 0.3923, "step": 819 }, { "epoch": 0.14855072463768115, "grad_norm": 5.097815918738833, "learning_rate": 9.636334355097417e-07, "loss": 0.372, "step": 820 }, { "epoch": 0.14873188405797103, "grad_norm": 8.42513399020794, "learning_rate": 9.635235111353061e-07, "loss": 0.3941, "step": 821 }, { "epoch": 0.14891304347826087, "grad_norm": 5.125452534191936, "learning_rate": 9.634134271675294e-07, "loss": 0.3848, "step": 822 }, { "epoch": 0.14909420289855072, "grad_norm": 4.9518559143653205, "learning_rate": 9.633031836443142e-07, "loss": 0.3407, "step": 823 }, { "epoch": 0.1492753623188406, "grad_norm": 7.988320363279841, "learning_rate": 9.631927806036175e-07, "loss": 0.4225, "step": 824 }, { "epoch": 0.14945652173913043, "grad_norm": 10.514387579134466, "learning_rate": 9.630822180834518e-07, "loss": 0.42, "step": 825 }, { "epoch": 0.14963768115942028, "grad_norm": 4.008870447693685, "learning_rate": 9.629714961218845e-07, "loss": 0.3538, "step": 826 }, { "epoch": 0.14981884057971015, "grad_norm": 3.2600794804770348, "learning_rate": 9.628606147570374e-07, "loss": 0.3674, "step": 827 }, { "epoch": 0.15, "grad_norm": 4.660418037004727, "learning_rate": 9.627495740270874e-07, "loss": 0.3875, "step": 828 }, { "epoch": 0.15018115942028987, "grad_norm": 14.658238136033177, "learning_rate": 9.626383739702668e-07, "loss": 0.4232, "step": 829 }, { "epoch": 0.1503623188405797, "grad_norm": 3.427152235192319, "learning_rate": 9.625270146248616e-07, "loss": 0.3716, "step": 830 }, { "epoch": 0.15054347826086956, "grad_norm": 7.104916016105561, "learning_rate": 9.624154960292141e-07, "loss": 0.3761, "step": 831 }, { "epoch": 0.15072463768115943, "grad_norm": 5.268346352114768, "learning_rate": 9.623038182217202e-07, "loss": 0.3954, "step": 832 }, { "epoch": 0.15090579710144927, "grad_norm": 3.909921849942659, "learning_rate": 9.621919812408313e-07, "loss": 0.3466, "step": 833 }, { "epoch": 0.15108695652173912, "grad_norm": 5.313064486280673, "learning_rate": 9.620799851250534e-07, "loss": 0.4255, "step": 834 }, { "epoch": 0.151268115942029, "grad_norm": 5.658689697252792, "learning_rate": 9.61967829912947e-07, "loss": 0.411, "step": 835 }, { "epoch": 0.15144927536231884, "grad_norm": 7.133698128780928, "learning_rate": 9.618555156431283e-07, "loss": 0.4258, "step": 836 }, { "epoch": 0.1516304347826087, "grad_norm": 10.710861565044695, "learning_rate": 9.61743042354267e-07, "loss": 0.3623, "step": 837 }, { "epoch": 0.15181159420289855, "grad_norm": 5.549993138938895, "learning_rate": 9.616304100850883e-07, "loss": 0.3879, "step": 838 }, { "epoch": 0.1519927536231884, "grad_norm": 2.988074991973997, "learning_rate": 9.615176188743724e-07, "loss": 0.3518, "step": 839 }, { "epoch": 0.15217391304347827, "grad_norm": 4.49008750002056, "learning_rate": 9.614046687609537e-07, "loss": 0.3054, "step": 840 }, { "epoch": 0.15235507246376812, "grad_norm": 6.63240051746998, "learning_rate": 9.61291559783721e-07, "loss": 0.3423, "step": 841 }, { "epoch": 0.15253623188405796, "grad_norm": 9.673311143114644, "learning_rate": 9.611782919816188e-07, "loss": 0.3448, "step": 842 }, { "epoch": 0.15271739130434783, "grad_norm": 3.749240635946091, "learning_rate": 9.610648653936456e-07, "loss": 0.3544, "step": 843 }, { "epoch": 0.15289855072463768, "grad_norm": 4.295774831111311, "learning_rate": 9.609512800588547e-07, "loss": 0.3632, "step": 844 }, { "epoch": 0.15307971014492755, "grad_norm": 6.363918337016414, "learning_rate": 9.608375360163539e-07, "loss": 0.3277, "step": 845 }, { "epoch": 0.1532608695652174, "grad_norm": 8.437863753748266, "learning_rate": 9.60723633305306e-07, "loss": 0.3885, "step": 846 }, { "epoch": 0.15344202898550724, "grad_norm": 3.670816263383733, "learning_rate": 9.606095719649283e-07, "loss": 0.3575, "step": 847 }, { "epoch": 0.1536231884057971, "grad_norm": 4.570738702853096, "learning_rate": 9.604953520344925e-07, "loss": 0.3544, "step": 848 }, { "epoch": 0.15380434782608696, "grad_norm": 13.828584658268664, "learning_rate": 9.603809735533252e-07, "loss": 0.4651, "step": 849 }, { "epoch": 0.1539855072463768, "grad_norm": 11.632307488888198, "learning_rate": 9.602664365608073e-07, "loss": 0.3982, "step": 850 }, { "epoch": 0.15416666666666667, "grad_norm": 10.83300354975015, "learning_rate": 9.601517410963744e-07, "loss": 0.3533, "step": 851 }, { "epoch": 0.15434782608695652, "grad_norm": 6.58319396975539, "learning_rate": 9.600368871995171e-07, "loss": 0.3647, "step": 852 }, { "epoch": 0.15452898550724636, "grad_norm": 6.30529673226947, "learning_rate": 9.599218749097795e-07, "loss": 0.3892, "step": 853 }, { "epoch": 0.15471014492753624, "grad_norm": 3.516847429110062, "learning_rate": 9.598067042667615e-07, "loss": 0.3497, "step": 854 }, { "epoch": 0.15489130434782608, "grad_norm": 7.011751073550093, "learning_rate": 9.596913753101164e-07, "loss": 0.3427, "step": 855 }, { "epoch": 0.15507246376811595, "grad_norm": 15.093237775139633, "learning_rate": 9.595758880795528e-07, "loss": 0.3937, "step": 856 }, { "epoch": 0.1552536231884058, "grad_norm": 7.4426286572501406, "learning_rate": 9.594602426148333e-07, "loss": 0.3569, "step": 857 }, { "epoch": 0.15543478260869564, "grad_norm": 3.432755358759048, "learning_rate": 9.593444389557754e-07, "loss": 0.3318, "step": 858 }, { "epoch": 0.15561594202898552, "grad_norm": 4.003942739573997, "learning_rate": 9.592284771422508e-07, "loss": 0.3702, "step": 859 }, { "epoch": 0.15579710144927536, "grad_norm": 4.348993560705254, "learning_rate": 9.591123572141855e-07, "loss": 0.3199, "step": 860 }, { "epoch": 0.1559782608695652, "grad_norm": 4.255913600894319, "learning_rate": 9.589960792115604e-07, "loss": 0.3644, "step": 861 }, { "epoch": 0.15615942028985508, "grad_norm": 3.7138425667155657, "learning_rate": 9.588796431744104e-07, "loss": 0.416, "step": 862 }, { "epoch": 0.15634057971014492, "grad_norm": 2.9089075514409983, "learning_rate": 9.587630491428251e-07, "loss": 0.3494, "step": 863 }, { "epoch": 0.1565217391304348, "grad_norm": 9.313295612231036, "learning_rate": 9.586462971569484e-07, "loss": 0.3903, "step": 864 }, { "epoch": 0.15670289855072464, "grad_norm": 18.45482805203893, "learning_rate": 9.585293872569784e-07, "loss": 0.3486, "step": 865 }, { "epoch": 0.15688405797101448, "grad_norm": 9.596823852595307, "learning_rate": 9.584123194831676e-07, "loss": 0.3402, "step": 866 }, { "epoch": 0.15706521739130436, "grad_norm": 20.850007743178377, "learning_rate": 9.582950938758235e-07, "loss": 0.4471, "step": 867 }, { "epoch": 0.1572463768115942, "grad_norm": 7.402207466198938, "learning_rate": 9.58177710475307e-07, "loss": 0.3485, "step": 868 }, { "epoch": 0.15742753623188405, "grad_norm": 3.2430225734636577, "learning_rate": 9.58060169322034e-07, "loss": 0.3873, "step": 869 }, { "epoch": 0.15760869565217392, "grad_norm": 3.14898584025622, "learning_rate": 9.579424704564742e-07, "loss": 0.3349, "step": 870 }, { "epoch": 0.15778985507246376, "grad_norm": 6.113644444204073, "learning_rate": 9.57824613919152e-07, "loss": 0.3682, "step": 871 }, { "epoch": 0.15797101449275364, "grad_norm": 10.038499417889854, "learning_rate": 9.577065997506462e-07, "loss": 0.4148, "step": 872 }, { "epoch": 0.15815217391304348, "grad_norm": 3.632432549337236, "learning_rate": 9.575884279915893e-07, "loss": 0.3898, "step": 873 }, { "epoch": 0.15833333333333333, "grad_norm": 3.457209725662949, "learning_rate": 9.574700986826686e-07, "loss": 0.34, "step": 874 }, { "epoch": 0.1585144927536232, "grad_norm": 3.77759273533897, "learning_rate": 9.573516118646255e-07, "loss": 0.3785, "step": 875 }, { "epoch": 0.15869565217391304, "grad_norm": 7.5671753374998145, "learning_rate": 9.572329675782554e-07, "loss": 0.4427, "step": 876 }, { "epoch": 0.1588768115942029, "grad_norm": 6.527966930844897, "learning_rate": 9.571141658644079e-07, "loss": 0.4471, "step": 877 }, { "epoch": 0.15905797101449276, "grad_norm": 8.468264485855864, "learning_rate": 9.569952067639876e-07, "loss": 0.3697, "step": 878 }, { "epoch": 0.1592391304347826, "grad_norm": 6.4731431205325425, "learning_rate": 9.568760903179522e-07, "loss": 0.3671, "step": 879 }, { "epoch": 0.15942028985507245, "grad_norm": 4.506365643974041, "learning_rate": 9.56756816567314e-07, "loss": 0.3502, "step": 880 }, { "epoch": 0.15960144927536232, "grad_norm": 5.409892690271166, "learning_rate": 9.5663738555314e-07, "loss": 0.3485, "step": 881 }, { "epoch": 0.15978260869565217, "grad_norm": 3.655959457052786, "learning_rate": 9.565177973165503e-07, "loss": 0.3249, "step": 882 }, { "epoch": 0.15996376811594204, "grad_norm": 10.263410742979417, "learning_rate": 9.5639805189872e-07, "loss": 0.3403, "step": 883 }, { "epoch": 0.16014492753623188, "grad_norm": 3.960020975701844, "learning_rate": 9.562781493408781e-07, "loss": 0.335, "step": 884 }, { "epoch": 0.16032608695652173, "grad_norm": 11.049851127898126, "learning_rate": 9.561580896843075e-07, "loss": 0.4307, "step": 885 }, { "epoch": 0.1605072463768116, "grad_norm": 8.49611291863617, "learning_rate": 9.560378729703453e-07, "loss": 0.3697, "step": 886 }, { "epoch": 0.16068840579710145, "grad_norm": 5.791306770435816, "learning_rate": 9.559174992403825e-07, "loss": 0.3317, "step": 887 }, { "epoch": 0.1608695652173913, "grad_norm": 7.657815216224209, "learning_rate": 9.557969685358646e-07, "loss": 0.3938, "step": 888 }, { "epoch": 0.16105072463768116, "grad_norm": 6.810366563771853, "learning_rate": 9.55676280898291e-07, "loss": 0.373, "step": 889 }, { "epoch": 0.161231884057971, "grad_norm": 3.072323261181124, "learning_rate": 9.555554363692146e-07, "loss": 0.3734, "step": 890 }, { "epoch": 0.16141304347826088, "grad_norm": 9.5510734788038, "learning_rate": 9.55434434990243e-07, "loss": 0.3297, "step": 891 }, { "epoch": 0.16159420289855073, "grad_norm": 5.637895209116408, "learning_rate": 9.553132768030377e-07, "loss": 0.3337, "step": 892 }, { "epoch": 0.16177536231884057, "grad_norm": 5.00893520226379, "learning_rate": 9.551919618493137e-07, "loss": 0.3373, "step": 893 }, { "epoch": 0.16195652173913044, "grad_norm": 3.3584080604749693, "learning_rate": 9.55070490170841e-07, "loss": 0.3539, "step": 894 }, { "epoch": 0.1621376811594203, "grad_norm": 5.186086911355004, "learning_rate": 9.549488618094417e-07, "loss": 0.3447, "step": 895 }, { "epoch": 0.16231884057971013, "grad_norm": 4.71172306752271, "learning_rate": 9.54827076806994e-07, "loss": 0.3913, "step": 896 }, { "epoch": 0.1625, "grad_norm": 6.159846189423231, "learning_rate": 9.547051352054288e-07, "loss": 0.3284, "step": 897 }, { "epoch": 0.16268115942028985, "grad_norm": 3.5153420547325296, "learning_rate": 9.54583037046731e-07, "loss": 0.3806, "step": 898 }, { "epoch": 0.16286231884057972, "grad_norm": 3.5131824704383665, "learning_rate": 9.544607823729397e-07, "loss": 0.3962, "step": 899 }, { "epoch": 0.16304347826086957, "grad_norm": 9.511103902268252, "learning_rate": 9.543383712261477e-07, "loss": 0.3837, "step": 900 }, { "epoch": 0.16304347826086957, "eval_loss": 0.37748438119888306, "eval_runtime": 9.8032, "eval_samples_per_second": 51.004, "eval_steps_per_second": 0.102, "step": 900 }, { "epoch": 0.1632246376811594, "grad_norm": 5.314558362677815, "learning_rate": 9.542158036485017e-07, "loss": 0.408, "step": 901 }, { "epoch": 0.16340579710144928, "grad_norm": 5.2986074149954625, "learning_rate": 9.540930796822025e-07, "loss": 0.3248, "step": 902 }, { "epoch": 0.16358695652173913, "grad_norm": 8.373249151123064, "learning_rate": 9.539701993695047e-07, "loss": 0.3659, "step": 903 }, { "epoch": 0.16376811594202897, "grad_norm": 3.9471347645813473, "learning_rate": 9.538471627527159e-07, "loss": 0.3451, "step": 904 }, { "epoch": 0.16394927536231885, "grad_norm": 8.60106479616005, "learning_rate": 9.537239698741989e-07, "loss": 0.3569, "step": 905 }, { "epoch": 0.1641304347826087, "grad_norm": 6.160742255884759, "learning_rate": 9.536006207763689e-07, "loss": 0.3965, "step": 906 }, { "epoch": 0.16431159420289856, "grad_norm": 5.851640936244484, "learning_rate": 9.534771155016963e-07, "loss": 0.3629, "step": 907 }, { "epoch": 0.1644927536231884, "grad_norm": 6.746005514478953, "learning_rate": 9.533534540927039e-07, "loss": 0.4047, "step": 908 }, { "epoch": 0.16467391304347825, "grad_norm": 7.654547234650914, "learning_rate": 9.532296365919695e-07, "loss": 0.4122, "step": 909 }, { "epoch": 0.16485507246376813, "grad_norm": 3.908851215894925, "learning_rate": 9.531056630421237e-07, "loss": 0.395, "step": 910 }, { "epoch": 0.16503623188405797, "grad_norm": 11.687031888304489, "learning_rate": 9.529815334858513e-07, "loss": 0.404, "step": 911 }, { "epoch": 0.16521739130434782, "grad_norm": 7.180870776017081, "learning_rate": 9.528572479658906e-07, "loss": 0.3594, "step": 912 }, { "epoch": 0.1653985507246377, "grad_norm": 7.414159772457074, "learning_rate": 9.527328065250337e-07, "loss": 0.3004, "step": 913 }, { "epoch": 0.16557971014492753, "grad_norm": 10.31741860357107, "learning_rate": 9.526082092061265e-07, "loss": 0.3714, "step": 914 }, { "epoch": 0.16576086956521738, "grad_norm": 5.360065023395594, "learning_rate": 9.524834560520683e-07, "loss": 0.4095, "step": 915 }, { "epoch": 0.16594202898550725, "grad_norm": 11.373999458413381, "learning_rate": 9.523585471058122e-07, "loss": 0.3418, "step": 916 }, { "epoch": 0.1661231884057971, "grad_norm": 6.350662250121829, "learning_rate": 9.522334824103652e-07, "loss": 0.4067, "step": 917 }, { "epoch": 0.16630434782608697, "grad_norm": 3.4483211330563015, "learning_rate": 9.521082620087874e-07, "loss": 0.3484, "step": 918 }, { "epoch": 0.1664855072463768, "grad_norm": 7.433083990164779, "learning_rate": 9.519828859441927e-07, "loss": 0.3609, "step": 919 }, { "epoch": 0.16666666666666666, "grad_norm": 4.742072955737512, "learning_rate": 9.51857354259749e-07, "loss": 0.3688, "step": 920 }, { "epoch": 0.16684782608695653, "grad_norm": 3.486922317106545, "learning_rate": 9.517316669986773e-07, "loss": 0.3536, "step": 921 }, { "epoch": 0.16702898550724637, "grad_norm": 10.190529498816694, "learning_rate": 9.516058242042523e-07, "loss": 0.347, "step": 922 }, { "epoch": 0.16721014492753622, "grad_norm": 5.361601627973088, "learning_rate": 9.514798259198023e-07, "loss": 0.3692, "step": 923 }, { "epoch": 0.1673913043478261, "grad_norm": 9.072925983194018, "learning_rate": 9.51353672188709e-07, "loss": 0.383, "step": 924 }, { "epoch": 0.16757246376811594, "grad_norm": 3.944725667584065, "learning_rate": 9.512273630544076e-07, "loss": 0.3909, "step": 925 }, { "epoch": 0.1677536231884058, "grad_norm": 6.363206727826493, "learning_rate": 9.511008985603874e-07, "loss": 0.3341, "step": 926 }, { "epoch": 0.16793478260869565, "grad_norm": 9.491061479422129, "learning_rate": 9.509742787501905e-07, "loss": 0.4163, "step": 927 }, { "epoch": 0.1681159420289855, "grad_norm": 4.787527839018586, "learning_rate": 9.508475036674126e-07, "loss": 0.3746, "step": 928 }, { "epoch": 0.16829710144927537, "grad_norm": 3.197899797504528, "learning_rate": 9.50720573355703e-07, "loss": 0.3116, "step": 929 }, { "epoch": 0.16847826086956522, "grad_norm": 5.140509743780928, "learning_rate": 9.505934878587645e-07, "loss": 0.4061, "step": 930 }, { "epoch": 0.16865942028985506, "grad_norm": 15.002553681035751, "learning_rate": 9.504662472203531e-07, "loss": 0.3604, "step": 931 }, { "epoch": 0.16884057971014493, "grad_norm": 5.96837292583248, "learning_rate": 9.503388514842785e-07, "loss": 0.3799, "step": 932 }, { "epoch": 0.16902173913043478, "grad_norm": 7.788969683101942, "learning_rate": 9.502113006944035e-07, "loss": 0.3005, "step": 933 }, { "epoch": 0.16920289855072465, "grad_norm": 4.044286329813409, "learning_rate": 9.500835948946445e-07, "loss": 0.3247, "step": 934 }, { "epoch": 0.1693840579710145, "grad_norm": 8.016480224087877, "learning_rate": 9.499557341289712e-07, "loss": 0.3712, "step": 935 }, { "epoch": 0.16956521739130434, "grad_norm": 4.700508069097732, "learning_rate": 9.498277184414069e-07, "loss": 0.3466, "step": 936 }, { "epoch": 0.1697463768115942, "grad_norm": 5.442230755902823, "learning_rate": 9.496995478760277e-07, "loss": 0.3927, "step": 937 }, { "epoch": 0.16992753623188406, "grad_norm": 8.915551465170877, "learning_rate": 9.495712224769634e-07, "loss": 0.4575, "step": 938 }, { "epoch": 0.1701086956521739, "grad_norm": 4.151027924138207, "learning_rate": 9.494427422883973e-07, "loss": 0.356, "step": 939 }, { "epoch": 0.17028985507246377, "grad_norm": 5.718246214997718, "learning_rate": 9.493141073545653e-07, "loss": 0.3824, "step": 940 }, { "epoch": 0.17047101449275362, "grad_norm": 3.685446969118357, "learning_rate": 9.491853177197573e-07, "loss": 0.335, "step": 941 }, { "epoch": 0.17065217391304346, "grad_norm": 6.412322728638104, "learning_rate": 9.490563734283162e-07, "loss": 0.3681, "step": 942 }, { "epoch": 0.17083333333333334, "grad_norm": 3.4483154072954916, "learning_rate": 9.48927274524638e-07, "loss": 0.3582, "step": 943 }, { "epoch": 0.17101449275362318, "grad_norm": 3.198315352212821, "learning_rate": 9.487980210531721e-07, "loss": 0.3226, "step": 944 }, { "epoch": 0.17119565217391305, "grad_norm": 4.118986349729483, "learning_rate": 9.486686130584211e-07, "loss": 0.3557, "step": 945 }, { "epoch": 0.1713768115942029, "grad_norm": 5.52801025546669, "learning_rate": 9.485390505849409e-07, "loss": 0.338, "step": 946 }, { "epoch": 0.17155797101449274, "grad_norm": 3.5767430582142485, "learning_rate": 9.484093336773402e-07, "loss": 0.3743, "step": 947 }, { "epoch": 0.17173913043478262, "grad_norm": 2.8978836810437807, "learning_rate": 9.482794623802813e-07, "loss": 0.3447, "step": 948 }, { "epoch": 0.17192028985507246, "grad_norm": 3.1757229252687202, "learning_rate": 9.481494367384797e-07, "loss": 0.3636, "step": 949 }, { "epoch": 0.1721014492753623, "grad_norm": 4.031839709657643, "learning_rate": 9.480192567967035e-07, "loss": 0.343, "step": 950 }, { "epoch": 0.17228260869565218, "grad_norm": 12.512170983104669, "learning_rate": 9.478889225997744e-07, "loss": 0.3585, "step": 951 }, { "epoch": 0.17246376811594202, "grad_norm": 4.353936774623856, "learning_rate": 9.477584341925672e-07, "loss": 0.4444, "step": 952 }, { "epoch": 0.1726449275362319, "grad_norm": 9.019787013891907, "learning_rate": 9.476277916200095e-07, "loss": 0.3483, "step": 953 }, { "epoch": 0.17282608695652174, "grad_norm": 4.660307011902687, "learning_rate": 9.474969949270824e-07, "loss": 0.4307, "step": 954 }, { "epoch": 0.17300724637681159, "grad_norm": 3.3725580448123145, "learning_rate": 9.473660441588195e-07, "loss": 0.3358, "step": 955 }, { "epoch": 0.17318840579710146, "grad_norm": 3.777973545569461, "learning_rate": 9.472349393603079e-07, "loss": 0.4268, "step": 956 }, { "epoch": 0.1733695652173913, "grad_norm": 8.176022737349475, "learning_rate": 9.471036805766879e-07, "loss": 0.353, "step": 957 }, { "epoch": 0.17355072463768115, "grad_norm": 7.009417024695021, "learning_rate": 9.46972267853152e-07, "loss": 0.3792, "step": 958 }, { "epoch": 0.17373188405797102, "grad_norm": 3.5393012585686616, "learning_rate": 9.468407012349465e-07, "loss": 0.4186, "step": 959 }, { "epoch": 0.17391304347826086, "grad_norm": 3.0345096292351688, "learning_rate": 9.467089807673705e-07, "loss": 0.3476, "step": 960 }, { "epoch": 0.17409420289855074, "grad_norm": 3.81454820468709, "learning_rate": 9.465771064957758e-07, "loss": 0.4319, "step": 961 }, { "epoch": 0.17427536231884058, "grad_norm": 5.993156035622376, "learning_rate": 9.464450784655674e-07, "loss": 0.3436, "step": 962 }, { "epoch": 0.17445652173913043, "grad_norm": 3.6909003706862955, "learning_rate": 9.463128967222032e-07, "loss": 0.3018, "step": 963 }, { "epoch": 0.1746376811594203, "grad_norm": 14.570507151586428, "learning_rate": 9.461805613111939e-07, "loss": 0.4432, "step": 964 }, { "epoch": 0.17481884057971014, "grad_norm": 5.674599814277299, "learning_rate": 9.460480722781035e-07, "loss": 0.3471, "step": 965 }, { "epoch": 0.175, "grad_norm": 6.035949212202583, "learning_rate": 9.459154296685484e-07, "loss": 0.308, "step": 966 }, { "epoch": 0.17518115942028986, "grad_norm": 3.208638660816449, "learning_rate": 9.457826335281978e-07, "loss": 0.3614, "step": 967 }, { "epoch": 0.1753623188405797, "grad_norm": 3.5617227454126246, "learning_rate": 9.456496839027745e-07, "loss": 0.4059, "step": 968 }, { "epoch": 0.17554347826086958, "grad_norm": 5.793243700113749, "learning_rate": 9.455165808380534e-07, "loss": 0.3929, "step": 969 }, { "epoch": 0.17572463768115942, "grad_norm": 7.775133784202099, "learning_rate": 9.453833243798628e-07, "loss": 0.3406, "step": 970 }, { "epoch": 0.17590579710144927, "grad_norm": 5.260090966172478, "learning_rate": 9.452499145740831e-07, "loss": 0.3907, "step": 971 }, { "epoch": 0.17608695652173914, "grad_norm": 4.829127408307273, "learning_rate": 9.451163514666483e-07, "loss": 0.3336, "step": 972 }, { "epoch": 0.17626811594202899, "grad_norm": 3.8242458029708852, "learning_rate": 9.449826351035448e-07, "loss": 0.3755, "step": 973 }, { "epoch": 0.17644927536231883, "grad_norm": 8.500633066793206, "learning_rate": 9.448487655308115e-07, "loss": 0.3964, "step": 974 }, { "epoch": 0.1766304347826087, "grad_norm": 3.442260634987885, "learning_rate": 9.447147427945406e-07, "loss": 0.3905, "step": 975 }, { "epoch": 0.17681159420289855, "grad_norm": 3.5481962394012223, "learning_rate": 9.445805669408765e-07, "loss": 0.3429, "step": 976 }, { "epoch": 0.1769927536231884, "grad_norm": 3.9489484074927526, "learning_rate": 9.444462380160168e-07, "loss": 0.3096, "step": 977 }, { "epoch": 0.17717391304347826, "grad_norm": 3.774687773569502, "learning_rate": 9.443117560662115e-07, "loss": 0.3696, "step": 978 }, { "epoch": 0.1773550724637681, "grad_norm": 3.557360706695933, "learning_rate": 9.441771211377636e-07, "loss": 0.3287, "step": 979 }, { "epoch": 0.17753623188405798, "grad_norm": 3.8417257399064124, "learning_rate": 9.440423332770281e-07, "loss": 0.319, "step": 980 }, { "epoch": 0.17771739130434783, "grad_norm": 4.566565366730673, "learning_rate": 9.439073925304134e-07, "loss": 0.3058, "step": 981 }, { "epoch": 0.17789855072463767, "grad_norm": 7.3976658023800965, "learning_rate": 9.437722989443802e-07, "loss": 0.4134, "step": 982 }, { "epoch": 0.17807971014492754, "grad_norm": 4.627771585188553, "learning_rate": 9.436370525654418e-07, "loss": 0.4205, "step": 983 }, { "epoch": 0.1782608695652174, "grad_norm": 3.3030755440117505, "learning_rate": 9.435016534401643e-07, "loss": 0.3332, "step": 984 }, { "epoch": 0.17844202898550723, "grad_norm": 8.834543020429823, "learning_rate": 9.43366101615166e-07, "loss": 0.3401, "step": 985 }, { "epoch": 0.1786231884057971, "grad_norm": 8.402093992430165, "learning_rate": 9.432303971371183e-07, "loss": 0.3159, "step": 986 }, { "epoch": 0.17880434782608695, "grad_norm": 10.264054501425852, "learning_rate": 9.430945400527448e-07, "loss": 0.3946, "step": 987 }, { "epoch": 0.17898550724637682, "grad_norm": 3.954948520536284, "learning_rate": 9.429585304088217e-07, "loss": 0.3259, "step": 988 }, { "epoch": 0.17916666666666667, "grad_norm": 3.8447824092246874, "learning_rate": 9.428223682521778e-07, "loss": 0.4027, "step": 989 }, { "epoch": 0.1793478260869565, "grad_norm": 5.012065500188494, "learning_rate": 9.426860536296946e-07, "loss": 0.3368, "step": 990 }, { "epoch": 0.17952898550724639, "grad_norm": 4.421023883537979, "learning_rate": 9.425495865883053e-07, "loss": 0.3147, "step": 991 }, { "epoch": 0.17971014492753623, "grad_norm": 7.9101491709946625, "learning_rate": 9.424129671749966e-07, "loss": 0.4053, "step": 992 }, { "epoch": 0.17989130434782608, "grad_norm": 4.053598227442787, "learning_rate": 9.422761954368073e-07, "loss": 0.3586, "step": 993 }, { "epoch": 0.18007246376811595, "grad_norm": 5.957033916844329, "learning_rate": 9.421392714208281e-07, "loss": 0.3587, "step": 994 }, { "epoch": 0.1802536231884058, "grad_norm": 4.195549867293666, "learning_rate": 9.42002195174203e-07, "loss": 0.4107, "step": 995 }, { "epoch": 0.18043478260869567, "grad_norm": 4.509708808919222, "learning_rate": 9.418649667441278e-07, "loss": 0.36, "step": 996 }, { "epoch": 0.1806159420289855, "grad_norm": 4.679967495725039, "learning_rate": 9.417275861778509e-07, "loss": 0.3694, "step": 997 }, { "epoch": 0.18079710144927535, "grad_norm": 6.081344390427841, "learning_rate": 9.415900535226733e-07, "loss": 0.3435, "step": 998 }, { "epoch": 0.18097826086956523, "grad_norm": 4.464610848455029, "learning_rate": 9.414523688259477e-07, "loss": 0.343, "step": 999 }, { "epoch": 0.18115942028985507, "grad_norm": 5.465069141893093, "learning_rate": 9.413145321350801e-07, "loss": 0.3831, "step": 1000 }, { "epoch": 0.18115942028985507, "eval_loss": 0.3557968735694885, "eval_runtime": 9.7981, "eval_samples_per_second": 51.03, "eval_steps_per_second": 0.102, "step": 1000 }, { "epoch": 0.18134057971014492, "grad_norm": 4.1988974120391225, "learning_rate": 9.411765434975281e-07, "loss": 0.3582, "step": 1001 }, { "epoch": 0.1815217391304348, "grad_norm": 5.676730742098176, "learning_rate": 9.410384029608016e-07, "loss": 0.3672, "step": 1002 }, { "epoch": 0.18170289855072463, "grad_norm": 5.374866950407323, "learning_rate": 9.409001105724634e-07, "loss": 0.3848, "step": 1003 }, { "epoch": 0.18188405797101448, "grad_norm": 5.128296812499538, "learning_rate": 9.407616663801283e-07, "loss": 0.4016, "step": 1004 }, { "epoch": 0.18206521739130435, "grad_norm": 3.31719734897798, "learning_rate": 9.406230704314628e-07, "loss": 0.3253, "step": 1005 }, { "epoch": 0.1822463768115942, "grad_norm": 6.785966995929203, "learning_rate": 9.404843227741867e-07, "loss": 0.4005, "step": 1006 }, { "epoch": 0.18242753623188407, "grad_norm": 6.838749556127302, "learning_rate": 9.403454234560711e-07, "loss": 0.3651, "step": 1007 }, { "epoch": 0.1826086956521739, "grad_norm": 6.560158606171007, "learning_rate": 9.402063725249396e-07, "loss": 0.3732, "step": 1008 }, { "epoch": 0.18278985507246376, "grad_norm": 3.86840262552788, "learning_rate": 9.400671700286685e-07, "loss": 0.3217, "step": 1009 }, { "epoch": 0.18297101449275363, "grad_norm": 4.2179882987082165, "learning_rate": 9.399278160151858e-07, "loss": 0.3052, "step": 1010 }, { "epoch": 0.18315217391304348, "grad_norm": 3.64792183603231, "learning_rate": 9.397883105324713e-07, "loss": 0.3477, "step": 1011 }, { "epoch": 0.18333333333333332, "grad_norm": 6.892344215352792, "learning_rate": 9.396486536285579e-07, "loss": 0.4508, "step": 1012 }, { "epoch": 0.1835144927536232, "grad_norm": 14.641369583306055, "learning_rate": 9.395088453515301e-07, "loss": 0.4146, "step": 1013 }, { "epoch": 0.18369565217391304, "grad_norm": 15.832537940991099, "learning_rate": 9.393688857495243e-07, "loss": 0.34, "step": 1014 }, { "epoch": 0.1838768115942029, "grad_norm": 4.3144035402251575, "learning_rate": 9.392287748707292e-07, "loss": 0.4022, "step": 1015 }, { "epoch": 0.18405797101449275, "grad_norm": 6.261068339056287, "learning_rate": 9.39088512763386e-07, "loss": 0.3387, "step": 1016 }, { "epoch": 0.1842391304347826, "grad_norm": 4.056070089493664, "learning_rate": 9.389480994757873e-07, "loss": 0.3511, "step": 1017 }, { "epoch": 0.18442028985507247, "grad_norm": 6.733169082851925, "learning_rate": 9.388075350562783e-07, "loss": 0.405, "step": 1018 }, { "epoch": 0.18460144927536232, "grad_norm": 3.8826643978883304, "learning_rate": 9.386668195532557e-07, "loss": 0.3667, "step": 1019 }, { "epoch": 0.18478260869565216, "grad_norm": 3.5398140355329204, "learning_rate": 9.385259530151688e-07, "loss": 0.355, "step": 1020 }, { "epoch": 0.18496376811594203, "grad_norm": 3.2992884292362694, "learning_rate": 9.383849354905184e-07, "loss": 0.3199, "step": 1021 }, { "epoch": 0.18514492753623188, "grad_norm": 3.158293490792938, "learning_rate": 9.382437670278578e-07, "loss": 0.3208, "step": 1022 }, { "epoch": 0.18532608695652175, "grad_norm": 7.364388737807517, "learning_rate": 9.381024476757915e-07, "loss": 0.3758, "step": 1023 }, { "epoch": 0.1855072463768116, "grad_norm": 8.152792736078732, "learning_rate": 9.379609774829769e-07, "loss": 0.3214, "step": 1024 }, { "epoch": 0.18568840579710144, "grad_norm": 4.680932173783523, "learning_rate": 9.378193564981225e-07, "loss": 0.3177, "step": 1025 }, { "epoch": 0.1858695652173913, "grad_norm": 3.2460016757868186, "learning_rate": 9.376775847699894e-07, "loss": 0.3685, "step": 1026 }, { "epoch": 0.18605072463768116, "grad_norm": 3.181593046712226, "learning_rate": 9.375356623473899e-07, "loss": 0.3076, "step": 1027 }, { "epoch": 0.186231884057971, "grad_norm": 4.144203924103408, "learning_rate": 9.373935892791889e-07, "loss": 0.3441, "step": 1028 }, { "epoch": 0.18641304347826088, "grad_norm": 8.878600569479174, "learning_rate": 9.372513656143026e-07, "loss": 0.3803, "step": 1029 }, { "epoch": 0.18659420289855072, "grad_norm": 3.300176819811975, "learning_rate": 9.371089914016995e-07, "loss": 0.334, "step": 1030 }, { "epoch": 0.1867753623188406, "grad_norm": 4.256776263086497, "learning_rate": 9.369664666903996e-07, "loss": 0.3546, "step": 1031 }, { "epoch": 0.18695652173913044, "grad_norm": 6.922059186282991, "learning_rate": 9.368237915294748e-07, "loss": 0.3782, "step": 1032 }, { "epoch": 0.18713768115942028, "grad_norm": 7.345206267770565, "learning_rate": 9.366809659680488e-07, "loss": 0.3101, "step": 1033 }, { "epoch": 0.18731884057971016, "grad_norm": 3.7060120071990137, "learning_rate": 9.365379900552972e-07, "loss": 0.3377, "step": 1034 }, { "epoch": 0.1875, "grad_norm": 9.258034061223405, "learning_rate": 9.363948638404472e-07, "loss": 0.3306, "step": 1035 }, { "epoch": 0.18768115942028984, "grad_norm": 3.756822751714663, "learning_rate": 9.36251587372778e-07, "loss": 0.3321, "step": 1036 }, { "epoch": 0.18786231884057972, "grad_norm": 3.873198801213808, "learning_rate": 9.361081607016202e-07, "loss": 0.3748, "step": 1037 }, { "epoch": 0.18804347826086956, "grad_norm": 4.118267251180074, "learning_rate": 9.359645838763564e-07, "loss": 0.3533, "step": 1038 }, { "epoch": 0.1882246376811594, "grad_norm": 3.516923362026741, "learning_rate": 9.358208569464208e-07, "loss": 0.32, "step": 1039 }, { "epoch": 0.18840579710144928, "grad_norm": 7.744720378017167, "learning_rate": 9.35676979961299e-07, "loss": 0.3836, "step": 1040 }, { "epoch": 0.18858695652173912, "grad_norm": 5.030928842442586, "learning_rate": 9.355329529705288e-07, "loss": 0.3219, "step": 1041 }, { "epoch": 0.188768115942029, "grad_norm": 3.538556785759717, "learning_rate": 9.353887760236994e-07, "loss": 0.3073, "step": 1042 }, { "epoch": 0.18894927536231884, "grad_norm": 4.596728290588321, "learning_rate": 9.352444491704513e-07, "loss": 0.3112, "step": 1043 }, { "epoch": 0.1891304347826087, "grad_norm": 3.870053971875424, "learning_rate": 9.350999724604772e-07, "loss": 0.3183, "step": 1044 }, { "epoch": 0.18931159420289856, "grad_norm": 7.761349154182568, "learning_rate": 9.349553459435211e-07, "loss": 0.3843, "step": 1045 }, { "epoch": 0.1894927536231884, "grad_norm": 8.152976149577444, "learning_rate": 9.348105696693785e-07, "loss": 0.3494, "step": 1046 }, { "epoch": 0.18967391304347825, "grad_norm": 12.263657947687328, "learning_rate": 9.346656436878965e-07, "loss": 0.3679, "step": 1047 }, { "epoch": 0.18985507246376812, "grad_norm": 3.4252123995475636, "learning_rate": 9.34520568048974e-07, "loss": 0.3684, "step": 1048 }, { "epoch": 0.19003623188405797, "grad_norm": 7.322953184670889, "learning_rate": 9.343753428025611e-07, "loss": 0.3858, "step": 1049 }, { "epoch": 0.19021739130434784, "grad_norm": 3.720919985019423, "learning_rate": 9.342299679986596e-07, "loss": 0.352, "step": 1050 }, { "epoch": 0.19039855072463768, "grad_norm": 5.309245134565914, "learning_rate": 9.340844436873226e-07, "loss": 0.3127, "step": 1051 }, { "epoch": 0.19057971014492753, "grad_norm": 6.088834503363983, "learning_rate": 9.33938769918655e-07, "loss": 0.3327, "step": 1052 }, { "epoch": 0.1907608695652174, "grad_norm": 5.244885531702647, "learning_rate": 9.337929467428128e-07, "loss": 0.3776, "step": 1053 }, { "epoch": 0.19094202898550725, "grad_norm": 5.222800044527979, "learning_rate": 9.336469742100037e-07, "loss": 0.3658, "step": 1054 }, { "epoch": 0.1911231884057971, "grad_norm": 4.565827218524475, "learning_rate": 9.335008523704867e-07, "loss": 0.315, "step": 1055 }, { "epoch": 0.19130434782608696, "grad_norm": 4.38386894383268, "learning_rate": 9.333545812745723e-07, "loss": 0.3693, "step": 1056 }, { "epoch": 0.1914855072463768, "grad_norm": 3.3435317732837455, "learning_rate": 9.332081609726224e-07, "loss": 0.3228, "step": 1057 }, { "epoch": 0.19166666666666668, "grad_norm": 6.5740815579589205, "learning_rate": 9.330615915150498e-07, "loss": 0.3694, "step": 1058 }, { "epoch": 0.19184782608695652, "grad_norm": 9.56767830970739, "learning_rate": 9.329148729523195e-07, "loss": 0.3828, "step": 1059 }, { "epoch": 0.19202898550724637, "grad_norm": 3.3478168389268785, "learning_rate": 9.327680053349474e-07, "loss": 0.329, "step": 1060 }, { "epoch": 0.19221014492753624, "grad_norm": 4.839261044548201, "learning_rate": 9.326209887135004e-07, "loss": 0.2738, "step": 1061 }, { "epoch": 0.1923913043478261, "grad_norm": 5.4335721948293365, "learning_rate": 9.324738231385971e-07, "loss": 0.336, "step": 1062 }, { "epoch": 0.19257246376811593, "grad_norm": 3.43564934617691, "learning_rate": 9.323265086609076e-07, "loss": 0.3337, "step": 1063 }, { "epoch": 0.1927536231884058, "grad_norm": 5.187357822085098, "learning_rate": 9.321790453311527e-07, "loss": 0.3221, "step": 1064 }, { "epoch": 0.19293478260869565, "grad_norm": 3.687748786376032, "learning_rate": 9.320314332001047e-07, "loss": 0.3907, "step": 1065 }, { "epoch": 0.19311594202898552, "grad_norm": 3.601776619294207, "learning_rate": 9.318836723185872e-07, "loss": 0.3178, "step": 1066 }, { "epoch": 0.19329710144927537, "grad_norm": 3.4853584536899236, "learning_rate": 9.317357627374751e-07, "loss": 0.3132, "step": 1067 }, { "epoch": 0.1934782608695652, "grad_norm": 6.302551817302995, "learning_rate": 9.315877045076943e-07, "loss": 0.3544, "step": 1068 }, { "epoch": 0.19365942028985508, "grad_norm": 3.759177455225637, "learning_rate": 9.31439497680222e-07, "loss": 0.3145, "step": 1069 }, { "epoch": 0.19384057971014493, "grad_norm": 3.36728008038697, "learning_rate": 9.312911423060863e-07, "loss": 0.3589, "step": 1070 }, { "epoch": 0.19402173913043477, "grad_norm": 4.154088418490633, "learning_rate": 9.31142638436367e-07, "loss": 0.342, "step": 1071 }, { "epoch": 0.19420289855072465, "grad_norm": 4.255679049260611, "learning_rate": 9.309939861221943e-07, "loss": 0.3596, "step": 1072 }, { "epoch": 0.1943840579710145, "grad_norm": 3.6152782589923005, "learning_rate": 9.308451854147501e-07, "loss": 0.3298, "step": 1073 }, { "epoch": 0.19456521739130433, "grad_norm": 6.809714669231455, "learning_rate": 9.306962363652673e-07, "loss": 0.3673, "step": 1074 }, { "epoch": 0.1947463768115942, "grad_norm": 7.817390681594236, "learning_rate": 9.305471390250294e-07, "loss": 0.3857, "step": 1075 }, { "epoch": 0.19492753623188405, "grad_norm": 8.462995843106821, "learning_rate": 9.303978934453718e-07, "loss": 0.382, "step": 1076 }, { "epoch": 0.19510869565217392, "grad_norm": 6.192447694817275, "learning_rate": 9.302484996776801e-07, "loss": 0.348, "step": 1077 }, { "epoch": 0.19528985507246377, "grad_norm": 4.879352825100935, "learning_rate": 9.300989577733915e-07, "loss": 0.3686, "step": 1078 }, { "epoch": 0.19547101449275361, "grad_norm": 9.443225979075029, "learning_rate": 9.299492677839938e-07, "loss": 0.3484, "step": 1079 }, { "epoch": 0.1956521739130435, "grad_norm": 7.902031406191935, "learning_rate": 9.297994297610261e-07, "loss": 0.3103, "step": 1080 }, { "epoch": 0.19583333333333333, "grad_norm": 16.296221093472447, "learning_rate": 9.296494437560782e-07, "loss": 0.3459, "step": 1081 }, { "epoch": 0.19601449275362318, "grad_norm": 11.415756234070233, "learning_rate": 9.294993098207913e-07, "loss": 0.3062, "step": 1082 }, { "epoch": 0.19619565217391305, "grad_norm": 10.317470423482591, "learning_rate": 9.293490280068567e-07, "loss": 0.3843, "step": 1083 }, { "epoch": 0.1963768115942029, "grad_norm": 4.599444632856712, "learning_rate": 9.291985983660178e-07, "loss": 0.3362, "step": 1084 }, { "epoch": 0.19655797101449277, "grad_norm": 5.607029721056904, "learning_rate": 9.290480209500678e-07, "loss": 0.403, "step": 1085 }, { "epoch": 0.1967391304347826, "grad_norm": 6.211688839085225, "learning_rate": 9.288972958108512e-07, "loss": 0.3273, "step": 1086 }, { "epoch": 0.19692028985507246, "grad_norm": 3.478190361462845, "learning_rate": 9.287464230002635e-07, "loss": 0.3742, "step": 1087 }, { "epoch": 0.19710144927536233, "grad_norm": 5.812794593798648, "learning_rate": 9.28595402570251e-07, "loss": 0.3475, "step": 1088 }, { "epoch": 0.19728260869565217, "grad_norm": 3.72849361710505, "learning_rate": 9.284442345728107e-07, "loss": 0.2964, "step": 1089 }, { "epoch": 0.19746376811594202, "grad_norm": 5.949450789308594, "learning_rate": 9.282929190599904e-07, "loss": 0.3248, "step": 1090 }, { "epoch": 0.1976449275362319, "grad_norm": 5.259186209578336, "learning_rate": 9.281414560838888e-07, "loss": 0.3251, "step": 1091 }, { "epoch": 0.19782608695652174, "grad_norm": 7.072859407675492, "learning_rate": 9.279898456966551e-07, "loss": 0.2894, "step": 1092 }, { "epoch": 0.1980072463768116, "grad_norm": 15.20510906931546, "learning_rate": 9.278380879504899e-07, "loss": 0.3811, "step": 1093 }, { "epoch": 0.19818840579710145, "grad_norm": 7.818422015322425, "learning_rate": 9.276861828976437e-07, "loss": 0.3401, "step": 1094 }, { "epoch": 0.1983695652173913, "grad_norm": 3.3437589662257152, "learning_rate": 9.275341305904185e-07, "loss": 0.3343, "step": 1095 }, { "epoch": 0.19855072463768117, "grad_norm": 6.749822156460748, "learning_rate": 9.273819310811664e-07, "loss": 0.3954, "step": 1096 }, { "epoch": 0.19873188405797101, "grad_norm": 5.763810971844492, "learning_rate": 9.272295844222902e-07, "loss": 0.3429, "step": 1097 }, { "epoch": 0.19891304347826086, "grad_norm": 3.767825869199621, "learning_rate": 9.27077090666244e-07, "loss": 0.3553, "step": 1098 }, { "epoch": 0.19909420289855073, "grad_norm": 11.354287594239699, "learning_rate": 9.26924449865532e-07, "loss": 0.3663, "step": 1099 }, { "epoch": 0.19927536231884058, "grad_norm": 3.7643877260071297, "learning_rate": 9.267716620727091e-07, "loss": 0.3576, "step": 1100 }, { "epoch": 0.19927536231884058, "eval_loss": 0.3427187502384186, "eval_runtime": 9.7492, "eval_samples_per_second": 51.286, "eval_steps_per_second": 0.103, "step": 1100 }, { "epoch": 0.19945652173913042, "grad_norm": 6.128790262274886, "learning_rate": 9.266187273403808e-07, "loss": 0.3613, "step": 1101 }, { "epoch": 0.1996376811594203, "grad_norm": 11.424083402715372, "learning_rate": 9.264656457212034e-07, "loss": 0.4133, "step": 1102 }, { "epoch": 0.19981884057971014, "grad_norm": 3.842995725930491, "learning_rate": 9.263124172678835e-07, "loss": 0.3736, "step": 1103 }, { "epoch": 0.2, "grad_norm": 5.506934004661734, "learning_rate": 9.261590420331784e-07, "loss": 0.3165, "step": 1104 }, { "epoch": 0.20018115942028986, "grad_norm": 3.8890109901948615, "learning_rate": 9.26005520069896e-07, "loss": 0.3175, "step": 1105 }, { "epoch": 0.2003623188405797, "grad_norm": 3.589317743829576, "learning_rate": 9.258518514308944e-07, "loss": 0.3052, "step": 1106 }, { "epoch": 0.20054347826086957, "grad_norm": 5.520151848132921, "learning_rate": 9.256980361690827e-07, "loss": 0.3307, "step": 1107 }, { "epoch": 0.20072463768115942, "grad_norm": 10.784318716483138, "learning_rate": 9.255440743374203e-07, "loss": 0.3625, "step": 1108 }, { "epoch": 0.20090579710144926, "grad_norm": 4.166893125999619, "learning_rate": 9.253899659889168e-07, "loss": 0.3052, "step": 1109 }, { "epoch": 0.20108695652173914, "grad_norm": 11.740812290864191, "learning_rate": 9.252357111766323e-07, "loss": 0.3665, "step": 1110 }, { "epoch": 0.20126811594202898, "grad_norm": 6.716346974200295, "learning_rate": 9.250813099536778e-07, "loss": 0.3451, "step": 1111 }, { "epoch": 0.20144927536231885, "grad_norm": 4.498535643674493, "learning_rate": 9.249267623732141e-07, "loss": 0.3234, "step": 1112 }, { "epoch": 0.2016304347826087, "grad_norm": 8.40625660849219, "learning_rate": 9.247720684884528e-07, "loss": 0.3078, "step": 1113 }, { "epoch": 0.20181159420289854, "grad_norm": 5.2609762947034335, "learning_rate": 9.246172283526557e-07, "loss": 0.3471, "step": 1114 }, { "epoch": 0.20199275362318841, "grad_norm": 11.903469401576388, "learning_rate": 9.24462242019135e-07, "loss": 0.3791, "step": 1115 }, { "epoch": 0.20217391304347826, "grad_norm": 4.398252956752326, "learning_rate": 9.243071095412534e-07, "loss": 0.3436, "step": 1116 }, { "epoch": 0.2023550724637681, "grad_norm": 8.785207159846003, "learning_rate": 9.241518309724233e-07, "loss": 0.2999, "step": 1117 }, { "epoch": 0.20253623188405798, "grad_norm": 4.357553444883357, "learning_rate": 9.239964063661083e-07, "loss": 0.3042, "step": 1118 }, { "epoch": 0.20271739130434782, "grad_norm": 12.05948814062256, "learning_rate": 9.238408357758216e-07, "loss": 0.317, "step": 1119 }, { "epoch": 0.2028985507246377, "grad_norm": 6.7722231202873795, "learning_rate": 9.236851192551269e-07, "loss": 0.338, "step": 1120 }, { "epoch": 0.20307971014492754, "grad_norm": 6.220502547650929, "learning_rate": 9.235292568576383e-07, "loss": 0.3484, "step": 1121 }, { "epoch": 0.20326086956521738, "grad_norm": 4.044774034395157, "learning_rate": 9.233732486370198e-07, "loss": 0.3544, "step": 1122 }, { "epoch": 0.20344202898550726, "grad_norm": 11.663325350898504, "learning_rate": 9.232170946469858e-07, "loss": 0.3819, "step": 1123 }, { "epoch": 0.2036231884057971, "grad_norm": 16.325034677045284, "learning_rate": 9.230607949413007e-07, "loss": 0.4426, "step": 1124 }, { "epoch": 0.20380434782608695, "grad_norm": 4.667412193785131, "learning_rate": 9.229043495737796e-07, "loss": 0.3227, "step": 1125 }, { "epoch": 0.20398550724637682, "grad_norm": 10.525774925483608, "learning_rate": 9.227477585982871e-07, "loss": 0.3619, "step": 1126 }, { "epoch": 0.20416666666666666, "grad_norm": 5.219206604324908, "learning_rate": 9.225910220687383e-07, "loss": 0.3131, "step": 1127 }, { "epoch": 0.20434782608695654, "grad_norm": 6.393631117302292, "learning_rate": 9.224341400390983e-07, "loss": 0.373, "step": 1128 }, { "epoch": 0.20452898550724638, "grad_norm": 4.014103010525397, "learning_rate": 9.222771125633825e-07, "loss": 0.3151, "step": 1129 }, { "epoch": 0.20471014492753623, "grad_norm": 3.6917550993065613, "learning_rate": 9.221199396956558e-07, "loss": 0.3845, "step": 1130 }, { "epoch": 0.2048913043478261, "grad_norm": 3.346092054285078, "learning_rate": 9.21962621490034e-07, "loss": 0.3389, "step": 1131 }, { "epoch": 0.20507246376811594, "grad_norm": 8.044549538911658, "learning_rate": 9.218051580006823e-07, "loss": 0.3457, "step": 1132 }, { "epoch": 0.2052536231884058, "grad_norm": 3.5806257550617664, "learning_rate": 9.216475492818161e-07, "loss": 0.3435, "step": 1133 }, { "epoch": 0.20543478260869566, "grad_norm": 3.568645542258272, "learning_rate": 9.21489795387701e-07, "loss": 0.3588, "step": 1134 }, { "epoch": 0.2056159420289855, "grad_norm": 4.71861334120439, "learning_rate": 9.213318963726522e-07, "loss": 0.3979, "step": 1135 }, { "epoch": 0.20579710144927535, "grad_norm": 3.6290599462987667, "learning_rate": 9.211738522910351e-07, "loss": 0.2969, "step": 1136 }, { "epoch": 0.20597826086956522, "grad_norm": 5.7895554874818, "learning_rate": 9.210156631972652e-07, "loss": 0.3373, "step": 1137 }, { "epoch": 0.20615942028985507, "grad_norm": 3.6089215727395283, "learning_rate": 9.208573291458075e-07, "loss": 0.3846, "step": 1138 }, { "epoch": 0.20634057971014494, "grad_norm": 9.414747750175813, "learning_rate": 9.206988501911774e-07, "loss": 0.3043, "step": 1139 }, { "epoch": 0.20652173913043478, "grad_norm": 5.021675194345018, "learning_rate": 9.205402263879399e-07, "loss": 0.2858, "step": 1140 }, { "epoch": 0.20670289855072463, "grad_norm": 8.691479291707761, "learning_rate": 9.203814577907099e-07, "loss": 0.3541, "step": 1141 }, { "epoch": 0.2068840579710145, "grad_norm": 6.702702835093648, "learning_rate": 9.20222544454152e-07, "loss": 0.3394, "step": 1142 }, { "epoch": 0.20706521739130435, "grad_norm": 4.306005758339415, "learning_rate": 9.200634864329813e-07, "loss": 0.3832, "step": 1143 }, { "epoch": 0.2072463768115942, "grad_norm": 5.707737634342709, "learning_rate": 9.199042837819617e-07, "loss": 0.312, "step": 1144 }, { "epoch": 0.20742753623188406, "grad_norm": 6.739596517891878, "learning_rate": 9.197449365559076e-07, "loss": 0.3229, "step": 1145 }, { "epoch": 0.2076086956521739, "grad_norm": 5.032932932874201, "learning_rate": 9.195854448096831e-07, "loss": 0.4449, "step": 1146 }, { "epoch": 0.20778985507246378, "grad_norm": 14.342779741181634, "learning_rate": 9.19425808598202e-07, "loss": 0.387, "step": 1147 }, { "epoch": 0.20797101449275363, "grad_norm": 7.487756939525193, "learning_rate": 9.192660279764278e-07, "loss": 0.3715, "step": 1148 }, { "epoch": 0.20815217391304347, "grad_norm": 5.961515436167527, "learning_rate": 9.191061029993734e-07, "loss": 0.3369, "step": 1149 }, { "epoch": 0.20833333333333334, "grad_norm": 3.510495429788573, "learning_rate": 9.189460337221021e-07, "loss": 0.3005, "step": 1150 }, { "epoch": 0.2085144927536232, "grad_norm": 3.714024418691465, "learning_rate": 9.187858201997264e-07, "loss": 0.285, "step": 1151 }, { "epoch": 0.20869565217391303, "grad_norm": 6.725784531538931, "learning_rate": 9.186254624874085e-07, "loss": 0.3585, "step": 1152 }, { "epoch": 0.2088768115942029, "grad_norm": 6.605587077891203, "learning_rate": 9.184649606403604e-07, "loss": 0.3703, "step": 1153 }, { "epoch": 0.20905797101449275, "grad_norm": 3.5370045653615296, "learning_rate": 9.183043147138436e-07, "loss": 0.3839, "step": 1154 }, { "epoch": 0.20923913043478262, "grad_norm": 6.352120373584736, "learning_rate": 9.181435247631693e-07, "loss": 0.3477, "step": 1155 }, { "epoch": 0.20942028985507247, "grad_norm": 7.113565388564078, "learning_rate": 9.179825908436983e-07, "loss": 0.3512, "step": 1156 }, { "epoch": 0.2096014492753623, "grad_norm": 6.148093784747192, "learning_rate": 9.178215130108407e-07, "loss": 0.3801, "step": 1157 }, { "epoch": 0.20978260869565218, "grad_norm": 4.117767830767374, "learning_rate": 9.176602913200567e-07, "loss": 0.3722, "step": 1158 }, { "epoch": 0.20996376811594203, "grad_norm": 7.290988329095725, "learning_rate": 9.174989258268551e-07, "loss": 0.3242, "step": 1159 }, { "epoch": 0.21014492753623187, "grad_norm": 3.4110712136513355, "learning_rate": 9.173374165867955e-07, "loss": 0.3235, "step": 1160 }, { "epoch": 0.21032608695652175, "grad_norm": 4.201425450924089, "learning_rate": 9.171757636554859e-07, "loss": 0.3658, "step": 1161 }, { "epoch": 0.2105072463768116, "grad_norm": 7.851814670231926, "learning_rate": 9.170139670885841e-07, "loss": 0.3036, "step": 1162 }, { "epoch": 0.21068840579710144, "grad_norm": 7.337011470424802, "learning_rate": 9.168520269417978e-07, "loss": 0.3761, "step": 1163 }, { "epoch": 0.2108695652173913, "grad_norm": 9.634430631387612, "learning_rate": 9.166899432708835e-07, "loss": 0.3451, "step": 1164 }, { "epoch": 0.21105072463768115, "grad_norm": 4.375900947739495, "learning_rate": 9.165277161316473e-07, "loss": 0.3838, "step": 1165 }, { "epoch": 0.21123188405797103, "grad_norm": 5.230338603681001, "learning_rate": 9.16365345579945e-07, "loss": 0.3036, "step": 1166 }, { "epoch": 0.21141304347826087, "grad_norm": 6.95064796883076, "learning_rate": 9.162028316716815e-07, "loss": 0.3689, "step": 1167 }, { "epoch": 0.21159420289855072, "grad_norm": 5.169954445208204, "learning_rate": 9.160401744628108e-07, "loss": 0.3853, "step": 1168 }, { "epoch": 0.2117753623188406, "grad_norm": 6.078275299631071, "learning_rate": 9.15877374009337e-07, "loss": 0.3278, "step": 1169 }, { "epoch": 0.21195652173913043, "grad_norm": 4.771621676358347, "learning_rate": 9.157144303673128e-07, "loss": 0.3604, "step": 1170 }, { "epoch": 0.21213768115942028, "grad_norm": 11.24121228358597, "learning_rate": 9.155513435928405e-07, "loss": 0.3915, "step": 1171 }, { "epoch": 0.21231884057971015, "grad_norm": 3.498503875826368, "learning_rate": 9.153881137420717e-07, "loss": 0.3715, "step": 1172 }, { "epoch": 0.2125, "grad_norm": 3.6629310625294367, "learning_rate": 9.152247408712073e-07, "loss": 0.3061, "step": 1173 }, { "epoch": 0.21268115942028987, "grad_norm": 6.637769743333661, "learning_rate": 9.15061225036497e-07, "loss": 0.3265, "step": 1174 }, { "epoch": 0.2128623188405797, "grad_norm": 3.595438248559532, "learning_rate": 9.148975662942404e-07, "loss": 0.3375, "step": 1175 }, { "epoch": 0.21304347826086956, "grad_norm": 4.042233150619379, "learning_rate": 9.14733764700786e-07, "loss": 0.3159, "step": 1176 }, { "epoch": 0.21322463768115943, "grad_norm": 8.690905380385374, "learning_rate": 9.145698203125313e-07, "loss": 0.3509, "step": 1177 }, { "epoch": 0.21340579710144927, "grad_norm": 3.5465642413707124, "learning_rate": 9.144057331859232e-07, "loss": 0.3578, "step": 1178 }, { "epoch": 0.21358695652173912, "grad_norm": 3.077868937570869, "learning_rate": 9.142415033774577e-07, "loss": 0.3231, "step": 1179 }, { "epoch": 0.213768115942029, "grad_norm": 3.1355293571853315, "learning_rate": 9.140771309436798e-07, "loss": 0.2754, "step": 1180 }, { "epoch": 0.21394927536231884, "grad_norm": 5.896185940355914, "learning_rate": 9.139126159411838e-07, "loss": 0.3365, "step": 1181 }, { "epoch": 0.2141304347826087, "grad_norm": 5.491191469508045, "learning_rate": 9.137479584266132e-07, "loss": 0.3892, "step": 1182 }, { "epoch": 0.21431159420289855, "grad_norm": 6.511034906567878, "learning_rate": 9.135831584566597e-07, "loss": 0.3632, "step": 1183 }, { "epoch": 0.2144927536231884, "grad_norm": 6.513532927129578, "learning_rate": 9.134182160880655e-07, "loss": 0.427, "step": 1184 }, { "epoch": 0.21467391304347827, "grad_norm": 3.1819913425535016, "learning_rate": 9.132531313776207e-07, "loss": 0.2913, "step": 1185 }, { "epoch": 0.21485507246376812, "grad_norm": 9.642877373841666, "learning_rate": 9.130879043821645e-07, "loss": 0.3699, "step": 1186 }, { "epoch": 0.21503623188405796, "grad_norm": 4.5689952729653065, "learning_rate": 9.129225351585856e-07, "loss": 0.361, "step": 1187 }, { "epoch": 0.21521739130434783, "grad_norm": 3.5447655527158717, "learning_rate": 9.127570237638214e-07, "loss": 0.3415, "step": 1188 }, { "epoch": 0.21539855072463768, "grad_norm": 5.129574453628034, "learning_rate": 9.125913702548583e-07, "loss": 0.3459, "step": 1189 }, { "epoch": 0.21557971014492755, "grad_norm": 3.5775440093323105, "learning_rate": 9.124255746887314e-07, "loss": 0.285, "step": 1190 }, { "epoch": 0.2157608695652174, "grad_norm": 4.653005461091044, "learning_rate": 9.122596371225253e-07, "loss": 0.2984, "step": 1191 }, { "epoch": 0.21594202898550724, "grad_norm": 4.472719753949903, "learning_rate": 9.120935576133726e-07, "loss": 0.3945, "step": 1192 }, { "epoch": 0.2161231884057971, "grad_norm": 4.606381116902868, "learning_rate": 9.119273362184554e-07, "loss": 0.365, "step": 1193 }, { "epoch": 0.21630434782608696, "grad_norm": 6.334844303794688, "learning_rate": 9.117609729950047e-07, "loss": 0.3936, "step": 1194 }, { "epoch": 0.2164855072463768, "grad_norm": 5.284068988281052, "learning_rate": 9.115944680003001e-07, "loss": 0.301, "step": 1195 }, { "epoch": 0.21666666666666667, "grad_norm": 7.116168200697639, "learning_rate": 9.1142782129167e-07, "loss": 0.3527, "step": 1196 }, { "epoch": 0.21684782608695652, "grad_norm": 10.075623742982382, "learning_rate": 9.112610329264915e-07, "loss": 0.3031, "step": 1197 }, { "epoch": 0.21702898550724636, "grad_norm": 4.8289485313476685, "learning_rate": 9.110941029621908e-07, "loss": 0.3308, "step": 1198 }, { "epoch": 0.21721014492753624, "grad_norm": 3.954519819691023, "learning_rate": 9.109270314562427e-07, "loss": 0.3737, "step": 1199 }, { "epoch": 0.21739130434782608, "grad_norm": 4.387893849234945, "learning_rate": 9.107598184661707e-07, "loss": 0.4003, "step": 1200 }, { "epoch": 0.21739130434782608, "eval_loss": 0.34571874141693115, "eval_runtime": 9.8889, "eval_samples_per_second": 50.562, "eval_steps_per_second": 0.101, "step": 1200 }, { "epoch": 0.21757246376811595, "grad_norm": 3.5971048581940814, "learning_rate": 9.105924640495468e-07, "loss": 0.3813, "step": 1201 }, { "epoch": 0.2177536231884058, "grad_norm": 5.389401264316595, "learning_rate": 9.104249682639922e-07, "loss": 0.3159, "step": 1202 }, { "epoch": 0.21793478260869564, "grad_norm": 4.8130975732853045, "learning_rate": 9.102573311671764e-07, "loss": 0.3953, "step": 1203 }, { "epoch": 0.21811594202898552, "grad_norm": 6.5490419402611, "learning_rate": 9.100895528168177e-07, "loss": 0.3469, "step": 1204 }, { "epoch": 0.21829710144927536, "grad_norm": 5.080686226241223, "learning_rate": 9.099216332706828e-07, "loss": 0.3453, "step": 1205 }, { "epoch": 0.2184782608695652, "grad_norm": 3.7277484103384673, "learning_rate": 9.097535725865875e-07, "loss": 0.3433, "step": 1206 }, { "epoch": 0.21865942028985508, "grad_norm": 4.003709876317981, "learning_rate": 9.095853708223955e-07, "loss": 0.3447, "step": 1207 }, { "epoch": 0.21884057971014492, "grad_norm": 3.7323889864612765, "learning_rate": 9.094170280360198e-07, "loss": 0.4109, "step": 1208 }, { "epoch": 0.2190217391304348, "grad_norm": 9.602557538583484, "learning_rate": 9.092485442854214e-07, "loss": 0.4127, "step": 1209 }, { "epoch": 0.21920289855072464, "grad_norm": 7.298048008276583, "learning_rate": 9.0907991962861e-07, "loss": 0.3828, "step": 1210 }, { "epoch": 0.21938405797101448, "grad_norm": 7.943662289260129, "learning_rate": 9.089111541236444e-07, "loss": 0.3148, "step": 1211 }, { "epoch": 0.21956521739130436, "grad_norm": 6.083209857329283, "learning_rate": 9.087422478286308e-07, "loss": 0.372, "step": 1212 }, { "epoch": 0.2197463768115942, "grad_norm": 19.988960217370533, "learning_rate": 9.085732008017245e-07, "loss": 0.3387, "step": 1213 }, { "epoch": 0.21992753623188405, "grad_norm": 6.151747595618726, "learning_rate": 9.084040131011295e-07, "loss": 0.3114, "step": 1214 }, { "epoch": 0.22010869565217392, "grad_norm": 4.490543143811417, "learning_rate": 9.082346847850974e-07, "loss": 0.3727, "step": 1215 }, { "epoch": 0.22028985507246376, "grad_norm": 5.233150505420256, "learning_rate": 9.080652159119294e-07, "loss": 0.3642, "step": 1216 }, { "epoch": 0.22047101449275364, "grad_norm": 9.732369632834613, "learning_rate": 9.078956065399739e-07, "loss": 0.3257, "step": 1217 }, { "epoch": 0.22065217391304348, "grad_norm": 10.272121018431124, "learning_rate": 9.077258567276286e-07, "loss": 0.3812, "step": 1218 }, { "epoch": 0.22083333333333333, "grad_norm": 3.5933635529642474, "learning_rate": 9.075559665333389e-07, "loss": 0.4311, "step": 1219 }, { "epoch": 0.2210144927536232, "grad_norm": 4.327113704145465, "learning_rate": 9.073859360155989e-07, "loss": 0.3447, "step": 1220 }, { "epoch": 0.22119565217391304, "grad_norm": 8.95674866628867, "learning_rate": 9.072157652329509e-07, "loss": 0.3631, "step": 1221 }, { "epoch": 0.2213768115942029, "grad_norm": 3.2292604916952086, "learning_rate": 9.070454542439854e-07, "loss": 0.3502, "step": 1222 }, { "epoch": 0.22155797101449276, "grad_norm": 7.583161740693873, "learning_rate": 9.068750031073414e-07, "loss": 0.3164, "step": 1223 }, { "epoch": 0.2217391304347826, "grad_norm": 3.0738902557289536, "learning_rate": 9.067044118817062e-07, "loss": 0.3077, "step": 1224 }, { "epoch": 0.22192028985507245, "grad_norm": 4.161213231321821, "learning_rate": 9.065336806258148e-07, "loss": 0.361, "step": 1225 }, { "epoch": 0.22210144927536232, "grad_norm": 6.967930997922288, "learning_rate": 9.06362809398451e-07, "loss": 0.3038, "step": 1226 }, { "epoch": 0.22228260869565217, "grad_norm": 5.623370037391552, "learning_rate": 9.061917982584466e-07, "loss": 0.4154, "step": 1227 }, { "epoch": 0.22246376811594204, "grad_norm": 7.558402566314015, "learning_rate": 9.060206472646814e-07, "loss": 0.4089, "step": 1228 }, { "epoch": 0.22264492753623188, "grad_norm": 9.141692572595682, "learning_rate": 9.058493564760836e-07, "loss": 0.4023, "step": 1229 }, { "epoch": 0.22282608695652173, "grad_norm": 4.196488676053457, "learning_rate": 9.056779259516294e-07, "loss": 0.3033, "step": 1230 }, { "epoch": 0.2230072463768116, "grad_norm": 4.294348077674157, "learning_rate": 9.055063557503433e-07, "loss": 0.3704, "step": 1231 }, { "epoch": 0.22318840579710145, "grad_norm": 6.530010340490466, "learning_rate": 9.053346459312974e-07, "loss": 0.3223, "step": 1232 }, { "epoch": 0.2233695652173913, "grad_norm": 3.8651163122606316, "learning_rate": 9.051627965536123e-07, "loss": 0.4018, "step": 1233 }, { "epoch": 0.22355072463768116, "grad_norm": 4.103493321918278, "learning_rate": 9.049908076764569e-07, "loss": 0.3248, "step": 1234 }, { "epoch": 0.223731884057971, "grad_norm": 10.96942520906358, "learning_rate": 9.048186793590475e-07, "loss": 0.3412, "step": 1235 }, { "epoch": 0.22391304347826088, "grad_norm": 6.126306015723247, "learning_rate": 9.046464116606487e-07, "loss": 0.3866, "step": 1236 }, { "epoch": 0.22409420289855073, "grad_norm": 4.143063120786398, "learning_rate": 9.04474004640573e-07, "loss": 0.3171, "step": 1237 }, { "epoch": 0.22427536231884057, "grad_norm": 3.492663278134232, "learning_rate": 9.043014583581812e-07, "loss": 0.2607, "step": 1238 }, { "epoch": 0.22445652173913044, "grad_norm": 3.939432702923544, "learning_rate": 9.041287728728816e-07, "loss": 0.3498, "step": 1239 }, { "epoch": 0.2246376811594203, "grad_norm": 5.335678788094838, "learning_rate": 9.039559482441307e-07, "loss": 0.2717, "step": 1240 }, { "epoch": 0.22481884057971013, "grad_norm": 4.165989925810196, "learning_rate": 9.037829845314328e-07, "loss": 0.3733, "step": 1241 }, { "epoch": 0.225, "grad_norm": 11.20480507288977, "learning_rate": 9.036098817943402e-07, "loss": 0.3663, "step": 1242 }, { "epoch": 0.22518115942028985, "grad_norm": 5.938461493367334, "learning_rate": 9.034366400924529e-07, "loss": 0.3672, "step": 1243 }, { "epoch": 0.22536231884057972, "grad_norm": 8.216875696420953, "learning_rate": 9.03263259485419e-07, "loss": 0.3799, "step": 1244 }, { "epoch": 0.22554347826086957, "grad_norm": 7.1361281645982, "learning_rate": 9.03089740032934e-07, "loss": 0.294, "step": 1245 }, { "epoch": 0.2257246376811594, "grad_norm": 3.2356137509866305, "learning_rate": 9.029160817947419e-07, "loss": 0.2933, "step": 1246 }, { "epoch": 0.22590579710144928, "grad_norm": 5.649658822053839, "learning_rate": 9.027422848306336e-07, "loss": 0.3068, "step": 1247 }, { "epoch": 0.22608695652173913, "grad_norm": 5.312495746898288, "learning_rate": 9.025683492004483e-07, "loss": 0.3898, "step": 1248 }, { "epoch": 0.22626811594202897, "grad_norm": 5.50819495308927, "learning_rate": 9.023942749640731e-07, "loss": 0.3415, "step": 1249 }, { "epoch": 0.22644927536231885, "grad_norm": 3.9040870967515784, "learning_rate": 9.022200621814425e-07, "loss": 0.3728, "step": 1250 }, { "epoch": 0.2266304347826087, "grad_norm": 4.93548118832667, "learning_rate": 9.020457109125386e-07, "loss": 0.3264, "step": 1251 }, { "epoch": 0.22681159420289856, "grad_norm": 3.4831176329452336, "learning_rate": 9.018712212173915e-07, "loss": 0.3494, "step": 1252 }, { "epoch": 0.2269927536231884, "grad_norm": 3.762130987872424, "learning_rate": 9.01696593156079e-07, "loss": 0.3581, "step": 1253 }, { "epoch": 0.22717391304347825, "grad_norm": 3.960631701449595, "learning_rate": 9.01521826788726e-07, "loss": 0.3625, "step": 1254 }, { "epoch": 0.22735507246376813, "grad_norm": 6.354525962480676, "learning_rate": 9.013469221755057e-07, "loss": 0.3446, "step": 1255 }, { "epoch": 0.22753623188405797, "grad_norm": 7.427381252438664, "learning_rate": 9.011718793766384e-07, "loss": 0.3044, "step": 1256 }, { "epoch": 0.22771739130434782, "grad_norm": 11.772935437533649, "learning_rate": 9.009966984523923e-07, "loss": 0.3279, "step": 1257 }, { "epoch": 0.2278985507246377, "grad_norm": 5.047475807930224, "learning_rate": 9.008213794630829e-07, "loss": 0.2904, "step": 1258 }, { "epoch": 0.22807971014492753, "grad_norm": 5.431777163692616, "learning_rate": 9.006459224690734e-07, "loss": 0.3078, "step": 1259 }, { "epoch": 0.22826086956521738, "grad_norm": 8.175128606886476, "learning_rate": 9.004703275307746e-07, "loss": 0.3635, "step": 1260 }, { "epoch": 0.22844202898550725, "grad_norm": 6.837442307314524, "learning_rate": 9.002945947086445e-07, "loss": 0.307, "step": 1261 }, { "epoch": 0.2286231884057971, "grad_norm": 9.695243546120613, "learning_rate": 9.001187240631889e-07, "loss": 0.326, "step": 1262 }, { "epoch": 0.22880434782608697, "grad_norm": 3.7129745044453446, "learning_rate": 8.999427156549606e-07, "loss": 0.2894, "step": 1263 }, { "epoch": 0.2289855072463768, "grad_norm": 8.327224382648378, "learning_rate": 8.997665695445606e-07, "loss": 0.3881, "step": 1264 }, { "epoch": 0.22916666666666666, "grad_norm": 7.117905841961984, "learning_rate": 8.995902857926363e-07, "loss": 0.3668, "step": 1265 }, { "epoch": 0.22934782608695653, "grad_norm": 3.4377048315831282, "learning_rate": 8.994138644598834e-07, "loss": 0.2948, "step": 1266 }, { "epoch": 0.22952898550724637, "grad_norm": 4.12329980743616, "learning_rate": 8.992373056070446e-07, "loss": 0.3214, "step": 1267 }, { "epoch": 0.22971014492753622, "grad_norm": 3.8999094160745753, "learning_rate": 8.990606092949098e-07, "loss": 0.3579, "step": 1268 }, { "epoch": 0.2298913043478261, "grad_norm": 4.181327819322105, "learning_rate": 8.988837755843164e-07, "loss": 0.2867, "step": 1269 }, { "epoch": 0.23007246376811594, "grad_norm": 9.358893804692244, "learning_rate": 8.987068045361492e-07, "loss": 0.3965, "step": 1270 }, { "epoch": 0.2302536231884058, "grad_norm": 10.158978678505177, "learning_rate": 8.9852969621134e-07, "loss": 0.2748, "step": 1271 }, { "epoch": 0.23043478260869565, "grad_norm": 7.262494787791029, "learning_rate": 8.983524506708681e-07, "loss": 0.3918, "step": 1272 }, { "epoch": 0.2306159420289855, "grad_norm": 3.3771311901426024, "learning_rate": 8.9817506797576e-07, "loss": 0.3373, "step": 1273 }, { "epoch": 0.23079710144927537, "grad_norm": 5.646504676227212, "learning_rate": 8.979975481870895e-07, "loss": 0.3661, "step": 1274 }, { "epoch": 0.23097826086956522, "grad_norm": 6.455236003510512, "learning_rate": 8.978198913659774e-07, "loss": 0.3181, "step": 1275 }, { "epoch": 0.23115942028985506, "grad_norm": 8.34846904885038, "learning_rate": 8.976420975735917e-07, "loss": 0.3246, "step": 1276 }, { "epoch": 0.23134057971014493, "grad_norm": 9.142471943183933, "learning_rate": 8.974641668711478e-07, "loss": 0.4117, "step": 1277 }, { "epoch": 0.23152173913043478, "grad_norm": 4.302318983661124, "learning_rate": 8.972860993199081e-07, "loss": 0.3284, "step": 1278 }, { "epoch": 0.23170289855072465, "grad_norm": 4.896572809261174, "learning_rate": 8.971078949811819e-07, "loss": 0.3486, "step": 1279 }, { "epoch": 0.2318840579710145, "grad_norm": 7.513370874886547, "learning_rate": 8.969295539163258e-07, "loss": 0.3539, "step": 1280 }, { "epoch": 0.23206521739130434, "grad_norm": 5.216826327162976, "learning_rate": 8.967510761867439e-07, "loss": 0.3019, "step": 1281 }, { "epoch": 0.2322463768115942, "grad_norm": 10.561040987414916, "learning_rate": 8.965724618538864e-07, "loss": 0.3461, "step": 1282 }, { "epoch": 0.23242753623188406, "grad_norm": 6.928651931060982, "learning_rate": 8.963937109792514e-07, "loss": 0.3109, "step": 1283 }, { "epoch": 0.2326086956521739, "grad_norm": 3.5977461335210723, "learning_rate": 8.962148236243834e-07, "loss": 0.3292, "step": 1284 }, { "epoch": 0.23278985507246377, "grad_norm": 4.195109345427405, "learning_rate": 8.960357998508745e-07, "loss": 0.4093, "step": 1285 }, { "epoch": 0.23297101449275362, "grad_norm": 5.683806680561408, "learning_rate": 8.958566397203632e-07, "loss": 0.3856, "step": 1286 }, { "epoch": 0.23315217391304346, "grad_norm": 4.826928924661649, "learning_rate": 8.956773432945353e-07, "loss": 0.3077, "step": 1287 }, { "epoch": 0.23333333333333334, "grad_norm": 4.606270305212832, "learning_rate": 8.954979106351232e-07, "loss": 0.3164, "step": 1288 }, { "epoch": 0.23351449275362318, "grad_norm": 4.93460270541993, "learning_rate": 8.953183418039065e-07, "loss": 0.314, "step": 1289 }, { "epoch": 0.23369565217391305, "grad_norm": 3.246901983250382, "learning_rate": 8.951386368627118e-07, "loss": 0.3032, "step": 1290 }, { "epoch": 0.2338768115942029, "grad_norm": 5.779498570238692, "learning_rate": 8.949587958734122e-07, "loss": 0.3142, "step": 1291 }, { "epoch": 0.23405797101449274, "grad_norm": 8.125378740331204, "learning_rate": 8.947788188979279e-07, "loss": 0.3669, "step": 1292 }, { "epoch": 0.23423913043478262, "grad_norm": 4.227889042202217, "learning_rate": 8.945987059982256e-07, "loss": 0.3128, "step": 1293 }, { "epoch": 0.23442028985507246, "grad_norm": 16.149319615221966, "learning_rate": 8.944184572363193e-07, "loss": 0.3649, "step": 1294 }, { "epoch": 0.2346014492753623, "grad_norm": 5.333794372543125, "learning_rate": 8.942380726742693e-07, "loss": 0.3146, "step": 1295 }, { "epoch": 0.23478260869565218, "grad_norm": 9.186710127409288, "learning_rate": 8.940575523741832e-07, "loss": 0.3411, "step": 1296 }, { "epoch": 0.23496376811594202, "grad_norm": 9.451298148970924, "learning_rate": 8.938768963982144e-07, "loss": 0.356, "step": 1297 }, { "epoch": 0.2351449275362319, "grad_norm": 7.616715147955477, "learning_rate": 8.936961048085641e-07, "loss": 0.3349, "step": 1298 }, { "epoch": 0.23532608695652174, "grad_norm": 7.3813470652580895, "learning_rate": 8.935151776674794e-07, "loss": 0.4008, "step": 1299 }, { "epoch": 0.23550724637681159, "grad_norm": 16.072109056540143, "learning_rate": 8.933341150372546e-07, "loss": 0.3997, "step": 1300 }, { "epoch": 0.23550724637681159, "eval_loss": 0.34568750858306885, "eval_runtime": 9.768, "eval_samples_per_second": 51.188, "eval_steps_per_second": 0.102, "step": 1300 }, { "epoch": 0.23568840579710146, "grad_norm": 8.18685560719828, "learning_rate": 8.931529169802304e-07, "loss": 0.2744, "step": 1301 }, { "epoch": 0.2358695652173913, "grad_norm": 6.465716870823023, "learning_rate": 8.929715835587941e-07, "loss": 0.333, "step": 1302 }, { "epoch": 0.23605072463768115, "grad_norm": 8.482431902401542, "learning_rate": 8.927901148353796e-07, "loss": 0.3171, "step": 1303 }, { "epoch": 0.23623188405797102, "grad_norm": 5.230415572187631, "learning_rate": 8.926085108724674e-07, "loss": 0.3434, "step": 1304 }, { "epoch": 0.23641304347826086, "grad_norm": 5.573864729548042, "learning_rate": 8.924267717325848e-07, "loss": 0.3569, "step": 1305 }, { "epoch": 0.23659420289855074, "grad_norm": 5.622625663198217, "learning_rate": 8.922448974783052e-07, "loss": 0.4255, "step": 1306 }, { "epoch": 0.23677536231884058, "grad_norm": 5.2447050884088515, "learning_rate": 8.92062888172249e-07, "loss": 0.3503, "step": 1307 }, { "epoch": 0.23695652173913043, "grad_norm": 6.259870194613778, "learning_rate": 8.918807438770828e-07, "loss": 0.3027, "step": 1308 }, { "epoch": 0.2371376811594203, "grad_norm": 13.288604019758068, "learning_rate": 8.916984646555197e-07, "loss": 0.395, "step": 1309 }, { "epoch": 0.23731884057971014, "grad_norm": 6.59668440470176, "learning_rate": 8.915160505703192e-07, "loss": 0.3228, "step": 1310 }, { "epoch": 0.2375, "grad_norm": 9.884336737057396, "learning_rate": 8.913335016842876e-07, "loss": 0.3704, "step": 1311 }, { "epoch": 0.23768115942028986, "grad_norm": 2.8813813729617523, "learning_rate": 8.911508180602771e-07, "loss": 0.3084, "step": 1312 }, { "epoch": 0.2378623188405797, "grad_norm": 4.282031086280285, "learning_rate": 8.909679997611868e-07, "loss": 0.3296, "step": 1313 }, { "epoch": 0.23804347826086958, "grad_norm": 3.2672018028003778, "learning_rate": 8.907850468499614e-07, "loss": 0.3535, "step": 1314 }, { "epoch": 0.23822463768115942, "grad_norm": 7.19297103709532, "learning_rate": 8.90601959389593e-07, "loss": 0.3431, "step": 1315 }, { "epoch": 0.23840579710144927, "grad_norm": 6.197478698744432, "learning_rate": 8.904187374431193e-07, "loss": 0.3391, "step": 1316 }, { "epoch": 0.23858695652173914, "grad_norm": 6.2460577721160595, "learning_rate": 8.902353810736245e-07, "loss": 0.32, "step": 1317 }, { "epoch": 0.23876811594202899, "grad_norm": 5.8844281610349105, "learning_rate": 8.900518903442389e-07, "loss": 0.3134, "step": 1318 }, { "epoch": 0.23894927536231883, "grad_norm": 6.4498932788279575, "learning_rate": 8.898682653181393e-07, "loss": 0.3516, "step": 1319 }, { "epoch": 0.2391304347826087, "grad_norm": 3.490721115063493, "learning_rate": 8.89684506058549e-07, "loss": 0.3572, "step": 1320 }, { "epoch": 0.23931159420289855, "grad_norm": 8.669454284194035, "learning_rate": 8.895006126287366e-07, "loss": 0.3188, "step": 1321 }, { "epoch": 0.2394927536231884, "grad_norm": 9.137855564274608, "learning_rate": 8.893165850920179e-07, "loss": 0.3631, "step": 1322 }, { "epoch": 0.23967391304347826, "grad_norm": 7.586913383241209, "learning_rate": 8.891324235117543e-07, "loss": 0.4019, "step": 1323 }, { "epoch": 0.2398550724637681, "grad_norm": 3.4782370530772093, "learning_rate": 8.889481279513536e-07, "loss": 0.3165, "step": 1324 }, { "epoch": 0.24003623188405798, "grad_norm": 5.828463556759546, "learning_rate": 8.887636984742694e-07, "loss": 0.4064, "step": 1325 }, { "epoch": 0.24021739130434783, "grad_norm": 7.846689692953546, "learning_rate": 8.885791351440019e-07, "loss": 0.351, "step": 1326 }, { "epoch": 0.24039855072463767, "grad_norm": 5.331304685578559, "learning_rate": 8.883944380240971e-07, "loss": 0.3246, "step": 1327 }, { "epoch": 0.24057971014492754, "grad_norm": 11.152378530603334, "learning_rate": 8.882096071781471e-07, "loss": 0.3244, "step": 1328 }, { "epoch": 0.2407608695652174, "grad_norm": 3.961230665090909, "learning_rate": 8.880246426697899e-07, "loss": 0.3408, "step": 1329 }, { "epoch": 0.24094202898550723, "grad_norm": 3.0672571747120023, "learning_rate": 8.878395445627096e-07, "loss": 0.2731, "step": 1330 }, { "epoch": 0.2411231884057971, "grad_norm": 5.452977269629534, "learning_rate": 8.876543129206367e-07, "loss": 0.3356, "step": 1331 }, { "epoch": 0.24130434782608695, "grad_norm": 5.2650355665238955, "learning_rate": 8.874689478073469e-07, "loss": 0.3108, "step": 1332 }, { "epoch": 0.24148550724637682, "grad_norm": 9.959647030914498, "learning_rate": 8.872834492866628e-07, "loss": 0.3843, "step": 1333 }, { "epoch": 0.24166666666666667, "grad_norm": 6.934439359648218, "learning_rate": 8.870978174224518e-07, "loss": 0.3543, "step": 1334 }, { "epoch": 0.2418478260869565, "grad_norm": 9.316571712068209, "learning_rate": 8.869120522786284e-07, "loss": 0.388, "step": 1335 }, { "epoch": 0.24202898550724639, "grad_norm": 3.301519857759256, "learning_rate": 8.867261539191521e-07, "loss": 0.3263, "step": 1336 }, { "epoch": 0.24221014492753623, "grad_norm": 4.572883545706603, "learning_rate": 8.865401224080285e-07, "loss": 0.3425, "step": 1337 }, { "epoch": 0.24239130434782608, "grad_norm": 3.460192979791663, "learning_rate": 8.863539578093095e-07, "loss": 0.3369, "step": 1338 }, { "epoch": 0.24257246376811595, "grad_norm": 4.543863120297665, "learning_rate": 8.861676601870922e-07, "loss": 0.2846, "step": 1339 }, { "epoch": 0.2427536231884058, "grad_norm": 6.8038943778211465, "learning_rate": 8.859812296055198e-07, "loss": 0.3261, "step": 1340 }, { "epoch": 0.24293478260869567, "grad_norm": 4.1486455698856615, "learning_rate": 8.857946661287812e-07, "loss": 0.3614, "step": 1341 }, { "epoch": 0.2431159420289855, "grad_norm": 5.066788304426683, "learning_rate": 8.856079698211109e-07, "loss": 0.3289, "step": 1342 }, { "epoch": 0.24329710144927535, "grad_norm": 2.930288002248603, "learning_rate": 8.854211407467898e-07, "loss": 0.2845, "step": 1343 }, { "epoch": 0.24347826086956523, "grad_norm": 4.994460981204123, "learning_rate": 8.852341789701439e-07, "loss": 0.4155, "step": 1344 }, { "epoch": 0.24365942028985507, "grad_norm": 6.28643347535595, "learning_rate": 8.850470845555447e-07, "loss": 0.3741, "step": 1345 }, { "epoch": 0.24384057971014492, "grad_norm": 7.528701749209174, "learning_rate": 8.848598575674099e-07, "loss": 0.4038, "step": 1346 }, { "epoch": 0.2440217391304348, "grad_norm": 5.138227496202631, "learning_rate": 8.846724980702026e-07, "loss": 0.3188, "step": 1347 }, { "epoch": 0.24420289855072463, "grad_norm": 3.773449850025705, "learning_rate": 8.844850061284317e-07, "loss": 0.3418, "step": 1348 }, { "epoch": 0.24438405797101448, "grad_norm": 6.675721193558435, "learning_rate": 8.842973818066515e-07, "loss": 0.3306, "step": 1349 }, { "epoch": 0.24456521739130435, "grad_norm": 10.627721576771867, "learning_rate": 8.841096251694618e-07, "loss": 0.3078, "step": 1350 }, { "epoch": 0.2447463768115942, "grad_norm": 6.955005601683814, "learning_rate": 8.839217362815081e-07, "loss": 0.3235, "step": 1351 }, { "epoch": 0.24492753623188407, "grad_norm": 4.15898697119472, "learning_rate": 8.837337152074818e-07, "loss": 0.317, "step": 1352 }, { "epoch": 0.2451086956521739, "grad_norm": 4.054679389135684, "learning_rate": 8.83545562012119e-07, "loss": 0.3682, "step": 1353 }, { "epoch": 0.24528985507246376, "grad_norm": 7.587696624323504, "learning_rate": 8.83357276760202e-07, "loss": 0.3259, "step": 1354 }, { "epoch": 0.24547101449275363, "grad_norm": 6.120132345277472, "learning_rate": 8.831688595165583e-07, "loss": 0.3007, "step": 1355 }, { "epoch": 0.24565217391304348, "grad_norm": 3.7569121835614046, "learning_rate": 8.829803103460607e-07, "loss": 0.3274, "step": 1356 }, { "epoch": 0.24583333333333332, "grad_norm": 4.661618593778217, "learning_rate": 8.827916293136275e-07, "loss": 0.3701, "step": 1357 }, { "epoch": 0.2460144927536232, "grad_norm": 7.192015103330489, "learning_rate": 8.826028164842228e-07, "loss": 0.3249, "step": 1358 }, { "epoch": 0.24619565217391304, "grad_norm": 5.832425782161701, "learning_rate": 8.824138719228556e-07, "loss": 0.3143, "step": 1359 }, { "epoch": 0.2463768115942029, "grad_norm": 4.257475375062232, "learning_rate": 8.822247956945803e-07, "loss": 0.2422, "step": 1360 }, { "epoch": 0.24655797101449275, "grad_norm": 10.83770867029943, "learning_rate": 8.820355878644968e-07, "loss": 0.4091, "step": 1361 }, { "epoch": 0.2467391304347826, "grad_norm": 4.778022238706081, "learning_rate": 8.818462484977502e-07, "loss": 0.3737, "step": 1362 }, { "epoch": 0.24692028985507247, "grad_norm": 4.56655115812147, "learning_rate": 8.816567776595312e-07, "loss": 0.3578, "step": 1363 }, { "epoch": 0.24710144927536232, "grad_norm": 6.3888702589878426, "learning_rate": 8.814671754150754e-07, "loss": 0.3076, "step": 1364 }, { "epoch": 0.24728260869565216, "grad_norm": 8.767249853037171, "learning_rate": 8.812774418296633e-07, "loss": 0.3739, "step": 1365 }, { "epoch": 0.24746376811594203, "grad_norm": 6.242333960535613, "learning_rate": 8.810875769686217e-07, "loss": 0.3472, "step": 1366 }, { "epoch": 0.24764492753623188, "grad_norm": 9.666806036687468, "learning_rate": 8.808975808973218e-07, "loss": 0.3757, "step": 1367 }, { "epoch": 0.24782608695652175, "grad_norm": 6.506336349139156, "learning_rate": 8.807074536811798e-07, "loss": 0.356, "step": 1368 }, { "epoch": 0.2480072463768116, "grad_norm": 4.092291263586452, "learning_rate": 8.805171953856578e-07, "loss": 0.3098, "step": 1369 }, { "epoch": 0.24818840579710144, "grad_norm": 4.800265389206893, "learning_rate": 8.803268060762626e-07, "loss": 0.3216, "step": 1370 }, { "epoch": 0.2483695652173913, "grad_norm": 9.205497769879106, "learning_rate": 8.80136285818546e-07, "loss": 0.3859, "step": 1371 }, { "epoch": 0.24855072463768116, "grad_norm": 3.7409099979489184, "learning_rate": 8.799456346781051e-07, "loss": 0.2962, "step": 1372 }, { "epoch": 0.248731884057971, "grad_norm": 4.667003183725607, "learning_rate": 8.797548527205818e-07, "loss": 0.329, "step": 1373 }, { "epoch": 0.24891304347826088, "grad_norm": 9.636347282381372, "learning_rate": 8.795639400116636e-07, "loss": 0.3747, "step": 1374 }, { "epoch": 0.24909420289855072, "grad_norm": 10.690178820862956, "learning_rate": 8.793728966170824e-07, "loss": 0.3871, "step": 1375 }, { "epoch": 0.2492753623188406, "grad_norm": 4.02348810073237, "learning_rate": 8.791817226026152e-07, "loss": 0.3476, "step": 1376 }, { "epoch": 0.24945652173913044, "grad_norm": 7.015420659830804, "learning_rate": 8.789904180340843e-07, "loss": 0.3312, "step": 1377 }, { "epoch": 0.24963768115942028, "grad_norm": 7.122428647887501, "learning_rate": 8.78798982977357e-07, "loss": 0.3777, "step": 1378 }, { "epoch": 0.24981884057971016, "grad_norm": 3.781332296034064, "learning_rate": 8.786074174983451e-07, "loss": 0.3469, "step": 1379 }, { "epoch": 0.25, "grad_norm": 6.51224209405785, "learning_rate": 8.784157216630053e-07, "loss": 0.2774, "step": 1380 }, { "epoch": 0.25018115942028984, "grad_norm": 3.8409980872379723, "learning_rate": 8.782238955373396e-07, "loss": 0.3129, "step": 1381 }, { "epoch": 0.2503623188405797, "grad_norm": 6.62076412471385, "learning_rate": 8.780319391873947e-07, "loss": 0.2931, "step": 1382 }, { "epoch": 0.2505434782608696, "grad_norm": 4.753718622113229, "learning_rate": 8.778398526792619e-07, "loss": 0.2701, "step": 1383 }, { "epoch": 0.25072463768115943, "grad_norm": 6.038729566210822, "learning_rate": 8.776476360790775e-07, "loss": 0.2765, "step": 1384 }, { "epoch": 0.2509057971014493, "grad_norm": 7.094377821118904, "learning_rate": 8.774552894530227e-07, "loss": 0.3313, "step": 1385 }, { "epoch": 0.2510869565217391, "grad_norm": 4.415781042475571, "learning_rate": 8.772628128673233e-07, "loss": 0.3672, "step": 1386 }, { "epoch": 0.25126811594202897, "grad_norm": 4.518381347143787, "learning_rate": 8.770702063882501e-07, "loss": 0.2967, "step": 1387 }, { "epoch": 0.2514492753623188, "grad_norm": 3.8771159358588765, "learning_rate": 8.768774700821182e-07, "loss": 0.3756, "step": 1388 }, { "epoch": 0.2516304347826087, "grad_norm": 8.563823161926644, "learning_rate": 8.766846040152875e-07, "loss": 0.3777, "step": 1389 }, { "epoch": 0.25181159420289856, "grad_norm": 5.497199992795406, "learning_rate": 8.764916082541631e-07, "loss": 0.3438, "step": 1390 }, { "epoch": 0.2519927536231884, "grad_norm": 4.20332765962466, "learning_rate": 8.76298482865194e-07, "loss": 0.3364, "step": 1391 }, { "epoch": 0.25217391304347825, "grad_norm": 5.173376962279245, "learning_rate": 8.761052279148742e-07, "loss": 0.3392, "step": 1392 }, { "epoch": 0.2523550724637681, "grad_norm": 8.350085792266798, "learning_rate": 8.759118434697426e-07, "loss": 0.313, "step": 1393 }, { "epoch": 0.252536231884058, "grad_norm": 6.651744425298431, "learning_rate": 8.757183295963822e-07, "loss": 0.2971, "step": 1394 }, { "epoch": 0.25271739130434784, "grad_norm": 3.918365506340284, "learning_rate": 8.755246863614205e-07, "loss": 0.3379, "step": 1395 }, { "epoch": 0.2528985507246377, "grad_norm": 3.777983201187088, "learning_rate": 8.753309138315301e-07, "loss": 0.3045, "step": 1396 }, { "epoch": 0.2530797101449275, "grad_norm": 4.360621862342518, "learning_rate": 8.751370120734278e-07, "loss": 0.3767, "step": 1397 }, { "epoch": 0.2532608695652174, "grad_norm": 3.742192485163964, "learning_rate": 8.749429811538747e-07, "loss": 0.2821, "step": 1398 }, { "epoch": 0.2534420289855073, "grad_norm": 4.352832978771017, "learning_rate": 8.747488211396767e-07, "loss": 0.3097, "step": 1399 }, { "epoch": 0.2536231884057971, "grad_norm": 5.536884522151105, "learning_rate": 8.745545320976842e-07, "loss": 0.3435, "step": 1400 }, { "epoch": 0.2536231884057971, "eval_loss": 0.33445313572883606, "eval_runtime": 9.7801, "eval_samples_per_second": 51.124, "eval_steps_per_second": 0.102, "step": 1400 }, { "epoch": 0.25380434782608696, "grad_norm": 5.400364690622893, "learning_rate": 8.743601140947913e-07, "loss": 0.3466, "step": 1401 }, { "epoch": 0.2539855072463768, "grad_norm": 3.2657391415928276, "learning_rate": 8.741655671979376e-07, "loss": 0.2774, "step": 1402 }, { "epoch": 0.25416666666666665, "grad_norm": 7.22819625773207, "learning_rate": 8.739708914741061e-07, "loss": 0.3492, "step": 1403 }, { "epoch": 0.2543478260869565, "grad_norm": 6.45319667061321, "learning_rate": 8.737760869903247e-07, "loss": 0.3256, "step": 1404 }, { "epoch": 0.2545289855072464, "grad_norm": 6.030233811979014, "learning_rate": 8.735811538136658e-07, "loss": 0.3067, "step": 1405 }, { "epoch": 0.25471014492753624, "grad_norm": 10.12705551702608, "learning_rate": 8.733860920112454e-07, "loss": 0.3061, "step": 1406 }, { "epoch": 0.2548913043478261, "grad_norm": 4.538490626905695, "learning_rate": 8.731909016502246e-07, "loss": 0.2935, "step": 1407 }, { "epoch": 0.25507246376811593, "grad_norm": 8.729687840643468, "learning_rate": 8.72995582797808e-07, "loss": 0.4079, "step": 1408 }, { "epoch": 0.2552536231884058, "grad_norm": 3.8079831634083083, "learning_rate": 8.728001355212449e-07, "loss": 0.3504, "step": 1409 }, { "epoch": 0.2554347826086957, "grad_norm": 10.674909153311495, "learning_rate": 8.726045598878288e-07, "loss": 0.3547, "step": 1410 }, { "epoch": 0.2556159420289855, "grad_norm": 3.860182279531041, "learning_rate": 8.724088559648974e-07, "loss": 0.3016, "step": 1411 }, { "epoch": 0.25579710144927537, "grad_norm": 5.922315898188411, "learning_rate": 8.722130238198322e-07, "loss": 0.3296, "step": 1412 }, { "epoch": 0.2559782608695652, "grad_norm": 4.622691813487052, "learning_rate": 8.720170635200594e-07, "loss": 0.2657, "step": 1413 }, { "epoch": 0.25615942028985506, "grad_norm": 16.349822303303963, "learning_rate": 8.718209751330491e-07, "loss": 0.3943, "step": 1414 }, { "epoch": 0.2563405797101449, "grad_norm": 5.153830498466211, "learning_rate": 8.716247587263153e-07, "loss": 0.347, "step": 1415 }, { "epoch": 0.2565217391304348, "grad_norm": 4.988211826887706, "learning_rate": 8.714284143674162e-07, "loss": 0.3018, "step": 1416 }, { "epoch": 0.25670289855072465, "grad_norm": 4.837196743173714, "learning_rate": 8.712319421239541e-07, "loss": 0.3177, "step": 1417 }, { "epoch": 0.2568840579710145, "grad_norm": 3.7804238674886537, "learning_rate": 8.710353420635754e-07, "loss": 0.3129, "step": 1418 }, { "epoch": 0.25706521739130433, "grad_norm": 4.151605183838069, "learning_rate": 8.708386142539705e-07, "loss": 0.3663, "step": 1419 }, { "epoch": 0.2572463768115942, "grad_norm": 4.875564503388854, "learning_rate": 8.706417587628737e-07, "loss": 0.3683, "step": 1420 }, { "epoch": 0.2574275362318841, "grad_norm": 3.8614735515668897, "learning_rate": 8.704447756580631e-07, "loss": 0.3386, "step": 1421 }, { "epoch": 0.2576086956521739, "grad_norm": 8.498932272802971, "learning_rate": 8.702476650073611e-07, "loss": 0.3494, "step": 1422 }, { "epoch": 0.25778985507246377, "grad_norm": 4.193900444081362, "learning_rate": 8.700504268786338e-07, "loss": 0.3281, "step": 1423 }, { "epoch": 0.2579710144927536, "grad_norm": 4.631827800441487, "learning_rate": 8.698530613397912e-07, "loss": 0.312, "step": 1424 }, { "epoch": 0.25815217391304346, "grad_norm": 6.276391739029858, "learning_rate": 8.696555684587872e-07, "loss": 0.3261, "step": 1425 }, { "epoch": 0.25833333333333336, "grad_norm": 4.120197921507397, "learning_rate": 8.694579483036194e-07, "loss": 0.3121, "step": 1426 }, { "epoch": 0.2585144927536232, "grad_norm": 4.9825472713501835, "learning_rate": 8.692602009423296e-07, "loss": 0.3355, "step": 1427 }, { "epoch": 0.25869565217391305, "grad_norm": 5.043230537479501, "learning_rate": 8.690623264430028e-07, "loss": 0.3173, "step": 1428 }, { "epoch": 0.2588768115942029, "grad_norm": 5.093064812812383, "learning_rate": 8.688643248737686e-07, "loss": 0.3303, "step": 1429 }, { "epoch": 0.25905797101449274, "grad_norm": 8.815620314056591, "learning_rate": 8.686661963027995e-07, "loss": 0.296, "step": 1430 }, { "epoch": 0.2592391304347826, "grad_norm": 4.862947599875481, "learning_rate": 8.684679407983122e-07, "loss": 0.307, "step": 1431 }, { "epoch": 0.2594202898550725, "grad_norm": 3.535632912298651, "learning_rate": 8.682695584285671e-07, "loss": 0.3596, "step": 1432 }, { "epoch": 0.25960144927536233, "grad_norm": 4.97525373354084, "learning_rate": 8.680710492618682e-07, "loss": 0.2819, "step": 1433 }, { "epoch": 0.2597826086956522, "grad_norm": 5.947201326926296, "learning_rate": 8.678724133665629e-07, "loss": 0.3759, "step": 1434 }, { "epoch": 0.259963768115942, "grad_norm": 3.101348659685582, "learning_rate": 8.676736508110428e-07, "loss": 0.3224, "step": 1435 }, { "epoch": 0.26014492753623186, "grad_norm": 4.420082967897086, "learning_rate": 8.674747616637426e-07, "loss": 0.392, "step": 1436 }, { "epoch": 0.26032608695652176, "grad_norm": 4.660980675047265, "learning_rate": 8.67275745993141e-07, "loss": 0.3455, "step": 1437 }, { "epoch": 0.2605072463768116, "grad_norm": 7.488751325359456, "learning_rate": 8.670766038677597e-07, "loss": 0.386, "step": 1438 }, { "epoch": 0.26068840579710145, "grad_norm": 3.1463366824675285, "learning_rate": 8.668773353561645e-07, "loss": 0.3355, "step": 1439 }, { "epoch": 0.2608695652173913, "grad_norm": 6.845061398395753, "learning_rate": 8.666779405269644e-07, "loss": 0.3263, "step": 1440 }, { "epoch": 0.26105072463768114, "grad_norm": 3.499376675959107, "learning_rate": 8.66478419448812e-07, "loss": 0.2943, "step": 1441 }, { "epoch": 0.26123188405797104, "grad_norm": 8.645841686681946, "learning_rate": 8.662787721904034e-07, "loss": 0.3257, "step": 1442 }, { "epoch": 0.2614130434782609, "grad_norm": 3.343405746176046, "learning_rate": 8.66078998820478e-07, "loss": 0.3149, "step": 1443 }, { "epoch": 0.26159420289855073, "grad_norm": 8.822584109198377, "learning_rate": 8.658790994078189e-07, "loss": 0.2464, "step": 1444 }, { "epoch": 0.2617753623188406, "grad_norm": 5.3212478975305295, "learning_rate": 8.656790740212523e-07, "loss": 0.35, "step": 1445 }, { "epoch": 0.2619565217391304, "grad_norm": 5.783678413914947, "learning_rate": 8.654789227296478e-07, "loss": 0.3188, "step": 1446 }, { "epoch": 0.26213768115942027, "grad_norm": 4.678612259950098, "learning_rate": 8.652786456019186e-07, "loss": 0.2831, "step": 1447 }, { "epoch": 0.26231884057971017, "grad_norm": 7.487509081249481, "learning_rate": 8.65078242707021e-07, "loss": 0.3231, "step": 1448 }, { "epoch": 0.2625, "grad_norm": 5.755727898886248, "learning_rate": 8.648777141139547e-07, "loss": 0.4016, "step": 1449 }, { "epoch": 0.26268115942028986, "grad_norm": 4.898748510106668, "learning_rate": 8.646770598917625e-07, "loss": 0.2902, "step": 1450 }, { "epoch": 0.2628623188405797, "grad_norm": 5.939272544142747, "learning_rate": 8.644762801095307e-07, "loss": 0.3433, "step": 1451 }, { "epoch": 0.26304347826086955, "grad_norm": 4.626659024000853, "learning_rate": 8.642753748363888e-07, "loss": 0.2995, "step": 1452 }, { "epoch": 0.26322463768115945, "grad_norm": 4.784240922151674, "learning_rate": 8.640743441415094e-07, "loss": 0.3231, "step": 1453 }, { "epoch": 0.2634057971014493, "grad_norm": 3.2423440213722174, "learning_rate": 8.638731880941082e-07, "loss": 0.3152, "step": 1454 }, { "epoch": 0.26358695652173914, "grad_norm": 6.95278933074151, "learning_rate": 8.636719067634443e-07, "loss": 0.3665, "step": 1455 }, { "epoch": 0.263768115942029, "grad_norm": 9.038201053118094, "learning_rate": 8.634705002188198e-07, "loss": 0.3295, "step": 1456 }, { "epoch": 0.2639492753623188, "grad_norm": 3.421819507641361, "learning_rate": 8.6326896852958e-07, "loss": 0.3324, "step": 1457 }, { "epoch": 0.26413043478260867, "grad_norm": 5.2937695358451915, "learning_rate": 8.63067311765113e-07, "loss": 0.3452, "step": 1458 }, { "epoch": 0.26431159420289857, "grad_norm": 8.100207003002685, "learning_rate": 8.628655299948503e-07, "loss": 0.3225, "step": 1459 }, { "epoch": 0.2644927536231884, "grad_norm": 7.159499700433832, "learning_rate": 8.626636232882664e-07, "loss": 0.3453, "step": 1460 }, { "epoch": 0.26467391304347826, "grad_norm": 9.163800296896483, "learning_rate": 8.624615917148787e-07, "loss": 0.4613, "step": 1461 }, { "epoch": 0.2648550724637681, "grad_norm": 5.506078709741328, "learning_rate": 8.622594353442474e-07, "loss": 0.3197, "step": 1462 }, { "epoch": 0.26503623188405795, "grad_norm": 3.6101260134565663, "learning_rate": 8.620571542459762e-07, "loss": 0.296, "step": 1463 }, { "epoch": 0.26521739130434785, "grad_norm": 4.552938679140964, "learning_rate": 8.618547484897114e-07, "loss": 0.316, "step": 1464 }, { "epoch": 0.2653985507246377, "grad_norm": 9.398824399031948, "learning_rate": 8.616522181451422e-07, "loss": 0.3369, "step": 1465 }, { "epoch": 0.26557971014492754, "grad_norm": 7.186287709111952, "learning_rate": 8.614495632820007e-07, "loss": 0.3726, "step": 1466 }, { "epoch": 0.2657608695652174, "grad_norm": 6.8184139732106654, "learning_rate": 8.61246783970062e-07, "loss": 0.2958, "step": 1467 }, { "epoch": 0.26594202898550723, "grad_norm": 8.371268031885386, "learning_rate": 8.61043880279144e-07, "loss": 0.3099, "step": 1468 }, { "epoch": 0.26612318840579713, "grad_norm": 4.732164096957703, "learning_rate": 8.608408522791071e-07, "loss": 0.3632, "step": 1469 }, { "epoch": 0.266304347826087, "grad_norm": 4.443176508975629, "learning_rate": 8.606377000398553e-07, "loss": 0.374, "step": 1470 }, { "epoch": 0.2664855072463768, "grad_norm": 3.681126511164227, "learning_rate": 8.604344236313345e-07, "loss": 0.3141, "step": 1471 }, { "epoch": 0.26666666666666666, "grad_norm": 4.274297686313551, "learning_rate": 8.602310231235342e-07, "loss": 0.2937, "step": 1472 }, { "epoch": 0.2668478260869565, "grad_norm": 4.3333896107038035, "learning_rate": 8.600274985864855e-07, "loss": 0.3642, "step": 1473 }, { "epoch": 0.26702898550724635, "grad_norm": 4.320485459535167, "learning_rate": 8.598238500902632e-07, "loss": 0.346, "step": 1474 }, { "epoch": 0.26721014492753625, "grad_norm": 4.591650893445888, "learning_rate": 8.596200777049845e-07, "loss": 0.26, "step": 1475 }, { "epoch": 0.2673913043478261, "grad_norm": 4.558259956964872, "learning_rate": 8.594161815008092e-07, "loss": 0.368, "step": 1476 }, { "epoch": 0.26757246376811594, "grad_norm": 5.87948073014778, "learning_rate": 8.592121615479397e-07, "loss": 0.3641, "step": 1477 }, { "epoch": 0.2677536231884058, "grad_norm": 3.9598472438298113, "learning_rate": 8.590080179166209e-07, "loss": 0.326, "step": 1478 }, { "epoch": 0.26793478260869563, "grad_norm": 3.5104250005481608, "learning_rate": 8.588037506771404e-07, "loss": 0.2586, "step": 1479 }, { "epoch": 0.26811594202898553, "grad_norm": 7.862466418120342, "learning_rate": 8.585993598998286e-07, "loss": 0.2453, "step": 1480 }, { "epoch": 0.2682971014492754, "grad_norm": 4.068702686785692, "learning_rate": 8.583948456550583e-07, "loss": 0.3664, "step": 1481 }, { "epoch": 0.2684782608695652, "grad_norm": 3.0069357984330343, "learning_rate": 8.581902080132442e-07, "loss": 0.2846, "step": 1482 }, { "epoch": 0.26865942028985507, "grad_norm": 10.261968846160247, "learning_rate": 8.579854470448446e-07, "loss": 0.3463, "step": 1483 }, { "epoch": 0.2688405797101449, "grad_norm": 4.023365986941748, "learning_rate": 8.577805628203592e-07, "loss": 0.2749, "step": 1484 }, { "epoch": 0.26902173913043476, "grad_norm": 6.422016011862551, "learning_rate": 8.57575555410331e-07, "loss": 0.3655, "step": 1485 }, { "epoch": 0.26920289855072466, "grad_norm": 3.630918413404799, "learning_rate": 8.573704248853447e-07, "loss": 0.3401, "step": 1486 }, { "epoch": 0.2693840579710145, "grad_norm": 6.10577633680755, "learning_rate": 8.57165171316028e-07, "loss": 0.3, "step": 1487 }, { "epoch": 0.26956521739130435, "grad_norm": 2.8549903913483243, "learning_rate": 8.569597947730505e-07, "loss": 0.2357, "step": 1488 }, { "epoch": 0.2697463768115942, "grad_norm": 5.195580665775834, "learning_rate": 8.567542953271241e-07, "loss": 0.3193, "step": 1489 }, { "epoch": 0.26992753623188404, "grad_norm": 4.181596565359129, "learning_rate": 8.565486730490037e-07, "loss": 0.3762, "step": 1490 }, { "epoch": 0.27010869565217394, "grad_norm": 3.913500666841408, "learning_rate": 8.563429280094859e-07, "loss": 0.3608, "step": 1491 }, { "epoch": 0.2702898550724638, "grad_norm": 3.767915834614901, "learning_rate": 8.561370602794095e-07, "loss": 0.3436, "step": 1492 }, { "epoch": 0.2704710144927536, "grad_norm": 3.9869985309541494, "learning_rate": 8.559310699296558e-07, "loss": 0.3287, "step": 1493 }, { "epoch": 0.27065217391304347, "grad_norm": 4.448264441443902, "learning_rate": 8.557249570311482e-07, "loss": 0.3796, "step": 1494 }, { "epoch": 0.2708333333333333, "grad_norm": 4.664663939809492, "learning_rate": 8.555187216548528e-07, "loss": 0.3147, "step": 1495 }, { "epoch": 0.2710144927536232, "grad_norm": 3.873535547826977, "learning_rate": 8.553123638717766e-07, "loss": 0.3565, "step": 1496 }, { "epoch": 0.27119565217391306, "grad_norm": 3.481105199991194, "learning_rate": 8.551058837529702e-07, "loss": 0.2922, "step": 1497 }, { "epoch": 0.2713768115942029, "grad_norm": 3.618053192370526, "learning_rate": 8.548992813695255e-07, "loss": 0.3515, "step": 1498 }, { "epoch": 0.27155797101449275, "grad_norm": 3.9211018799211153, "learning_rate": 8.546925567925767e-07, "loss": 0.3641, "step": 1499 }, { "epoch": 0.2717391304347826, "grad_norm": 6.608175380270077, "learning_rate": 8.544857100933e-07, "loss": 0.3319, "step": 1500 }, { "epoch": 0.2717391304347826, "eval_loss": 0.3291953206062317, "eval_runtime": 9.8043, "eval_samples_per_second": 50.998, "eval_steps_per_second": 0.102, "step": 1500 }, { "epoch": 0.27192028985507244, "grad_norm": 7.94603320231093, "learning_rate": 8.542787413429138e-07, "loss": 0.3598, "step": 1501 }, { "epoch": 0.27210144927536234, "grad_norm": 5.200603219014856, "learning_rate": 8.540716506126783e-07, "loss": 0.3063, "step": 1502 }, { "epoch": 0.2722826086956522, "grad_norm": 8.450787840873383, "learning_rate": 8.538644379738958e-07, "loss": 0.3376, "step": 1503 }, { "epoch": 0.27246376811594203, "grad_norm": 4.554374448992296, "learning_rate": 8.536571034979108e-07, "loss": 0.3246, "step": 1504 }, { "epoch": 0.2726449275362319, "grad_norm": 3.4097748206696927, "learning_rate": 8.534496472561093e-07, "loss": 0.3821, "step": 1505 }, { "epoch": 0.2728260869565217, "grad_norm": 3.9630753217047574, "learning_rate": 8.532420693199194e-07, "loss": 0.3277, "step": 1506 }, { "epoch": 0.2730072463768116, "grad_norm": 3.1081510117355036, "learning_rate": 8.530343697608116e-07, "loss": 0.337, "step": 1507 }, { "epoch": 0.27318840579710146, "grad_norm": 4.175615263612925, "learning_rate": 8.528265486502974e-07, "loss": 0.3282, "step": 1508 }, { "epoch": 0.2733695652173913, "grad_norm": 5.250703913777366, "learning_rate": 8.52618606059931e-07, "loss": 0.3531, "step": 1509 }, { "epoch": 0.27355072463768115, "grad_norm": 3.339386059847261, "learning_rate": 8.524105420613077e-07, "loss": 0.3207, "step": 1510 }, { "epoch": 0.273731884057971, "grad_norm": 3.600122407000914, "learning_rate": 8.52202356726065e-07, "loss": 0.2744, "step": 1511 }, { "epoch": 0.27391304347826084, "grad_norm": 5.473886600729019, "learning_rate": 8.51994050125882e-07, "loss": 0.3938, "step": 1512 }, { "epoch": 0.27409420289855074, "grad_norm": 5.937355561170181, "learning_rate": 8.5178562233248e-07, "loss": 0.3317, "step": 1513 }, { "epoch": 0.2742753623188406, "grad_norm": 4.362263198446328, "learning_rate": 8.515770734176211e-07, "loss": 0.3368, "step": 1514 }, { "epoch": 0.27445652173913043, "grad_norm": 3.8500946795265607, "learning_rate": 8.513684034531104e-07, "loss": 0.3235, "step": 1515 }, { "epoch": 0.2746376811594203, "grad_norm": 11.298926079489457, "learning_rate": 8.511596125107932e-07, "loss": 0.3558, "step": 1516 }, { "epoch": 0.2748188405797101, "grad_norm": 4.827466966068262, "learning_rate": 8.509507006625578e-07, "loss": 0.338, "step": 1517 }, { "epoch": 0.275, "grad_norm": 6.087367129538768, "learning_rate": 8.507416679803332e-07, "loss": 0.3025, "step": 1518 }, { "epoch": 0.27518115942028987, "grad_norm": 3.9381710819720124, "learning_rate": 8.505325145360907e-07, "loss": 0.3145, "step": 1519 }, { "epoch": 0.2753623188405797, "grad_norm": 5.254791838868577, "learning_rate": 8.503232404018423e-07, "loss": 0.3689, "step": 1520 }, { "epoch": 0.27554347826086956, "grad_norm": 3.7853871113110213, "learning_rate": 8.501138456496426e-07, "loss": 0.3392, "step": 1521 }, { "epoch": 0.2757246376811594, "grad_norm": 6.789476258260733, "learning_rate": 8.499043303515867e-07, "loss": 0.303, "step": 1522 }, { "epoch": 0.2759057971014493, "grad_norm": 7.540076645851566, "learning_rate": 8.496946945798123e-07, "loss": 0.2841, "step": 1523 }, { "epoch": 0.27608695652173915, "grad_norm": 3.7575288732410073, "learning_rate": 8.494849384064973e-07, "loss": 0.3352, "step": 1524 }, { "epoch": 0.276268115942029, "grad_norm": 3.852559693797952, "learning_rate": 8.492750619038624e-07, "loss": 0.3135, "step": 1525 }, { "epoch": 0.27644927536231884, "grad_norm": 5.145294947713052, "learning_rate": 8.490650651441688e-07, "loss": 0.3342, "step": 1526 }, { "epoch": 0.2766304347826087, "grad_norm": 4.2364768313691625, "learning_rate": 8.488549481997191e-07, "loss": 0.3107, "step": 1527 }, { "epoch": 0.2768115942028985, "grad_norm": 6.861576809365711, "learning_rate": 8.48644711142858e-07, "loss": 0.3089, "step": 1528 }, { "epoch": 0.2769927536231884, "grad_norm": 3.22061685004266, "learning_rate": 8.484343540459711e-07, "loss": 0.2904, "step": 1529 }, { "epoch": 0.27717391304347827, "grad_norm": 5.229224561068187, "learning_rate": 8.48223876981485e-07, "loss": 0.2915, "step": 1530 }, { "epoch": 0.2773550724637681, "grad_norm": 3.686300755968237, "learning_rate": 8.480132800218681e-07, "loss": 0.3597, "step": 1531 }, { "epoch": 0.27753623188405796, "grad_norm": 5.085514079163521, "learning_rate": 8.478025632396301e-07, "loss": 0.3779, "step": 1532 }, { "epoch": 0.2777173913043478, "grad_norm": 7.165670049387968, "learning_rate": 8.475917267073215e-07, "loss": 0.3167, "step": 1533 }, { "epoch": 0.2778985507246377, "grad_norm": 5.1829820708715415, "learning_rate": 8.473807704975346e-07, "loss": 0.3327, "step": 1534 }, { "epoch": 0.27807971014492755, "grad_norm": 3.88074841558038, "learning_rate": 8.471696946829024e-07, "loss": 0.3427, "step": 1535 }, { "epoch": 0.2782608695652174, "grad_norm": 5.907455533941769, "learning_rate": 8.469584993360994e-07, "loss": 0.3848, "step": 1536 }, { "epoch": 0.27844202898550724, "grad_norm": 7.617246279236586, "learning_rate": 8.467471845298413e-07, "loss": 0.3052, "step": 1537 }, { "epoch": 0.2786231884057971, "grad_norm": 5.092561470231212, "learning_rate": 8.465357503368845e-07, "loss": 0.322, "step": 1538 }, { "epoch": 0.27880434782608693, "grad_norm": 6.562172416925117, "learning_rate": 8.46324196830027e-07, "loss": 0.3582, "step": 1539 }, { "epoch": 0.27898550724637683, "grad_norm": 4.394735908966694, "learning_rate": 8.461125240821076e-07, "loss": 0.3876, "step": 1540 }, { "epoch": 0.2791666666666667, "grad_norm": 12.079006453677533, "learning_rate": 8.459007321660061e-07, "loss": 0.364, "step": 1541 }, { "epoch": 0.2793478260869565, "grad_norm": 9.01054595163063, "learning_rate": 8.456888211546438e-07, "loss": 0.3115, "step": 1542 }, { "epoch": 0.27952898550724636, "grad_norm": 3.529579067834169, "learning_rate": 8.454767911209824e-07, "loss": 0.3229, "step": 1543 }, { "epoch": 0.2797101449275362, "grad_norm": 3.949927686450316, "learning_rate": 8.452646421380249e-07, "loss": 0.3369, "step": 1544 }, { "epoch": 0.2798913043478261, "grad_norm": 4.419598461268908, "learning_rate": 8.450523742788153e-07, "loss": 0.3423, "step": 1545 }, { "epoch": 0.28007246376811595, "grad_norm": 3.424894427169211, "learning_rate": 8.448399876164382e-07, "loss": 0.2834, "step": 1546 }, { "epoch": 0.2802536231884058, "grad_norm": 7.784078738924731, "learning_rate": 8.446274822240196e-07, "loss": 0.342, "step": 1547 }, { "epoch": 0.28043478260869564, "grad_norm": 3.4297566168299993, "learning_rate": 8.444148581747259e-07, "loss": 0.3533, "step": 1548 }, { "epoch": 0.2806159420289855, "grad_norm": 3.4617069123342343, "learning_rate": 8.442021155417647e-07, "loss": 0.2811, "step": 1549 }, { "epoch": 0.2807971014492754, "grad_norm": 3.6861467071640037, "learning_rate": 8.439892543983844e-07, "loss": 0.3497, "step": 1550 }, { "epoch": 0.28097826086956523, "grad_norm": 3.202756169770153, "learning_rate": 8.437762748178738e-07, "loss": 0.2899, "step": 1551 }, { "epoch": 0.2811594202898551, "grad_norm": 3.436221346362621, "learning_rate": 8.43563176873563e-07, "loss": 0.347, "step": 1552 }, { "epoch": 0.2813405797101449, "grad_norm": 4.833697246701201, "learning_rate": 8.433499606388224e-07, "loss": 0.3221, "step": 1553 }, { "epoch": 0.28152173913043477, "grad_norm": 4.719470145881154, "learning_rate": 8.431366261870637e-07, "loss": 0.3654, "step": 1554 }, { "epoch": 0.2817028985507246, "grad_norm": 4.001786646781353, "learning_rate": 8.429231735917387e-07, "loss": 0.3574, "step": 1555 }, { "epoch": 0.2818840579710145, "grad_norm": 4.539308156263115, "learning_rate": 8.427096029263403e-07, "loss": 0.3565, "step": 1556 }, { "epoch": 0.28206521739130436, "grad_norm": 5.301696453308421, "learning_rate": 8.424959142644017e-07, "loss": 0.2933, "step": 1557 }, { "epoch": 0.2822463768115942, "grad_norm": 8.737154332858287, "learning_rate": 8.422821076794971e-07, "loss": 0.3705, "step": 1558 }, { "epoch": 0.28242753623188405, "grad_norm": 9.03406094631612, "learning_rate": 8.420681832452411e-07, "loss": 0.3431, "step": 1559 }, { "epoch": 0.2826086956521739, "grad_norm": 3.494253860898528, "learning_rate": 8.418541410352888e-07, "loss": 0.3123, "step": 1560 }, { "epoch": 0.2827898550724638, "grad_norm": 4.224727291054771, "learning_rate": 8.416399811233361e-07, "loss": 0.248, "step": 1561 }, { "epoch": 0.28297101449275364, "grad_norm": 4.412263190343274, "learning_rate": 8.41425703583119e-07, "loss": 0.3516, "step": 1562 }, { "epoch": 0.2831521739130435, "grad_norm": 5.191574023375802, "learning_rate": 8.412113084884146e-07, "loss": 0.3244, "step": 1563 }, { "epoch": 0.2833333333333333, "grad_norm": 5.575472293051576, "learning_rate": 8.4099679591304e-07, "loss": 0.3617, "step": 1564 }, { "epoch": 0.28351449275362317, "grad_norm": 4.455283192473403, "learning_rate": 8.407821659308528e-07, "loss": 0.3508, "step": 1565 }, { "epoch": 0.28369565217391307, "grad_norm": 4.303689466574255, "learning_rate": 8.405674186157511e-07, "loss": 0.3636, "step": 1566 }, { "epoch": 0.2838768115942029, "grad_norm": 4.309155995585994, "learning_rate": 8.403525540416738e-07, "loss": 0.3191, "step": 1567 }, { "epoch": 0.28405797101449276, "grad_norm": 3.152108543112069, "learning_rate": 8.401375722825995e-07, "loss": 0.2996, "step": 1568 }, { "epoch": 0.2842391304347826, "grad_norm": 5.603000767800198, "learning_rate": 8.399224734125473e-07, "loss": 0.2968, "step": 1569 }, { "epoch": 0.28442028985507245, "grad_norm": 3.958979161088896, "learning_rate": 8.397072575055771e-07, "loss": 0.3631, "step": 1570 }, { "epoch": 0.2846014492753623, "grad_norm": 6.141595615360145, "learning_rate": 8.394919246357883e-07, "loss": 0.3457, "step": 1571 }, { "epoch": 0.2847826086956522, "grad_norm": 4.820277840732952, "learning_rate": 8.392764748773214e-07, "loss": 0.3448, "step": 1572 }, { "epoch": 0.28496376811594204, "grad_norm": 3.9357781528625426, "learning_rate": 8.390609083043568e-07, "loss": 0.2584, "step": 1573 }, { "epoch": 0.2851449275362319, "grad_norm": 3.5161519053212777, "learning_rate": 8.388452249911149e-07, "loss": 0.3204, "step": 1574 }, { "epoch": 0.28532608695652173, "grad_norm": 7.282372463882652, "learning_rate": 8.386294250118565e-07, "loss": 0.325, "step": 1575 }, { "epoch": 0.2855072463768116, "grad_norm": 5.918162653341962, "learning_rate": 8.384135084408826e-07, "loss": 0.3262, "step": 1576 }, { "epoch": 0.2856884057971015, "grad_norm": 6.923760556303854, "learning_rate": 8.381974753525345e-07, "loss": 0.2956, "step": 1577 }, { "epoch": 0.2858695652173913, "grad_norm": 7.543933793802687, "learning_rate": 8.379813258211929e-07, "loss": 0.3173, "step": 1578 }, { "epoch": 0.28605072463768116, "grad_norm": 4.583192971574929, "learning_rate": 8.377650599212798e-07, "loss": 0.35, "step": 1579 }, { "epoch": 0.286231884057971, "grad_norm": 3.673374292807425, "learning_rate": 8.37548677727256e-07, "loss": 0.3445, "step": 1580 }, { "epoch": 0.28641304347826085, "grad_norm": 3.5752070642664178, "learning_rate": 8.373321793136232e-07, "loss": 0.2583, "step": 1581 }, { "epoch": 0.2865942028985507, "grad_norm": 9.241637813586905, "learning_rate": 8.371155647549226e-07, "loss": 0.335, "step": 1582 }, { "epoch": 0.2867753623188406, "grad_norm": 8.44960724751869, "learning_rate": 8.368988341257359e-07, "loss": 0.3398, "step": 1583 }, { "epoch": 0.28695652173913044, "grad_norm": 6.7555221206061535, "learning_rate": 8.366819875006843e-07, "loss": 0.2912, "step": 1584 }, { "epoch": 0.2871376811594203, "grad_norm": 5.143185181351389, "learning_rate": 8.364650249544291e-07, "loss": 0.2969, "step": 1585 }, { "epoch": 0.28731884057971013, "grad_norm": 3.6029792463124086, "learning_rate": 8.362479465616717e-07, "loss": 0.3134, "step": 1586 }, { "epoch": 0.2875, "grad_norm": 3.354229431447784, "learning_rate": 8.360307523971532e-07, "loss": 0.2607, "step": 1587 }, { "epoch": 0.2876811594202899, "grad_norm": 4.48795155613157, "learning_rate": 8.358134425356543e-07, "loss": 0.3406, "step": 1588 }, { "epoch": 0.2878623188405797, "grad_norm": 3.8339057208163276, "learning_rate": 8.355960170519962e-07, "loss": 0.3412, "step": 1589 }, { "epoch": 0.28804347826086957, "grad_norm": 4.661123094207706, "learning_rate": 8.353784760210392e-07, "loss": 0.3792, "step": 1590 }, { "epoch": 0.2882246376811594, "grad_norm": 12.802215822779953, "learning_rate": 8.351608195176839e-07, "loss": 0.3747, "step": 1591 }, { "epoch": 0.28840579710144926, "grad_norm": 8.422644914479166, "learning_rate": 8.349430476168704e-07, "loss": 0.431, "step": 1592 }, { "epoch": 0.28858695652173916, "grad_norm": 3.8533377070126567, "learning_rate": 8.347251603935788e-07, "loss": 0.3596, "step": 1593 }, { "epoch": 0.288768115942029, "grad_norm": 3.8344946606723975, "learning_rate": 8.345071579228282e-07, "loss": 0.3595, "step": 1594 }, { "epoch": 0.28894927536231885, "grad_norm": 5.176216822170738, "learning_rate": 8.342890402796783e-07, "loss": 0.3087, "step": 1595 }, { "epoch": 0.2891304347826087, "grad_norm": 4.634830361021857, "learning_rate": 8.340708075392281e-07, "loss": 0.2932, "step": 1596 }, { "epoch": 0.28931159420289854, "grad_norm": 2.980558253197547, "learning_rate": 8.338524597766159e-07, "loss": 0.2413, "step": 1597 }, { "epoch": 0.2894927536231884, "grad_norm": 3.331210909978132, "learning_rate": 8.336339970670198e-07, "loss": 0.3071, "step": 1598 }, { "epoch": 0.2896739130434783, "grad_norm": 5.5771896280633, "learning_rate": 8.33415419485658e-07, "loss": 0.3151, "step": 1599 }, { "epoch": 0.2898550724637681, "grad_norm": 4.64277194305046, "learning_rate": 8.331967271077874e-07, "loss": 0.3495, "step": 1600 }, { "epoch": 0.2898550724637681, "eval_loss": 0.3165625035762787, "eval_runtime": 9.7974, "eval_samples_per_second": 51.034, "eval_steps_per_second": 0.102, "step": 1600 }, { "epoch": 0.29003623188405797, "grad_norm": 6.378395618638732, "learning_rate": 8.32977920008705e-07, "loss": 0.3236, "step": 1601 }, { "epoch": 0.2902173913043478, "grad_norm": 3.305310306727567, "learning_rate": 8.327589982637469e-07, "loss": 0.3272, "step": 1602 }, { "epoch": 0.29039855072463766, "grad_norm": 3.2262682005236267, "learning_rate": 8.325399619482892e-07, "loss": 0.317, "step": 1603 }, { "epoch": 0.29057971014492756, "grad_norm": 4.148213141193882, "learning_rate": 8.32320811137747e-07, "loss": 0.3322, "step": 1604 }, { "epoch": 0.2907608695652174, "grad_norm": 10.652562906333356, "learning_rate": 8.321015459075749e-07, "loss": 0.3375, "step": 1605 }, { "epoch": 0.29094202898550725, "grad_norm": 5.3521498911331715, "learning_rate": 8.318821663332669e-07, "loss": 0.3535, "step": 1606 }, { "epoch": 0.2911231884057971, "grad_norm": 7.128114754485076, "learning_rate": 8.316626724903567e-07, "loss": 0.3358, "step": 1607 }, { "epoch": 0.29130434782608694, "grad_norm": 7.274594250395883, "learning_rate": 8.314430644544169e-07, "loss": 0.3214, "step": 1608 }, { "epoch": 0.2914855072463768, "grad_norm": 3.4403681115590103, "learning_rate": 8.312233423010595e-07, "loss": 0.3372, "step": 1609 }, { "epoch": 0.2916666666666667, "grad_norm": 6.147402016937184, "learning_rate": 8.310035061059362e-07, "loss": 0.2874, "step": 1610 }, { "epoch": 0.29184782608695653, "grad_norm": 7.155103900182853, "learning_rate": 8.307835559447371e-07, "loss": 0.3665, "step": 1611 }, { "epoch": 0.2920289855072464, "grad_norm": 7.400319149805051, "learning_rate": 8.305634918931927e-07, "loss": 0.3191, "step": 1612 }, { "epoch": 0.2922101449275362, "grad_norm": 3.997353071823464, "learning_rate": 8.303433140270717e-07, "loss": 0.3461, "step": 1613 }, { "epoch": 0.29239130434782606, "grad_norm": 9.75431790097079, "learning_rate": 8.301230224221825e-07, "loss": 0.2663, "step": 1614 }, { "epoch": 0.29257246376811596, "grad_norm": 3.448463681822264, "learning_rate": 8.299026171543723e-07, "loss": 0.3371, "step": 1615 }, { "epoch": 0.2927536231884058, "grad_norm": 10.201464537825748, "learning_rate": 8.29682098299528e-07, "loss": 0.3842, "step": 1616 }, { "epoch": 0.29293478260869565, "grad_norm": 5.533583721142776, "learning_rate": 8.294614659335754e-07, "loss": 0.3392, "step": 1617 }, { "epoch": 0.2931159420289855, "grad_norm": 7.100245978645747, "learning_rate": 8.29240720132479e-07, "loss": 0.4021, "step": 1618 }, { "epoch": 0.29329710144927534, "grad_norm": 3.29047254076525, "learning_rate": 8.290198609722425e-07, "loss": 0.3055, "step": 1619 }, { "epoch": 0.29347826086956524, "grad_norm": 12.697452764155445, "learning_rate": 8.28798888528909e-07, "loss": 0.2784, "step": 1620 }, { "epoch": 0.2936594202898551, "grad_norm": 8.577476673394132, "learning_rate": 8.285778028785604e-07, "loss": 0.2752, "step": 1621 }, { "epoch": 0.29384057971014493, "grad_norm": 8.891110236058598, "learning_rate": 8.283566040973173e-07, "loss": 0.3295, "step": 1622 }, { "epoch": 0.2940217391304348, "grad_norm": 6.278714573875438, "learning_rate": 8.281352922613397e-07, "loss": 0.3491, "step": 1623 }, { "epoch": 0.2942028985507246, "grad_norm": 3.6375562155142918, "learning_rate": 8.279138674468263e-07, "loss": 0.3343, "step": 1624 }, { "epoch": 0.29438405797101447, "grad_norm": 4.537387375519442, "learning_rate": 8.276923297300146e-07, "loss": 0.3178, "step": 1625 }, { "epoch": 0.29456521739130437, "grad_norm": 11.942034259448368, "learning_rate": 8.27470679187181e-07, "loss": 0.39, "step": 1626 }, { "epoch": 0.2947463768115942, "grad_norm": 14.695645928480674, "learning_rate": 8.272489158946412e-07, "loss": 0.3695, "step": 1627 }, { "epoch": 0.29492753623188406, "grad_norm": 11.078467592004337, "learning_rate": 8.27027039928749e-07, "loss": 0.3189, "step": 1628 }, { "epoch": 0.2951086956521739, "grad_norm": 11.260718473857697, "learning_rate": 8.268050513658976e-07, "loss": 0.3262, "step": 1629 }, { "epoch": 0.29528985507246375, "grad_norm": 4.7968953279817015, "learning_rate": 8.265829502825182e-07, "loss": 0.3589, "step": 1630 }, { "epoch": 0.29547101449275365, "grad_norm": 8.480942280628483, "learning_rate": 8.26360736755082e-07, "loss": 0.3444, "step": 1631 }, { "epoch": 0.2956521739130435, "grad_norm": 4.599040676192968, "learning_rate": 8.261384108600977e-07, "loss": 0.3277, "step": 1632 }, { "epoch": 0.29583333333333334, "grad_norm": 5.803326829663677, "learning_rate": 8.259159726741132e-07, "loss": 0.2633, "step": 1633 }, { "epoch": 0.2960144927536232, "grad_norm": 3.944907576685592, "learning_rate": 8.25693422273715e-07, "loss": 0.2755, "step": 1634 }, { "epoch": 0.296195652173913, "grad_norm": 7.6415934521684195, "learning_rate": 8.254707597355286e-07, "loss": 0.3148, "step": 1635 }, { "epoch": 0.29637681159420287, "grad_norm": 7.1530593049049855, "learning_rate": 8.252479851362176e-07, "loss": 0.3329, "step": 1636 }, { "epoch": 0.29655797101449277, "grad_norm": 7.3869089157210315, "learning_rate": 8.250250985524839e-07, "loss": 0.3277, "step": 1637 }, { "epoch": 0.2967391304347826, "grad_norm": 3.9304977458400305, "learning_rate": 8.24802100061069e-07, "loss": 0.3284, "step": 1638 }, { "epoch": 0.29692028985507246, "grad_norm": 4.319350273583407, "learning_rate": 8.245789897387521e-07, "loss": 0.2677, "step": 1639 }, { "epoch": 0.2971014492753623, "grad_norm": 5.486958289995254, "learning_rate": 8.243557676623509e-07, "loss": 0.3007, "step": 1640 }, { "epoch": 0.29728260869565215, "grad_norm": 4.093664414283139, "learning_rate": 8.241324339087224e-07, "loss": 0.2998, "step": 1641 }, { "epoch": 0.29746376811594205, "grad_norm": 4.432355806367156, "learning_rate": 8.239089885547608e-07, "loss": 0.2794, "step": 1642 }, { "epoch": 0.2976449275362319, "grad_norm": 4.1344808743219925, "learning_rate": 8.236854316774e-07, "loss": 0.3036, "step": 1643 }, { "epoch": 0.29782608695652174, "grad_norm": 6.009139098621791, "learning_rate": 8.234617633536113e-07, "loss": 0.3314, "step": 1644 }, { "epoch": 0.2980072463768116, "grad_norm": 7.239377871455703, "learning_rate": 8.232379836604048e-07, "loss": 0.4076, "step": 1645 }, { "epoch": 0.29818840579710143, "grad_norm": 4.582783674139785, "learning_rate": 8.230140926748291e-07, "loss": 0.3508, "step": 1646 }, { "epoch": 0.29836956521739133, "grad_norm": 7.506638240825678, "learning_rate": 8.227900904739709e-07, "loss": 0.3915, "step": 1647 }, { "epoch": 0.2985507246376812, "grad_norm": 10.230292626773988, "learning_rate": 8.225659771349551e-07, "loss": 0.2514, "step": 1648 }, { "epoch": 0.298731884057971, "grad_norm": 4.597719450326029, "learning_rate": 8.22341752734945e-07, "loss": 0.3474, "step": 1649 }, { "epoch": 0.29891304347826086, "grad_norm": 7.992863045230097, "learning_rate": 8.221174173511421e-07, "loss": 0.3326, "step": 1650 }, { "epoch": 0.2990942028985507, "grad_norm": 3.6998288338167984, "learning_rate": 8.218929710607863e-07, "loss": 0.3412, "step": 1651 }, { "epoch": 0.29927536231884055, "grad_norm": 9.78677207193033, "learning_rate": 8.216684139411551e-07, "loss": 0.3571, "step": 1652 }, { "epoch": 0.29945652173913045, "grad_norm": 4.754946772389717, "learning_rate": 8.214437460695651e-07, "loss": 0.3102, "step": 1653 }, { "epoch": 0.2996376811594203, "grad_norm": 3.6399284203865063, "learning_rate": 8.212189675233703e-07, "loss": 0.3074, "step": 1654 }, { "epoch": 0.29981884057971014, "grad_norm": 7.290067679634445, "learning_rate": 8.209940783799632e-07, "loss": 0.3187, "step": 1655 }, { "epoch": 0.3, "grad_norm": 4.814851215421622, "learning_rate": 8.207690787167736e-07, "loss": 0.2526, "step": 1656 }, { "epoch": 0.30018115942028983, "grad_norm": 4.5703032648245925, "learning_rate": 8.205439686112706e-07, "loss": 0.3178, "step": 1657 }, { "epoch": 0.30036231884057973, "grad_norm": 3.483001123055395, "learning_rate": 8.203187481409604e-07, "loss": 0.3109, "step": 1658 }, { "epoch": 0.3005434782608696, "grad_norm": 6.539740426528362, "learning_rate": 8.200934173833876e-07, "loss": 0.3038, "step": 1659 }, { "epoch": 0.3007246376811594, "grad_norm": 4.035415377972808, "learning_rate": 8.198679764161344e-07, "loss": 0.2564, "step": 1660 }, { "epoch": 0.30090579710144927, "grad_norm": 11.928054378232504, "learning_rate": 8.196424253168215e-07, "loss": 0.2992, "step": 1661 }, { "epoch": 0.3010869565217391, "grad_norm": 4.761747541137394, "learning_rate": 8.19416764163107e-07, "loss": 0.2985, "step": 1662 }, { "epoch": 0.30126811594202896, "grad_norm": 7.882641673254881, "learning_rate": 8.191909930326873e-07, "loss": 0.2681, "step": 1663 }, { "epoch": 0.30144927536231886, "grad_norm": 5.16498571414151, "learning_rate": 8.189651120032961e-07, "loss": 0.2982, "step": 1664 }, { "epoch": 0.3016304347826087, "grad_norm": 3.791597801034975, "learning_rate": 8.187391211527057e-07, "loss": 0.2672, "step": 1665 }, { "epoch": 0.30181159420289855, "grad_norm": 4.391914652669149, "learning_rate": 8.185130205587256e-07, "loss": 0.3719, "step": 1666 }, { "epoch": 0.3019927536231884, "grad_norm": 3.825751565978178, "learning_rate": 8.182868102992034e-07, "loss": 0.3201, "step": 1667 }, { "epoch": 0.30217391304347824, "grad_norm": 3.8365459038452423, "learning_rate": 8.180604904520243e-07, "loss": 0.3198, "step": 1668 }, { "epoch": 0.30235507246376814, "grad_norm": 9.954952894096023, "learning_rate": 8.178340610951113e-07, "loss": 0.3356, "step": 1669 }, { "epoch": 0.302536231884058, "grad_norm": 4.237608552840032, "learning_rate": 8.176075223064254e-07, "loss": 0.3057, "step": 1670 }, { "epoch": 0.3027173913043478, "grad_norm": 4.937242080309297, "learning_rate": 8.173808741639645e-07, "loss": 0.3831, "step": 1671 }, { "epoch": 0.30289855072463767, "grad_norm": 8.638907774574463, "learning_rate": 8.171541167457648e-07, "loss": 0.3585, "step": 1672 }, { "epoch": 0.3030797101449275, "grad_norm": 4.9368554415733295, "learning_rate": 8.169272501299e-07, "loss": 0.2738, "step": 1673 }, { "epoch": 0.3032608695652174, "grad_norm": 3.105638821588064, "learning_rate": 8.167002743944815e-07, "loss": 0.3069, "step": 1674 }, { "epoch": 0.30344202898550726, "grad_norm": 6.184566997979668, "learning_rate": 8.16473189617658e-07, "loss": 0.3126, "step": 1675 }, { "epoch": 0.3036231884057971, "grad_norm": 8.577425247463461, "learning_rate": 8.162459958776157e-07, "loss": 0.3297, "step": 1676 }, { "epoch": 0.30380434782608695, "grad_norm": 6.075670646524077, "learning_rate": 8.160186932525786e-07, "loss": 0.2979, "step": 1677 }, { "epoch": 0.3039855072463768, "grad_norm": 8.17065404475752, "learning_rate": 8.157912818208082e-07, "loss": 0.2567, "step": 1678 }, { "epoch": 0.30416666666666664, "grad_norm": 12.172849250228765, "learning_rate": 8.15563761660603e-07, "loss": 0.3384, "step": 1679 }, { "epoch": 0.30434782608695654, "grad_norm": 11.514993580298441, "learning_rate": 8.153361328502997e-07, "loss": 0.4114, "step": 1680 }, { "epoch": 0.3045289855072464, "grad_norm": 4.422004935489591, "learning_rate": 8.151083954682716e-07, "loss": 0.3861, "step": 1681 }, { "epoch": 0.30471014492753623, "grad_norm": 3.0950820256059632, "learning_rate": 8.148805495929301e-07, "loss": 0.3081, "step": 1682 }, { "epoch": 0.3048913043478261, "grad_norm": 4.2933449575939955, "learning_rate": 8.146525953027234e-07, "loss": 0.3385, "step": 1683 }, { "epoch": 0.3050724637681159, "grad_norm": 6.563174574341766, "learning_rate": 8.144245326761372e-07, "loss": 0.3494, "step": 1684 }, { "epoch": 0.3052536231884058, "grad_norm": 14.968938020714559, "learning_rate": 8.141963617916947e-07, "loss": 0.3132, "step": 1685 }, { "epoch": 0.30543478260869567, "grad_norm": 8.119941435405739, "learning_rate": 8.139680827279561e-07, "loss": 0.302, "step": 1686 }, { "epoch": 0.3056159420289855, "grad_norm": 7.514167445732151, "learning_rate": 8.13739695563519e-07, "loss": 0.2958, "step": 1687 }, { "epoch": 0.30579710144927535, "grad_norm": 4.8600159321965455, "learning_rate": 8.135112003770183e-07, "loss": 0.3024, "step": 1688 }, { "epoch": 0.3059782608695652, "grad_norm": 4.362134270632265, "learning_rate": 8.132825972471257e-07, "loss": 0.3058, "step": 1689 }, { "epoch": 0.3061594202898551, "grad_norm": 11.88542025847849, "learning_rate": 8.130538862525507e-07, "loss": 0.3456, "step": 1690 }, { "epoch": 0.30634057971014494, "grad_norm": 7.380431182586247, "learning_rate": 8.128250674720391e-07, "loss": 0.3011, "step": 1691 }, { "epoch": 0.3065217391304348, "grad_norm": 4.484294932241263, "learning_rate": 8.125961409843747e-07, "loss": 0.3658, "step": 1692 }, { "epoch": 0.30670289855072463, "grad_norm": 3.320116234268537, "learning_rate": 8.123671068683779e-07, "loss": 0.3053, "step": 1693 }, { "epoch": 0.3068840579710145, "grad_norm": 6.242270244530482, "learning_rate": 8.121379652029062e-07, "loss": 0.289, "step": 1694 }, { "epoch": 0.3070652173913043, "grad_norm": 8.685104064895292, "learning_rate": 8.119087160668541e-07, "loss": 0.3122, "step": 1695 }, { "epoch": 0.3072463768115942, "grad_norm": 5.625449806615163, "learning_rate": 8.116793595391531e-07, "loss": 0.3004, "step": 1696 }, { "epoch": 0.30742753623188407, "grad_norm": 4.83672194009164, "learning_rate": 8.11449895698772e-07, "loss": 0.3246, "step": 1697 }, { "epoch": 0.3076086956521739, "grad_norm": 4.951062129123521, "learning_rate": 8.11220324624716e-07, "loss": 0.3808, "step": 1698 }, { "epoch": 0.30778985507246376, "grad_norm": 7.803992820412519, "learning_rate": 8.109906463960278e-07, "loss": 0.3244, "step": 1699 }, { "epoch": 0.3079710144927536, "grad_norm": 6.018782723368235, "learning_rate": 8.107608610917864e-07, "loss": 0.4355, "step": 1700 }, { "epoch": 0.3079710144927536, "eval_loss": 0.3173125088214874, "eval_runtime": 9.8026, "eval_samples_per_second": 51.007, "eval_steps_per_second": 0.102, "step": 1700 }, { "epoch": 0.3081521739130435, "grad_norm": 5.492281426920849, "learning_rate": 8.105309687911081e-07, "loss": 0.3196, "step": 1701 }, { "epoch": 0.30833333333333335, "grad_norm": 5.719478406761022, "learning_rate": 8.10300969573146e-07, "loss": 0.3062, "step": 1702 }, { "epoch": 0.3085144927536232, "grad_norm": 7.930820493815263, "learning_rate": 8.100708635170899e-07, "loss": 0.282, "step": 1703 }, { "epoch": 0.30869565217391304, "grad_norm": 11.007865923193712, "learning_rate": 8.098406507021662e-07, "loss": 0.3151, "step": 1704 }, { "epoch": 0.3088768115942029, "grad_norm": 10.859344678807489, "learning_rate": 8.096103312076385e-07, "loss": 0.3986, "step": 1705 }, { "epoch": 0.3090579710144927, "grad_norm": 3.776643484700144, "learning_rate": 8.093799051128068e-07, "loss": 0.3228, "step": 1706 }, { "epoch": 0.3092391304347826, "grad_norm": 5.930200032897158, "learning_rate": 8.091493724970078e-07, "loss": 0.3097, "step": 1707 }, { "epoch": 0.3094202898550725, "grad_norm": 7.995811284308179, "learning_rate": 8.089187334396152e-07, "loss": 0.2922, "step": 1708 }, { "epoch": 0.3096014492753623, "grad_norm": 9.038408354147903, "learning_rate": 8.086879880200389e-07, "loss": 0.321, "step": 1709 }, { "epoch": 0.30978260869565216, "grad_norm": 5.328408142403583, "learning_rate": 8.084571363177257e-07, "loss": 0.319, "step": 1710 }, { "epoch": 0.309963768115942, "grad_norm": 3.5597320281458615, "learning_rate": 8.082261784121591e-07, "loss": 0.2508, "step": 1711 }, { "epoch": 0.3101449275362319, "grad_norm": 6.336031122568708, "learning_rate": 8.079951143828587e-07, "loss": 0.2932, "step": 1712 }, { "epoch": 0.31032608695652175, "grad_norm": 4.919364404287858, "learning_rate": 8.077639443093813e-07, "loss": 0.2613, "step": 1713 }, { "epoch": 0.3105072463768116, "grad_norm": 4.1022156965988765, "learning_rate": 8.075326682713195e-07, "loss": 0.3666, "step": 1714 }, { "epoch": 0.31068840579710144, "grad_norm": 3.8694318989791263, "learning_rate": 8.07301286348303e-07, "loss": 0.3367, "step": 1715 }, { "epoch": 0.3108695652173913, "grad_norm": 7.122481586009249, "learning_rate": 8.070697986199975e-07, "loss": 0.3001, "step": 1716 }, { "epoch": 0.3110507246376812, "grad_norm": 3.3830720501581975, "learning_rate": 8.068382051661054e-07, "loss": 0.3013, "step": 1717 }, { "epoch": 0.31123188405797103, "grad_norm": 10.629590540589108, "learning_rate": 8.066065060663655e-07, "loss": 0.3384, "step": 1718 }, { "epoch": 0.3114130434782609, "grad_norm": 3.894935965033015, "learning_rate": 8.063747014005528e-07, "loss": 0.3307, "step": 1719 }, { "epoch": 0.3115942028985507, "grad_norm": 8.494647196669545, "learning_rate": 8.061427912484787e-07, "loss": 0.3248, "step": 1720 }, { "epoch": 0.31177536231884057, "grad_norm": 7.12551551259516, "learning_rate": 8.059107756899912e-07, "loss": 0.3502, "step": 1721 }, { "epoch": 0.3119565217391304, "grad_norm": 5.32475208946155, "learning_rate": 8.056786548049741e-07, "loss": 0.2877, "step": 1722 }, { "epoch": 0.3121376811594203, "grad_norm": 6.967031117789461, "learning_rate": 8.054464286733478e-07, "loss": 0.3152, "step": 1723 }, { "epoch": 0.31231884057971016, "grad_norm": 7.555447942790169, "learning_rate": 8.05214097375069e-07, "loss": 0.3465, "step": 1724 }, { "epoch": 0.3125, "grad_norm": 4.070719397505282, "learning_rate": 8.049816609901303e-07, "loss": 0.3354, "step": 1725 }, { "epoch": 0.31268115942028984, "grad_norm": 8.384575921616161, "learning_rate": 8.047491195985605e-07, "loss": 0.2967, "step": 1726 }, { "epoch": 0.3128623188405797, "grad_norm": 4.611983231322097, "learning_rate": 8.04516473280425e-07, "loss": 0.3364, "step": 1727 }, { "epoch": 0.3130434782608696, "grad_norm": 5.853952889529148, "learning_rate": 8.04283722115825e-07, "loss": 0.3113, "step": 1728 }, { "epoch": 0.31322463768115943, "grad_norm": 5.027119828842899, "learning_rate": 8.040508661848977e-07, "loss": 0.3289, "step": 1729 }, { "epoch": 0.3134057971014493, "grad_norm": 3.270780677236944, "learning_rate": 8.038179055678165e-07, "loss": 0.3124, "step": 1730 }, { "epoch": 0.3135869565217391, "grad_norm": 4.127668886158838, "learning_rate": 8.035848403447909e-07, "loss": 0.3549, "step": 1731 }, { "epoch": 0.31376811594202897, "grad_norm": 4.087005380805331, "learning_rate": 8.033516705960663e-07, "loss": 0.3385, "step": 1732 }, { "epoch": 0.3139492753623188, "grad_norm": 3.327602869466973, "learning_rate": 8.031183964019243e-07, "loss": 0.3244, "step": 1733 }, { "epoch": 0.3141304347826087, "grad_norm": 10.313271954335411, "learning_rate": 8.028850178426822e-07, "loss": 0.3574, "step": 1734 }, { "epoch": 0.31431159420289856, "grad_norm": 5.520779553007381, "learning_rate": 8.026515349986935e-07, "loss": 0.3619, "step": 1735 }, { "epoch": 0.3144927536231884, "grad_norm": 3.953922544297295, "learning_rate": 8.02417947950347e-07, "loss": 0.3014, "step": 1736 }, { "epoch": 0.31467391304347825, "grad_norm": 6.692317222783169, "learning_rate": 8.021842567780684e-07, "loss": 0.3732, "step": 1737 }, { "epoch": 0.3148550724637681, "grad_norm": 8.65083240899913, "learning_rate": 8.019504615623183e-07, "loss": 0.3563, "step": 1738 }, { "epoch": 0.315036231884058, "grad_norm": 10.892243907432828, "learning_rate": 8.017165623835935e-07, "loss": 0.3206, "step": 1739 }, { "epoch": 0.31521739130434784, "grad_norm": 6.3045245615993535, "learning_rate": 8.014825593224268e-07, "loss": 0.2876, "step": 1740 }, { "epoch": 0.3153985507246377, "grad_norm": 3.8197231707198473, "learning_rate": 8.012484524593866e-07, "loss": 0.3541, "step": 1741 }, { "epoch": 0.3155797101449275, "grad_norm": 5.755044329091152, "learning_rate": 8.010142418750768e-07, "loss": 0.3152, "step": 1742 }, { "epoch": 0.3157608695652174, "grad_norm": 4.137010640554019, "learning_rate": 8.007799276501372e-07, "loss": 0.311, "step": 1743 }, { "epoch": 0.3159420289855073, "grad_norm": 5.767085388663738, "learning_rate": 8.005455098652435e-07, "loss": 0.3432, "step": 1744 }, { "epoch": 0.3161231884057971, "grad_norm": 12.840668638797792, "learning_rate": 8.003109886011066e-07, "loss": 0.413, "step": 1745 }, { "epoch": 0.31630434782608696, "grad_norm": 9.072168907881776, "learning_rate": 8.000763639384735e-07, "loss": 0.2955, "step": 1746 }, { "epoch": 0.3164855072463768, "grad_norm": 3.228234268362536, "learning_rate": 7.998416359581266e-07, "loss": 0.3198, "step": 1747 }, { "epoch": 0.31666666666666665, "grad_norm": 3.6393868339760074, "learning_rate": 7.996068047408837e-07, "loss": 0.2711, "step": 1748 }, { "epoch": 0.3168478260869565, "grad_norm": 2.909269469768695, "learning_rate": 7.993718703675983e-07, "loss": 0.3151, "step": 1749 }, { "epoch": 0.3170289855072464, "grad_norm": 12.590518173027503, "learning_rate": 7.991368329191595e-07, "loss": 0.3557, "step": 1750 }, { "epoch": 0.31721014492753624, "grad_norm": 9.625837073942225, "learning_rate": 7.989016924764918e-07, "loss": 0.3373, "step": 1751 }, { "epoch": 0.3173913043478261, "grad_norm": 12.204503330163293, "learning_rate": 7.986664491205552e-07, "loss": 0.297, "step": 1752 }, { "epoch": 0.31757246376811593, "grad_norm": 3.9730453944517707, "learning_rate": 7.98431102932345e-07, "loss": 0.2768, "step": 1753 }, { "epoch": 0.3177536231884058, "grad_norm": 4.878760706265584, "learning_rate": 7.98195653992892e-07, "loss": 0.3432, "step": 1754 }, { "epoch": 0.3179347826086957, "grad_norm": 4.7797206233418335, "learning_rate": 7.979601023832625e-07, "loss": 0.3926, "step": 1755 }, { "epoch": 0.3181159420289855, "grad_norm": 7.197706234302545, "learning_rate": 7.97724448184558e-07, "loss": 0.2906, "step": 1756 }, { "epoch": 0.31829710144927537, "grad_norm": 13.018178014540316, "learning_rate": 7.974886914779153e-07, "loss": 0.3726, "step": 1757 }, { "epoch": 0.3184782608695652, "grad_norm": 9.80128062782683, "learning_rate": 7.972528323445067e-07, "loss": 0.3029, "step": 1758 }, { "epoch": 0.31865942028985506, "grad_norm": 8.733887067577676, "learning_rate": 7.970168708655394e-07, "loss": 0.3651, "step": 1759 }, { "epoch": 0.3188405797101449, "grad_norm": 5.75649463387482, "learning_rate": 7.967808071222564e-07, "loss": 0.2818, "step": 1760 }, { "epoch": 0.3190217391304348, "grad_norm": 3.7894055734976093, "learning_rate": 7.96544641195935e-07, "loss": 0.3068, "step": 1761 }, { "epoch": 0.31920289855072465, "grad_norm": 8.2874706703031, "learning_rate": 7.963083731678888e-07, "loss": 0.3273, "step": 1762 }, { "epoch": 0.3193840579710145, "grad_norm": 4.744385932326128, "learning_rate": 7.960720031194656e-07, "loss": 0.3093, "step": 1763 }, { "epoch": 0.31956521739130433, "grad_norm": 6.046360752192413, "learning_rate": 7.95835531132049e-07, "loss": 0.3463, "step": 1764 }, { "epoch": 0.3197463768115942, "grad_norm": 6.25156012306365, "learning_rate": 7.955989572870574e-07, "loss": 0.4005, "step": 1765 }, { "epoch": 0.3199275362318841, "grad_norm": 4.033191064466662, "learning_rate": 7.953622816659442e-07, "loss": 0.3305, "step": 1766 }, { "epoch": 0.3201086956521739, "grad_norm": 3.4091109789621843, "learning_rate": 7.951255043501978e-07, "loss": 0.325, "step": 1767 }, { "epoch": 0.32028985507246377, "grad_norm": 6.969332580688918, "learning_rate": 7.948886254213423e-07, "loss": 0.3218, "step": 1768 }, { "epoch": 0.3204710144927536, "grad_norm": 3.1964983579917146, "learning_rate": 7.946516449609355e-07, "loss": 0.2884, "step": 1769 }, { "epoch": 0.32065217391304346, "grad_norm": 3.665297902530676, "learning_rate": 7.944145630505714e-07, "loss": 0.2778, "step": 1770 }, { "epoch": 0.32083333333333336, "grad_norm": 15.983651726363597, "learning_rate": 7.941773797718783e-07, "loss": 0.3463, "step": 1771 }, { "epoch": 0.3210144927536232, "grad_norm": 10.265141581605715, "learning_rate": 7.939400952065193e-07, "loss": 0.3379, "step": 1772 }, { "epoch": 0.32119565217391305, "grad_norm": 15.058386227801867, "learning_rate": 7.93702709436193e-07, "loss": 0.3262, "step": 1773 }, { "epoch": 0.3213768115942029, "grad_norm": 12.753059074024279, "learning_rate": 7.934652225426321e-07, "loss": 0.3325, "step": 1774 }, { "epoch": 0.32155797101449274, "grad_norm": 8.206695925369717, "learning_rate": 7.932276346076047e-07, "loss": 0.3304, "step": 1775 }, { "epoch": 0.3217391304347826, "grad_norm": 5.127611689367168, "learning_rate": 7.929899457129135e-07, "loss": 0.3076, "step": 1776 }, { "epoch": 0.3219202898550725, "grad_norm": 3.616167030994503, "learning_rate": 7.927521559403956e-07, "loss": 0.2938, "step": 1777 }, { "epoch": 0.32210144927536233, "grad_norm": 5.936154514974325, "learning_rate": 7.925142653719235e-07, "loss": 0.3397, "step": 1778 }, { "epoch": 0.3222826086956522, "grad_norm": 7.531415104527346, "learning_rate": 7.922762740894036e-07, "loss": 0.3241, "step": 1779 }, { "epoch": 0.322463768115942, "grad_norm": 12.151271145029897, "learning_rate": 7.92038182174778e-07, "loss": 0.3487, "step": 1780 }, { "epoch": 0.32264492753623186, "grad_norm": 5.623020406797692, "learning_rate": 7.917999897100222e-07, "loss": 0.3139, "step": 1781 }, { "epoch": 0.32282608695652176, "grad_norm": 10.863941871658541, "learning_rate": 7.915616967771477e-07, "loss": 0.3318, "step": 1782 }, { "epoch": 0.3230072463768116, "grad_norm": 9.024418909062998, "learning_rate": 7.913233034581994e-07, "loss": 0.2895, "step": 1783 }, { "epoch": 0.32318840579710145, "grad_norm": 3.9634898278216077, "learning_rate": 7.910848098352574e-07, "loss": 0.3504, "step": 1784 }, { "epoch": 0.3233695652173913, "grad_norm": 4.518014284501169, "learning_rate": 7.908462159904362e-07, "loss": 0.3021, "step": 1785 }, { "epoch": 0.32355072463768114, "grad_norm": 9.238802669532143, "learning_rate": 7.906075220058847e-07, "loss": 0.4299, "step": 1786 }, { "epoch": 0.32373188405797104, "grad_norm": 5.21929560669184, "learning_rate": 7.903687279637867e-07, "loss": 0.3364, "step": 1787 }, { "epoch": 0.3239130434782609, "grad_norm": 11.012728402839075, "learning_rate": 7.901298339463597e-07, "loss": 0.3175, "step": 1788 }, { "epoch": 0.32409420289855073, "grad_norm": 5.696682544109489, "learning_rate": 7.898908400358561e-07, "loss": 0.2815, "step": 1789 }, { "epoch": 0.3242753623188406, "grad_norm": 6.464259691848639, "learning_rate": 7.896517463145629e-07, "loss": 0.3231, "step": 1790 }, { "epoch": 0.3244565217391304, "grad_norm": 5.970634926153257, "learning_rate": 7.894125528648011e-07, "loss": 0.3132, "step": 1791 }, { "epoch": 0.32463768115942027, "grad_norm": 3.9456199455399763, "learning_rate": 7.891732597689259e-07, "loss": 0.3501, "step": 1792 }, { "epoch": 0.32481884057971017, "grad_norm": 4.132907835092769, "learning_rate": 7.889338671093273e-07, "loss": 0.3908, "step": 1793 }, { "epoch": 0.325, "grad_norm": 6.286989059880983, "learning_rate": 7.886943749684293e-07, "loss": 0.3026, "step": 1794 }, { "epoch": 0.32518115942028986, "grad_norm": 7.261166467637907, "learning_rate": 7.884547834286901e-07, "loss": 0.3568, "step": 1795 }, { "epoch": 0.3253623188405797, "grad_norm": 3.406809395394441, "learning_rate": 7.882150925726023e-07, "loss": 0.3052, "step": 1796 }, { "epoch": 0.32554347826086955, "grad_norm": 6.114238799476201, "learning_rate": 7.879753024826925e-07, "loss": 0.2653, "step": 1797 }, { "epoch": 0.32572463768115945, "grad_norm": 8.117039676780227, "learning_rate": 7.877354132415215e-07, "loss": 0.2944, "step": 1798 }, { "epoch": 0.3259057971014493, "grad_norm": 7.043234745915229, "learning_rate": 7.874954249316846e-07, "loss": 0.3492, "step": 1799 }, { "epoch": 0.32608695652173914, "grad_norm": 5.58362755560085, "learning_rate": 7.872553376358104e-07, "loss": 0.35, "step": 1800 }, { "epoch": 0.32608695652173914, "eval_loss": 0.3110625147819519, "eval_runtime": 9.8069, "eval_samples_per_second": 50.984, "eval_steps_per_second": 0.102, "step": 1800 }, { "epoch": 0.326268115942029, "grad_norm": 6.9229344023976935, "learning_rate": 7.870151514365626e-07, "loss": 0.3014, "step": 1801 }, { "epoch": 0.3264492753623188, "grad_norm": 7.5092665291583485, "learning_rate": 7.867748664166383e-07, "loss": 0.3281, "step": 1802 }, { "epoch": 0.32663043478260867, "grad_norm": 4.530904318891568, "learning_rate": 7.865344826587688e-07, "loss": 0.2728, "step": 1803 }, { "epoch": 0.32681159420289857, "grad_norm": 9.976531480698771, "learning_rate": 7.86294000245719e-07, "loss": 0.3434, "step": 1804 }, { "epoch": 0.3269927536231884, "grad_norm": 3.712576034526324, "learning_rate": 7.860534192602887e-07, "loss": 0.3596, "step": 1805 }, { "epoch": 0.32717391304347826, "grad_norm": 6.78122623043467, "learning_rate": 7.858127397853107e-07, "loss": 0.3255, "step": 1806 }, { "epoch": 0.3273550724637681, "grad_norm": 13.949449697003933, "learning_rate": 7.855719619036522e-07, "loss": 0.2817, "step": 1807 }, { "epoch": 0.32753623188405795, "grad_norm": 4.218279967689233, "learning_rate": 7.85331085698214e-07, "loss": 0.3364, "step": 1808 }, { "epoch": 0.32771739130434785, "grad_norm": 5.533274062204578, "learning_rate": 7.850901112519312e-07, "loss": 0.2823, "step": 1809 }, { "epoch": 0.3278985507246377, "grad_norm": 3.543552759773686, "learning_rate": 7.848490386477724e-07, "loss": 0.3599, "step": 1810 }, { "epoch": 0.32807971014492754, "grad_norm": 3.9866252451118527, "learning_rate": 7.846078679687398e-07, "loss": 0.337, "step": 1811 }, { "epoch": 0.3282608695652174, "grad_norm": 5.734366563873238, "learning_rate": 7.843665992978699e-07, "loss": 0.277, "step": 1812 }, { "epoch": 0.32844202898550723, "grad_norm": 5.419092947234047, "learning_rate": 7.841252327182324e-07, "loss": 0.3199, "step": 1813 }, { "epoch": 0.32862318840579713, "grad_norm": 9.236887425051977, "learning_rate": 7.838837683129311e-07, "loss": 0.3139, "step": 1814 }, { "epoch": 0.328804347826087, "grad_norm": 3.9756656590256862, "learning_rate": 7.836422061651031e-07, "loss": 0.329, "step": 1815 }, { "epoch": 0.3289855072463768, "grad_norm": 5.01291542310867, "learning_rate": 7.834005463579199e-07, "loss": 0.3097, "step": 1816 }, { "epoch": 0.32916666666666666, "grad_norm": 4.769191561751831, "learning_rate": 7.831587889745856e-07, "loss": 0.3507, "step": 1817 }, { "epoch": 0.3293478260869565, "grad_norm": 4.0984766101069985, "learning_rate": 7.829169340983388e-07, "loss": 0.266, "step": 1818 }, { "epoch": 0.32952898550724635, "grad_norm": 11.098333700488112, "learning_rate": 7.826749818124509e-07, "loss": 0.304, "step": 1819 }, { "epoch": 0.32971014492753625, "grad_norm": 4.8360526383389395, "learning_rate": 7.824329322002276e-07, "loss": 0.2941, "step": 1820 }, { "epoch": 0.3298913043478261, "grad_norm": 4.354136907590092, "learning_rate": 7.821907853450074e-07, "loss": 0.2926, "step": 1821 }, { "epoch": 0.33007246376811594, "grad_norm": 5.682014557605183, "learning_rate": 7.819485413301629e-07, "loss": 0.2977, "step": 1822 }, { "epoch": 0.3302536231884058, "grad_norm": 5.29047262698395, "learning_rate": 7.817062002390997e-07, "loss": 0.3418, "step": 1823 }, { "epoch": 0.33043478260869563, "grad_norm": 3.437535128783487, "learning_rate": 7.814637621552569e-07, "loss": 0.2943, "step": 1824 }, { "epoch": 0.33061594202898553, "grad_norm": 8.494373244559128, "learning_rate": 7.812212271621072e-07, "loss": 0.3455, "step": 1825 }, { "epoch": 0.3307971014492754, "grad_norm": 6.117527539614069, "learning_rate": 7.809785953431566e-07, "loss": 0.4307, "step": 1826 }, { "epoch": 0.3309782608695652, "grad_norm": 7.0242481198031, "learning_rate": 7.807358667819444e-07, "loss": 0.3699, "step": 1827 }, { "epoch": 0.33115942028985507, "grad_norm": 6.494935737259366, "learning_rate": 7.80493041562043e-07, "loss": 0.306, "step": 1828 }, { "epoch": 0.3313405797101449, "grad_norm": 11.040990824622417, "learning_rate": 7.802501197670584e-07, "loss": 0.3092, "step": 1829 }, { "epoch": 0.33152173913043476, "grad_norm": 6.140918427273407, "learning_rate": 7.800071014806298e-07, "loss": 0.2845, "step": 1830 }, { "epoch": 0.33170289855072466, "grad_norm": 5.782933828184184, "learning_rate": 7.797639867864292e-07, "loss": 0.2868, "step": 1831 }, { "epoch": 0.3318840579710145, "grad_norm": 4.55085839386319, "learning_rate": 7.795207757681625e-07, "loss": 0.32, "step": 1832 }, { "epoch": 0.33206521739130435, "grad_norm": 4.988593286291903, "learning_rate": 7.792774685095685e-07, "loss": 0.2611, "step": 1833 }, { "epoch": 0.3322463768115942, "grad_norm": 10.675872523095858, "learning_rate": 7.790340650944187e-07, "loss": 0.3535, "step": 1834 }, { "epoch": 0.33242753623188404, "grad_norm": 6.438853883030144, "learning_rate": 7.787905656065181e-07, "loss": 0.3236, "step": 1835 }, { "epoch": 0.33260869565217394, "grad_norm": 11.587803589616485, "learning_rate": 7.785469701297051e-07, "loss": 0.3597, "step": 1836 }, { "epoch": 0.3327898550724638, "grad_norm": 4.93510083383632, "learning_rate": 7.783032787478503e-07, "loss": 0.345, "step": 1837 }, { "epoch": 0.3329710144927536, "grad_norm": 5.870644635889845, "learning_rate": 7.78059491544858e-07, "loss": 0.3492, "step": 1838 }, { "epoch": 0.33315217391304347, "grad_norm": 6.756904392385173, "learning_rate": 7.778156086046653e-07, "loss": 0.4325, "step": 1839 }, { "epoch": 0.3333333333333333, "grad_norm": 3.6573303948484868, "learning_rate": 7.775716300112422e-07, "loss": 0.3203, "step": 1840 }, { "epoch": 0.3335144927536232, "grad_norm": 3.046149780740665, "learning_rate": 7.773275558485919e-07, "loss": 0.2639, "step": 1841 }, { "epoch": 0.33369565217391306, "grad_norm": 5.529985241268639, "learning_rate": 7.7708338620075e-07, "loss": 0.3153, "step": 1842 }, { "epoch": 0.3338768115942029, "grad_norm": 3.634591019962169, "learning_rate": 7.768391211517854e-07, "loss": 0.2825, "step": 1843 }, { "epoch": 0.33405797101449275, "grad_norm": 4.064523349135702, "learning_rate": 7.765947607857996e-07, "loss": 0.3234, "step": 1844 }, { "epoch": 0.3342391304347826, "grad_norm": 4.898462953096211, "learning_rate": 7.763503051869274e-07, "loss": 0.3493, "step": 1845 }, { "epoch": 0.33442028985507244, "grad_norm": 4.535616035004989, "learning_rate": 7.761057544393354e-07, "loss": 0.3633, "step": 1846 }, { "epoch": 0.33460144927536234, "grad_norm": 6.399007789665967, "learning_rate": 7.758611086272242e-07, "loss": 0.2986, "step": 1847 }, { "epoch": 0.3347826086956522, "grad_norm": 11.827980252735243, "learning_rate": 7.756163678348259e-07, "loss": 0.3161, "step": 1848 }, { "epoch": 0.33496376811594203, "grad_norm": 5.764356486799095, "learning_rate": 7.753715321464067e-07, "loss": 0.3814, "step": 1849 }, { "epoch": 0.3351449275362319, "grad_norm": 8.655970632077477, "learning_rate": 7.75126601646264e-07, "loss": 0.2574, "step": 1850 }, { "epoch": 0.3353260869565217, "grad_norm": 7.392371604385812, "learning_rate": 7.748815764187289e-07, "loss": 0.3307, "step": 1851 }, { "epoch": 0.3355072463768116, "grad_norm": 3.961886234950718, "learning_rate": 7.746364565481644e-07, "loss": 0.3492, "step": 1852 }, { "epoch": 0.33568840579710146, "grad_norm": 4.312124096310197, "learning_rate": 7.743912421189669e-07, "loss": 0.3745, "step": 1853 }, { "epoch": 0.3358695652173913, "grad_norm": 3.576929001616148, "learning_rate": 7.741459332155644e-07, "loss": 0.3312, "step": 1854 }, { "epoch": 0.33605072463768115, "grad_norm": 3.1350013576812588, "learning_rate": 7.739005299224184e-07, "loss": 0.3079, "step": 1855 }, { "epoch": 0.336231884057971, "grad_norm": 3.100110304948787, "learning_rate": 7.736550323240221e-07, "loss": 0.2841, "step": 1856 }, { "epoch": 0.33641304347826084, "grad_norm": 5.412568495165048, "learning_rate": 7.734094405049016e-07, "loss": 0.297, "step": 1857 }, { "epoch": 0.33659420289855074, "grad_norm": 4.829316799924912, "learning_rate": 7.731637545496152e-07, "loss": 0.3187, "step": 1858 }, { "epoch": 0.3367753623188406, "grad_norm": 4.069996412320471, "learning_rate": 7.729179745427539e-07, "loss": 0.3193, "step": 1859 }, { "epoch": 0.33695652173913043, "grad_norm": 6.896132936551222, "learning_rate": 7.726721005689407e-07, "loss": 0.2841, "step": 1860 }, { "epoch": 0.3371376811594203, "grad_norm": 3.4484648851852304, "learning_rate": 7.724261327128316e-07, "loss": 0.3051, "step": 1861 }, { "epoch": 0.3373188405797101, "grad_norm": 4.8216965517130195, "learning_rate": 7.72180071059114e-07, "loss": 0.3949, "step": 1862 }, { "epoch": 0.3375, "grad_norm": 3.5955961823535145, "learning_rate": 7.719339156925085e-07, "loss": 0.2961, "step": 1863 }, { "epoch": 0.33768115942028987, "grad_norm": 4.568770172107434, "learning_rate": 7.716876666977672e-07, "loss": 0.3697, "step": 1864 }, { "epoch": 0.3378623188405797, "grad_norm": 3.4887417154003866, "learning_rate": 7.714413241596752e-07, "loss": 0.337, "step": 1865 }, { "epoch": 0.33804347826086956, "grad_norm": 8.078526845128401, "learning_rate": 7.711948881630488e-07, "loss": 0.3098, "step": 1866 }, { "epoch": 0.3382246376811594, "grad_norm": 4.666920406554297, "learning_rate": 7.709483587927377e-07, "loss": 0.3748, "step": 1867 }, { "epoch": 0.3384057971014493, "grad_norm": 11.889240970106998, "learning_rate": 7.707017361336229e-07, "loss": 0.301, "step": 1868 }, { "epoch": 0.33858695652173915, "grad_norm": 7.052726826122178, "learning_rate": 7.704550202706177e-07, "loss": 0.2922, "step": 1869 }, { "epoch": 0.338768115942029, "grad_norm": 4.541414767469716, "learning_rate": 7.702082112886675e-07, "loss": 0.3466, "step": 1870 }, { "epoch": 0.33894927536231884, "grad_norm": 5.045076435203539, "learning_rate": 7.699613092727501e-07, "loss": 0.3337, "step": 1871 }, { "epoch": 0.3391304347826087, "grad_norm": 5.174490533205886, "learning_rate": 7.697143143078746e-07, "loss": 0.4119, "step": 1872 }, { "epoch": 0.3393115942028985, "grad_norm": 5.66541132872301, "learning_rate": 7.694672264790829e-07, "loss": 0.3312, "step": 1873 }, { "epoch": 0.3394927536231884, "grad_norm": 6.865984013490447, "learning_rate": 7.692200458714482e-07, "loss": 0.3042, "step": 1874 }, { "epoch": 0.33967391304347827, "grad_norm": 13.355950382047805, "learning_rate": 7.689727725700762e-07, "loss": 0.3695, "step": 1875 }, { "epoch": 0.3398550724637681, "grad_norm": 4.346972351647204, "learning_rate": 7.687254066601042e-07, "loss": 0.3828, "step": 1876 }, { "epoch": 0.34003623188405796, "grad_norm": 3.2301297780422895, "learning_rate": 7.684779482267015e-07, "loss": 0.2957, "step": 1877 }, { "epoch": 0.3402173913043478, "grad_norm": 3.729963802113808, "learning_rate": 7.68230397355069e-07, "loss": 0.2879, "step": 1878 }, { "epoch": 0.3403985507246377, "grad_norm": 11.284445167538793, "learning_rate": 7.679827541304399e-07, "loss": 0.3391, "step": 1879 }, { "epoch": 0.34057971014492755, "grad_norm": 7.993766447744876, "learning_rate": 7.677350186380787e-07, "loss": 0.2724, "step": 1880 }, { "epoch": 0.3407608695652174, "grad_norm": 7.059520062316174, "learning_rate": 7.67487190963282e-07, "loss": 0.323, "step": 1881 }, { "epoch": 0.34094202898550724, "grad_norm": 9.917544819971706, "learning_rate": 7.672392711913783e-07, "loss": 0.2579, "step": 1882 }, { "epoch": 0.3411231884057971, "grad_norm": 8.212302852980963, "learning_rate": 7.669912594077272e-07, "loss": 0.3157, "step": 1883 }, { "epoch": 0.34130434782608693, "grad_norm": 6.432886292746912, "learning_rate": 7.667431556977205e-07, "loss": 0.3038, "step": 1884 }, { "epoch": 0.34148550724637683, "grad_norm": 4.5740339102545216, "learning_rate": 7.664949601467814e-07, "loss": 0.2946, "step": 1885 }, { "epoch": 0.3416666666666667, "grad_norm": 6.278337582897395, "learning_rate": 7.66246672840365e-07, "loss": 0.2569, "step": 1886 }, { "epoch": 0.3418478260869565, "grad_norm": 5.553927723122158, "learning_rate": 7.659982938639573e-07, "loss": 0.3451, "step": 1887 }, { "epoch": 0.34202898550724636, "grad_norm": 9.631497175645482, "learning_rate": 7.657498233030769e-07, "loss": 0.3146, "step": 1888 }, { "epoch": 0.3422101449275362, "grad_norm": 4.92189232450789, "learning_rate": 7.655012612432732e-07, "loss": 0.2783, "step": 1889 }, { "epoch": 0.3423913043478261, "grad_norm": 3.8638210356351808, "learning_rate": 7.652526077701273e-07, "loss": 0.3392, "step": 1890 }, { "epoch": 0.34257246376811595, "grad_norm": 9.769706653601604, "learning_rate": 7.650038629692517e-07, "loss": 0.345, "step": 1891 }, { "epoch": 0.3427536231884058, "grad_norm": 2.985434391402221, "learning_rate": 7.647550269262904e-07, "loss": 0.2556, "step": 1892 }, { "epoch": 0.34293478260869564, "grad_norm": 7.128359932100153, "learning_rate": 7.64506099726919e-07, "loss": 0.3926, "step": 1893 }, { "epoch": 0.3431159420289855, "grad_norm": 4.712568135716269, "learning_rate": 7.642570814568442e-07, "loss": 0.2929, "step": 1894 }, { "epoch": 0.3432971014492754, "grad_norm": 4.870888807304561, "learning_rate": 7.64007972201804e-07, "loss": 0.2495, "step": 1895 }, { "epoch": 0.34347826086956523, "grad_norm": 11.396376975111291, "learning_rate": 7.637587720475683e-07, "loss": 0.3848, "step": 1896 }, { "epoch": 0.3436594202898551, "grad_norm": 3.87365561517141, "learning_rate": 7.635094810799376e-07, "loss": 0.3206, "step": 1897 }, { "epoch": 0.3438405797101449, "grad_norm": 3.1454057233550703, "learning_rate": 7.63260099384744e-07, "loss": 0.2676, "step": 1898 }, { "epoch": 0.34402173913043477, "grad_norm": 3.6565351839349693, "learning_rate": 7.63010627047851e-07, "loss": 0.3404, "step": 1899 }, { "epoch": 0.3442028985507246, "grad_norm": 3.3259769279543416, "learning_rate": 7.627610641551527e-07, "loss": 0.3035, "step": 1900 }, { "epoch": 0.3442028985507246, "eval_loss": 0.31725001335144043, "eval_runtime": 9.7375, "eval_samples_per_second": 51.348, "eval_steps_per_second": 0.103, "step": 1900 }, { "epoch": 0.3443840579710145, "grad_norm": 4.282098781605039, "learning_rate": 7.62511410792575e-07, "loss": 0.3334, "step": 1901 }, { "epoch": 0.34456521739130436, "grad_norm": 7.7128900444262545, "learning_rate": 7.62261667046075e-07, "loss": 0.3718, "step": 1902 }, { "epoch": 0.3447463768115942, "grad_norm": 5.686317933797614, "learning_rate": 7.620118330016402e-07, "loss": 0.3582, "step": 1903 }, { "epoch": 0.34492753623188405, "grad_norm": 9.517294360438747, "learning_rate": 7.6176190874529e-07, "loss": 0.3749, "step": 1904 }, { "epoch": 0.3451086956521739, "grad_norm": 7.779011917835557, "learning_rate": 7.615118943630743e-07, "loss": 0.3279, "step": 1905 }, { "epoch": 0.3452898550724638, "grad_norm": 5.391162533515678, "learning_rate": 7.612617899410743e-07, "loss": 0.3345, "step": 1906 }, { "epoch": 0.34547101449275364, "grad_norm": 8.667519242476674, "learning_rate": 7.61011595565402e-07, "loss": 0.2916, "step": 1907 }, { "epoch": 0.3456521739130435, "grad_norm": 5.248028694855879, "learning_rate": 7.607613113222009e-07, "loss": 0.291, "step": 1908 }, { "epoch": 0.3458333333333333, "grad_norm": 13.70835393462969, "learning_rate": 7.605109372976446e-07, "loss": 0.3652, "step": 1909 }, { "epoch": 0.34601449275362317, "grad_norm": 7.095388012091221, "learning_rate": 7.602604735779384e-07, "loss": 0.299, "step": 1910 }, { "epoch": 0.34619565217391307, "grad_norm": 3.3656288451090948, "learning_rate": 7.60009920249318e-07, "loss": 0.2843, "step": 1911 }, { "epoch": 0.3463768115942029, "grad_norm": 9.18839271346281, "learning_rate": 7.597592773980501e-07, "loss": 0.3467, "step": 1912 }, { "epoch": 0.34655797101449276, "grad_norm": 3.363641644804981, "learning_rate": 7.595085451104322e-07, "loss": 0.271, "step": 1913 }, { "epoch": 0.3467391304347826, "grad_norm": 5.443643354170259, "learning_rate": 7.592577234727927e-07, "loss": 0.3369, "step": 1914 }, { "epoch": 0.34692028985507245, "grad_norm": 3.9042244073527077, "learning_rate": 7.590068125714904e-07, "loss": 0.3802, "step": 1915 }, { "epoch": 0.3471014492753623, "grad_norm": 5.439914170267567, "learning_rate": 7.587558124929155e-07, "loss": 0.3301, "step": 1916 }, { "epoch": 0.3472826086956522, "grad_norm": 8.900399166788967, "learning_rate": 7.585047233234883e-07, "loss": 0.368, "step": 1917 }, { "epoch": 0.34746376811594204, "grad_norm": 6.041870917497792, "learning_rate": 7.582535451496601e-07, "loss": 0.3347, "step": 1918 }, { "epoch": 0.3476449275362319, "grad_norm": 4.568070340817429, "learning_rate": 7.580022780579127e-07, "loss": 0.3172, "step": 1919 }, { "epoch": 0.34782608695652173, "grad_norm": 4.441333780159891, "learning_rate": 7.577509221347584e-07, "loss": 0.3246, "step": 1920 }, { "epoch": 0.3480072463768116, "grad_norm": 7.3331851376781145, "learning_rate": 7.574994774667405e-07, "loss": 0.2966, "step": 1921 }, { "epoch": 0.3481884057971015, "grad_norm": 8.145608524730086, "learning_rate": 7.572479441404323e-07, "loss": 0.2691, "step": 1922 }, { "epoch": 0.3483695652173913, "grad_norm": 4.073117607302022, "learning_rate": 7.569963222424382e-07, "loss": 0.2984, "step": 1923 }, { "epoch": 0.34855072463768116, "grad_norm": 4.092988409608099, "learning_rate": 7.567446118593927e-07, "loss": 0.3324, "step": 1924 }, { "epoch": 0.348731884057971, "grad_norm": 3.619394052885244, "learning_rate": 7.564928130779608e-07, "loss": 0.3246, "step": 1925 }, { "epoch": 0.34891304347826085, "grad_norm": 4.843311112410277, "learning_rate": 7.56240925984838e-07, "loss": 0.3395, "step": 1926 }, { "epoch": 0.3490942028985507, "grad_norm": 3.299196083780203, "learning_rate": 7.559889506667502e-07, "loss": 0.3041, "step": 1927 }, { "epoch": 0.3492753623188406, "grad_norm": 4.51999885419715, "learning_rate": 7.557368872104539e-07, "loss": 0.3693, "step": 1928 }, { "epoch": 0.34945652173913044, "grad_norm": 6.250648374945436, "learning_rate": 7.554847357027358e-07, "loss": 0.2881, "step": 1929 }, { "epoch": 0.3496376811594203, "grad_norm": 9.058991677860087, "learning_rate": 7.552324962304126e-07, "loss": 0.3195, "step": 1930 }, { "epoch": 0.34981884057971013, "grad_norm": 9.306681764335439, "learning_rate": 7.549801688803314e-07, "loss": 0.3363, "step": 1931 }, { "epoch": 0.35, "grad_norm": 4.41249544527736, "learning_rate": 7.547277537393701e-07, "loss": 0.3355, "step": 1932 }, { "epoch": 0.3501811594202899, "grad_norm": 5.059114323913898, "learning_rate": 7.544752508944363e-07, "loss": 0.2837, "step": 1933 }, { "epoch": 0.3503623188405797, "grad_norm": 3.9975470441371685, "learning_rate": 7.54222660432468e-07, "loss": 0.3284, "step": 1934 }, { "epoch": 0.35054347826086957, "grad_norm": 3.667456386660029, "learning_rate": 7.53969982440433e-07, "loss": 0.3544, "step": 1935 }, { "epoch": 0.3507246376811594, "grad_norm": 5.467054414616333, "learning_rate": 7.537172170053296e-07, "loss": 0.354, "step": 1936 }, { "epoch": 0.35090579710144926, "grad_norm": 3.898467393603716, "learning_rate": 7.534643642141864e-07, "loss": 0.3335, "step": 1937 }, { "epoch": 0.35108695652173916, "grad_norm": 5.351133640830385, "learning_rate": 7.532114241540617e-07, "loss": 0.3429, "step": 1938 }, { "epoch": 0.351268115942029, "grad_norm": 6.713417884803827, "learning_rate": 7.529583969120439e-07, "loss": 0.3193, "step": 1939 }, { "epoch": 0.35144927536231885, "grad_norm": 3.6311873813636546, "learning_rate": 7.527052825752514e-07, "loss": 0.2878, "step": 1940 }, { "epoch": 0.3516304347826087, "grad_norm": 4.598739660373051, "learning_rate": 7.524520812308329e-07, "loss": 0.304, "step": 1941 }, { "epoch": 0.35181159420289854, "grad_norm": 4.7275618672068385, "learning_rate": 7.521987929659666e-07, "loss": 0.2905, "step": 1942 }, { "epoch": 0.3519927536231884, "grad_norm": 11.364381926193575, "learning_rate": 7.51945417867861e-07, "loss": 0.314, "step": 1943 }, { "epoch": 0.3521739130434783, "grad_norm": 3.45285174488261, "learning_rate": 7.516919560237543e-07, "loss": 0.3115, "step": 1944 }, { "epoch": 0.3523550724637681, "grad_norm": 6.88704782510566, "learning_rate": 7.514384075209145e-07, "loss": 0.3046, "step": 1945 }, { "epoch": 0.35253623188405797, "grad_norm": 3.461509491621105, "learning_rate": 7.511847724466398e-07, "loss": 0.3596, "step": 1946 }, { "epoch": 0.3527173913043478, "grad_norm": 7.224756575119816, "learning_rate": 7.509310508882576e-07, "loss": 0.2902, "step": 1947 }, { "epoch": 0.35289855072463766, "grad_norm": 6.139441570882219, "learning_rate": 7.506772429331258e-07, "loss": 0.3176, "step": 1948 }, { "epoch": 0.35307971014492756, "grad_norm": 5.864063586915534, "learning_rate": 7.504233486686315e-07, "loss": 0.2741, "step": 1949 }, { "epoch": 0.3532608695652174, "grad_norm": 4.737651456180436, "learning_rate": 7.501693681821917e-07, "loss": 0.3732, "step": 1950 }, { "epoch": 0.35344202898550725, "grad_norm": 7.098250544261399, "learning_rate": 7.499153015612531e-07, "loss": 0.3041, "step": 1951 }, { "epoch": 0.3536231884057971, "grad_norm": 11.461618543071948, "learning_rate": 7.49661148893292e-07, "loss": 0.3358, "step": 1952 }, { "epoch": 0.35380434782608694, "grad_norm": 13.918541046966165, "learning_rate": 7.494069102658144e-07, "loss": 0.3547, "step": 1953 }, { "epoch": 0.3539855072463768, "grad_norm": 3.946662794690365, "learning_rate": 7.491525857663561e-07, "loss": 0.2751, "step": 1954 }, { "epoch": 0.3541666666666667, "grad_norm": 3.320597497624449, "learning_rate": 7.488981754824818e-07, "loss": 0.3018, "step": 1955 }, { "epoch": 0.35434782608695653, "grad_norm": 5.79081248013009, "learning_rate": 7.486436795017865e-07, "loss": 0.3255, "step": 1956 }, { "epoch": 0.3545289855072464, "grad_norm": 5.377384462939565, "learning_rate": 7.483890979118941e-07, "loss": 0.3199, "step": 1957 }, { "epoch": 0.3547101449275362, "grad_norm": 3.9946503786788456, "learning_rate": 7.481344308004586e-07, "loss": 0.3405, "step": 1958 }, { "epoch": 0.35489130434782606, "grad_norm": 4.459599253562778, "learning_rate": 7.478796782551627e-07, "loss": 0.3222, "step": 1959 }, { "epoch": 0.35507246376811596, "grad_norm": 4.321830940291255, "learning_rate": 7.476248403637193e-07, "loss": 0.3451, "step": 1960 }, { "epoch": 0.3552536231884058, "grad_norm": 6.154671052614726, "learning_rate": 7.473699172138699e-07, "loss": 0.2844, "step": 1961 }, { "epoch": 0.35543478260869565, "grad_norm": 9.313270188556709, "learning_rate": 7.471149088933861e-07, "loss": 0.3265, "step": 1962 }, { "epoch": 0.3556159420289855, "grad_norm": 3.312867911782334, "learning_rate": 7.468598154900681e-07, "loss": 0.3312, "step": 1963 }, { "epoch": 0.35579710144927534, "grad_norm": 6.432823713029236, "learning_rate": 7.466046370917462e-07, "loss": 0.2874, "step": 1964 }, { "epoch": 0.35597826086956524, "grad_norm": 7.2127414323412005, "learning_rate": 7.463493737862792e-07, "loss": 0.3055, "step": 1965 }, { "epoch": 0.3561594202898551, "grad_norm": 4.439994348162171, "learning_rate": 7.460940256615556e-07, "loss": 0.3275, "step": 1966 }, { "epoch": 0.35634057971014493, "grad_norm": 8.970028985119088, "learning_rate": 7.458385928054929e-07, "loss": 0.3383, "step": 1967 }, { "epoch": 0.3565217391304348, "grad_norm": 3.712195228807908, "learning_rate": 7.455830753060379e-07, "loss": 0.2984, "step": 1968 }, { "epoch": 0.3567028985507246, "grad_norm": 4.318077828598915, "learning_rate": 7.453274732511666e-07, "loss": 0.3431, "step": 1969 }, { "epoch": 0.35688405797101447, "grad_norm": 3.8396660276512757, "learning_rate": 7.450717867288838e-07, "loss": 0.3336, "step": 1970 }, { "epoch": 0.35706521739130437, "grad_norm": 6.4177151398265195, "learning_rate": 7.448160158272235e-07, "loss": 0.3145, "step": 1971 }, { "epoch": 0.3572463768115942, "grad_norm": 3.787539417337925, "learning_rate": 7.445601606342493e-07, "loss": 0.3432, "step": 1972 }, { "epoch": 0.35742753623188406, "grad_norm": 3.486054169161485, "learning_rate": 7.443042212380527e-07, "loss": 0.314, "step": 1973 }, { "epoch": 0.3576086956521739, "grad_norm": 3.5757932929300806, "learning_rate": 7.440481977267555e-07, "loss": 0.3143, "step": 1974 }, { "epoch": 0.35778985507246375, "grad_norm": 3.3658739895347063, "learning_rate": 7.437920901885073e-07, "loss": 0.3223, "step": 1975 }, { "epoch": 0.35797101449275365, "grad_norm": 4.100429319835068, "learning_rate": 7.435358987114874e-07, "loss": 0.3652, "step": 1976 }, { "epoch": 0.3581521739130435, "grad_norm": 4.129137635609264, "learning_rate": 7.432796233839036e-07, "loss": 0.3182, "step": 1977 }, { "epoch": 0.35833333333333334, "grad_norm": 7.570761725939312, "learning_rate": 7.430232642939929e-07, "loss": 0.3354, "step": 1978 }, { "epoch": 0.3585144927536232, "grad_norm": 4.619318360142829, "learning_rate": 7.427668215300206e-07, "loss": 0.3604, "step": 1979 }, { "epoch": 0.358695652173913, "grad_norm": 7.679296847995771, "learning_rate": 7.425102951802817e-07, "loss": 0.2763, "step": 1980 }, { "epoch": 0.35887681159420287, "grad_norm": 5.04311052514982, "learning_rate": 7.422536853330991e-07, "loss": 0.3366, "step": 1981 }, { "epoch": 0.35905797101449277, "grad_norm": 3.981794977926401, "learning_rate": 7.419969920768248e-07, "loss": 0.2789, "step": 1982 }, { "epoch": 0.3592391304347826, "grad_norm": 4.216962557200483, "learning_rate": 7.417402154998393e-07, "loss": 0.2929, "step": 1983 }, { "epoch": 0.35942028985507246, "grad_norm": 4.1488859722145355, "learning_rate": 7.414833556905524e-07, "loss": 0.338, "step": 1984 }, { "epoch": 0.3596014492753623, "grad_norm": 4.934740187518044, "learning_rate": 7.41226412737402e-07, "loss": 0.286, "step": 1985 }, { "epoch": 0.35978260869565215, "grad_norm": 3.9615401399550736, "learning_rate": 7.409693867288547e-07, "loss": 0.3377, "step": 1986 }, { "epoch": 0.35996376811594205, "grad_norm": 10.247830981070214, "learning_rate": 7.407122777534058e-07, "loss": 0.3495, "step": 1987 }, { "epoch": 0.3601449275362319, "grad_norm": 4.051189792203562, "learning_rate": 7.40455085899579e-07, "loss": 0.364, "step": 1988 }, { "epoch": 0.36032608695652174, "grad_norm": 5.035068085203559, "learning_rate": 7.401978112559271e-07, "loss": 0.32, "step": 1989 }, { "epoch": 0.3605072463768116, "grad_norm": 7.652800777619602, "learning_rate": 7.399404539110304e-07, "loss": 0.3494, "step": 1990 }, { "epoch": 0.36068840579710143, "grad_norm": 12.323705836928221, "learning_rate": 7.396830139534988e-07, "loss": 0.4155, "step": 1991 }, { "epoch": 0.36086956521739133, "grad_norm": 3.422811390821941, "learning_rate": 7.394254914719697e-07, "loss": 0.3353, "step": 1992 }, { "epoch": 0.3610507246376812, "grad_norm": 5.878923700656869, "learning_rate": 7.391678865551096e-07, "loss": 0.3138, "step": 1993 }, { "epoch": 0.361231884057971, "grad_norm": 3.078172283599551, "learning_rate": 7.389101992916129e-07, "loss": 0.2726, "step": 1994 }, { "epoch": 0.36141304347826086, "grad_norm": 7.297137082836424, "learning_rate": 7.386524297702025e-07, "loss": 0.3261, "step": 1995 }, { "epoch": 0.3615942028985507, "grad_norm": 5.497105598108665, "learning_rate": 7.3839457807963e-07, "loss": 0.2926, "step": 1996 }, { "epoch": 0.36177536231884055, "grad_norm": 4.306504039927819, "learning_rate": 7.381366443086746e-07, "loss": 0.3333, "step": 1997 }, { "epoch": 0.36195652173913045, "grad_norm": 4.7975678003784745, "learning_rate": 7.378786285461441e-07, "loss": 0.3125, "step": 1998 }, { "epoch": 0.3621376811594203, "grad_norm": 3.8304666735680795, "learning_rate": 7.376205308808751e-07, "loss": 0.3451, "step": 1999 }, { "epoch": 0.36231884057971014, "grad_norm": 4.245393270464726, "learning_rate": 7.37362351401731e-07, "loss": 0.3298, "step": 2000 }, { "epoch": 0.36231884057971014, "eval_loss": 0.3128125071525574, "eval_runtime": 9.7582, "eval_samples_per_second": 51.239, "eval_steps_per_second": 0.102, "step": 2000 }, { "epoch": 0.3625, "grad_norm": 3.0270586053648496, "learning_rate": 7.371040901976049e-07, "loss": 0.2622, "step": 2001 }, { "epoch": 0.36268115942028983, "grad_norm": 3.8414369889881725, "learning_rate": 7.368457473574171e-07, "loss": 0.2963, "step": 2002 }, { "epoch": 0.36286231884057973, "grad_norm": 5.497425488685455, "learning_rate": 7.365873229701163e-07, "loss": 0.3499, "step": 2003 }, { "epoch": 0.3630434782608696, "grad_norm": 5.631438760538479, "learning_rate": 7.363288171246792e-07, "loss": 0.3811, "step": 2004 }, { "epoch": 0.3632246376811594, "grad_norm": 5.966784931572397, "learning_rate": 7.360702299101107e-07, "loss": 0.2722, "step": 2005 }, { "epoch": 0.36340579710144927, "grad_norm": 3.687933646458537, "learning_rate": 7.358115614154433e-07, "loss": 0.3185, "step": 2006 }, { "epoch": 0.3635869565217391, "grad_norm": 6.581873337166921, "learning_rate": 7.355528117297383e-07, "loss": 0.3166, "step": 2007 }, { "epoch": 0.36376811594202896, "grad_norm": 6.170943611267692, "learning_rate": 7.352939809420839e-07, "loss": 0.3138, "step": 2008 }, { "epoch": 0.36394927536231886, "grad_norm": 8.952168700024632, "learning_rate": 7.350350691415971e-07, "loss": 0.3372, "step": 2009 }, { "epoch": 0.3641304347826087, "grad_norm": 11.028699099346055, "learning_rate": 7.347760764174224e-07, "loss": 0.3206, "step": 2010 }, { "epoch": 0.36431159420289855, "grad_norm": 14.133079621169708, "learning_rate": 7.345170028587322e-07, "loss": 0.4056, "step": 2011 }, { "epoch": 0.3644927536231884, "grad_norm": 3.2778698499876784, "learning_rate": 7.342578485547266e-07, "loss": 0.3339, "step": 2012 }, { "epoch": 0.36467391304347824, "grad_norm": 6.553533744716087, "learning_rate": 7.339986135946341e-07, "loss": 0.3341, "step": 2013 }, { "epoch": 0.36485507246376814, "grad_norm": 4.9691193087276, "learning_rate": 7.337392980677099e-07, "loss": 0.3122, "step": 2014 }, { "epoch": 0.365036231884058, "grad_norm": 5.36577648784817, "learning_rate": 7.334799020632381e-07, "loss": 0.2962, "step": 2015 }, { "epoch": 0.3652173913043478, "grad_norm": 5.238857688780358, "learning_rate": 7.332204256705298e-07, "loss": 0.2906, "step": 2016 }, { "epoch": 0.36539855072463767, "grad_norm": 4.505102622026442, "learning_rate": 7.329608689789239e-07, "loss": 0.3501, "step": 2017 }, { "epoch": 0.3655797101449275, "grad_norm": 6.525304426787631, "learning_rate": 7.327012320777869e-07, "loss": 0.3028, "step": 2018 }, { "epoch": 0.3657608695652174, "grad_norm": 4.958368122970147, "learning_rate": 7.324415150565132e-07, "loss": 0.2903, "step": 2019 }, { "epoch": 0.36594202898550726, "grad_norm": 3.309716557522381, "learning_rate": 7.321817180045244e-07, "loss": 0.3021, "step": 2020 }, { "epoch": 0.3661231884057971, "grad_norm": 4.354441746693979, "learning_rate": 7.319218410112703e-07, "loss": 0.3676, "step": 2021 }, { "epoch": 0.36630434782608695, "grad_norm": 5.469423312657493, "learning_rate": 7.316618841662272e-07, "loss": 0.2757, "step": 2022 }, { "epoch": 0.3664855072463768, "grad_norm": 4.344599867273378, "learning_rate": 7.314018475588999e-07, "loss": 0.3359, "step": 2023 }, { "epoch": 0.36666666666666664, "grad_norm": 4.28121711090224, "learning_rate": 7.311417312788199e-07, "loss": 0.3145, "step": 2024 }, { "epoch": 0.36684782608695654, "grad_norm": 3.693167465141958, "learning_rate": 7.308815354155467e-07, "loss": 0.3191, "step": 2025 }, { "epoch": 0.3670289855072464, "grad_norm": 3.327928016514394, "learning_rate": 7.306212600586672e-07, "loss": 0.3119, "step": 2026 }, { "epoch": 0.36721014492753623, "grad_norm": 2.960953077757847, "learning_rate": 7.303609052977949e-07, "loss": 0.251, "step": 2027 }, { "epoch": 0.3673913043478261, "grad_norm": 5.6379815879269755, "learning_rate": 7.301004712225715e-07, "loss": 0.3292, "step": 2028 }, { "epoch": 0.3675724637681159, "grad_norm": 4.153505492287318, "learning_rate": 7.298399579226656e-07, "loss": 0.2939, "step": 2029 }, { "epoch": 0.3677536231884058, "grad_norm": 3.6907010172366044, "learning_rate": 7.295793654877731e-07, "loss": 0.3217, "step": 2030 }, { "epoch": 0.36793478260869567, "grad_norm": 6.660776135019311, "learning_rate": 7.293186940076175e-07, "loss": 0.3063, "step": 2031 }, { "epoch": 0.3681159420289855, "grad_norm": 8.058889248374719, "learning_rate": 7.290579435719489e-07, "loss": 0.2703, "step": 2032 }, { "epoch": 0.36829710144927535, "grad_norm": 4.568875412446538, "learning_rate": 7.287971142705449e-07, "loss": 0.3111, "step": 2033 }, { "epoch": 0.3684782608695652, "grad_norm": 5.285773026662832, "learning_rate": 7.285362061932106e-07, "loss": 0.3599, "step": 2034 }, { "epoch": 0.3686594202898551, "grad_norm": 4.559084104330743, "learning_rate": 7.282752194297774e-07, "loss": 0.3152, "step": 2035 }, { "epoch": 0.36884057971014494, "grad_norm": 4.003909428970958, "learning_rate": 7.280141540701048e-07, "loss": 0.3255, "step": 2036 }, { "epoch": 0.3690217391304348, "grad_norm": 6.536513211069695, "learning_rate": 7.277530102040787e-07, "loss": 0.3642, "step": 2037 }, { "epoch": 0.36920289855072463, "grad_norm": 3.4631942393304374, "learning_rate": 7.274917879216119e-07, "loss": 0.2774, "step": 2038 }, { "epoch": 0.3693840579710145, "grad_norm": 3.9895088509639742, "learning_rate": 7.272304873126446e-07, "loss": 0.3041, "step": 2039 }, { "epoch": 0.3695652173913043, "grad_norm": 3.8860915610883393, "learning_rate": 7.26969108467144e-07, "loss": 0.3497, "step": 2040 }, { "epoch": 0.3697463768115942, "grad_norm": 3.8852716416302155, "learning_rate": 7.267076514751038e-07, "loss": 0.33, "step": 2041 }, { "epoch": 0.36992753623188407, "grad_norm": 9.728558434089113, "learning_rate": 7.264461164265452e-07, "loss": 0.3249, "step": 2042 }, { "epoch": 0.3701086956521739, "grad_norm": 3.790370008020601, "learning_rate": 7.261845034115157e-07, "loss": 0.2957, "step": 2043 }, { "epoch": 0.37028985507246376, "grad_norm": 5.449015758986728, "learning_rate": 7.2592281252009e-07, "loss": 0.2465, "step": 2044 }, { "epoch": 0.3704710144927536, "grad_norm": 7.623878282007046, "learning_rate": 7.256610438423695e-07, "loss": 0.3091, "step": 2045 }, { "epoch": 0.3706521739130435, "grad_norm": 5.066301414351345, "learning_rate": 7.253991974684824e-07, "loss": 0.2812, "step": 2046 }, { "epoch": 0.37083333333333335, "grad_norm": 4.117911446775097, "learning_rate": 7.251372734885836e-07, "loss": 0.3156, "step": 2047 }, { "epoch": 0.3710144927536232, "grad_norm": 5.99825679691614, "learning_rate": 7.248752719928551e-07, "loss": 0.2898, "step": 2048 }, { "epoch": 0.37119565217391304, "grad_norm": 5.49201968945519, "learning_rate": 7.246131930715045e-07, "loss": 0.3816, "step": 2049 }, { "epoch": 0.3713768115942029, "grad_norm": 4.145233219363318, "learning_rate": 7.243510368147676e-07, "loss": 0.2585, "step": 2050 }, { "epoch": 0.3715579710144927, "grad_norm": 7.717990554087374, "learning_rate": 7.240888033129056e-07, "loss": 0.3192, "step": 2051 }, { "epoch": 0.3717391304347826, "grad_norm": 8.169066793495674, "learning_rate": 7.23826492656207e-07, "loss": 0.3848, "step": 2052 }, { "epoch": 0.3719202898550725, "grad_norm": 6.400391417922713, "learning_rate": 7.235641049349865e-07, "loss": 0.3492, "step": 2053 }, { "epoch": 0.3721014492753623, "grad_norm": 3.931827679268894, "learning_rate": 7.233016402395852e-07, "loss": 0.3179, "step": 2054 }, { "epoch": 0.37228260869565216, "grad_norm": 4.5048953586220355, "learning_rate": 7.230390986603712e-07, "loss": 0.3062, "step": 2055 }, { "epoch": 0.372463768115942, "grad_norm": 4.645952094260839, "learning_rate": 7.227764802877389e-07, "loss": 0.3712, "step": 2056 }, { "epoch": 0.3726449275362319, "grad_norm": 9.515911650410436, "learning_rate": 7.225137852121086e-07, "loss": 0.3331, "step": 2057 }, { "epoch": 0.37282608695652175, "grad_norm": 4.442014684996327, "learning_rate": 7.22251013523928e-07, "loss": 0.355, "step": 2058 }, { "epoch": 0.3730072463768116, "grad_norm": 9.383218319731174, "learning_rate": 7.219881653136704e-07, "loss": 0.3008, "step": 2059 }, { "epoch": 0.37318840579710144, "grad_norm": 7.457139813990069, "learning_rate": 7.217252406718355e-07, "loss": 0.309, "step": 2060 }, { "epoch": 0.3733695652173913, "grad_norm": 4.133345550164097, "learning_rate": 7.214622396889499e-07, "loss": 0.2967, "step": 2061 }, { "epoch": 0.3735507246376812, "grad_norm": 4.823880286761828, "learning_rate": 7.211991624555657e-07, "loss": 0.2686, "step": 2062 }, { "epoch": 0.37373188405797103, "grad_norm": 4.739082202303997, "learning_rate": 7.209360090622618e-07, "loss": 0.3135, "step": 2063 }, { "epoch": 0.3739130434782609, "grad_norm": 11.611386841223215, "learning_rate": 7.206727795996433e-07, "loss": 0.3756, "step": 2064 }, { "epoch": 0.3740942028985507, "grad_norm": 11.806555837798529, "learning_rate": 7.204094741583412e-07, "loss": 0.3278, "step": 2065 }, { "epoch": 0.37427536231884057, "grad_norm": 11.27800619214121, "learning_rate": 7.201460928290128e-07, "loss": 0.3615, "step": 2066 }, { "epoch": 0.3744565217391304, "grad_norm": 8.62118823112314, "learning_rate": 7.198826357023415e-07, "loss": 0.3564, "step": 2067 }, { "epoch": 0.3746376811594203, "grad_norm": 11.727212544956975, "learning_rate": 7.196191028690369e-07, "loss": 0.3348, "step": 2068 }, { "epoch": 0.37481884057971016, "grad_norm": 8.065242909197162, "learning_rate": 7.193554944198347e-07, "loss": 0.2963, "step": 2069 }, { "epoch": 0.375, "grad_norm": 4.153451890912459, "learning_rate": 7.190918104454963e-07, "loss": 0.3575, "step": 2070 }, { "epoch": 0.37518115942028984, "grad_norm": 3.568962249415379, "learning_rate": 7.188280510368096e-07, "loss": 0.3228, "step": 2071 }, { "epoch": 0.3753623188405797, "grad_norm": 4.0733749588043615, "learning_rate": 7.18564216284588e-07, "loss": 0.2936, "step": 2072 }, { "epoch": 0.3755434782608696, "grad_norm": 4.372754958251993, "learning_rate": 7.183003062796713e-07, "loss": 0.3703, "step": 2073 }, { "epoch": 0.37572463768115943, "grad_norm": 6.049029820514382, "learning_rate": 7.180363211129248e-07, "loss": 0.344, "step": 2074 }, { "epoch": 0.3759057971014493, "grad_norm": 15.136135414834653, "learning_rate": 7.177722608752398e-07, "loss": 0.3454, "step": 2075 }, { "epoch": 0.3760869565217391, "grad_norm": 3.398013820032196, "learning_rate": 7.175081256575335e-07, "loss": 0.2539, "step": 2076 }, { "epoch": 0.37626811594202897, "grad_norm": 8.8830238308733, "learning_rate": 7.17243915550749e-07, "loss": 0.3383, "step": 2077 }, { "epoch": 0.3764492753623188, "grad_norm": 9.92106288135896, "learning_rate": 7.169796306458551e-07, "loss": 0.3244, "step": 2078 }, { "epoch": 0.3766304347826087, "grad_norm": 7.768034758252296, "learning_rate": 7.167152710338462e-07, "loss": 0.3178, "step": 2079 }, { "epoch": 0.37681159420289856, "grad_norm": 8.926891587456375, "learning_rate": 7.164508368057428e-07, "loss": 0.2653, "step": 2080 }, { "epoch": 0.3769927536231884, "grad_norm": 4.380748384571386, "learning_rate": 7.161863280525907e-07, "loss": 0.3743, "step": 2081 }, { "epoch": 0.37717391304347825, "grad_norm": 5.274089189912098, "learning_rate": 7.159217448654614e-07, "loss": 0.2967, "step": 2082 }, { "epoch": 0.3773550724637681, "grad_norm": 4.3655903354976315, "learning_rate": 7.156570873354525e-07, "loss": 0.2751, "step": 2083 }, { "epoch": 0.377536231884058, "grad_norm": 3.696656201101535, "learning_rate": 7.153923555536865e-07, "loss": 0.2997, "step": 2084 }, { "epoch": 0.37771739130434784, "grad_norm": 8.018624634293852, "learning_rate": 7.151275496113119e-07, "loss": 0.3689, "step": 2085 }, { "epoch": 0.3778985507246377, "grad_norm": 8.433384008571464, "learning_rate": 7.148626695995027e-07, "loss": 0.3384, "step": 2086 }, { "epoch": 0.3780797101449275, "grad_norm": 7.272546899288065, "learning_rate": 7.145977156094584e-07, "loss": 0.2984, "step": 2087 }, { "epoch": 0.3782608695652174, "grad_norm": 10.852528127690967, "learning_rate": 7.143326877324037e-07, "loss": 0.3497, "step": 2088 }, { "epoch": 0.3784420289855073, "grad_norm": 13.87037781322862, "learning_rate": 7.140675860595892e-07, "loss": 0.3098, "step": 2089 }, { "epoch": 0.3786231884057971, "grad_norm": 12.301017897790347, "learning_rate": 7.138024106822904e-07, "loss": 0.3711, "step": 2090 }, { "epoch": 0.37880434782608696, "grad_norm": 7.917130556191393, "learning_rate": 7.135371616918088e-07, "loss": 0.2796, "step": 2091 }, { "epoch": 0.3789855072463768, "grad_norm": 3.9873882203685094, "learning_rate": 7.132718391794704e-07, "loss": 0.2603, "step": 2092 }, { "epoch": 0.37916666666666665, "grad_norm": 5.311367551899578, "learning_rate": 7.130064432366274e-07, "loss": 0.2735, "step": 2093 }, { "epoch": 0.3793478260869565, "grad_norm": 5.228149985285873, "learning_rate": 7.127409739546568e-07, "loss": 0.3029, "step": 2094 }, { "epoch": 0.3795289855072464, "grad_norm": 4.001570027615623, "learning_rate": 7.124754314249608e-07, "loss": 0.3132, "step": 2095 }, { "epoch": 0.37971014492753624, "grad_norm": 4.687844615547635, "learning_rate": 7.122098157389671e-07, "loss": 0.334, "step": 2096 }, { "epoch": 0.3798913043478261, "grad_norm": 6.1324919335590575, "learning_rate": 7.119441269881283e-07, "loss": 0.2968, "step": 2097 }, { "epoch": 0.38007246376811593, "grad_norm": 6.693705083215806, "learning_rate": 7.116783652639224e-07, "loss": 0.2996, "step": 2098 }, { "epoch": 0.3802536231884058, "grad_norm": 6.938585015310847, "learning_rate": 7.114125306578525e-07, "loss": 0.3004, "step": 2099 }, { "epoch": 0.3804347826086957, "grad_norm": 3.3305523385032423, "learning_rate": 7.111466232614465e-07, "loss": 0.3071, "step": 2100 }, { "epoch": 0.3804347826086957, "eval_loss": 0.2997343838214874, "eval_runtime": 9.772, "eval_samples_per_second": 51.167, "eval_steps_per_second": 0.102, "step": 2100 }, { "epoch": 0.3806159420289855, "grad_norm": 3.5937342311261893, "learning_rate": 7.108806431662577e-07, "loss": 0.3115, "step": 2101 }, { "epoch": 0.38079710144927537, "grad_norm": 5.733302048934132, "learning_rate": 7.106145904638642e-07, "loss": 0.3015, "step": 2102 }, { "epoch": 0.3809782608695652, "grad_norm": 3.275610822661634, "learning_rate": 7.103484652458693e-07, "loss": 0.3269, "step": 2103 }, { "epoch": 0.38115942028985506, "grad_norm": 3.4731681017381923, "learning_rate": 7.100822676039013e-07, "loss": 0.3345, "step": 2104 }, { "epoch": 0.3813405797101449, "grad_norm": 9.474435185831744, "learning_rate": 7.09815997629613e-07, "loss": 0.2991, "step": 2105 }, { "epoch": 0.3815217391304348, "grad_norm": 4.873748219443486, "learning_rate": 7.095496554146827e-07, "loss": 0.2983, "step": 2106 }, { "epoch": 0.38170289855072465, "grad_norm": 3.5377240835984316, "learning_rate": 7.092832410508132e-07, "loss": 0.3148, "step": 2107 }, { "epoch": 0.3818840579710145, "grad_norm": 4.693249675105682, "learning_rate": 7.090167546297321e-07, "loss": 0.3024, "step": 2108 }, { "epoch": 0.38206521739130433, "grad_norm": 3.6605558938989775, "learning_rate": 7.087501962431921e-07, "loss": 0.2876, "step": 2109 }, { "epoch": 0.3822463768115942, "grad_norm": 4.596755121653797, "learning_rate": 7.084835659829705e-07, "loss": 0.3019, "step": 2110 }, { "epoch": 0.3824275362318841, "grad_norm": 3.686897769046088, "learning_rate": 7.082168639408691e-07, "loss": 0.2493, "step": 2111 }, { "epoch": 0.3826086956521739, "grad_norm": 4.072969934127878, "learning_rate": 7.079500902087152e-07, "loss": 0.3324, "step": 2112 }, { "epoch": 0.38278985507246377, "grad_norm": 6.399363765959017, "learning_rate": 7.076832448783596e-07, "loss": 0.3882, "step": 2113 }, { "epoch": 0.3829710144927536, "grad_norm": 3.4042653262831815, "learning_rate": 7.07416328041679e-07, "loss": 0.2881, "step": 2114 }, { "epoch": 0.38315217391304346, "grad_norm": 4.078562472526981, "learning_rate": 7.071493397905739e-07, "loss": 0.2971, "step": 2115 }, { "epoch": 0.38333333333333336, "grad_norm": 4.152659854568918, "learning_rate": 7.068822802169696e-07, "loss": 0.2765, "step": 2116 }, { "epoch": 0.3835144927536232, "grad_norm": 5.370460060434913, "learning_rate": 7.066151494128158e-07, "loss": 0.3091, "step": 2117 }, { "epoch": 0.38369565217391305, "grad_norm": 4.093914646720985, "learning_rate": 7.063479474700875e-07, "loss": 0.3264, "step": 2118 }, { "epoch": 0.3838768115942029, "grad_norm": 5.681256537201822, "learning_rate": 7.060806744807828e-07, "loss": 0.2866, "step": 2119 }, { "epoch": 0.38405797101449274, "grad_norm": 6.630432321189139, "learning_rate": 7.058133305369256e-07, "loss": 0.3312, "step": 2120 }, { "epoch": 0.3842391304347826, "grad_norm": 3.9654397095708735, "learning_rate": 7.055459157305637e-07, "loss": 0.344, "step": 2121 }, { "epoch": 0.3844202898550725, "grad_norm": 11.058387699729824, "learning_rate": 7.052784301537688e-07, "loss": 0.3214, "step": 2122 }, { "epoch": 0.38460144927536233, "grad_norm": 9.341950315149177, "learning_rate": 7.05010873898638e-07, "loss": 0.3778, "step": 2123 }, { "epoch": 0.3847826086956522, "grad_norm": 10.937001758481303, "learning_rate": 7.047432470572918e-07, "loss": 0.3831, "step": 2124 }, { "epoch": 0.384963768115942, "grad_norm": 4.436601013218096, "learning_rate": 7.044755497218756e-07, "loss": 0.2963, "step": 2125 }, { "epoch": 0.38514492753623186, "grad_norm": 3.4628776872665328, "learning_rate": 7.042077819845588e-07, "loss": 0.2726, "step": 2126 }, { "epoch": 0.38532608695652176, "grad_norm": 10.3383413703353, "learning_rate": 7.039399439375352e-07, "loss": 0.3159, "step": 2127 }, { "epoch": 0.3855072463768116, "grad_norm": 5.6294194173317, "learning_rate": 7.036720356730225e-07, "loss": 0.3329, "step": 2128 }, { "epoch": 0.38568840579710145, "grad_norm": 2.9282256130093156, "learning_rate": 7.03404057283263e-07, "loss": 0.2479, "step": 2129 }, { "epoch": 0.3858695652173913, "grad_norm": 4.029948418252385, "learning_rate": 7.031360088605227e-07, "loss": 0.3576, "step": 2130 }, { "epoch": 0.38605072463768114, "grad_norm": 4.766907554519569, "learning_rate": 7.028678904970923e-07, "loss": 0.4047, "step": 2131 }, { "epoch": 0.38623188405797104, "grad_norm": 7.871698129751128, "learning_rate": 7.025997022852856e-07, "loss": 0.3535, "step": 2132 }, { "epoch": 0.3864130434782609, "grad_norm": 4.269659727960499, "learning_rate": 7.023314443174418e-07, "loss": 0.2599, "step": 2133 }, { "epoch": 0.38659420289855073, "grad_norm": 4.980339697472714, "learning_rate": 7.02063116685923e-07, "loss": 0.3186, "step": 2134 }, { "epoch": 0.3867753623188406, "grad_norm": 4.826318513402168, "learning_rate": 7.017947194831156e-07, "loss": 0.3271, "step": 2135 }, { "epoch": 0.3869565217391304, "grad_norm": 8.225636588927411, "learning_rate": 7.015262528014303e-07, "loss": 0.3381, "step": 2136 }, { "epoch": 0.38713768115942027, "grad_norm": 4.149261097093593, "learning_rate": 7.012577167333013e-07, "loss": 0.3179, "step": 2137 }, { "epoch": 0.38731884057971017, "grad_norm": 4.762094949943608, "learning_rate": 7.009891113711868e-07, "loss": 0.343, "step": 2138 }, { "epoch": 0.3875, "grad_norm": 10.001299244383034, "learning_rate": 7.00720436807569e-07, "loss": 0.3435, "step": 2139 }, { "epoch": 0.38768115942028986, "grad_norm": 6.122362279332189, "learning_rate": 7.004516931349535e-07, "loss": 0.3444, "step": 2140 }, { "epoch": 0.3878623188405797, "grad_norm": 5.673363310153358, "learning_rate": 7.001828804458707e-07, "loss": 0.2896, "step": 2141 }, { "epoch": 0.38804347826086955, "grad_norm": 4.365146468695218, "learning_rate": 6.999139988328735e-07, "loss": 0.2904, "step": 2142 }, { "epoch": 0.38822463768115945, "grad_norm": 4.952228499391306, "learning_rate": 6.996450483885392e-07, "loss": 0.3213, "step": 2143 }, { "epoch": 0.3884057971014493, "grad_norm": 4.283859416825511, "learning_rate": 6.993760292054689e-07, "loss": 0.3162, "step": 2144 }, { "epoch": 0.38858695652173914, "grad_norm": 5.565955989672552, "learning_rate": 6.991069413762871e-07, "loss": 0.2863, "step": 2145 }, { "epoch": 0.388768115942029, "grad_norm": 5.840585925510722, "learning_rate": 6.988377849936419e-07, "loss": 0.3013, "step": 2146 }, { "epoch": 0.3889492753623188, "grad_norm": 2.985844384205347, "learning_rate": 6.985685601502054e-07, "loss": 0.234, "step": 2147 }, { "epoch": 0.38913043478260867, "grad_norm": 7.52349361444468, "learning_rate": 6.982992669386726e-07, "loss": 0.3636, "step": 2148 }, { "epoch": 0.38931159420289857, "grad_norm": 8.179994075387192, "learning_rate": 6.980299054517627e-07, "loss": 0.3079, "step": 2149 }, { "epoch": 0.3894927536231884, "grad_norm": 7.3953456035424745, "learning_rate": 6.977604757822181e-07, "loss": 0.3696, "step": 2150 }, { "epoch": 0.38967391304347826, "grad_norm": 10.231994205623613, "learning_rate": 6.974909780228046e-07, "loss": 0.278, "step": 2151 }, { "epoch": 0.3898550724637681, "grad_norm": 3.6122227075145936, "learning_rate": 6.972214122663117e-07, "loss": 0.3023, "step": 2152 }, { "epoch": 0.39003623188405795, "grad_norm": 4.98461583138587, "learning_rate": 6.969517786055522e-07, "loss": 0.3162, "step": 2153 }, { "epoch": 0.39021739130434785, "grad_norm": 4.528627814814437, "learning_rate": 6.966820771333619e-07, "loss": 0.3128, "step": 2154 }, { "epoch": 0.3903985507246377, "grad_norm": 6.153493328734538, "learning_rate": 6.964123079426008e-07, "loss": 0.3225, "step": 2155 }, { "epoch": 0.39057971014492754, "grad_norm": 6.464676154023411, "learning_rate": 6.961424711261514e-07, "loss": 0.3465, "step": 2156 }, { "epoch": 0.3907608695652174, "grad_norm": 7.425681674557852, "learning_rate": 6.958725667769197e-07, "loss": 0.346, "step": 2157 }, { "epoch": 0.39094202898550723, "grad_norm": 3.821427599782369, "learning_rate": 6.956025949878353e-07, "loss": 0.2852, "step": 2158 }, { "epoch": 0.39112318840579713, "grad_norm": 9.394635561319111, "learning_rate": 6.953325558518507e-07, "loss": 0.2609, "step": 2159 }, { "epoch": 0.391304347826087, "grad_norm": 4.747575940249641, "learning_rate": 6.950624494619415e-07, "loss": 0.2659, "step": 2160 }, { "epoch": 0.3914855072463768, "grad_norm": 7.480201764047443, "learning_rate": 6.947922759111069e-07, "loss": 0.3117, "step": 2161 }, { "epoch": 0.39166666666666666, "grad_norm": 7.366875731216558, "learning_rate": 6.945220352923685e-07, "loss": 0.3016, "step": 2162 }, { "epoch": 0.3918478260869565, "grad_norm": 9.656088497588994, "learning_rate": 6.942517276987719e-07, "loss": 0.2977, "step": 2163 }, { "epoch": 0.39202898550724635, "grad_norm": 5.201203908319206, "learning_rate": 6.939813532233849e-07, "loss": 0.2704, "step": 2164 }, { "epoch": 0.39221014492753625, "grad_norm": 3.9137337075989955, "learning_rate": 6.93710911959299e-07, "loss": 0.2855, "step": 2165 }, { "epoch": 0.3923913043478261, "grad_norm": 3.540516257074366, "learning_rate": 6.934404039996283e-07, "loss": 0.2849, "step": 2166 }, { "epoch": 0.39257246376811594, "grad_norm": 4.534143848571846, "learning_rate": 6.931698294375099e-07, "loss": 0.2901, "step": 2167 }, { "epoch": 0.3927536231884058, "grad_norm": 9.226494214506207, "learning_rate": 6.928991883661039e-07, "loss": 0.3448, "step": 2168 }, { "epoch": 0.39293478260869563, "grad_norm": 5.088294842948125, "learning_rate": 6.926284808785936e-07, "loss": 0.348, "step": 2169 }, { "epoch": 0.39311594202898553, "grad_norm": 3.7941952287354437, "learning_rate": 6.923577070681845e-07, "loss": 0.2458, "step": 2170 }, { "epoch": 0.3932971014492754, "grad_norm": 4.128119822105309, "learning_rate": 6.920868670281055e-07, "loss": 0.317, "step": 2171 }, { "epoch": 0.3934782608695652, "grad_norm": 8.80245311259596, "learning_rate": 6.91815960851608e-07, "loss": 0.2327, "step": 2172 }, { "epoch": 0.39365942028985507, "grad_norm": 6.295775077737116, "learning_rate": 6.915449886319663e-07, "loss": 0.3115, "step": 2173 }, { "epoch": 0.3938405797101449, "grad_norm": 5.805682623689149, "learning_rate": 6.912739504624776e-07, "loss": 0.3759, "step": 2174 }, { "epoch": 0.39402173913043476, "grad_norm": 7.582295689095818, "learning_rate": 6.910028464364612e-07, "loss": 0.3541, "step": 2175 }, { "epoch": 0.39420289855072466, "grad_norm": 4.403919544543604, "learning_rate": 6.9073167664726e-07, "loss": 0.3095, "step": 2176 }, { "epoch": 0.3943840579710145, "grad_norm": 4.216083694991703, "learning_rate": 6.904604411882388e-07, "loss": 0.3014, "step": 2177 }, { "epoch": 0.39456521739130435, "grad_norm": 4.176160918755515, "learning_rate": 6.901891401527854e-07, "loss": 0.3115, "step": 2178 }, { "epoch": 0.3947463768115942, "grad_norm": 4.3890765587182665, "learning_rate": 6.899177736343098e-07, "loss": 0.2708, "step": 2179 }, { "epoch": 0.39492753623188404, "grad_norm": 4.103807331881485, "learning_rate": 6.896463417262448e-07, "loss": 0.2887, "step": 2180 }, { "epoch": 0.39510869565217394, "grad_norm": 4.867160521532571, "learning_rate": 6.893748445220457e-07, "loss": 0.3401, "step": 2181 }, { "epoch": 0.3952898550724638, "grad_norm": 10.806779871556616, "learning_rate": 6.891032821151907e-07, "loss": 0.314, "step": 2182 }, { "epoch": 0.3954710144927536, "grad_norm": 4.679841284115658, "learning_rate": 6.888316545991795e-07, "loss": 0.2952, "step": 2183 }, { "epoch": 0.39565217391304347, "grad_norm": 7.174450230651045, "learning_rate": 6.885599620675349e-07, "loss": 0.3337, "step": 2184 }, { "epoch": 0.3958333333333333, "grad_norm": 6.805732276950488, "learning_rate": 6.882882046138019e-07, "loss": 0.3029, "step": 2185 }, { "epoch": 0.3960144927536232, "grad_norm": 6.916376239794475, "learning_rate": 6.88016382331548e-07, "loss": 0.2755, "step": 2186 }, { "epoch": 0.39619565217391306, "grad_norm": 3.1640859016069918, "learning_rate": 6.877444953143628e-07, "loss": 0.2586, "step": 2187 }, { "epoch": 0.3963768115942029, "grad_norm": 5.337632469953887, "learning_rate": 6.874725436558583e-07, "loss": 0.3503, "step": 2188 }, { "epoch": 0.39655797101449275, "grad_norm": 3.7614417732880985, "learning_rate": 6.872005274496686e-07, "loss": 0.2927, "step": 2189 }, { "epoch": 0.3967391304347826, "grad_norm": 5.941380756939097, "learning_rate": 6.869284467894506e-07, "loss": 0.3042, "step": 2190 }, { "epoch": 0.39692028985507244, "grad_norm": 7.354207197148562, "learning_rate": 6.866563017688825e-07, "loss": 0.2971, "step": 2191 }, { "epoch": 0.39710144927536234, "grad_norm": 4.92479046090298, "learning_rate": 6.863840924816654e-07, "loss": 0.2868, "step": 2192 }, { "epoch": 0.3972826086956522, "grad_norm": 4.140477044611321, "learning_rate": 6.861118190215221e-07, "loss": 0.2681, "step": 2193 }, { "epoch": 0.39746376811594203, "grad_norm": 9.608005362186999, "learning_rate": 6.858394814821979e-07, "loss": 0.3176, "step": 2194 }, { "epoch": 0.3976449275362319, "grad_norm": 3.5331177111360743, "learning_rate": 6.855670799574593e-07, "loss": 0.3329, "step": 2195 }, { "epoch": 0.3978260869565217, "grad_norm": 11.36791909861226, "learning_rate": 6.852946145410963e-07, "loss": 0.2798, "step": 2196 }, { "epoch": 0.3980072463768116, "grad_norm": 5.905235439283369, "learning_rate": 6.850220853269192e-07, "loss": 0.2823, "step": 2197 }, { "epoch": 0.39818840579710146, "grad_norm": 6.802293435441138, "learning_rate": 6.847494924087615e-07, "loss": 0.2319, "step": 2198 }, { "epoch": 0.3983695652173913, "grad_norm": 3.4871520154116094, "learning_rate": 6.844768358804784e-07, "loss": 0.2877, "step": 2199 }, { "epoch": 0.39855072463768115, "grad_norm": 6.606776141129715, "learning_rate": 6.842041158359465e-07, "loss": 0.2839, "step": 2200 }, { "epoch": 0.39855072463768115, "eval_loss": 0.31165623664855957, "eval_runtime": 9.7812, "eval_samples_per_second": 51.119, "eval_steps_per_second": 0.102, "step": 2200 }, { "epoch": 0.398731884057971, "grad_norm": 4.004580365436502, "learning_rate": 6.839313323690647e-07, "loss": 0.3235, "step": 2201 }, { "epoch": 0.39891304347826084, "grad_norm": 9.017065743390921, "learning_rate": 6.836584855737537e-07, "loss": 0.3066, "step": 2202 }, { "epoch": 0.39909420289855074, "grad_norm": 5.334253719988122, "learning_rate": 6.833855755439561e-07, "loss": 0.2645, "step": 2203 }, { "epoch": 0.3992753623188406, "grad_norm": 8.777738221400382, "learning_rate": 6.831126023736358e-07, "loss": 0.2669, "step": 2204 }, { "epoch": 0.39945652173913043, "grad_norm": 3.8197175856336165, "learning_rate": 6.828395661567789e-07, "loss": 0.2713, "step": 2205 }, { "epoch": 0.3996376811594203, "grad_norm": 4.48874189922189, "learning_rate": 6.825664669873931e-07, "loss": 0.3084, "step": 2206 }, { "epoch": 0.3998188405797101, "grad_norm": 5.080293220585723, "learning_rate": 6.822933049595077e-07, "loss": 0.36, "step": 2207 }, { "epoch": 0.4, "grad_norm": 4.696484334787702, "learning_rate": 6.820200801671736e-07, "loss": 0.2526, "step": 2208 }, { "epoch": 0.40018115942028987, "grad_norm": 3.5370597557875625, "learning_rate": 6.817467927044635e-07, "loss": 0.2644, "step": 2209 }, { "epoch": 0.4003623188405797, "grad_norm": 4.425550127312261, "learning_rate": 6.814734426654716e-07, "loss": 0.3272, "step": 2210 }, { "epoch": 0.40054347826086956, "grad_norm": 5.760714598972042, "learning_rate": 6.812000301443135e-07, "loss": 0.2659, "step": 2211 }, { "epoch": 0.4007246376811594, "grad_norm": 7.58433814276553, "learning_rate": 6.809265552351264e-07, "loss": 0.3356, "step": 2212 }, { "epoch": 0.4009057971014493, "grad_norm": 4.332188856177377, "learning_rate": 6.806530180320693e-07, "loss": 0.2903, "step": 2213 }, { "epoch": 0.40108695652173915, "grad_norm": 7.1163274548067275, "learning_rate": 6.80379418629322e-07, "loss": 0.3761, "step": 2214 }, { "epoch": 0.401268115942029, "grad_norm": 4.49422129917352, "learning_rate": 6.801057571210862e-07, "loss": 0.3243, "step": 2215 }, { "epoch": 0.40144927536231884, "grad_norm": 4.773996975382843, "learning_rate": 6.798320336015848e-07, "loss": 0.3078, "step": 2216 }, { "epoch": 0.4016304347826087, "grad_norm": 6.104874246248235, "learning_rate": 6.795582481650623e-07, "loss": 0.3101, "step": 2217 }, { "epoch": 0.4018115942028985, "grad_norm": 3.962353351560333, "learning_rate": 6.792844009057842e-07, "loss": 0.2927, "step": 2218 }, { "epoch": 0.4019927536231884, "grad_norm": 4.43873212167092, "learning_rate": 6.790104919180373e-07, "loss": 0.2672, "step": 2219 }, { "epoch": 0.40217391304347827, "grad_norm": 8.851462258056083, "learning_rate": 6.7873652129613e-07, "loss": 0.3768, "step": 2220 }, { "epoch": 0.4023550724637681, "grad_norm": 4.13758685057565, "learning_rate": 6.784624891343914e-07, "loss": 0.331, "step": 2221 }, { "epoch": 0.40253623188405796, "grad_norm": 4.07969786302646, "learning_rate": 6.781883955271722e-07, "loss": 0.3206, "step": 2222 }, { "epoch": 0.4027173913043478, "grad_norm": 3.4425626225475225, "learning_rate": 6.779142405688443e-07, "loss": 0.3099, "step": 2223 }, { "epoch": 0.4028985507246377, "grad_norm": 4.909572420801204, "learning_rate": 6.776400243538003e-07, "loss": 0.2478, "step": 2224 }, { "epoch": 0.40307971014492755, "grad_norm": 5.291819558569912, "learning_rate": 6.773657469764542e-07, "loss": 0.3564, "step": 2225 }, { "epoch": 0.4032608695652174, "grad_norm": 4.373416755523693, "learning_rate": 6.770914085312412e-07, "loss": 0.293, "step": 2226 }, { "epoch": 0.40344202898550724, "grad_norm": 4.09980830065468, "learning_rate": 6.76817009112617e-07, "loss": 0.2836, "step": 2227 }, { "epoch": 0.4036231884057971, "grad_norm": 6.344705788752141, "learning_rate": 6.765425488150589e-07, "loss": 0.3275, "step": 2228 }, { "epoch": 0.40380434782608693, "grad_norm": 4.770346520763357, "learning_rate": 6.762680277330648e-07, "loss": 0.3109, "step": 2229 }, { "epoch": 0.40398550724637683, "grad_norm": 8.881697922793602, "learning_rate": 6.759934459611534e-07, "loss": 0.3075, "step": 2230 }, { "epoch": 0.4041666666666667, "grad_norm": 4.1903094693543785, "learning_rate": 6.757188035938648e-07, "loss": 0.2944, "step": 2231 }, { "epoch": 0.4043478260869565, "grad_norm": 4.356995256903603, "learning_rate": 6.754441007257594e-07, "loss": 0.3127, "step": 2232 }, { "epoch": 0.40452898550724636, "grad_norm": 6.098313538479057, "learning_rate": 6.751693374514192e-07, "loss": 0.3044, "step": 2233 }, { "epoch": 0.4047101449275362, "grad_norm": 4.91282076148066, "learning_rate": 6.748945138654458e-07, "loss": 0.3127, "step": 2234 }, { "epoch": 0.4048913043478261, "grad_norm": 8.409449416572015, "learning_rate": 6.746196300624627e-07, "loss": 0.3011, "step": 2235 }, { "epoch": 0.40507246376811595, "grad_norm": 3.5367485660500972, "learning_rate": 6.743446861371137e-07, "loss": 0.3378, "step": 2236 }, { "epoch": 0.4052536231884058, "grad_norm": 3.5167666854013504, "learning_rate": 6.74069682184063e-07, "loss": 0.2645, "step": 2237 }, { "epoch": 0.40543478260869564, "grad_norm": 4.661991601268904, "learning_rate": 6.737946182979961e-07, "loss": 0.403, "step": 2238 }, { "epoch": 0.4056159420289855, "grad_norm": 4.491830335469685, "learning_rate": 6.735194945736186e-07, "loss": 0.3082, "step": 2239 }, { "epoch": 0.4057971014492754, "grad_norm": 4.013795874324127, "learning_rate": 6.732443111056572e-07, "loss": 0.2737, "step": 2240 }, { "epoch": 0.40597826086956523, "grad_norm": 7.642900286388456, "learning_rate": 6.729690679888584e-07, "loss": 0.3472, "step": 2241 }, { "epoch": 0.4061594202898551, "grad_norm": 4.434087916359773, "learning_rate": 6.7269376531799e-07, "loss": 0.2744, "step": 2242 }, { "epoch": 0.4063405797101449, "grad_norm": 5.641102829094799, "learning_rate": 6.724184031878399e-07, "loss": 0.3148, "step": 2243 }, { "epoch": 0.40652173913043477, "grad_norm": 3.441437848483509, "learning_rate": 6.721429816932169e-07, "loss": 0.2956, "step": 2244 }, { "epoch": 0.4067028985507246, "grad_norm": 8.894075312929761, "learning_rate": 6.718675009289494e-07, "loss": 0.3626, "step": 2245 }, { "epoch": 0.4068840579710145, "grad_norm": 5.939315500932609, "learning_rate": 6.71591960989887e-07, "loss": 0.3075, "step": 2246 }, { "epoch": 0.40706521739130436, "grad_norm": 6.991336955854468, "learning_rate": 6.713163619708995e-07, "loss": 0.3081, "step": 2247 }, { "epoch": 0.4072463768115942, "grad_norm": 6.329886474887556, "learning_rate": 6.71040703966877e-07, "loss": 0.2896, "step": 2248 }, { "epoch": 0.40742753623188405, "grad_norm": 5.583220967094046, "learning_rate": 6.707649870727296e-07, "loss": 0.3637, "step": 2249 }, { "epoch": 0.4076086956521739, "grad_norm": 7.386632320792021, "learning_rate": 6.70489211383388e-07, "loss": 0.2961, "step": 2250 }, { "epoch": 0.4077898550724638, "grad_norm": 3.375989649154391, "learning_rate": 6.702133769938031e-07, "loss": 0.3148, "step": 2251 }, { "epoch": 0.40797101449275364, "grad_norm": 8.362479882284925, "learning_rate": 6.699374839989462e-07, "loss": 0.2897, "step": 2252 }, { "epoch": 0.4081521739130435, "grad_norm": 5.169666061134393, "learning_rate": 6.696615324938082e-07, "loss": 0.321, "step": 2253 }, { "epoch": 0.4083333333333333, "grad_norm": 5.569690016903751, "learning_rate": 6.693855225734006e-07, "loss": 0.3489, "step": 2254 }, { "epoch": 0.40851449275362317, "grad_norm": 4.426887493077836, "learning_rate": 6.691094543327553e-07, "loss": 0.3335, "step": 2255 }, { "epoch": 0.40869565217391307, "grad_norm": 7.528814173632632, "learning_rate": 6.688333278669233e-07, "loss": 0.2668, "step": 2256 }, { "epoch": 0.4088768115942029, "grad_norm": 8.447861692222983, "learning_rate": 6.685571432709768e-07, "loss": 0.3514, "step": 2257 }, { "epoch": 0.40905797101449276, "grad_norm": 5.551127199752848, "learning_rate": 6.682809006400073e-07, "loss": 0.2867, "step": 2258 }, { "epoch": 0.4092391304347826, "grad_norm": 6.078553925398269, "learning_rate": 6.680046000691262e-07, "loss": 0.2964, "step": 2259 }, { "epoch": 0.40942028985507245, "grad_norm": 4.200727215066275, "learning_rate": 6.677282416534653e-07, "loss": 0.2908, "step": 2260 }, { "epoch": 0.4096014492753623, "grad_norm": 5.614888855289529, "learning_rate": 6.67451825488176e-07, "loss": 0.2991, "step": 2261 }, { "epoch": 0.4097826086956522, "grad_norm": 3.685666096999919, "learning_rate": 6.6717535166843e-07, "loss": 0.3232, "step": 2262 }, { "epoch": 0.40996376811594204, "grad_norm": 3.8855236972948353, "learning_rate": 6.668988202894181e-07, "loss": 0.3747, "step": 2263 }, { "epoch": 0.4101449275362319, "grad_norm": 9.250525232590523, "learning_rate": 6.666222314463518e-07, "loss": 0.3065, "step": 2264 }, { "epoch": 0.41032608695652173, "grad_norm": 7.635126726567272, "learning_rate": 6.663455852344615e-07, "loss": 0.2876, "step": 2265 }, { "epoch": 0.4105072463768116, "grad_norm": 5.114922133890042, "learning_rate": 6.660688817489984e-07, "loss": 0.3087, "step": 2266 }, { "epoch": 0.4106884057971015, "grad_norm": 8.285470410124132, "learning_rate": 6.657921210852321e-07, "loss": 0.2542, "step": 2267 }, { "epoch": 0.4108695652173913, "grad_norm": 4.170803308744904, "learning_rate": 6.655153033384531e-07, "loss": 0.366, "step": 2268 }, { "epoch": 0.41105072463768116, "grad_norm": 4.8761931590322565, "learning_rate": 6.65238428603971e-07, "loss": 0.3199, "step": 2269 }, { "epoch": 0.411231884057971, "grad_norm": 4.143135332840739, "learning_rate": 6.64961496977115e-07, "loss": 0.3059, "step": 2270 }, { "epoch": 0.41141304347826085, "grad_norm": 3.237703349654009, "learning_rate": 6.646845085532339e-07, "loss": 0.2816, "step": 2271 }, { "epoch": 0.4115942028985507, "grad_norm": 8.299850469933856, "learning_rate": 6.644074634276963e-07, "loss": 0.3042, "step": 2272 }, { "epoch": 0.4117753623188406, "grad_norm": 3.2974559364063927, "learning_rate": 6.6413036169589e-07, "loss": 0.3103, "step": 2273 }, { "epoch": 0.41195652173913044, "grad_norm": 3.76438256733284, "learning_rate": 6.638532034532225e-07, "loss": 0.3385, "step": 2274 }, { "epoch": 0.4121376811594203, "grad_norm": 5.685197955516119, "learning_rate": 6.635759887951208e-07, "loss": 0.3176, "step": 2275 }, { "epoch": 0.41231884057971013, "grad_norm": 5.367368509701561, "learning_rate": 6.63298717817031e-07, "loss": 0.3161, "step": 2276 }, { "epoch": 0.4125, "grad_norm": 8.23828116247501, "learning_rate": 6.630213906144191e-07, "loss": 0.2477, "step": 2277 }, { "epoch": 0.4126811594202899, "grad_norm": 4.245331880732722, "learning_rate": 6.627440072827697e-07, "loss": 0.2964, "step": 2278 }, { "epoch": 0.4128623188405797, "grad_norm": 5.027613811697914, "learning_rate": 6.624665679175878e-07, "loss": 0.3002, "step": 2279 }, { "epoch": 0.41304347826086957, "grad_norm": 5.254985319598645, "learning_rate": 6.621890726143966e-07, "loss": 0.3574, "step": 2280 }, { "epoch": 0.4132246376811594, "grad_norm": 6.872207546488963, "learning_rate": 6.619115214687393e-07, "loss": 0.2712, "step": 2281 }, { "epoch": 0.41340579710144926, "grad_norm": 3.2641773382908377, "learning_rate": 6.61633914576178e-07, "loss": 0.2719, "step": 2282 }, { "epoch": 0.41358695652173916, "grad_norm": 6.1828766544893785, "learning_rate": 6.61356252032294e-07, "loss": 0.304, "step": 2283 }, { "epoch": 0.413768115942029, "grad_norm": 4.321040540507069, "learning_rate": 6.610785339326881e-07, "loss": 0.2975, "step": 2284 }, { "epoch": 0.41394927536231885, "grad_norm": 7.069091130381335, "learning_rate": 6.608007603729796e-07, "loss": 0.3124, "step": 2285 }, { "epoch": 0.4141304347826087, "grad_norm": 5.060448386586392, "learning_rate": 6.605229314488073e-07, "loss": 0.2866, "step": 2286 }, { "epoch": 0.41431159420289854, "grad_norm": 6.203290330828651, "learning_rate": 6.602450472558294e-07, "loss": 0.2976, "step": 2287 }, { "epoch": 0.4144927536231884, "grad_norm": 3.50982587550599, "learning_rate": 6.599671078897223e-07, "loss": 0.3137, "step": 2288 }, { "epoch": 0.4146739130434783, "grad_norm": 3.5430731875103385, "learning_rate": 6.59689113446182e-07, "loss": 0.2619, "step": 2289 }, { "epoch": 0.4148550724637681, "grad_norm": 4.771418239074733, "learning_rate": 6.594110640209235e-07, "loss": 0.3391, "step": 2290 }, { "epoch": 0.41503623188405797, "grad_norm": 5.825738070816933, "learning_rate": 6.591329597096802e-07, "loss": 0.364, "step": 2291 }, { "epoch": 0.4152173913043478, "grad_norm": 6.74872906751537, "learning_rate": 6.588548006082049e-07, "loss": 0.3171, "step": 2292 }, { "epoch": 0.41539855072463766, "grad_norm": 4.878259741149326, "learning_rate": 6.585765868122691e-07, "loss": 0.3227, "step": 2293 }, { "epoch": 0.41557971014492756, "grad_norm": 6.212747632634399, "learning_rate": 6.58298318417663e-07, "loss": 0.2941, "step": 2294 }, { "epoch": 0.4157608695652174, "grad_norm": 14.253826190308022, "learning_rate": 6.580199955201961e-07, "loss": 0.3592, "step": 2295 }, { "epoch": 0.41594202898550725, "grad_norm": 5.10730921894822, "learning_rate": 6.577416182156958e-07, "loss": 0.2499, "step": 2296 }, { "epoch": 0.4161231884057971, "grad_norm": 16.19235589648736, "learning_rate": 6.574631866000089e-07, "loss": 0.382, "step": 2297 }, { "epoch": 0.41630434782608694, "grad_norm": 9.27566753712969, "learning_rate": 6.57184700769001e-07, "loss": 0.2907, "step": 2298 }, { "epoch": 0.4164855072463768, "grad_norm": 3.6538757269226343, "learning_rate": 6.569061608185557e-07, "loss": 0.2971, "step": 2299 }, { "epoch": 0.4166666666666667, "grad_norm": 4.554385376545483, "learning_rate": 6.566275668445758e-07, "loss": 0.283, "step": 2300 }, { "epoch": 0.4166666666666667, "eval_loss": 0.3081875145435333, "eval_runtime": 9.7833, "eval_samples_per_second": 51.108, "eval_steps_per_second": 0.102, "step": 2300 }, { "epoch": 0.41684782608695653, "grad_norm": 5.252070927566908, "learning_rate": 6.563489189429828e-07, "loss": 0.3335, "step": 2301 }, { "epoch": 0.4170289855072464, "grad_norm": 6.5788475189753886, "learning_rate": 6.560702172097158e-07, "loss": 0.2588, "step": 2302 }, { "epoch": 0.4172101449275362, "grad_norm": 4.44138324373938, "learning_rate": 6.557914617407339e-07, "loss": 0.3036, "step": 2303 }, { "epoch": 0.41739130434782606, "grad_norm": 8.79844273409732, "learning_rate": 6.555126526320134e-07, "loss": 0.3117, "step": 2304 }, { "epoch": 0.41757246376811596, "grad_norm": 9.321045294902802, "learning_rate": 6.552337899795497e-07, "loss": 0.293, "step": 2305 }, { "epoch": 0.4177536231884058, "grad_norm": 6.777456768486296, "learning_rate": 6.549548738793566e-07, "loss": 0.3115, "step": 2306 }, { "epoch": 0.41793478260869565, "grad_norm": 5.490405975247144, "learning_rate": 6.546759044274663e-07, "loss": 0.2958, "step": 2307 }, { "epoch": 0.4181159420289855, "grad_norm": 6.1963416416821735, "learning_rate": 6.543968817199292e-07, "loss": 0.3203, "step": 2308 }, { "epoch": 0.41829710144927534, "grad_norm": 8.25763006972259, "learning_rate": 6.541178058528143e-07, "loss": 0.2866, "step": 2309 }, { "epoch": 0.41847826086956524, "grad_norm": 7.2752239577071895, "learning_rate": 6.538386769222085e-07, "loss": 0.3369, "step": 2310 }, { "epoch": 0.4186594202898551, "grad_norm": 6.863002245502321, "learning_rate": 6.535594950242174e-07, "loss": 0.3251, "step": 2311 }, { "epoch": 0.41884057971014493, "grad_norm": 4.411816407905571, "learning_rate": 6.532802602549646e-07, "loss": 0.3568, "step": 2312 }, { "epoch": 0.4190217391304348, "grad_norm": 5.093038580306019, "learning_rate": 6.530009727105916e-07, "loss": 0.3051, "step": 2313 }, { "epoch": 0.4192028985507246, "grad_norm": 11.348804991938719, "learning_rate": 6.527216324872592e-07, "loss": 0.3459, "step": 2314 }, { "epoch": 0.41938405797101447, "grad_norm": 10.843852583075352, "learning_rate": 6.524422396811448e-07, "loss": 0.2824, "step": 2315 }, { "epoch": 0.41956521739130437, "grad_norm": 9.116085155890177, "learning_rate": 6.521627943884452e-07, "loss": 0.3583, "step": 2316 }, { "epoch": 0.4197463768115942, "grad_norm": 8.427956674746454, "learning_rate": 6.518832967053746e-07, "loss": 0.2868, "step": 2317 }, { "epoch": 0.41992753623188406, "grad_norm": 5.639861262365254, "learning_rate": 6.516037467281652e-07, "loss": 0.2633, "step": 2318 }, { "epoch": 0.4201086956521739, "grad_norm": 5.14891423125128, "learning_rate": 6.513241445530676e-07, "loss": 0.271, "step": 2319 }, { "epoch": 0.42028985507246375, "grad_norm": 4.579124761321766, "learning_rate": 6.510444902763498e-07, "loss": 0.2664, "step": 2320 }, { "epoch": 0.42047101449275365, "grad_norm": 4.534131870437726, "learning_rate": 6.507647839942983e-07, "loss": 0.2937, "step": 2321 }, { "epoch": 0.4206521739130435, "grad_norm": 3.40644285543595, "learning_rate": 6.504850258032176e-07, "loss": 0.2928, "step": 2322 }, { "epoch": 0.42083333333333334, "grad_norm": 3.1704861579416894, "learning_rate": 6.502052157994294e-07, "loss": 0.2768, "step": 2323 }, { "epoch": 0.4210144927536232, "grad_norm": 7.494768894456705, "learning_rate": 6.499253540792736e-07, "loss": 0.2935, "step": 2324 }, { "epoch": 0.421195652173913, "grad_norm": 6.9556993549617445, "learning_rate": 6.496454407391082e-07, "loss": 0.2941, "step": 2325 }, { "epoch": 0.42137681159420287, "grad_norm": 4.136855338022511, "learning_rate": 6.493654758753084e-07, "loss": 0.3093, "step": 2326 }, { "epoch": 0.42155797101449277, "grad_norm": 9.212047863217336, "learning_rate": 6.490854595842675e-07, "loss": 0.33, "step": 2327 }, { "epoch": 0.4217391304347826, "grad_norm": 7.667652790684813, "learning_rate": 6.488053919623968e-07, "loss": 0.3428, "step": 2328 }, { "epoch": 0.42192028985507246, "grad_norm": 3.5609158476656435, "learning_rate": 6.485252731061242e-07, "loss": 0.3044, "step": 2329 }, { "epoch": 0.4221014492753623, "grad_norm": 5.096558513035923, "learning_rate": 6.482451031118965e-07, "loss": 0.2994, "step": 2330 }, { "epoch": 0.42228260869565215, "grad_norm": 9.849021584451869, "learning_rate": 6.479648820761776e-07, "loss": 0.2762, "step": 2331 }, { "epoch": 0.42246376811594205, "grad_norm": 4.593447913335792, "learning_rate": 6.476846100954484e-07, "loss": 0.3464, "step": 2332 }, { "epoch": 0.4226449275362319, "grad_norm": 3.4143861457939044, "learning_rate": 6.474042872662084e-07, "loss": 0.2854, "step": 2333 }, { "epoch": 0.42282608695652174, "grad_norm": 5.95878172037489, "learning_rate": 6.471239136849738e-07, "loss": 0.2922, "step": 2334 }, { "epoch": 0.4230072463768116, "grad_norm": 6.95448046727009, "learning_rate": 6.468434894482786e-07, "loss": 0.3218, "step": 2335 }, { "epoch": 0.42318840579710143, "grad_norm": 5.737354145804355, "learning_rate": 6.465630146526744e-07, "loss": 0.3313, "step": 2336 }, { "epoch": 0.42336956521739133, "grad_norm": 3.2418882401072415, "learning_rate": 6.462824893947296e-07, "loss": 0.2754, "step": 2337 }, { "epoch": 0.4235507246376812, "grad_norm": 6.638313730445387, "learning_rate": 6.460019137710307e-07, "loss": 0.3213, "step": 2338 }, { "epoch": 0.423731884057971, "grad_norm": 4.075967396282302, "learning_rate": 6.457212878781812e-07, "loss": 0.2939, "step": 2339 }, { "epoch": 0.42391304347826086, "grad_norm": 6.545210924283952, "learning_rate": 6.454406118128017e-07, "loss": 0.2839, "step": 2340 }, { "epoch": 0.4240942028985507, "grad_norm": 4.525793572017759, "learning_rate": 6.451598856715304e-07, "loss": 0.283, "step": 2341 }, { "epoch": 0.42427536231884055, "grad_norm": 6.156340652832883, "learning_rate": 6.448791095510229e-07, "loss": 0.2988, "step": 2342 }, { "epoch": 0.42445652173913045, "grad_norm": 6.568764572585965, "learning_rate": 6.445982835479513e-07, "loss": 0.3907, "step": 2343 }, { "epoch": 0.4246376811594203, "grad_norm": 5.951589934703833, "learning_rate": 6.443174077590056e-07, "loss": 0.2634, "step": 2344 }, { "epoch": 0.42481884057971014, "grad_norm": 7.108996064150201, "learning_rate": 6.440364822808928e-07, "loss": 0.3592, "step": 2345 }, { "epoch": 0.425, "grad_norm": 8.727790706486658, "learning_rate": 6.437555072103365e-07, "loss": 0.274, "step": 2346 }, { "epoch": 0.42518115942028983, "grad_norm": 4.802534038712147, "learning_rate": 6.434744826440781e-07, "loss": 0.3236, "step": 2347 }, { "epoch": 0.42536231884057973, "grad_norm": 3.187665598838089, "learning_rate": 6.431934086788753e-07, "loss": 0.3055, "step": 2348 }, { "epoch": 0.4255434782608696, "grad_norm": 4.149065953139308, "learning_rate": 6.429122854115036e-07, "loss": 0.3238, "step": 2349 }, { "epoch": 0.4257246376811594, "grad_norm": 7.3621213118183135, "learning_rate": 6.42631112938755e-07, "loss": 0.2961, "step": 2350 }, { "epoch": 0.42590579710144927, "grad_norm": 3.2486055671494616, "learning_rate": 6.423498913574383e-07, "loss": 0.2145, "step": 2351 }, { "epoch": 0.4260869565217391, "grad_norm": 4.526894388022784, "learning_rate": 6.420686207643794e-07, "loss": 0.3409, "step": 2352 }, { "epoch": 0.42626811594202896, "grad_norm": 4.392681220850965, "learning_rate": 6.417873012564215e-07, "loss": 0.3176, "step": 2353 }, { "epoch": 0.42644927536231886, "grad_norm": 6.110794389623651, "learning_rate": 6.415059329304238e-07, "loss": 0.2667, "step": 2354 }, { "epoch": 0.4266304347826087, "grad_norm": 7.266934307401165, "learning_rate": 6.412245158832629e-07, "loss": 0.3011, "step": 2355 }, { "epoch": 0.42681159420289855, "grad_norm": 4.446239473597966, "learning_rate": 6.40943050211832e-07, "loss": 0.3144, "step": 2356 }, { "epoch": 0.4269927536231884, "grad_norm": 4.243355332455165, "learning_rate": 6.406615360130414e-07, "loss": 0.2814, "step": 2357 }, { "epoch": 0.42717391304347824, "grad_norm": 7.42819065267284, "learning_rate": 6.403799733838171e-07, "loss": 0.3367, "step": 2358 }, { "epoch": 0.42735507246376814, "grad_norm": 3.117263154175289, "learning_rate": 6.400983624211031e-07, "loss": 0.2473, "step": 2359 }, { "epoch": 0.427536231884058, "grad_norm": 5.654648596519151, "learning_rate": 6.398167032218591e-07, "loss": 0.3391, "step": 2360 }, { "epoch": 0.4277173913043478, "grad_norm": 5.712715697561089, "learning_rate": 6.395349958830616e-07, "loss": 0.3131, "step": 2361 }, { "epoch": 0.42789855072463767, "grad_norm": 6.014980975626511, "learning_rate": 6.392532405017039e-07, "loss": 0.2879, "step": 2362 }, { "epoch": 0.4280797101449275, "grad_norm": 5.2166066553112245, "learning_rate": 6.389714371747958e-07, "loss": 0.271, "step": 2363 }, { "epoch": 0.4282608695652174, "grad_norm": 3.5281287010252895, "learning_rate": 6.386895859993633e-07, "loss": 0.2781, "step": 2364 }, { "epoch": 0.42844202898550726, "grad_norm": 6.075133336028407, "learning_rate": 6.384076870724493e-07, "loss": 0.3413, "step": 2365 }, { "epoch": 0.4286231884057971, "grad_norm": 3.311171515576892, "learning_rate": 6.38125740491113e-07, "loss": 0.2614, "step": 2366 }, { "epoch": 0.42880434782608695, "grad_norm": 4.038095763657474, "learning_rate": 6.378437463524295e-07, "loss": 0.2852, "step": 2367 }, { "epoch": 0.4289855072463768, "grad_norm": 5.270364823459333, "learning_rate": 6.375617047534911e-07, "loss": 0.3007, "step": 2368 }, { "epoch": 0.42916666666666664, "grad_norm": 7.889787936372453, "learning_rate": 6.372796157914059e-07, "loss": 0.2674, "step": 2369 }, { "epoch": 0.42934782608695654, "grad_norm": 10.987924033374092, "learning_rate": 6.369974795632988e-07, "loss": 0.3345, "step": 2370 }, { "epoch": 0.4295289855072464, "grad_norm": 7.329870667955026, "learning_rate": 6.367152961663102e-07, "loss": 0.3263, "step": 2371 }, { "epoch": 0.42971014492753623, "grad_norm": 4.220140572988842, "learning_rate": 6.364330656975973e-07, "loss": 0.3106, "step": 2372 }, { "epoch": 0.4298913043478261, "grad_norm": 4.163053856611312, "learning_rate": 6.361507882543335e-07, "loss": 0.3107, "step": 2373 }, { "epoch": 0.4300724637681159, "grad_norm": 4.999521026328111, "learning_rate": 6.358684639337084e-07, "loss": 0.3176, "step": 2374 }, { "epoch": 0.4302536231884058, "grad_norm": 5.589645442204374, "learning_rate": 6.355860928329271e-07, "loss": 0.3328, "step": 2375 }, { "epoch": 0.43043478260869567, "grad_norm": 5.463617192861189, "learning_rate": 6.35303675049212e-07, "loss": 0.2833, "step": 2376 }, { "epoch": 0.4306159420289855, "grad_norm": 4.594530499190523, "learning_rate": 6.350212106798002e-07, "loss": 0.3701, "step": 2377 }, { "epoch": 0.43079710144927535, "grad_norm": 5.351475130390271, "learning_rate": 6.347386998219458e-07, "loss": 0.2662, "step": 2378 }, { "epoch": 0.4309782608695652, "grad_norm": 3.443919946262373, "learning_rate": 6.344561425729186e-07, "loss": 0.2779, "step": 2379 }, { "epoch": 0.4311594202898551, "grad_norm": 3.7981303231880696, "learning_rate": 6.341735390300047e-07, "loss": 0.2863, "step": 2380 }, { "epoch": 0.43134057971014494, "grad_norm": 4.804880059405027, "learning_rate": 6.338908892905055e-07, "loss": 0.3002, "step": 2381 }, { "epoch": 0.4315217391304348, "grad_norm": 6.016134838775121, "learning_rate": 6.336081934517388e-07, "loss": 0.2338, "step": 2382 }, { "epoch": 0.43170289855072463, "grad_norm": 7.220698799639642, "learning_rate": 6.333254516110377e-07, "loss": 0.2667, "step": 2383 }, { "epoch": 0.4318840579710145, "grad_norm": 3.783084763616544, "learning_rate": 6.330426638657524e-07, "loss": 0.2923, "step": 2384 }, { "epoch": 0.4320652173913043, "grad_norm": 5.821360737081035, "learning_rate": 6.32759830313247e-07, "loss": 0.302, "step": 2385 }, { "epoch": 0.4322463768115942, "grad_norm": 5.911176517191124, "learning_rate": 6.324769510509034e-07, "loss": 0.3166, "step": 2386 }, { "epoch": 0.43242753623188407, "grad_norm": 10.33246031124787, "learning_rate": 6.321940261761178e-07, "loss": 0.3128, "step": 2387 }, { "epoch": 0.4326086956521739, "grad_norm": 5.866156582196534, "learning_rate": 6.319110557863025e-07, "loss": 0.2822, "step": 2388 }, { "epoch": 0.43278985507246376, "grad_norm": 4.568627246411997, "learning_rate": 6.316280399788859e-07, "loss": 0.4158, "step": 2389 }, { "epoch": 0.4329710144927536, "grad_norm": 3.9881536953107584, "learning_rate": 6.313449788513114e-07, "loss": 0.2872, "step": 2390 }, { "epoch": 0.4331521739130435, "grad_norm": 7.191773234053636, "learning_rate": 6.310618725010381e-07, "loss": 0.3354, "step": 2391 }, { "epoch": 0.43333333333333335, "grad_norm": 3.3151817463531836, "learning_rate": 6.307787210255414e-07, "loss": 0.2838, "step": 2392 }, { "epoch": 0.4335144927536232, "grad_norm": 3.1034187918020972, "learning_rate": 6.304955245223113e-07, "loss": 0.2724, "step": 2393 }, { "epoch": 0.43369565217391304, "grad_norm": 3.30721134824641, "learning_rate": 6.302122830888539e-07, "loss": 0.2397, "step": 2394 }, { "epoch": 0.4338768115942029, "grad_norm": 3.5254479200838014, "learning_rate": 6.299289968226904e-07, "loss": 0.2768, "step": 2395 }, { "epoch": 0.4340579710144927, "grad_norm": 11.329931826072082, "learning_rate": 6.296456658213577e-07, "loss": 0.3801, "step": 2396 }, { "epoch": 0.4342391304347826, "grad_norm": 4.763436588760162, "learning_rate": 6.29362290182408e-07, "loss": 0.3279, "step": 2397 }, { "epoch": 0.4344202898550725, "grad_norm": 5.514605367037099, "learning_rate": 6.290788700034088e-07, "loss": 0.2995, "step": 2398 }, { "epoch": 0.4346014492753623, "grad_norm": 4.884851347962377, "learning_rate": 6.287954053819431e-07, "loss": 0.2985, "step": 2399 }, { "epoch": 0.43478260869565216, "grad_norm": 5.8128705795798306, "learning_rate": 6.285118964156091e-07, "loss": 0.3162, "step": 2400 }, { "epoch": 0.43478260869565216, "eval_loss": 0.3111875057220459, "eval_runtime": 9.7334, "eval_samples_per_second": 51.369, "eval_steps_per_second": 0.103, "step": 2400 }, { "epoch": 0.434963768115942, "grad_norm": 6.032919533461052, "learning_rate": 6.282283432020202e-07, "loss": 0.3033, "step": 2401 }, { "epoch": 0.4351449275362319, "grad_norm": 9.8388685666013, "learning_rate": 6.279447458388051e-07, "loss": 0.2837, "step": 2402 }, { "epoch": 0.43532608695652175, "grad_norm": 8.627197461456078, "learning_rate": 6.27661104423608e-07, "loss": 0.3109, "step": 2403 }, { "epoch": 0.4355072463768116, "grad_norm": 4.6613897030136995, "learning_rate": 6.273774190540878e-07, "loss": 0.3176, "step": 2404 }, { "epoch": 0.43568840579710144, "grad_norm": 5.168306745523437, "learning_rate": 6.270936898279185e-07, "loss": 0.3404, "step": 2405 }, { "epoch": 0.4358695652173913, "grad_norm": 5.646349520336148, "learning_rate": 6.268099168427898e-07, "loss": 0.3678, "step": 2406 }, { "epoch": 0.4360507246376812, "grad_norm": 3.575771991391126, "learning_rate": 6.265261001964057e-07, "loss": 0.3228, "step": 2407 }, { "epoch": 0.43623188405797103, "grad_norm": 4.803310136645171, "learning_rate": 6.262422399864859e-07, "loss": 0.285, "step": 2408 }, { "epoch": 0.4364130434782609, "grad_norm": 4.033472385962535, "learning_rate": 6.259583363107648e-07, "loss": 0.2501, "step": 2409 }, { "epoch": 0.4365942028985507, "grad_norm": 4.6880235105969215, "learning_rate": 6.256743892669916e-07, "loss": 0.277, "step": 2410 }, { "epoch": 0.43677536231884057, "grad_norm": 7.729944298397247, "learning_rate": 6.253903989529307e-07, "loss": 0.3259, "step": 2411 }, { "epoch": 0.4369565217391304, "grad_norm": 5.974530213953839, "learning_rate": 6.251063654663614e-07, "loss": 0.2841, "step": 2412 }, { "epoch": 0.4371376811594203, "grad_norm": 4.166502550490707, "learning_rate": 6.248222889050776e-07, "loss": 0.3132, "step": 2413 }, { "epoch": 0.43731884057971016, "grad_norm": 3.3637687861488783, "learning_rate": 6.245381693668885e-07, "loss": 0.2901, "step": 2414 }, { "epoch": 0.4375, "grad_norm": 5.353592628541287, "learning_rate": 6.242540069496173e-07, "loss": 0.3437, "step": 2415 }, { "epoch": 0.43768115942028984, "grad_norm": 3.8199879977948545, "learning_rate": 6.23969801751103e-07, "loss": 0.2813, "step": 2416 }, { "epoch": 0.4378623188405797, "grad_norm": 4.033829725970013, "learning_rate": 6.236855538691986e-07, "loss": 0.3056, "step": 2417 }, { "epoch": 0.4380434782608696, "grad_norm": 3.8833235801396766, "learning_rate": 6.234012634017718e-07, "loss": 0.2981, "step": 2418 }, { "epoch": 0.43822463768115943, "grad_norm": 3.2405427516316188, "learning_rate": 6.231169304467056e-07, "loss": 0.2632, "step": 2419 }, { "epoch": 0.4384057971014493, "grad_norm": 3.652979531153594, "learning_rate": 6.228325551018967e-07, "loss": 0.2841, "step": 2420 }, { "epoch": 0.4385869565217391, "grad_norm": 3.8590877440846967, "learning_rate": 6.225481374652572e-07, "loss": 0.2858, "step": 2421 }, { "epoch": 0.43876811594202897, "grad_norm": 6.556473631494257, "learning_rate": 6.222636776347132e-07, "loss": 0.3133, "step": 2422 }, { "epoch": 0.4389492753623188, "grad_norm": 3.7270987312082293, "learning_rate": 6.219791757082058e-07, "loss": 0.2819, "step": 2423 }, { "epoch": 0.4391304347826087, "grad_norm": 4.533409871199603, "learning_rate": 6.2169463178369e-07, "loss": 0.299, "step": 2424 }, { "epoch": 0.43931159420289856, "grad_norm": 3.258767091980891, "learning_rate": 6.214100459591363e-07, "loss": 0.2942, "step": 2425 }, { "epoch": 0.4394927536231884, "grad_norm": 4.322071867009134, "learning_rate": 6.211254183325281e-07, "loss": 0.3331, "step": 2426 }, { "epoch": 0.43967391304347825, "grad_norm": 4.694868290410619, "learning_rate": 6.208407490018648e-07, "loss": 0.3338, "step": 2427 }, { "epoch": 0.4398550724637681, "grad_norm": 8.944845316507115, "learning_rate": 6.205560380651589e-07, "loss": 0.2864, "step": 2428 }, { "epoch": 0.440036231884058, "grad_norm": 4.489572713454659, "learning_rate": 6.202712856204379e-07, "loss": 0.3502, "step": 2429 }, { "epoch": 0.44021739130434784, "grad_norm": 3.2292276395752584, "learning_rate": 6.199864917657434e-07, "loss": 0.2912, "step": 2430 }, { "epoch": 0.4403985507246377, "grad_norm": 6.383876568796987, "learning_rate": 6.197016565991314e-07, "loss": 0.3878, "step": 2431 }, { "epoch": 0.4405797101449275, "grad_norm": 4.309347998381022, "learning_rate": 6.194167802186718e-07, "loss": 0.3552, "step": 2432 }, { "epoch": 0.4407608695652174, "grad_norm": 4.349599489988224, "learning_rate": 6.191318627224489e-07, "loss": 0.29, "step": 2433 }, { "epoch": 0.4409420289855073, "grad_norm": 10.140908604499579, "learning_rate": 6.188469042085612e-07, "loss": 0.3605, "step": 2434 }, { "epoch": 0.4411231884057971, "grad_norm": 4.177803360097879, "learning_rate": 6.185619047751214e-07, "loss": 0.2773, "step": 2435 }, { "epoch": 0.44130434782608696, "grad_norm": 4.767036299494666, "learning_rate": 6.182768645202558e-07, "loss": 0.2799, "step": 2436 }, { "epoch": 0.4414855072463768, "grad_norm": 3.766462633636971, "learning_rate": 6.179917835421055e-07, "loss": 0.2733, "step": 2437 }, { "epoch": 0.44166666666666665, "grad_norm": 3.818063907046883, "learning_rate": 6.177066619388251e-07, "loss": 0.3255, "step": 2438 }, { "epoch": 0.4418478260869565, "grad_norm": 3.5420938549509158, "learning_rate": 6.174214998085832e-07, "loss": 0.3177, "step": 2439 }, { "epoch": 0.4420289855072464, "grad_norm": 4.705024431833968, "learning_rate": 6.171362972495626e-07, "loss": 0.3486, "step": 2440 }, { "epoch": 0.44221014492753624, "grad_norm": 3.713483054306353, "learning_rate": 6.1685105435996e-07, "loss": 0.3105, "step": 2441 }, { "epoch": 0.4423913043478261, "grad_norm": 6.860656832518109, "learning_rate": 6.165657712379854e-07, "loss": 0.3324, "step": 2442 }, { "epoch": 0.44257246376811593, "grad_norm": 5.170289831277415, "learning_rate": 6.162804479818637e-07, "loss": 0.3116, "step": 2443 }, { "epoch": 0.4427536231884058, "grad_norm": 6.865661144465947, "learning_rate": 6.159950846898328e-07, "loss": 0.286, "step": 2444 }, { "epoch": 0.4429347826086957, "grad_norm": 3.6141622065675576, "learning_rate": 6.157096814601447e-07, "loss": 0.3246, "step": 2445 }, { "epoch": 0.4431159420289855, "grad_norm": 4.711299127220832, "learning_rate": 6.154242383910649e-07, "loss": 0.2776, "step": 2446 }, { "epoch": 0.44329710144927537, "grad_norm": 4.800757471753301, "learning_rate": 6.151387555808729e-07, "loss": 0.3115, "step": 2447 }, { "epoch": 0.4434782608695652, "grad_norm": 7.2176308367961, "learning_rate": 6.148532331278619e-07, "loss": 0.2622, "step": 2448 }, { "epoch": 0.44365942028985506, "grad_norm": 4.580417800697855, "learning_rate": 6.145676711303386e-07, "loss": 0.2483, "step": 2449 }, { "epoch": 0.4438405797101449, "grad_norm": 7.090839506145054, "learning_rate": 6.142820696866231e-07, "loss": 0.3002, "step": 2450 }, { "epoch": 0.4440217391304348, "grad_norm": 4.170397544873171, "learning_rate": 6.139964288950497e-07, "loss": 0.3181, "step": 2451 }, { "epoch": 0.44420289855072465, "grad_norm": 8.087550735356297, "learning_rate": 6.137107488539657e-07, "loss": 0.3585, "step": 2452 }, { "epoch": 0.4443840579710145, "grad_norm": 5.713435797652091, "learning_rate": 6.13425029661732e-07, "loss": 0.3129, "step": 2453 }, { "epoch": 0.44456521739130433, "grad_norm": 3.844583853942515, "learning_rate": 6.131392714167233e-07, "loss": 0.3107, "step": 2454 }, { "epoch": 0.4447463768115942, "grad_norm": 3.0519902543496387, "learning_rate": 6.128534742173273e-07, "loss": 0.2734, "step": 2455 }, { "epoch": 0.4449275362318841, "grad_norm": 4.734232598715959, "learning_rate": 6.125676381619454e-07, "loss": 0.288, "step": 2456 }, { "epoch": 0.4451086956521739, "grad_norm": 7.043243332262786, "learning_rate": 6.122817633489923e-07, "loss": 0.3175, "step": 2457 }, { "epoch": 0.44528985507246377, "grad_norm": 8.200428475635034, "learning_rate": 6.119958498768962e-07, "loss": 0.3144, "step": 2458 }, { "epoch": 0.4454710144927536, "grad_norm": 4.47371989403032, "learning_rate": 6.117098978440981e-07, "loss": 0.3111, "step": 2459 }, { "epoch": 0.44565217391304346, "grad_norm": 3.2942870807097124, "learning_rate": 6.114239073490533e-07, "loss": 0.2837, "step": 2460 }, { "epoch": 0.44583333333333336, "grad_norm": 3.2136242867192517, "learning_rate": 6.111378784902288e-07, "loss": 0.3165, "step": 2461 }, { "epoch": 0.4460144927536232, "grad_norm": 3.566430781317973, "learning_rate": 6.108518113661064e-07, "loss": 0.3129, "step": 2462 }, { "epoch": 0.44619565217391305, "grad_norm": 4.43212040990493, "learning_rate": 6.105657060751801e-07, "loss": 0.3618, "step": 2463 }, { "epoch": 0.4463768115942029, "grad_norm": 7.100989076740194, "learning_rate": 6.102795627159572e-07, "loss": 0.3349, "step": 2464 }, { "epoch": 0.44655797101449274, "grad_norm": 4.798284331614351, "learning_rate": 6.099933813869585e-07, "loss": 0.2486, "step": 2465 }, { "epoch": 0.4467391304347826, "grad_norm": 4.114344426352763, "learning_rate": 6.097071621867175e-07, "loss": 0.3625, "step": 2466 }, { "epoch": 0.4469202898550725, "grad_norm": 5.591657104607873, "learning_rate": 6.094209052137805e-07, "loss": 0.3869, "step": 2467 }, { "epoch": 0.44710144927536233, "grad_norm": 4.5381586589400715, "learning_rate": 6.091346105667077e-07, "loss": 0.3456, "step": 2468 }, { "epoch": 0.4472826086956522, "grad_norm": 6.732564754578744, "learning_rate": 6.08848278344071e-07, "loss": 0.256, "step": 2469 }, { "epoch": 0.447463768115942, "grad_norm": 5.522757347127664, "learning_rate": 6.085619086444566e-07, "loss": 0.2551, "step": 2470 }, { "epoch": 0.44764492753623186, "grad_norm": 3.7862866477335535, "learning_rate": 6.082755015664626e-07, "loss": 0.3159, "step": 2471 }, { "epoch": 0.44782608695652176, "grad_norm": 5.773950369934294, "learning_rate": 6.079890572087005e-07, "loss": 0.2975, "step": 2472 }, { "epoch": 0.4480072463768116, "grad_norm": 4.245890011685441, "learning_rate": 6.077025756697942e-07, "loss": 0.2986, "step": 2473 }, { "epoch": 0.44818840579710145, "grad_norm": 6.326689728606212, "learning_rate": 6.074160570483809e-07, "loss": 0.3101, "step": 2474 }, { "epoch": 0.4483695652173913, "grad_norm": 8.499362818011361, "learning_rate": 6.0712950144311e-07, "loss": 0.3405, "step": 2475 }, { "epoch": 0.44855072463768114, "grad_norm": 3.7230462980396717, "learning_rate": 6.068429089526446e-07, "loss": 0.2809, "step": 2476 }, { "epoch": 0.44873188405797104, "grad_norm": 11.311724816719947, "learning_rate": 6.06556279675659e-07, "loss": 0.3452, "step": 2477 }, { "epoch": 0.4489130434782609, "grad_norm": 9.75145005318127, "learning_rate": 6.062696137108415e-07, "loss": 0.2696, "step": 2478 }, { "epoch": 0.44909420289855073, "grad_norm": 7.14584372660994, "learning_rate": 6.059829111568926e-07, "loss": 0.3522, "step": 2479 }, { "epoch": 0.4492753623188406, "grad_norm": 10.218413451985699, "learning_rate": 6.056961721125252e-07, "loss": 0.3456, "step": 2480 }, { "epoch": 0.4494565217391304, "grad_norm": 3.3258361918376496, "learning_rate": 6.054093966764649e-07, "loss": 0.2685, "step": 2481 }, { "epoch": 0.44963768115942027, "grad_norm": 6.63057783981375, "learning_rate": 6.0512258494745e-07, "loss": 0.2621, "step": 2482 }, { "epoch": 0.44981884057971017, "grad_norm": 3.5014204089310494, "learning_rate": 6.048357370242308e-07, "loss": 0.3149, "step": 2483 }, { "epoch": 0.45, "grad_norm": 4.371632426519727, "learning_rate": 6.045488530055709e-07, "loss": 0.2606, "step": 2484 }, { "epoch": 0.45018115942028986, "grad_norm": 4.303240966821157, "learning_rate": 6.042619329902453e-07, "loss": 0.3337, "step": 2485 }, { "epoch": 0.4503623188405797, "grad_norm": 3.529163527135913, "learning_rate": 6.039749770770422e-07, "loss": 0.3257, "step": 2486 }, { "epoch": 0.45054347826086955, "grad_norm": 3.300415660889749, "learning_rate": 6.03687985364762e-07, "loss": 0.2918, "step": 2487 }, { "epoch": 0.45072463768115945, "grad_norm": 7.841796250440804, "learning_rate": 6.03400957952217e-07, "loss": 0.2842, "step": 2488 }, { "epoch": 0.4509057971014493, "grad_norm": 3.76475456465956, "learning_rate": 6.031138949382323e-07, "loss": 0.3086, "step": 2489 }, { "epoch": 0.45108695652173914, "grad_norm": 3.4730194512582906, "learning_rate": 6.02826796421645e-07, "loss": 0.3077, "step": 2490 }, { "epoch": 0.451268115942029, "grad_norm": 3.689315498803631, "learning_rate": 6.025396625013046e-07, "loss": 0.2664, "step": 2491 }, { "epoch": 0.4514492753623188, "grad_norm": 6.188874922621899, "learning_rate": 6.022524932760724e-07, "loss": 0.3626, "step": 2492 }, { "epoch": 0.45163043478260867, "grad_norm": 4.635190863041126, "learning_rate": 6.019652888448225e-07, "loss": 0.2876, "step": 2493 }, { "epoch": 0.45181159420289857, "grad_norm": 3.311049094059225, "learning_rate": 6.016780493064403e-07, "loss": 0.3193, "step": 2494 }, { "epoch": 0.4519927536231884, "grad_norm": 9.044556119574287, "learning_rate": 6.013907747598241e-07, "loss": 0.3398, "step": 2495 }, { "epoch": 0.45217391304347826, "grad_norm": 3.354433975091886, "learning_rate": 6.011034653038837e-07, "loss": 0.2773, "step": 2496 }, { "epoch": 0.4523550724637681, "grad_norm": 5.041896221104646, "learning_rate": 6.008161210375411e-07, "loss": 0.4211, "step": 2497 }, { "epoch": 0.45253623188405795, "grad_norm": 5.482637424516314, "learning_rate": 6.005287420597305e-07, "loss": 0.3891, "step": 2498 }, { "epoch": 0.45271739130434785, "grad_norm": 6.77500890492817, "learning_rate": 6.002413284693976e-07, "loss": 0.2743, "step": 2499 }, { "epoch": 0.4528985507246377, "grad_norm": 8.011139241455309, "learning_rate": 5.999538803655003e-07, "loss": 0.3303, "step": 2500 }, { "epoch": 0.4528985507246377, "eval_loss": 0.30396875739097595, "eval_runtime": 9.7609, "eval_samples_per_second": 51.225, "eval_steps_per_second": 0.102, "step": 2500 }, { "epoch": 0.45307971014492754, "grad_norm": 6.9092246240073525, "learning_rate": 5.996663978470084e-07, "loss": 0.2473, "step": 2501 }, { "epoch": 0.4532608695652174, "grad_norm": 3.8900354175610254, "learning_rate": 5.993788810129036e-07, "loss": 0.2997, "step": 2502 }, { "epoch": 0.45344202898550723, "grad_norm": 3.993330457172059, "learning_rate": 5.990913299621792e-07, "loss": 0.3003, "step": 2503 }, { "epoch": 0.45362318840579713, "grad_norm": 6.952833218191655, "learning_rate": 5.988037447938402e-07, "loss": 0.3267, "step": 2504 }, { "epoch": 0.453804347826087, "grad_norm": 7.3550149817202515, "learning_rate": 5.985161256069039e-07, "loss": 0.2859, "step": 2505 }, { "epoch": 0.4539855072463768, "grad_norm": 7.708170270294903, "learning_rate": 5.982284725003988e-07, "loss": 0.2684, "step": 2506 }, { "epoch": 0.45416666666666666, "grad_norm": 3.6160747934822424, "learning_rate": 5.979407855733651e-07, "loss": 0.2888, "step": 2507 }, { "epoch": 0.4543478260869565, "grad_norm": 3.7935639790069198, "learning_rate": 5.976530649248551e-07, "loss": 0.3668, "step": 2508 }, { "epoch": 0.45452898550724635, "grad_norm": 5.813067893712035, "learning_rate": 5.973653106539318e-07, "loss": 0.3615, "step": 2509 }, { "epoch": 0.45471014492753625, "grad_norm": 3.457242965645761, "learning_rate": 5.970775228596708e-07, "loss": 0.2869, "step": 2510 }, { "epoch": 0.4548913043478261, "grad_norm": 3.3273154708971755, "learning_rate": 5.967897016411589e-07, "loss": 0.2926, "step": 2511 }, { "epoch": 0.45507246376811594, "grad_norm": 4.769305794513921, "learning_rate": 5.965018470974941e-07, "loss": 0.3344, "step": 2512 }, { "epoch": 0.4552536231884058, "grad_norm": 3.723894121100625, "learning_rate": 5.96213959327786e-07, "loss": 0.2911, "step": 2513 }, { "epoch": 0.45543478260869563, "grad_norm": 4.444093860444105, "learning_rate": 5.959260384311559e-07, "loss": 0.3074, "step": 2514 }, { "epoch": 0.45561594202898553, "grad_norm": 3.7273155036014654, "learning_rate": 5.956380845067361e-07, "loss": 0.2979, "step": 2515 }, { "epoch": 0.4557971014492754, "grad_norm": 9.428175187359235, "learning_rate": 5.95350097653671e-07, "loss": 0.2889, "step": 2516 }, { "epoch": 0.4559782608695652, "grad_norm": 5.184841240186356, "learning_rate": 5.950620779711152e-07, "loss": 0.3802, "step": 2517 }, { "epoch": 0.45615942028985507, "grad_norm": 6.6449863043115505, "learning_rate": 5.947740255582355e-07, "loss": 0.3157, "step": 2518 }, { "epoch": 0.4563405797101449, "grad_norm": 6.853326562915106, "learning_rate": 5.944859405142101e-07, "loss": 0.3018, "step": 2519 }, { "epoch": 0.45652173913043476, "grad_norm": 4.965577971452306, "learning_rate": 5.941978229382274e-07, "loss": 0.2693, "step": 2520 }, { "epoch": 0.45670289855072466, "grad_norm": 3.708603031043961, "learning_rate": 5.93909672929488e-07, "loss": 0.2856, "step": 2521 }, { "epoch": 0.4568840579710145, "grad_norm": 3.220568144515752, "learning_rate": 5.936214905872032e-07, "loss": 0.3139, "step": 2522 }, { "epoch": 0.45706521739130435, "grad_norm": 4.092866477931799, "learning_rate": 5.933332760105956e-07, "loss": 0.3007, "step": 2523 }, { "epoch": 0.4572463768115942, "grad_norm": 4.225827040897612, "learning_rate": 5.930450292988991e-07, "loss": 0.301, "step": 2524 }, { "epoch": 0.45742753623188404, "grad_norm": 3.3557582860384136, "learning_rate": 5.92756750551358e-07, "loss": 0.366, "step": 2525 }, { "epoch": 0.45760869565217394, "grad_norm": 5.518169821067675, "learning_rate": 5.924684398672281e-07, "loss": 0.3443, "step": 2526 }, { "epoch": 0.4577898550724638, "grad_norm": 3.785659288877862, "learning_rate": 5.921800973457764e-07, "loss": 0.2654, "step": 2527 }, { "epoch": 0.4579710144927536, "grad_norm": 4.009671498013421, "learning_rate": 5.918917230862803e-07, "loss": 0.307, "step": 2528 }, { "epoch": 0.45815217391304347, "grad_norm": 4.9690540897524, "learning_rate": 5.916033171880284e-07, "loss": 0.3379, "step": 2529 }, { "epoch": 0.4583333333333333, "grad_norm": 9.41697339851124, "learning_rate": 5.913148797503208e-07, "loss": 0.3092, "step": 2530 }, { "epoch": 0.4585144927536232, "grad_norm": 9.47462224952637, "learning_rate": 5.91026410872467e-07, "loss": 0.3121, "step": 2531 }, { "epoch": 0.45869565217391306, "grad_norm": 5.690768341547824, "learning_rate": 5.907379106537889e-07, "loss": 0.2885, "step": 2532 }, { "epoch": 0.4588768115942029, "grad_norm": 4.628047691863953, "learning_rate": 5.904493791936183e-07, "loss": 0.354, "step": 2533 }, { "epoch": 0.45905797101449275, "grad_norm": 4.848851639431173, "learning_rate": 5.901608165912976e-07, "loss": 0.2851, "step": 2534 }, { "epoch": 0.4592391304347826, "grad_norm": 5.1550511103204135, "learning_rate": 5.898722229461809e-07, "loss": 0.3049, "step": 2535 }, { "epoch": 0.45942028985507244, "grad_norm": 4.297388301767464, "learning_rate": 5.895835983576319e-07, "loss": 0.2299, "step": 2536 }, { "epoch": 0.45960144927536234, "grad_norm": 5.199239870062037, "learning_rate": 5.892949429250253e-07, "loss": 0.3433, "step": 2537 }, { "epoch": 0.4597826086956522, "grad_norm": 7.774860038661607, "learning_rate": 5.89006256747747e-07, "loss": 0.3519, "step": 2538 }, { "epoch": 0.45996376811594203, "grad_norm": 4.235904750735014, "learning_rate": 5.887175399251927e-07, "loss": 0.3299, "step": 2539 }, { "epoch": 0.4601449275362319, "grad_norm": 4.407687118864259, "learning_rate": 5.88428792556769e-07, "loss": 0.2474, "step": 2540 }, { "epoch": 0.4603260869565217, "grad_norm": 9.510498571741651, "learning_rate": 5.881400147418931e-07, "loss": 0.3548, "step": 2541 }, { "epoch": 0.4605072463768116, "grad_norm": 5.929562229013172, "learning_rate": 5.878512065799925e-07, "loss": 0.2903, "step": 2542 }, { "epoch": 0.46068840579710146, "grad_norm": 5.254605174575904, "learning_rate": 5.875623681705053e-07, "loss": 0.2956, "step": 2543 }, { "epoch": 0.4608695652173913, "grad_norm": 4.496975943263152, "learning_rate": 5.872734996128798e-07, "loss": 0.3078, "step": 2544 }, { "epoch": 0.46105072463768115, "grad_norm": 3.2626975670614464, "learning_rate": 5.869846010065748e-07, "loss": 0.2976, "step": 2545 }, { "epoch": 0.461231884057971, "grad_norm": 4.7579484396135285, "learning_rate": 5.866956724510597e-07, "loss": 0.318, "step": 2546 }, { "epoch": 0.46141304347826084, "grad_norm": 4.759287212212859, "learning_rate": 5.864067140458136e-07, "loss": 0.2473, "step": 2547 }, { "epoch": 0.46159420289855074, "grad_norm": 3.8593186038718255, "learning_rate": 5.861177258903266e-07, "loss": 0.2975, "step": 2548 }, { "epoch": 0.4617753623188406, "grad_norm": 4.504714612409603, "learning_rate": 5.858287080840984e-07, "loss": 0.3153, "step": 2549 }, { "epoch": 0.46195652173913043, "grad_norm": 5.811938849443222, "learning_rate": 5.855396607266395e-07, "loss": 0.282, "step": 2550 }, { "epoch": 0.4621376811594203, "grad_norm": 3.82343735772419, "learning_rate": 5.852505839174701e-07, "loss": 0.2822, "step": 2551 }, { "epoch": 0.4623188405797101, "grad_norm": 6.042649419091215, "learning_rate": 5.849614777561207e-07, "loss": 0.2831, "step": 2552 }, { "epoch": 0.4625, "grad_norm": 3.3205333473392162, "learning_rate": 5.846723423421318e-07, "loss": 0.2746, "step": 2553 }, { "epoch": 0.46268115942028987, "grad_norm": 3.068165194143659, "learning_rate": 5.843831777750546e-07, "loss": 0.2639, "step": 2554 }, { "epoch": 0.4628623188405797, "grad_norm": 3.077256674128147, "learning_rate": 5.840939841544491e-07, "loss": 0.2482, "step": 2555 }, { "epoch": 0.46304347826086956, "grad_norm": 4.5863200275723495, "learning_rate": 5.838047615798865e-07, "loss": 0.354, "step": 2556 }, { "epoch": 0.4632246376811594, "grad_norm": 5.246053590489951, "learning_rate": 5.835155101509476e-07, "loss": 0.2958, "step": 2557 }, { "epoch": 0.4634057971014493, "grad_norm": 3.927843424714349, "learning_rate": 5.832262299672226e-07, "loss": 0.33, "step": 2558 }, { "epoch": 0.46358695652173915, "grad_norm": 6.396070289297395, "learning_rate": 5.829369211283125e-07, "loss": 0.2932, "step": 2559 }, { "epoch": 0.463768115942029, "grad_norm": 4.140828888321739, "learning_rate": 5.826475837338274e-07, "loss": 0.2795, "step": 2560 }, { "epoch": 0.46394927536231884, "grad_norm": 6.938652882119639, "learning_rate": 5.823582178833876e-07, "loss": 0.2945, "step": 2561 }, { "epoch": 0.4641304347826087, "grad_norm": 11.615666670262163, "learning_rate": 5.820688236766232e-07, "loss": 0.2376, "step": 2562 }, { "epoch": 0.4643115942028985, "grad_norm": 4.953730084152967, "learning_rate": 5.817794012131741e-07, "loss": 0.3398, "step": 2563 }, { "epoch": 0.4644927536231884, "grad_norm": 3.6463775335092397, "learning_rate": 5.814899505926893e-07, "loss": 0.2933, "step": 2564 }, { "epoch": 0.46467391304347827, "grad_norm": 5.865073224835817, "learning_rate": 5.812004719148288e-07, "loss": 0.3032, "step": 2565 }, { "epoch": 0.4648550724637681, "grad_norm": 3.495056803371781, "learning_rate": 5.809109652792608e-07, "loss": 0.2777, "step": 2566 }, { "epoch": 0.46503623188405796, "grad_norm": 3.8229105722497416, "learning_rate": 5.806214307856643e-07, "loss": 0.3315, "step": 2567 }, { "epoch": 0.4652173913043478, "grad_norm": 4.098903819095269, "learning_rate": 5.803318685337271e-07, "loss": 0.2664, "step": 2568 }, { "epoch": 0.4653985507246377, "grad_norm": 3.461546788856099, "learning_rate": 5.800422786231469e-07, "loss": 0.3271, "step": 2569 }, { "epoch": 0.46557971014492755, "grad_norm": 3.8846026275711814, "learning_rate": 5.797526611536311e-07, "loss": 0.3003, "step": 2570 }, { "epoch": 0.4657608695652174, "grad_norm": 3.4201127324348732, "learning_rate": 5.79463016224896e-07, "loss": 0.2542, "step": 2571 }, { "epoch": 0.46594202898550724, "grad_norm": 4.100625254758706, "learning_rate": 5.79173343936668e-07, "loss": 0.3068, "step": 2572 }, { "epoch": 0.4661231884057971, "grad_norm": 7.772267251134122, "learning_rate": 5.788836443886825e-07, "loss": 0.2599, "step": 2573 }, { "epoch": 0.46630434782608693, "grad_norm": 4.882826856368822, "learning_rate": 5.785939176806845e-07, "loss": 0.3243, "step": 2574 }, { "epoch": 0.46648550724637683, "grad_norm": 3.2698182786469556, "learning_rate": 5.783041639124282e-07, "loss": 0.2491, "step": 2575 }, { "epoch": 0.4666666666666667, "grad_norm": 5.122200296028673, "learning_rate": 5.780143831836774e-07, "loss": 0.2838, "step": 2576 }, { "epoch": 0.4668478260869565, "grad_norm": 4.605204533068787, "learning_rate": 5.777245755942047e-07, "loss": 0.2925, "step": 2577 }, { "epoch": 0.46702898550724636, "grad_norm": 3.225029378647811, "learning_rate": 5.774347412437924e-07, "loss": 0.2727, "step": 2578 }, { "epoch": 0.4672101449275362, "grad_norm": 5.430634093165232, "learning_rate": 5.771448802322319e-07, "loss": 0.3149, "step": 2579 }, { "epoch": 0.4673913043478261, "grad_norm": 4.743721177857558, "learning_rate": 5.768549926593234e-07, "loss": 0.3106, "step": 2580 }, { "epoch": 0.46757246376811595, "grad_norm": 4.010257606889404, "learning_rate": 5.765650786248769e-07, "loss": 0.323, "step": 2581 }, { "epoch": 0.4677536231884058, "grad_norm": 3.925544492714965, "learning_rate": 5.76275138228711e-07, "loss": 0.299, "step": 2582 }, { "epoch": 0.46793478260869564, "grad_norm": 3.6602035527498558, "learning_rate": 5.759851715706536e-07, "loss": 0.2757, "step": 2583 }, { "epoch": 0.4681159420289855, "grad_norm": 5.012508221820364, "learning_rate": 5.756951787505417e-07, "loss": 0.3045, "step": 2584 }, { "epoch": 0.4682971014492754, "grad_norm": 6.155301065533482, "learning_rate": 5.754051598682212e-07, "loss": 0.4303, "step": 2585 }, { "epoch": 0.46847826086956523, "grad_norm": 3.053830558166979, "learning_rate": 5.751151150235467e-07, "loss": 0.2546, "step": 2586 }, { "epoch": 0.4686594202898551, "grad_norm": 4.508932676087459, "learning_rate": 5.748250443163823e-07, "loss": 0.3448, "step": 2587 }, { "epoch": 0.4688405797101449, "grad_norm": 3.57702351859665, "learning_rate": 5.745349478466004e-07, "loss": 0.2932, "step": 2588 }, { "epoch": 0.46902173913043477, "grad_norm": 4.067593840822912, "learning_rate": 5.742448257140831e-07, "loss": 0.3339, "step": 2589 }, { "epoch": 0.4692028985507246, "grad_norm": 8.400370210807065, "learning_rate": 5.739546780187202e-07, "loss": 0.3749, "step": 2590 }, { "epoch": 0.4693840579710145, "grad_norm": 5.184628643464647, "learning_rate": 5.736645048604115e-07, "loss": 0.3138, "step": 2591 }, { "epoch": 0.46956521739130436, "grad_norm": 4.549921201725895, "learning_rate": 5.733743063390647e-07, "loss": 0.2975, "step": 2592 }, { "epoch": 0.4697463768115942, "grad_norm": 6.4597389096253, "learning_rate": 5.730840825545965e-07, "loss": 0.3349, "step": 2593 }, { "epoch": 0.46992753623188405, "grad_norm": 5.414429841894115, "learning_rate": 5.727938336069325e-07, "loss": 0.3463, "step": 2594 }, { "epoch": 0.4701086956521739, "grad_norm": 6.182784881613846, "learning_rate": 5.725035595960066e-07, "loss": 0.3101, "step": 2595 }, { "epoch": 0.4702898550724638, "grad_norm": 7.2311545179767425, "learning_rate": 5.722132606217616e-07, "loss": 0.3177, "step": 2596 }, { "epoch": 0.47047101449275364, "grad_norm": 10.54098617581661, "learning_rate": 5.719229367841489e-07, "loss": 0.3, "step": 2597 }, { "epoch": 0.4706521739130435, "grad_norm": 6.62541623108199, "learning_rate": 5.716325881831282e-07, "loss": 0.3197, "step": 2598 }, { "epoch": 0.4708333333333333, "grad_norm": 4.690670094677277, "learning_rate": 5.71342214918668e-07, "loss": 0.2814, "step": 2599 }, { "epoch": 0.47101449275362317, "grad_norm": 3.5949207240586953, "learning_rate": 5.710518170907452e-07, "loss": 0.2644, "step": 2600 }, { "epoch": 0.47101449275362317, "eval_loss": 0.310484379529953, "eval_runtime": 9.849, "eval_samples_per_second": 50.767, "eval_steps_per_second": 0.102, "step": 2600 }, { "epoch": 0.47119565217391307, "grad_norm": 3.595624114374465, "learning_rate": 5.707613947993451e-07, "loss": 0.2267, "step": 2601 }, { "epoch": 0.4713768115942029, "grad_norm": 5.846735160575425, "learning_rate": 5.704709481444615e-07, "loss": 0.2739, "step": 2602 }, { "epoch": 0.47155797101449276, "grad_norm": 4.312101455072667, "learning_rate": 5.701804772260967e-07, "loss": 0.2902, "step": 2603 }, { "epoch": 0.4717391304347826, "grad_norm": 5.677494926181608, "learning_rate": 5.698899821442608e-07, "loss": 0.2375, "step": 2604 }, { "epoch": 0.47192028985507245, "grad_norm": 4.710343177338405, "learning_rate": 5.695994629989731e-07, "loss": 0.3088, "step": 2605 }, { "epoch": 0.4721014492753623, "grad_norm": 6.474847578010311, "learning_rate": 5.693089198902605e-07, "loss": 0.2696, "step": 2606 }, { "epoch": 0.4722826086956522, "grad_norm": 7.754153874184792, "learning_rate": 5.690183529181583e-07, "loss": 0.324, "step": 2607 }, { "epoch": 0.47246376811594204, "grad_norm": 5.39865584055263, "learning_rate": 5.687277621827104e-07, "loss": 0.2482, "step": 2608 }, { "epoch": 0.4726449275362319, "grad_norm": 7.302495389596663, "learning_rate": 5.68437147783968e-07, "loss": 0.2247, "step": 2609 }, { "epoch": 0.47282608695652173, "grad_norm": 5.040916731038546, "learning_rate": 5.681465098219915e-07, "loss": 0.2831, "step": 2610 }, { "epoch": 0.4730072463768116, "grad_norm": 6.064501599866838, "learning_rate": 5.678558483968489e-07, "loss": 0.2481, "step": 2611 }, { "epoch": 0.4731884057971015, "grad_norm": 3.573436779516617, "learning_rate": 5.675651636086162e-07, "loss": 0.2868, "step": 2612 }, { "epoch": 0.4733695652173913, "grad_norm": 9.570513209943377, "learning_rate": 5.672744555573774e-07, "loss": 0.3184, "step": 2613 }, { "epoch": 0.47355072463768116, "grad_norm": 8.493863799379548, "learning_rate": 5.669837243432248e-07, "loss": 0.2867, "step": 2614 }, { "epoch": 0.473731884057971, "grad_norm": 5.503225370995322, "learning_rate": 5.666929700662584e-07, "loss": 0.295, "step": 2615 }, { "epoch": 0.47391304347826085, "grad_norm": 11.850025654481273, "learning_rate": 5.664021928265868e-07, "loss": 0.4279, "step": 2616 }, { "epoch": 0.4740942028985507, "grad_norm": 4.6253460668145925, "learning_rate": 5.661113927243253e-07, "loss": 0.2358, "step": 2617 }, { "epoch": 0.4742753623188406, "grad_norm": 6.264961553023649, "learning_rate": 5.658205698595982e-07, "loss": 0.2819, "step": 2618 }, { "epoch": 0.47445652173913044, "grad_norm": 4.075884242549915, "learning_rate": 5.655297243325368e-07, "loss": 0.2966, "step": 2619 }, { "epoch": 0.4746376811594203, "grad_norm": 6.867910903811688, "learning_rate": 5.65238856243281e-07, "loss": 0.381, "step": 2620 }, { "epoch": 0.47481884057971013, "grad_norm": 3.883815606795304, "learning_rate": 5.64947965691978e-07, "loss": 0.2253, "step": 2621 }, { "epoch": 0.475, "grad_norm": 4.537925629994537, "learning_rate": 5.646570527787826e-07, "loss": 0.3035, "step": 2622 }, { "epoch": 0.4751811594202899, "grad_norm": 6.288214030219309, "learning_rate": 5.643661176038574e-07, "loss": 0.3158, "step": 2623 }, { "epoch": 0.4753623188405797, "grad_norm": 5.479975975318324, "learning_rate": 5.640751602673731e-07, "loss": 0.2769, "step": 2624 }, { "epoch": 0.47554347826086957, "grad_norm": 5.212645691924527, "learning_rate": 5.637841808695074e-07, "loss": 0.3199, "step": 2625 }, { "epoch": 0.4757246376811594, "grad_norm": 4.169349248093498, "learning_rate": 5.634931795104461e-07, "loss": 0.2965, "step": 2626 }, { "epoch": 0.47590579710144926, "grad_norm": 4.996569224274965, "learning_rate": 5.632021562903822e-07, "loss": 0.2849, "step": 2627 }, { "epoch": 0.47608695652173916, "grad_norm": 7.0275331389235305, "learning_rate": 5.629111113095166e-07, "loss": 0.291, "step": 2628 }, { "epoch": 0.476268115942029, "grad_norm": 4.8035532703178, "learning_rate": 5.626200446680571e-07, "loss": 0.3569, "step": 2629 }, { "epoch": 0.47644927536231885, "grad_norm": 13.915672788863972, "learning_rate": 5.623289564662195e-07, "loss": 0.3025, "step": 2630 }, { "epoch": 0.4766304347826087, "grad_norm": 14.618706013861567, "learning_rate": 5.62037846804227e-07, "loss": 0.3708, "step": 2631 }, { "epoch": 0.47681159420289854, "grad_norm": 15.877577208065015, "learning_rate": 5.617467157823099e-07, "loss": 0.382, "step": 2632 }, { "epoch": 0.4769927536231884, "grad_norm": 10.108425075493251, "learning_rate": 5.61455563500706e-07, "loss": 0.2957, "step": 2633 }, { "epoch": 0.4771739130434783, "grad_norm": 3.492586410796111, "learning_rate": 5.611643900596604e-07, "loss": 0.3059, "step": 2634 }, { "epoch": 0.4773550724637681, "grad_norm": 4.880506232723462, "learning_rate": 5.608731955594255e-07, "loss": 0.2765, "step": 2635 }, { "epoch": 0.47753623188405797, "grad_norm": 2.938274308863043, "learning_rate": 5.605819801002608e-07, "loss": 0.2414, "step": 2636 }, { "epoch": 0.4777173913043478, "grad_norm": 5.748849209465617, "learning_rate": 5.602907437824335e-07, "loss": 0.3214, "step": 2637 }, { "epoch": 0.47789855072463766, "grad_norm": 7.629622573469101, "learning_rate": 5.599994867062173e-07, "loss": 0.31, "step": 2638 }, { "epoch": 0.47807971014492756, "grad_norm": 4.807859565732472, "learning_rate": 5.597082089718937e-07, "loss": 0.3092, "step": 2639 }, { "epoch": 0.4782608695652174, "grad_norm": 5.942727586030775, "learning_rate": 5.594169106797507e-07, "loss": 0.3688, "step": 2640 }, { "epoch": 0.47844202898550725, "grad_norm": 12.050696095031252, "learning_rate": 5.591255919300839e-07, "loss": 0.3134, "step": 2641 }, { "epoch": 0.4786231884057971, "grad_norm": 5.415935043959021, "learning_rate": 5.588342528231956e-07, "loss": 0.3516, "step": 2642 }, { "epoch": 0.47880434782608694, "grad_norm": 3.415654005211583, "learning_rate": 5.585428934593954e-07, "loss": 0.2853, "step": 2643 }, { "epoch": 0.4789855072463768, "grad_norm": 9.16039898259828, "learning_rate": 5.582515139389995e-07, "loss": 0.3593, "step": 2644 }, { "epoch": 0.4791666666666667, "grad_norm": 6.434642097986956, "learning_rate": 5.579601143623314e-07, "loss": 0.3035, "step": 2645 }, { "epoch": 0.47934782608695653, "grad_norm": 13.360594188126193, "learning_rate": 5.576686948297213e-07, "loss": 0.3365, "step": 2646 }, { "epoch": 0.4795289855072464, "grad_norm": 9.596711866869823, "learning_rate": 5.573772554415064e-07, "loss": 0.2995, "step": 2647 }, { "epoch": 0.4797101449275362, "grad_norm": 9.047410781720341, "learning_rate": 5.570857962980305e-07, "loss": 0.2821, "step": 2648 }, { "epoch": 0.47989130434782606, "grad_norm": 7.026168666018671, "learning_rate": 5.567943174996444e-07, "loss": 0.2991, "step": 2649 }, { "epoch": 0.48007246376811596, "grad_norm": 3.7208179823506784, "learning_rate": 5.565028191467057e-07, "loss": 0.3339, "step": 2650 }, { "epoch": 0.4802536231884058, "grad_norm": 10.732073571208474, "learning_rate": 5.562113013395789e-07, "loss": 0.3353, "step": 2651 }, { "epoch": 0.48043478260869565, "grad_norm": 3.7077811782930774, "learning_rate": 5.559197641786344e-07, "loss": 0.3303, "step": 2652 }, { "epoch": 0.4806159420289855, "grad_norm": 4.754087645243929, "learning_rate": 5.556282077642504e-07, "loss": 0.2982, "step": 2653 }, { "epoch": 0.48079710144927534, "grad_norm": 7.1514453187911, "learning_rate": 5.553366321968107e-07, "loss": 0.2945, "step": 2654 }, { "epoch": 0.48097826086956524, "grad_norm": 6.644085111264358, "learning_rate": 5.550450375767064e-07, "loss": 0.2902, "step": 2655 }, { "epoch": 0.4811594202898551, "grad_norm": 7.535968497184428, "learning_rate": 5.547534240043349e-07, "loss": 0.3392, "step": 2656 }, { "epoch": 0.48134057971014493, "grad_norm": 3.5069523855932556, "learning_rate": 5.544617915801e-07, "loss": 0.2854, "step": 2657 }, { "epoch": 0.4815217391304348, "grad_norm": 6.992167105541255, "learning_rate": 5.541701404044122e-07, "loss": 0.3161, "step": 2658 }, { "epoch": 0.4817028985507246, "grad_norm": 4.306855989262285, "learning_rate": 5.538784705776886e-07, "loss": 0.2746, "step": 2659 }, { "epoch": 0.48188405797101447, "grad_norm": 3.849789750717037, "learning_rate": 5.535867822003521e-07, "loss": 0.3391, "step": 2660 }, { "epoch": 0.48206521739130437, "grad_norm": 5.490840859226864, "learning_rate": 5.532950753728325e-07, "loss": 0.2808, "step": 2661 }, { "epoch": 0.4822463768115942, "grad_norm": 8.980019169542674, "learning_rate": 5.530033501955662e-07, "loss": 0.2883, "step": 2662 }, { "epoch": 0.48242753623188406, "grad_norm": 2.9911500066400216, "learning_rate": 5.527116067689951e-07, "loss": 0.2689, "step": 2663 }, { "epoch": 0.4826086956521739, "grad_norm": 4.509846198833672, "learning_rate": 5.524198451935682e-07, "loss": 0.2955, "step": 2664 }, { "epoch": 0.48278985507246375, "grad_norm": 5.2834549244991305, "learning_rate": 5.5212806556974e-07, "loss": 0.338, "step": 2665 }, { "epoch": 0.48297101449275365, "grad_norm": 3.817141454906933, "learning_rate": 5.51836267997972e-07, "loss": 0.3278, "step": 2666 }, { "epoch": 0.4831521739130435, "grad_norm": 7.783635165410847, "learning_rate": 5.515444525787313e-07, "loss": 0.3436, "step": 2667 }, { "epoch": 0.48333333333333334, "grad_norm": 8.629770650834422, "learning_rate": 5.512526194124914e-07, "loss": 0.3251, "step": 2668 }, { "epoch": 0.4835144927536232, "grad_norm": 4.83390122724991, "learning_rate": 5.509607685997316e-07, "loss": 0.3123, "step": 2669 }, { "epoch": 0.483695652173913, "grad_norm": 4.894794140233111, "learning_rate": 5.50668900240938e-07, "loss": 0.2605, "step": 2670 }, { "epoch": 0.48387681159420287, "grad_norm": 3.4730321680561484, "learning_rate": 5.503770144366018e-07, "loss": 0.2723, "step": 2671 }, { "epoch": 0.48405797101449277, "grad_norm": 3.0638806487271766, "learning_rate": 5.500851112872207e-07, "loss": 0.2955, "step": 2672 }, { "epoch": 0.4842391304347826, "grad_norm": 3.7593535571090975, "learning_rate": 5.497931908932988e-07, "loss": 0.3083, "step": 2673 }, { "epoch": 0.48442028985507246, "grad_norm": 11.062331823031327, "learning_rate": 5.495012533553452e-07, "loss": 0.3174, "step": 2674 }, { "epoch": 0.4846014492753623, "grad_norm": 5.575073713322108, "learning_rate": 5.492092987738756e-07, "loss": 0.2735, "step": 2675 }, { "epoch": 0.48478260869565215, "grad_norm": 6.716159322376339, "learning_rate": 5.489173272494112e-07, "loss": 0.3295, "step": 2676 }, { "epoch": 0.48496376811594205, "grad_norm": 3.8029670925725387, "learning_rate": 5.486253388824791e-07, "loss": 0.3206, "step": 2677 }, { "epoch": 0.4851449275362319, "grad_norm": 7.328418064770815, "learning_rate": 5.483333337736127e-07, "loss": 0.2246, "step": 2678 }, { "epoch": 0.48532608695652174, "grad_norm": 4.1349575904903295, "learning_rate": 5.480413120233503e-07, "loss": 0.3096, "step": 2679 }, { "epoch": 0.4855072463768116, "grad_norm": 7.1317055532675155, "learning_rate": 5.477492737322366e-07, "loss": 0.3323, "step": 2680 }, { "epoch": 0.48568840579710143, "grad_norm": 5.435014203979758, "learning_rate": 5.474572190008217e-07, "loss": 0.2565, "step": 2681 }, { "epoch": 0.48586956521739133, "grad_norm": 4.490403980577365, "learning_rate": 5.471651479296616e-07, "loss": 0.2511, "step": 2682 }, { "epoch": 0.4860507246376812, "grad_norm": 3.496916476105908, "learning_rate": 5.468730606193174e-07, "loss": 0.297, "step": 2683 }, { "epoch": 0.486231884057971, "grad_norm": 4.435615242357741, "learning_rate": 5.465809571703564e-07, "loss": 0.3972, "step": 2684 }, { "epoch": 0.48641304347826086, "grad_norm": 4.05157110261667, "learning_rate": 5.462888376833509e-07, "loss": 0.2606, "step": 2685 }, { "epoch": 0.4865942028985507, "grad_norm": 8.997093826612808, "learning_rate": 5.459967022588797e-07, "loss": 0.3108, "step": 2686 }, { "epoch": 0.48677536231884055, "grad_norm": 6.653451282453933, "learning_rate": 5.457045509975256e-07, "loss": 0.3045, "step": 2687 }, { "epoch": 0.48695652173913045, "grad_norm": 4.380012813074611, "learning_rate": 5.454123839998784e-07, "loss": 0.3346, "step": 2688 }, { "epoch": 0.4871376811594203, "grad_norm": 5.08012002303374, "learning_rate": 5.451202013665319e-07, "loss": 0.2602, "step": 2689 }, { "epoch": 0.48731884057971014, "grad_norm": 4.677514818883332, "learning_rate": 5.448280031980865e-07, "loss": 0.297, "step": 2690 }, { "epoch": 0.4875, "grad_norm": 3.939079055787215, "learning_rate": 5.445357895951471e-07, "loss": 0.2489, "step": 2691 }, { "epoch": 0.48768115942028983, "grad_norm": 3.6384946991565017, "learning_rate": 5.442435606583243e-07, "loss": 0.2826, "step": 2692 }, { "epoch": 0.48786231884057973, "grad_norm": 3.4208001965138912, "learning_rate": 5.439513164882339e-07, "loss": 0.3057, "step": 2693 }, { "epoch": 0.4880434782608696, "grad_norm": 4.241873975138517, "learning_rate": 5.436590571854973e-07, "loss": 0.3235, "step": 2694 }, { "epoch": 0.4882246376811594, "grad_norm": 3.523296992085853, "learning_rate": 5.4336678285074e-07, "loss": 0.2848, "step": 2695 }, { "epoch": 0.48840579710144927, "grad_norm": 3.9136402428348176, "learning_rate": 5.430744935845941e-07, "loss": 0.3137, "step": 2696 }, { "epoch": 0.4885869565217391, "grad_norm": 4.002356665702272, "learning_rate": 5.42782189487696e-07, "loss": 0.2593, "step": 2697 }, { "epoch": 0.48876811594202896, "grad_norm": 4.317955103919563, "learning_rate": 5.424898706606874e-07, "loss": 0.3594, "step": 2698 }, { "epoch": 0.48894927536231886, "grad_norm": 3.321739899539395, "learning_rate": 5.421975372042149e-07, "loss": 0.2896, "step": 2699 }, { "epoch": 0.4891304347826087, "grad_norm": 5.904949321834864, "learning_rate": 5.419051892189305e-07, "loss": 0.367, "step": 2700 }, { "epoch": 0.4891304347826087, "eval_loss": 0.3039218783378601, "eval_runtime": 9.7551, "eval_samples_per_second": 51.255, "eval_steps_per_second": 0.103, "step": 2700 }, { "epoch": 0.48931159420289855, "grad_norm": 4.45579202305939, "learning_rate": 5.416128268054907e-07, "loss": 0.2626, "step": 2701 }, { "epoch": 0.4894927536231884, "grad_norm": 6.308609420590981, "learning_rate": 5.413204500645576e-07, "loss": 0.2723, "step": 2702 }, { "epoch": 0.48967391304347824, "grad_norm": 3.5778652091022485, "learning_rate": 5.410280590967978e-07, "loss": 0.3099, "step": 2703 }, { "epoch": 0.48985507246376814, "grad_norm": 8.508044392263377, "learning_rate": 5.407356540028828e-07, "loss": 0.3034, "step": 2704 }, { "epoch": 0.490036231884058, "grad_norm": 5.734873014248094, "learning_rate": 5.404432348834892e-07, "loss": 0.2523, "step": 2705 }, { "epoch": 0.4902173913043478, "grad_norm": 4.525846647957305, "learning_rate": 5.40150801839298e-07, "loss": 0.2657, "step": 2706 }, { "epoch": 0.49039855072463767, "grad_norm": 5.521035150051582, "learning_rate": 5.398583549709957e-07, "loss": 0.2695, "step": 2707 }, { "epoch": 0.4905797101449275, "grad_norm": 4.882680052252439, "learning_rate": 5.395658943792729e-07, "loss": 0.2752, "step": 2708 }, { "epoch": 0.4907608695652174, "grad_norm": 6.149246453969823, "learning_rate": 5.392734201648251e-07, "loss": 0.2829, "step": 2709 }, { "epoch": 0.49094202898550726, "grad_norm": 4.624701767646452, "learning_rate": 5.389809324283528e-07, "loss": 0.3101, "step": 2710 }, { "epoch": 0.4911231884057971, "grad_norm": 3.2727847471012192, "learning_rate": 5.386884312705607e-07, "loss": 0.29, "step": 2711 }, { "epoch": 0.49130434782608695, "grad_norm": 4.455545016255246, "learning_rate": 5.383959167921584e-07, "loss": 0.4018, "step": 2712 }, { "epoch": 0.4914855072463768, "grad_norm": 6.838452853084961, "learning_rate": 5.3810338909386e-07, "loss": 0.3067, "step": 2713 }, { "epoch": 0.49166666666666664, "grad_norm": 7.775671180613394, "learning_rate": 5.378108482763841e-07, "loss": 0.3201, "step": 2714 }, { "epoch": 0.49184782608695654, "grad_norm": 4.462140202625592, "learning_rate": 5.375182944404542e-07, "loss": 0.3137, "step": 2715 }, { "epoch": 0.4920289855072464, "grad_norm": 4.687554638196434, "learning_rate": 5.372257276867977e-07, "loss": 0.2755, "step": 2716 }, { "epoch": 0.49221014492753623, "grad_norm": 7.577957605634497, "learning_rate": 5.369331481161468e-07, "loss": 0.2793, "step": 2717 }, { "epoch": 0.4923913043478261, "grad_norm": 4.467344434683192, "learning_rate": 5.366405558292379e-07, "loss": 0.266, "step": 2718 }, { "epoch": 0.4925724637681159, "grad_norm": 4.072289834852524, "learning_rate": 5.36347950926812e-07, "loss": 0.3357, "step": 2719 }, { "epoch": 0.4927536231884058, "grad_norm": 4.842289431907543, "learning_rate": 5.360553335096144e-07, "loss": 0.3132, "step": 2720 }, { "epoch": 0.49293478260869567, "grad_norm": 4.192826672463043, "learning_rate": 5.357627036783949e-07, "loss": 0.2427, "step": 2721 }, { "epoch": 0.4931159420289855, "grad_norm": 5.315459785000078, "learning_rate": 5.354700615339067e-07, "loss": 0.3047, "step": 2722 }, { "epoch": 0.49329710144927535, "grad_norm": 7.950173376612212, "learning_rate": 5.351774071769084e-07, "loss": 0.2732, "step": 2723 }, { "epoch": 0.4934782608695652, "grad_norm": 4.555633720750855, "learning_rate": 5.348847407081622e-07, "loss": 0.2816, "step": 2724 }, { "epoch": 0.4936594202898551, "grad_norm": 6.59457212048181, "learning_rate": 5.345920622284344e-07, "loss": 0.2854, "step": 2725 }, { "epoch": 0.49384057971014494, "grad_norm": 4.483224534729175, "learning_rate": 5.342993718384958e-07, "loss": 0.2434, "step": 2726 }, { "epoch": 0.4940217391304348, "grad_norm": 8.111939583998177, "learning_rate": 5.340066696391209e-07, "loss": 0.3217, "step": 2727 }, { "epoch": 0.49420289855072463, "grad_norm": 4.168379596516225, "learning_rate": 5.337139557310883e-07, "loss": 0.2947, "step": 2728 }, { "epoch": 0.4943840579710145, "grad_norm": 3.7397990657870896, "learning_rate": 5.334212302151814e-07, "loss": 0.2913, "step": 2729 }, { "epoch": 0.4945652173913043, "grad_norm": 3.537244043476381, "learning_rate": 5.331284931921862e-07, "loss": 0.2637, "step": 2730 }, { "epoch": 0.4947463768115942, "grad_norm": 10.28278894777417, "learning_rate": 5.328357447628941e-07, "loss": 0.2918, "step": 2731 }, { "epoch": 0.49492753623188407, "grad_norm": 4.893977234583997, "learning_rate": 5.325429850280993e-07, "loss": 0.3189, "step": 2732 }, { "epoch": 0.4951086956521739, "grad_norm": 7.677152587780667, "learning_rate": 5.322502140886007e-07, "loss": 0.3112, "step": 2733 }, { "epoch": 0.49528985507246376, "grad_norm": 7.244766671748888, "learning_rate": 5.319574320452007e-07, "loss": 0.2805, "step": 2734 }, { "epoch": 0.4954710144927536, "grad_norm": 7.375741734044889, "learning_rate": 5.316646389987052e-07, "loss": 0.2301, "step": 2735 }, { "epoch": 0.4956521739130435, "grad_norm": 14.714999307588787, "learning_rate": 5.313718350499245e-07, "loss": 0.3243, "step": 2736 }, { "epoch": 0.49583333333333335, "grad_norm": 8.895261043189805, "learning_rate": 5.310790202996723e-07, "loss": 0.3119, "step": 2737 }, { "epoch": 0.4960144927536232, "grad_norm": 4.718628149017444, "learning_rate": 5.307861948487663e-07, "loss": 0.3616, "step": 2738 }, { "epoch": 0.49619565217391304, "grad_norm": 5.052879601517405, "learning_rate": 5.304933587980274e-07, "loss": 0.3123, "step": 2739 }, { "epoch": 0.4963768115942029, "grad_norm": 5.537096583386094, "learning_rate": 5.302005122482808e-07, "loss": 0.34, "step": 2740 }, { "epoch": 0.4965579710144927, "grad_norm": 5.390605984417862, "learning_rate": 5.299076553003545e-07, "loss": 0.2796, "step": 2741 }, { "epoch": 0.4967391304347826, "grad_norm": 11.504103419548, "learning_rate": 5.29614788055081e-07, "loss": 0.2965, "step": 2742 }, { "epoch": 0.4969202898550725, "grad_norm": 4.423570427125164, "learning_rate": 5.293219106132956e-07, "loss": 0.3015, "step": 2743 }, { "epoch": 0.4971014492753623, "grad_norm": 3.7265284481310266, "learning_rate": 5.290290230758373e-07, "loss": 0.3188, "step": 2744 }, { "epoch": 0.49728260869565216, "grad_norm": 3.7518614117350824, "learning_rate": 5.28736125543549e-07, "loss": 0.2914, "step": 2745 }, { "epoch": 0.497463768115942, "grad_norm": 3.5353363783121328, "learning_rate": 5.284432181172763e-07, "loss": 0.2986, "step": 2746 }, { "epoch": 0.4976449275362319, "grad_norm": 8.074210758954466, "learning_rate": 5.281503008978689e-07, "loss": 0.285, "step": 2747 }, { "epoch": 0.49782608695652175, "grad_norm": 4.265055425624807, "learning_rate": 5.278573739861798e-07, "loss": 0.2608, "step": 2748 }, { "epoch": 0.4980072463768116, "grad_norm": 4.28218710607402, "learning_rate": 5.275644374830645e-07, "loss": 0.3041, "step": 2749 }, { "epoch": 0.49818840579710144, "grad_norm": 8.163648954334318, "learning_rate": 5.272714914893829e-07, "loss": 0.301, "step": 2750 }, { "epoch": 0.4983695652173913, "grad_norm": 4.2666614306728885, "learning_rate": 5.269785361059976e-07, "loss": 0.3238, "step": 2751 }, { "epoch": 0.4985507246376812, "grad_norm": 9.276078803223166, "learning_rate": 5.266855714337745e-07, "loss": 0.3673, "step": 2752 }, { "epoch": 0.49873188405797103, "grad_norm": 4.20801313210396, "learning_rate": 5.263925975735826e-07, "loss": 0.2893, "step": 2753 }, { "epoch": 0.4989130434782609, "grad_norm": 4.800150450879598, "learning_rate": 5.260996146262944e-07, "loss": 0.311, "step": 2754 }, { "epoch": 0.4990942028985507, "grad_norm": 6.266777583724696, "learning_rate": 5.258066226927851e-07, "loss": 0.2688, "step": 2755 }, { "epoch": 0.49927536231884057, "grad_norm": 6.216070769103604, "learning_rate": 5.255136218739336e-07, "loss": 0.3215, "step": 2756 }, { "epoch": 0.4994565217391304, "grad_norm": 3.6440029964968574, "learning_rate": 5.25220612270621e-07, "loss": 0.284, "step": 2757 }, { "epoch": 0.4996376811594203, "grad_norm": 7.325266596129861, "learning_rate": 5.249275939837321e-07, "loss": 0.2799, "step": 2758 }, { "epoch": 0.49981884057971016, "grad_norm": 8.400060048053327, "learning_rate": 5.246345671141546e-07, "loss": 0.3279, "step": 2759 }, { "epoch": 0.5, "grad_norm": 6.1671206027410035, "learning_rate": 5.243415317627788e-07, "loss": 0.3204, "step": 2760 }, { "epoch": 0.5001811594202898, "grad_norm": 8.764162152695576, "learning_rate": 5.240484880304983e-07, "loss": 0.3597, "step": 2761 }, { "epoch": 0.5003623188405797, "grad_norm": 3.8247283686088114, "learning_rate": 5.237554360182095e-07, "loss": 0.2886, "step": 2762 }, { "epoch": 0.5005434782608695, "grad_norm": 4.47621729177752, "learning_rate": 5.234623758268113e-07, "loss": 0.3073, "step": 2763 }, { "epoch": 0.5007246376811594, "grad_norm": 3.5599895353965145, "learning_rate": 5.231693075572062e-07, "loss": 0.2818, "step": 2764 }, { "epoch": 0.5009057971014492, "grad_norm": 5.229752367628947, "learning_rate": 5.228762313102985e-07, "loss": 0.3219, "step": 2765 }, { "epoch": 0.5010869565217392, "grad_norm": 7.554121663949298, "learning_rate": 5.225831471869961e-07, "loss": 0.3141, "step": 2766 }, { "epoch": 0.501268115942029, "grad_norm": 6.040960590596064, "learning_rate": 5.222900552882092e-07, "loss": 0.3264, "step": 2767 }, { "epoch": 0.5014492753623189, "grad_norm": 4.87642454573085, "learning_rate": 5.219969557148506e-07, "loss": 0.2887, "step": 2768 }, { "epoch": 0.5016304347826087, "grad_norm": 3.7610477736395014, "learning_rate": 5.217038485678359e-07, "loss": 0.287, "step": 2769 }, { "epoch": 0.5018115942028986, "grad_norm": 3.477214852855607, "learning_rate": 5.214107339480833e-07, "loss": 0.3, "step": 2770 }, { "epoch": 0.5019927536231884, "grad_norm": 3.4792708443862193, "learning_rate": 5.211176119565135e-07, "loss": 0.2627, "step": 2771 }, { "epoch": 0.5021739130434782, "grad_norm": 5.198722844529025, "learning_rate": 5.2082448269405e-07, "loss": 0.3126, "step": 2772 }, { "epoch": 0.5023550724637681, "grad_norm": 7.767051608977658, "learning_rate": 5.205313462616183e-07, "loss": 0.3113, "step": 2773 }, { "epoch": 0.5025362318840579, "grad_norm": 5.481460204094765, "learning_rate": 5.202382027601467e-07, "loss": 0.321, "step": 2774 }, { "epoch": 0.5027173913043478, "grad_norm": 5.298026483632371, "learning_rate": 5.199450522905663e-07, "loss": 0.296, "step": 2775 }, { "epoch": 0.5028985507246376, "grad_norm": 6.528779965297598, "learning_rate": 5.196518949538097e-07, "loss": 0.2764, "step": 2776 }, { "epoch": 0.5030797101449276, "grad_norm": 7.486347283018093, "learning_rate": 5.193587308508126e-07, "loss": 0.2903, "step": 2777 }, { "epoch": 0.5032608695652174, "grad_norm": 7.771786703073877, "learning_rate": 5.190655600825128e-07, "loss": 0.2868, "step": 2778 }, { "epoch": 0.5034420289855073, "grad_norm": 3.5659050589334194, "learning_rate": 5.187723827498502e-07, "loss": 0.1864, "step": 2779 }, { "epoch": 0.5036231884057971, "grad_norm": 6.008823588538369, "learning_rate": 5.184791989537674e-07, "loss": 0.2789, "step": 2780 }, { "epoch": 0.503804347826087, "grad_norm": 7.613597510972579, "learning_rate": 5.181860087952088e-07, "loss": 0.2598, "step": 2781 }, { "epoch": 0.5039855072463768, "grad_norm": 3.3576839466132835, "learning_rate": 5.178928123751211e-07, "loss": 0.2599, "step": 2782 }, { "epoch": 0.5041666666666667, "grad_norm": 7.31152567780526, "learning_rate": 5.175996097944535e-07, "loss": 0.2998, "step": 2783 }, { "epoch": 0.5043478260869565, "grad_norm": 4.578935790793429, "learning_rate": 5.173064011541566e-07, "loss": 0.2965, "step": 2784 }, { "epoch": 0.5045289855072463, "grad_norm": 4.169471430311392, "learning_rate": 5.170131865551841e-07, "loss": 0.3135, "step": 2785 }, { "epoch": 0.5047101449275362, "grad_norm": 7.580570026181968, "learning_rate": 5.167199660984906e-07, "loss": 0.2832, "step": 2786 }, { "epoch": 0.5048913043478261, "grad_norm": 6.040073670942348, "learning_rate": 5.164267398850339e-07, "loss": 0.2553, "step": 2787 }, { "epoch": 0.505072463768116, "grad_norm": 6.67835437416186, "learning_rate": 5.161335080157725e-07, "loss": 0.3483, "step": 2788 }, { "epoch": 0.5052536231884058, "grad_norm": 6.219598029669925, "learning_rate": 5.158402705916679e-07, "loss": 0.3111, "step": 2789 }, { "epoch": 0.5054347826086957, "grad_norm": 6.481701755817264, "learning_rate": 5.155470277136831e-07, "loss": 0.2825, "step": 2790 }, { "epoch": 0.5056159420289855, "grad_norm": 5.70283411828738, "learning_rate": 5.152537794827832e-07, "loss": 0.3415, "step": 2791 }, { "epoch": 0.5057971014492754, "grad_norm": 3.405505719167638, "learning_rate": 5.149605259999344e-07, "loss": 0.2909, "step": 2792 }, { "epoch": 0.5059782608695652, "grad_norm": 4.18580993332098, "learning_rate": 5.146672673661058e-07, "loss": 0.3708, "step": 2793 }, { "epoch": 0.506159420289855, "grad_norm": 7.709719581792903, "learning_rate": 5.143740036822676e-07, "loss": 0.2595, "step": 2794 }, { "epoch": 0.5063405797101449, "grad_norm": 4.388332583582546, "learning_rate": 5.140807350493916e-07, "loss": 0.3013, "step": 2795 }, { "epoch": 0.5065217391304347, "grad_norm": 6.201275091791021, "learning_rate": 5.137874615684521e-07, "loss": 0.333, "step": 2796 }, { "epoch": 0.5067028985507246, "grad_norm": 3.939595196659426, "learning_rate": 5.134941833404241e-07, "loss": 0.287, "step": 2797 }, { "epoch": 0.5068840579710145, "grad_norm": 9.636058615048986, "learning_rate": 5.132009004662848e-07, "loss": 0.316, "step": 2798 }, { "epoch": 0.5070652173913044, "grad_norm": 8.779208618325898, "learning_rate": 5.129076130470132e-07, "loss": 0.276, "step": 2799 }, { "epoch": 0.5072463768115942, "grad_norm": 4.381185710594406, "learning_rate": 5.126143211835888e-07, "loss": 0.2783, "step": 2800 }, { "epoch": 0.5072463768115942, "eval_loss": 0.30992186069488525, "eval_runtime": 9.8138, "eval_samples_per_second": 50.948, "eval_steps_per_second": 0.102, "step": 2800 }, { "epoch": 0.5074275362318841, "grad_norm": 4.444935383307834, "learning_rate": 5.123210249769941e-07, "loss": 0.2879, "step": 2801 }, { "epoch": 0.5076086956521739, "grad_norm": 7.0702367080838044, "learning_rate": 5.120277245282122e-07, "loss": 0.3459, "step": 2802 }, { "epoch": 0.5077898550724638, "grad_norm": 9.285804903120146, "learning_rate": 5.117344199382277e-07, "loss": 0.3477, "step": 2803 }, { "epoch": 0.5079710144927536, "grad_norm": 4.512259842940229, "learning_rate": 5.114411113080269e-07, "loss": 0.2756, "step": 2804 }, { "epoch": 0.5081521739130435, "grad_norm": 3.2881963983268223, "learning_rate": 5.111477987385971e-07, "loss": 0.2763, "step": 2805 }, { "epoch": 0.5083333333333333, "grad_norm": 7.287858779094714, "learning_rate": 5.108544823309273e-07, "loss": 0.2863, "step": 2806 }, { "epoch": 0.5085144927536231, "grad_norm": 6.12396532048185, "learning_rate": 5.105611621860081e-07, "loss": 0.2384, "step": 2807 }, { "epoch": 0.508695652173913, "grad_norm": 9.270361915799064, "learning_rate": 5.102678384048304e-07, "loss": 0.2773, "step": 2808 }, { "epoch": 0.508876811594203, "grad_norm": 3.9602652508454104, "learning_rate": 5.099745110883874e-07, "loss": 0.2974, "step": 2809 }, { "epoch": 0.5090579710144928, "grad_norm": 4.000995661088271, "learning_rate": 5.096811803376732e-07, "loss": 0.342, "step": 2810 }, { "epoch": 0.5092391304347826, "grad_norm": 8.50366055980769, "learning_rate": 5.093878462536825e-07, "loss": 0.2723, "step": 2811 }, { "epoch": 0.5094202898550725, "grad_norm": 3.44748965070376, "learning_rate": 5.090945089374119e-07, "loss": 0.296, "step": 2812 }, { "epoch": 0.5096014492753623, "grad_norm": 4.992219720206116, "learning_rate": 5.088011684898587e-07, "loss": 0.2431, "step": 2813 }, { "epoch": 0.5097826086956522, "grad_norm": 5.490216340891855, "learning_rate": 5.085078250120217e-07, "loss": 0.2874, "step": 2814 }, { "epoch": 0.509963768115942, "grad_norm": 9.770944949097256, "learning_rate": 5.082144786049001e-07, "loss": 0.3555, "step": 2815 }, { "epoch": 0.5101449275362319, "grad_norm": 5.034012903524801, "learning_rate": 5.079211293694946e-07, "loss": 0.2713, "step": 2816 }, { "epoch": 0.5103260869565217, "grad_norm": 6.360553422122106, "learning_rate": 5.076277774068067e-07, "loss": 0.29, "step": 2817 }, { "epoch": 0.5105072463768116, "grad_norm": 3.674034260627281, "learning_rate": 5.073344228178391e-07, "loss": 0.3521, "step": 2818 }, { "epoch": 0.5106884057971014, "grad_norm": 4.268537459452379, "learning_rate": 5.070410657035948e-07, "loss": 0.2903, "step": 2819 }, { "epoch": 0.5108695652173914, "grad_norm": 3.52560398046829, "learning_rate": 5.067477061650781e-07, "loss": 0.3023, "step": 2820 }, { "epoch": 0.5110507246376812, "grad_norm": 6.909141459924591, "learning_rate": 5.064543443032944e-07, "loss": 0.2351, "step": 2821 }, { "epoch": 0.511231884057971, "grad_norm": 3.3244077410951816, "learning_rate": 5.061609802192492e-07, "loss": 0.2926, "step": 2822 }, { "epoch": 0.5114130434782609, "grad_norm": 3.3025160531589783, "learning_rate": 5.058676140139495e-07, "loss": 0.2822, "step": 2823 }, { "epoch": 0.5115942028985507, "grad_norm": 3.846351024786474, "learning_rate": 5.055742457884024e-07, "loss": 0.2941, "step": 2824 }, { "epoch": 0.5117753623188406, "grad_norm": 4.159220113731831, "learning_rate": 5.05280875643616e-07, "loss": 0.2748, "step": 2825 }, { "epoch": 0.5119565217391304, "grad_norm": 4.263021700051534, "learning_rate": 5.049875036805994e-07, "loss": 0.3434, "step": 2826 }, { "epoch": 0.5121376811594203, "grad_norm": 3.427479635258739, "learning_rate": 5.046941300003614e-07, "loss": 0.2494, "step": 2827 }, { "epoch": 0.5123188405797101, "grad_norm": 6.250932982838778, "learning_rate": 5.044007547039121e-07, "loss": 0.3705, "step": 2828 }, { "epoch": 0.5125, "grad_norm": 6.1214318923029625, "learning_rate": 5.041073778922622e-07, "loss": 0.2844, "step": 2829 }, { "epoch": 0.5126811594202898, "grad_norm": 4.41172827489127, "learning_rate": 5.038139996664227e-07, "loss": 0.3126, "step": 2830 }, { "epoch": 0.5128623188405798, "grad_norm": 4.791392182353196, "learning_rate": 5.035206201274051e-07, "loss": 0.2914, "step": 2831 }, { "epoch": 0.5130434782608696, "grad_norm": 3.847998232525494, "learning_rate": 5.032272393762211e-07, "loss": 0.3172, "step": 2832 }, { "epoch": 0.5132246376811594, "grad_norm": 4.6133980823835605, "learning_rate": 5.029338575138834e-07, "loss": 0.2607, "step": 2833 }, { "epoch": 0.5134057971014493, "grad_norm": 3.991569445253998, "learning_rate": 5.026404746414048e-07, "loss": 0.3232, "step": 2834 }, { "epoch": 0.5135869565217391, "grad_norm": 6.100076834062837, "learning_rate": 5.02347090859798e-07, "loss": 0.2892, "step": 2835 }, { "epoch": 0.513768115942029, "grad_norm": 5.290232373801101, "learning_rate": 5.020537062700768e-07, "loss": 0.277, "step": 2836 }, { "epoch": 0.5139492753623188, "grad_norm": 3.793760745375536, "learning_rate": 5.017603209732549e-07, "loss": 0.3287, "step": 2837 }, { "epoch": 0.5141304347826087, "grad_norm": 4.965000473966615, "learning_rate": 5.014669350703461e-07, "loss": 0.2634, "step": 2838 }, { "epoch": 0.5143115942028985, "grad_norm": 7.688074758279249, "learning_rate": 5.011735486623646e-07, "loss": 0.2652, "step": 2839 }, { "epoch": 0.5144927536231884, "grad_norm": 5.197366458962598, "learning_rate": 5.008801618503248e-07, "loss": 0.2589, "step": 2840 }, { "epoch": 0.5146739130434783, "grad_norm": 4.162405717601701, "learning_rate": 5.005867747352408e-07, "loss": 0.2465, "step": 2841 }, { "epoch": 0.5148550724637682, "grad_norm": 7.387248082860555, "learning_rate": 5.002933874181279e-07, "loss": 0.3106, "step": 2842 }, { "epoch": 0.515036231884058, "grad_norm": 3.8299961115052894, "learning_rate": 5e-07, "loss": 0.2891, "step": 2843 }, { "epoch": 0.5152173913043478, "grad_norm": 7.082791106352679, "learning_rate": 4.997066125818723e-07, "loss": 0.3214, "step": 2844 }, { "epoch": 0.5153985507246377, "grad_norm": 4.721037829850546, "learning_rate": 4.99413225264759e-07, "loss": 0.3017, "step": 2845 }, { "epoch": 0.5155797101449275, "grad_norm": 4.907335920373676, "learning_rate": 4.991198381496753e-07, "loss": 0.2975, "step": 2846 }, { "epoch": 0.5157608695652174, "grad_norm": 5.846007352970598, "learning_rate": 4.988264513376354e-07, "loss": 0.2214, "step": 2847 }, { "epoch": 0.5159420289855072, "grad_norm": 6.454427299303954, "learning_rate": 4.98533064929654e-07, "loss": 0.3337, "step": 2848 }, { "epoch": 0.5161231884057971, "grad_norm": 6.5084427526921695, "learning_rate": 4.982396790267451e-07, "loss": 0.2596, "step": 2849 }, { "epoch": 0.5163043478260869, "grad_norm": 4.860629882289485, "learning_rate": 4.979462937299232e-07, "loss": 0.3135, "step": 2850 }, { "epoch": 0.5164855072463768, "grad_norm": 9.222953468180826, "learning_rate": 4.976529091402019e-07, "loss": 0.3661, "step": 2851 }, { "epoch": 0.5166666666666667, "grad_norm": 4.652435896338946, "learning_rate": 4.973595253585953e-07, "loss": 0.3105, "step": 2852 }, { "epoch": 0.5168478260869566, "grad_norm": 3.357892891036832, "learning_rate": 4.970661424861165e-07, "loss": 0.2791, "step": 2853 }, { "epoch": 0.5170289855072464, "grad_norm": 3.7458053853401903, "learning_rate": 4.967727606237788e-07, "loss": 0.276, "step": 2854 }, { "epoch": 0.5172101449275363, "grad_norm": 3.730847214688724, "learning_rate": 4.96479379872595e-07, "loss": 0.2246, "step": 2855 }, { "epoch": 0.5173913043478261, "grad_norm": 7.150438128393571, "learning_rate": 4.961860003335774e-07, "loss": 0.3039, "step": 2856 }, { "epoch": 0.5175724637681159, "grad_norm": 3.611701708090848, "learning_rate": 4.958926221077376e-07, "loss": 0.305, "step": 2857 }, { "epoch": 0.5177536231884058, "grad_norm": 4.26438471115421, "learning_rate": 4.955992452960879e-07, "loss": 0.3383, "step": 2858 }, { "epoch": 0.5179347826086956, "grad_norm": 3.5519538361101097, "learning_rate": 4.953058699996388e-07, "loss": 0.3162, "step": 2859 }, { "epoch": 0.5181159420289855, "grad_norm": 5.21780459865734, "learning_rate": 4.950124963194008e-07, "loss": 0.3235, "step": 2860 }, { "epoch": 0.5182971014492753, "grad_norm": 4.694891216443285, "learning_rate": 4.947191243563838e-07, "loss": 0.223, "step": 2861 }, { "epoch": 0.5184782608695652, "grad_norm": 3.4368356884789777, "learning_rate": 4.944257542115975e-07, "loss": 0.2693, "step": 2862 }, { "epoch": 0.5186594202898551, "grad_norm": 4.490206472701619, "learning_rate": 4.941323859860505e-07, "loss": 0.3028, "step": 2863 }, { "epoch": 0.518840579710145, "grad_norm": 10.41683550303921, "learning_rate": 4.938390197807508e-07, "loss": 0.2414, "step": 2864 }, { "epoch": 0.5190217391304348, "grad_norm": 5.534953077913409, "learning_rate": 4.935456556967055e-07, "loss": 0.3218, "step": 2865 }, { "epoch": 0.5192028985507247, "grad_norm": 5.4908036609999735, "learning_rate": 4.932522938349219e-07, "loss": 0.2693, "step": 2866 }, { "epoch": 0.5193840579710145, "grad_norm": 7.792525436579251, "learning_rate": 4.929589342964053e-07, "loss": 0.2638, "step": 2867 }, { "epoch": 0.5195652173913043, "grad_norm": 4.582075008658208, "learning_rate": 4.92665577182161e-07, "loss": 0.286, "step": 2868 }, { "epoch": 0.5197463768115942, "grad_norm": 3.7679544064073807, "learning_rate": 4.923722225931932e-07, "loss": 0.2639, "step": 2869 }, { "epoch": 0.519927536231884, "grad_norm": 7.423896020144309, "learning_rate": 4.920788706305053e-07, "loss": 0.3192, "step": 2870 }, { "epoch": 0.5201086956521739, "grad_norm": 3.5172256413182192, "learning_rate": 4.917855213950999e-07, "loss": 0.233, "step": 2871 }, { "epoch": 0.5202898550724637, "grad_norm": 5.646715112284041, "learning_rate": 4.914921749879784e-07, "loss": 0.2809, "step": 2872 }, { "epoch": 0.5204710144927536, "grad_norm": 3.960817594061969, "learning_rate": 4.911988315101411e-07, "loss": 0.3662, "step": 2873 }, { "epoch": 0.5206521739130435, "grad_norm": 10.285735948656155, "learning_rate": 4.909054910625882e-07, "loss": 0.3069, "step": 2874 }, { "epoch": 0.5208333333333334, "grad_norm": 5.050756744713645, "learning_rate": 4.906121537463176e-07, "loss": 0.3036, "step": 2875 }, { "epoch": 0.5210144927536232, "grad_norm": 5.570303055947805, "learning_rate": 4.90318819662327e-07, "loss": 0.3015, "step": 2876 }, { "epoch": 0.5211956521739131, "grad_norm": 6.726832132065173, "learning_rate": 4.900254889116125e-07, "loss": 0.3845, "step": 2877 }, { "epoch": 0.5213768115942029, "grad_norm": 7.751050382260064, "learning_rate": 4.897321615951695e-07, "loss": 0.2859, "step": 2878 }, { "epoch": 0.5215579710144927, "grad_norm": 4.4863994835212875, "learning_rate": 4.894388378139921e-07, "loss": 0.3872, "step": 2879 }, { "epoch": 0.5217391304347826, "grad_norm": 8.630366966978777, "learning_rate": 4.891455176690725e-07, "loss": 0.296, "step": 2880 }, { "epoch": 0.5219202898550724, "grad_norm": 10.85056205126875, "learning_rate": 4.888522012614029e-07, "loss": 0.2988, "step": 2881 }, { "epoch": 0.5221014492753623, "grad_norm": 5.694373442398479, "learning_rate": 4.885588886919732e-07, "loss": 0.2819, "step": 2882 }, { "epoch": 0.5222826086956521, "grad_norm": 5.260728137507764, "learning_rate": 4.882655800617724e-07, "loss": 0.2535, "step": 2883 }, { "epoch": 0.5224637681159421, "grad_norm": 4.626474486270258, "learning_rate": 4.879722754717878e-07, "loss": 0.2746, "step": 2884 }, { "epoch": 0.5226449275362319, "grad_norm": 6.315266110750342, "learning_rate": 4.876789750230059e-07, "loss": 0.2878, "step": 2885 }, { "epoch": 0.5228260869565218, "grad_norm": 3.415097083621458, "learning_rate": 4.873856788164111e-07, "loss": 0.2577, "step": 2886 }, { "epoch": 0.5230072463768116, "grad_norm": 4.426905421836901, "learning_rate": 4.87092386952987e-07, "loss": 0.3109, "step": 2887 }, { "epoch": 0.5231884057971015, "grad_norm": 4.83881863171699, "learning_rate": 4.867990995337151e-07, "loss": 0.324, "step": 2888 }, { "epoch": 0.5233695652173913, "grad_norm": 3.3212060803866437, "learning_rate": 4.865058166595759e-07, "loss": 0.2217, "step": 2889 }, { "epoch": 0.5235507246376812, "grad_norm": 3.593184184582708, "learning_rate": 4.862125384315479e-07, "loss": 0.2856, "step": 2890 }, { "epoch": 0.523731884057971, "grad_norm": 4.872786154993526, "learning_rate": 4.859192649506084e-07, "loss": 0.3417, "step": 2891 }, { "epoch": 0.5239130434782608, "grad_norm": 3.3607488085082746, "learning_rate": 4.856259963177324e-07, "loss": 0.2466, "step": 2892 }, { "epoch": 0.5240942028985507, "grad_norm": 5.05014734722463, "learning_rate": 4.853327326338942e-07, "loss": 0.2898, "step": 2893 }, { "epoch": 0.5242753623188405, "grad_norm": 4.841080865206514, "learning_rate": 4.850394740000656e-07, "loss": 0.2924, "step": 2894 }, { "epoch": 0.5244565217391305, "grad_norm": 4.7657042041267985, "learning_rate": 4.84746220517217e-07, "loss": 0.3357, "step": 2895 }, { "epoch": 0.5246376811594203, "grad_norm": 9.302743520382093, "learning_rate": 4.844529722863168e-07, "loss": 0.2906, "step": 2896 }, { "epoch": 0.5248188405797102, "grad_norm": 4.685193581166995, "learning_rate": 4.84159729408332e-07, "loss": 0.2639, "step": 2897 }, { "epoch": 0.525, "grad_norm": 10.081003204801586, "learning_rate": 4.838664919842275e-07, "loss": 0.3782, "step": 2898 }, { "epoch": 0.5251811594202899, "grad_norm": 3.865197310034983, "learning_rate": 4.835732601149663e-07, "loss": 0.3392, "step": 2899 }, { "epoch": 0.5253623188405797, "grad_norm": 3.8486182918014973, "learning_rate": 4.832800339015092e-07, "loss": 0.2517, "step": 2900 }, { "epoch": 0.5253623188405797, "eval_loss": 0.30259373784065247, "eval_runtime": 9.848, "eval_samples_per_second": 50.772, "eval_steps_per_second": 0.102, "step": 2900 }, { "epoch": 0.5255434782608696, "grad_norm": 4.706585507984769, "learning_rate": 4.82986813444816e-07, "loss": 0.2884, "step": 2901 }, { "epoch": 0.5257246376811594, "grad_norm": 4.369058975566071, "learning_rate": 4.826935988458433e-07, "loss": 0.291, "step": 2902 }, { "epoch": 0.5259057971014492, "grad_norm": 6.651098391671914, "learning_rate": 4.824003902055466e-07, "loss": 0.3423, "step": 2903 }, { "epoch": 0.5260869565217391, "grad_norm": 3.4080392294999386, "learning_rate": 4.821071876248788e-07, "loss": 0.2741, "step": 2904 }, { "epoch": 0.5262681159420289, "grad_norm": 3.8671285957757413, "learning_rate": 4.818139912047912e-07, "loss": 0.29, "step": 2905 }, { "epoch": 0.5264492753623189, "grad_norm": 6.825199707495563, "learning_rate": 4.815208010462326e-07, "loss": 0.3319, "step": 2906 }, { "epoch": 0.5266304347826087, "grad_norm": 4.097937362276843, "learning_rate": 4.8122761725015e-07, "loss": 0.352, "step": 2907 }, { "epoch": 0.5268115942028986, "grad_norm": 7.949303661327839, "learning_rate": 4.809344399174872e-07, "loss": 0.3299, "step": 2908 }, { "epoch": 0.5269927536231884, "grad_norm": 8.239374923375, "learning_rate": 4.806412691491875e-07, "loss": 0.2705, "step": 2909 }, { "epoch": 0.5271739130434783, "grad_norm": 4.134447589834429, "learning_rate": 4.803481050461903e-07, "loss": 0.2774, "step": 2910 }, { "epoch": 0.5273550724637681, "grad_norm": 3.4807326975889135, "learning_rate": 4.800549477094337e-07, "loss": 0.2727, "step": 2911 }, { "epoch": 0.527536231884058, "grad_norm": 9.416456224930261, "learning_rate": 4.797617972398531e-07, "loss": 0.2947, "step": 2912 }, { "epoch": 0.5277173913043478, "grad_norm": 7.037133986338564, "learning_rate": 4.794686537383817e-07, "loss": 0.285, "step": 2913 }, { "epoch": 0.5278985507246376, "grad_norm": 4.595600062062957, "learning_rate": 4.791755173059501e-07, "loss": 0.2542, "step": 2914 }, { "epoch": 0.5280797101449275, "grad_norm": 3.3442524936434714, "learning_rate": 4.788823880434864e-07, "loss": 0.2851, "step": 2915 }, { "epoch": 0.5282608695652173, "grad_norm": 4.7399865498045015, "learning_rate": 4.785892660519166e-07, "loss": 0.3278, "step": 2916 }, { "epoch": 0.5284420289855073, "grad_norm": 8.944313544806079, "learning_rate": 4.782961514321641e-07, "loss": 0.2914, "step": 2917 }, { "epoch": 0.5286231884057971, "grad_norm": 9.43697220477717, "learning_rate": 4.780030442851495e-07, "loss": 0.2931, "step": 2918 }, { "epoch": 0.528804347826087, "grad_norm": 10.456610248000333, "learning_rate": 4.777099447117907e-07, "loss": 0.3064, "step": 2919 }, { "epoch": 0.5289855072463768, "grad_norm": 5.106965218301173, "learning_rate": 4.774168528130038e-07, "loss": 0.3267, "step": 2920 }, { "epoch": 0.5291666666666667, "grad_norm": 3.435840283644493, "learning_rate": 4.771237686897014e-07, "loss": 0.3195, "step": 2921 }, { "epoch": 0.5293478260869565, "grad_norm": 6.460835795089601, "learning_rate": 4.7683069244279387e-07, "loss": 0.2497, "step": 2922 }, { "epoch": 0.5295289855072464, "grad_norm": 4.364890048962088, "learning_rate": 4.765376241731886e-07, "loss": 0.2616, "step": 2923 }, { "epoch": 0.5297101449275362, "grad_norm": 3.6488425646980183, "learning_rate": 4.7624456398179056e-07, "loss": 0.2552, "step": 2924 }, { "epoch": 0.529891304347826, "grad_norm": 8.556067994898077, "learning_rate": 4.7595151196950173e-07, "loss": 0.3494, "step": 2925 }, { "epoch": 0.5300724637681159, "grad_norm": 8.903265099327312, "learning_rate": 4.756584682372214e-07, "loss": 0.3016, "step": 2926 }, { "epoch": 0.5302536231884057, "grad_norm": 3.3351674195301415, "learning_rate": 4.7536543288584543e-07, "loss": 0.2672, "step": 2927 }, { "epoch": 0.5304347826086957, "grad_norm": 6.522383373178969, "learning_rate": 4.75072406016268e-07, "loss": 0.32, "step": 2928 }, { "epoch": 0.5306159420289855, "grad_norm": 3.912768172411499, "learning_rate": 4.747793877293791e-07, "loss": 0.3214, "step": 2929 }, { "epoch": 0.5307971014492754, "grad_norm": 3.9593986244641233, "learning_rate": 4.7448637812606656e-07, "loss": 0.3168, "step": 2930 }, { "epoch": 0.5309782608695652, "grad_norm": 4.013660105745786, "learning_rate": 4.741933773072148e-07, "loss": 0.2811, "step": 2931 }, { "epoch": 0.5311594202898551, "grad_norm": 6.328394689392958, "learning_rate": 4.739003853737056e-07, "loss": 0.3249, "step": 2932 }, { "epoch": 0.5313405797101449, "grad_norm": 4.128936545453159, "learning_rate": 4.7360740242641737e-07, "loss": 0.2949, "step": 2933 }, { "epoch": 0.5315217391304348, "grad_norm": 7.178584594619719, "learning_rate": 4.7331442856622566e-07, "loss": 0.2908, "step": 2934 }, { "epoch": 0.5317028985507246, "grad_norm": 3.294286165671993, "learning_rate": 4.7302146389400235e-07, "loss": 0.2663, "step": 2935 }, { "epoch": 0.5318840579710145, "grad_norm": 3.842315395751036, "learning_rate": 4.7272850851061715e-07, "loss": 0.2726, "step": 2936 }, { "epoch": 0.5320652173913043, "grad_norm": 4.841177790154437, "learning_rate": 4.7243556251693553e-07, "loss": 0.2632, "step": 2937 }, { "epoch": 0.5322463768115943, "grad_norm": 4.827651120836476, "learning_rate": 4.721426260138204e-07, "loss": 0.3129, "step": 2938 }, { "epoch": 0.5324275362318841, "grad_norm": 4.382323862941937, "learning_rate": 4.7184969910213094e-07, "loss": 0.2488, "step": 2939 }, { "epoch": 0.532608695652174, "grad_norm": 5.221230383386048, "learning_rate": 4.715567818827236e-07, "loss": 0.3083, "step": 2940 }, { "epoch": 0.5327898550724638, "grad_norm": 10.395773574737106, "learning_rate": 4.712638744564511e-07, "loss": 0.2769, "step": 2941 }, { "epoch": 0.5329710144927536, "grad_norm": 5.955584361676708, "learning_rate": 4.709709769241628e-07, "loss": 0.3167, "step": 2942 }, { "epoch": 0.5331521739130435, "grad_norm": 4.997364715528131, "learning_rate": 4.706780893867044e-07, "loss": 0.3279, "step": 2943 }, { "epoch": 0.5333333333333333, "grad_norm": 3.3899341303151282, "learning_rate": 4.703852119449191e-07, "loss": 0.2329, "step": 2944 }, { "epoch": 0.5335144927536232, "grad_norm": 3.837514775848834, "learning_rate": 4.700923446996455e-07, "loss": 0.2893, "step": 2945 }, { "epoch": 0.533695652173913, "grad_norm": 3.3874379575637934, "learning_rate": 4.6979948775171927e-07, "loss": 0.2746, "step": 2946 }, { "epoch": 0.5338768115942029, "grad_norm": 7.888294680935468, "learning_rate": 4.695066412019725e-07, "loss": 0.2999, "step": 2947 }, { "epoch": 0.5340579710144927, "grad_norm": 3.9539066164778367, "learning_rate": 4.692138051512337e-07, "loss": 0.2564, "step": 2948 }, { "epoch": 0.5342391304347827, "grad_norm": 15.137564404946195, "learning_rate": 4.689209797003277e-07, "loss": 0.3544, "step": 2949 }, { "epoch": 0.5344202898550725, "grad_norm": 3.699822810136642, "learning_rate": 4.6862816495007566e-07, "loss": 0.2782, "step": 2950 }, { "epoch": 0.5346014492753624, "grad_norm": 7.664313700060354, "learning_rate": 4.6833536100129477e-07, "loss": 0.2747, "step": 2951 }, { "epoch": 0.5347826086956522, "grad_norm": 4.057222464136469, "learning_rate": 4.680425679547994e-07, "loss": 0.2543, "step": 2952 }, { "epoch": 0.534963768115942, "grad_norm": 8.476141520893258, "learning_rate": 4.6774978591139936e-07, "loss": 0.31, "step": 2953 }, { "epoch": 0.5351449275362319, "grad_norm": 5.740854409432257, "learning_rate": 4.6745701497190053e-07, "loss": 0.3149, "step": 2954 }, { "epoch": 0.5353260869565217, "grad_norm": 6.889381913163191, "learning_rate": 4.6716425523710595e-07, "loss": 0.2866, "step": 2955 }, { "epoch": 0.5355072463768116, "grad_norm": 4.4349145120754185, "learning_rate": 4.668715068078138e-07, "loss": 0.3289, "step": 2956 }, { "epoch": 0.5356884057971014, "grad_norm": 4.993115450699712, "learning_rate": 4.6657876978481877e-07, "loss": 0.4318, "step": 2957 }, { "epoch": 0.5358695652173913, "grad_norm": 5.732098456324761, "learning_rate": 4.6628604426891154e-07, "loss": 0.3144, "step": 2958 }, { "epoch": 0.5360507246376811, "grad_norm": 6.32482046003519, "learning_rate": 4.659933303608792e-07, "loss": 0.2964, "step": 2959 }, { "epoch": 0.5362318840579711, "grad_norm": 5.784427743641433, "learning_rate": 4.6570062816150424e-07, "loss": 0.2787, "step": 2960 }, { "epoch": 0.5364130434782609, "grad_norm": 3.1823569216573877, "learning_rate": 4.6540793777156566e-07, "loss": 0.2928, "step": 2961 }, { "epoch": 0.5365942028985508, "grad_norm": 4.198969777840339, "learning_rate": 4.651152592918377e-07, "loss": 0.3306, "step": 2962 }, { "epoch": 0.5367753623188406, "grad_norm": 4.743275689858857, "learning_rate": 4.648225928230916e-07, "loss": 0.2926, "step": 2963 }, { "epoch": 0.5369565217391304, "grad_norm": 3.315411269492071, "learning_rate": 4.6452993846609336e-07, "loss": 0.219, "step": 2964 }, { "epoch": 0.5371376811594203, "grad_norm": 3.9691781573552305, "learning_rate": 4.642372963216053e-07, "loss": 0.3098, "step": 2965 }, { "epoch": 0.5373188405797101, "grad_norm": 5.76370586122385, "learning_rate": 4.639446664903855e-07, "loss": 0.2178, "step": 2966 }, { "epoch": 0.5375, "grad_norm": 5.660178666093768, "learning_rate": 4.6365204907318796e-07, "loss": 0.3028, "step": 2967 }, { "epoch": 0.5376811594202898, "grad_norm": 3.278949890067449, "learning_rate": 4.633594441707621e-07, "loss": 0.2673, "step": 2968 }, { "epoch": 0.5378623188405797, "grad_norm": 6.324043358926048, "learning_rate": 4.630668518838534e-07, "loss": 0.269, "step": 2969 }, { "epoch": 0.5380434782608695, "grad_norm": 4.201414182476533, "learning_rate": 4.627742723132023e-07, "loss": 0.3378, "step": 2970 }, { "epoch": 0.5382246376811595, "grad_norm": 4.2934541855468, "learning_rate": 4.624817055595458e-07, "loss": 0.2958, "step": 2971 }, { "epoch": 0.5384057971014493, "grad_norm": 12.725856977736433, "learning_rate": 4.6218915172361583e-07, "loss": 0.2684, "step": 2972 }, { "epoch": 0.5385869565217392, "grad_norm": 9.13372364217117, "learning_rate": 4.618966109061401e-07, "loss": 0.3318, "step": 2973 }, { "epoch": 0.538768115942029, "grad_norm": 5.194467951361341, "learning_rate": 4.616040832078416e-07, "loss": 0.2922, "step": 2974 }, { "epoch": 0.5389492753623188, "grad_norm": 3.48682987762208, "learning_rate": 4.6131156872943933e-07, "loss": 0.3163, "step": 2975 }, { "epoch": 0.5391304347826087, "grad_norm": 3.6981277256158083, "learning_rate": 4.610190675716472e-07, "loss": 0.2395, "step": 2976 }, { "epoch": 0.5393115942028985, "grad_norm": 5.070913587930658, "learning_rate": 4.607265798351749e-07, "loss": 0.3273, "step": 2977 }, { "epoch": 0.5394927536231884, "grad_norm": 7.719395187329499, "learning_rate": 4.60434105620727e-07, "loss": 0.3058, "step": 2978 }, { "epoch": 0.5396739130434782, "grad_norm": 13.266257504656563, "learning_rate": 4.6014164502900434e-07, "loss": 0.3113, "step": 2979 }, { "epoch": 0.5398550724637681, "grad_norm": 5.977656331543663, "learning_rate": 4.5984919816070193e-07, "loss": 0.2737, "step": 2980 }, { "epoch": 0.5400362318840579, "grad_norm": 4.61219466513511, "learning_rate": 4.595567651165109e-07, "loss": 0.2808, "step": 2981 }, { "epoch": 0.5402173913043479, "grad_norm": 3.9956499930743594, "learning_rate": 4.5926434599711716e-07, "loss": 0.3135, "step": 2982 }, { "epoch": 0.5403985507246377, "grad_norm": 4.042843240138853, "learning_rate": 4.5897194090320217e-07, "loss": 0.2914, "step": 2983 }, { "epoch": 0.5405797101449276, "grad_norm": 3.949287588623843, "learning_rate": 4.586795499354424e-07, "loss": 0.2986, "step": 2984 }, { "epoch": 0.5407608695652174, "grad_norm": 3.456017304253609, "learning_rate": 4.5838717319450945e-07, "loss": 0.3401, "step": 2985 }, { "epoch": 0.5409420289855073, "grad_norm": 3.1839016554398185, "learning_rate": 4.5809481078106954e-07, "loss": 0.2473, "step": 2986 }, { "epoch": 0.5411231884057971, "grad_norm": 8.448942119415877, "learning_rate": 4.578024627957851e-07, "loss": 0.2843, "step": 2987 }, { "epoch": 0.5413043478260869, "grad_norm": 10.884188441341491, "learning_rate": 4.575101293393128e-07, "loss": 0.2931, "step": 2988 }, { "epoch": 0.5414855072463768, "grad_norm": 5.436363730614316, "learning_rate": 4.5721781051230385e-07, "loss": 0.3016, "step": 2989 }, { "epoch": 0.5416666666666666, "grad_norm": 3.903275862626155, "learning_rate": 4.569255064154058e-07, "loss": 0.3121, "step": 2990 }, { "epoch": 0.5418478260869565, "grad_norm": 3.568864628172722, "learning_rate": 4.5663321714925997e-07, "loss": 0.2786, "step": 2991 }, { "epoch": 0.5420289855072464, "grad_norm": 5.482816712868959, "learning_rate": 4.563409428145029e-07, "loss": 0.3961, "step": 2992 }, { "epoch": 0.5422101449275363, "grad_norm": 7.425900947266244, "learning_rate": 4.56048683511766e-07, "loss": 0.2738, "step": 2993 }, { "epoch": 0.5423913043478261, "grad_norm": 4.399882730754699, "learning_rate": 4.5575643934167567e-07, "loss": 0.2886, "step": 2994 }, { "epoch": 0.542572463768116, "grad_norm": 3.722852451943934, "learning_rate": 4.5546421040485295e-07, "loss": 0.327, "step": 2995 }, { "epoch": 0.5427536231884058, "grad_norm": 4.917695344786367, "learning_rate": 4.551719968019136e-07, "loss": 0.2651, "step": 2996 }, { "epoch": 0.5429347826086957, "grad_norm": 7.009359764797192, "learning_rate": 4.54879798633468e-07, "loss": 0.3207, "step": 2997 }, { "epoch": 0.5431159420289855, "grad_norm": 9.242907991912164, "learning_rate": 4.5458761600012174e-07, "loss": 0.3321, "step": 2998 }, { "epoch": 0.5432971014492753, "grad_norm": 3.51820867373752, "learning_rate": 4.5429544900247434e-07, "loss": 0.32, "step": 2999 }, { "epoch": 0.5434782608695652, "grad_norm": 4.33423858985368, "learning_rate": 4.5400329774112043e-07, "loss": 0.3243, "step": 3000 }, { "epoch": 0.5434782608695652, "eval_loss": 0.2858281135559082, "eval_runtime": 9.7799, "eval_samples_per_second": 51.125, "eval_steps_per_second": 0.102, "step": 3000 }, { "epoch": 0.543659420289855, "grad_norm": 3.1264835143066825, "learning_rate": 4.537111623166489e-07, "loss": 0.2181, "step": 3001 }, { "epoch": 0.5438405797101449, "grad_norm": 3.5655919537808867, "learning_rate": 4.5341904282964364e-07, "loss": 0.2283, "step": 3002 }, { "epoch": 0.5440217391304348, "grad_norm": 7.914218311548956, "learning_rate": 4.531269393806827e-07, "loss": 0.3013, "step": 3003 }, { "epoch": 0.5442028985507247, "grad_norm": 3.9144643785650532, "learning_rate": 4.5283485207033866e-07, "loss": 0.3562, "step": 3004 }, { "epoch": 0.5443840579710145, "grad_norm": 4.100928593273769, "learning_rate": 4.525427809991782e-07, "loss": 0.3179, "step": 3005 }, { "epoch": 0.5445652173913044, "grad_norm": 4.668630208425053, "learning_rate": 4.5225072626776345e-07, "loss": 0.3104, "step": 3006 }, { "epoch": 0.5447463768115942, "grad_norm": 4.179568114727412, "learning_rate": 4.519586879766498e-07, "loss": 0.2754, "step": 3007 }, { "epoch": 0.5449275362318841, "grad_norm": 4.029226018148599, "learning_rate": 4.516666662263874e-07, "loss": 0.3084, "step": 3008 }, { "epoch": 0.5451086956521739, "grad_norm": 6.272860036723506, "learning_rate": 4.513746611175208e-07, "loss": 0.2531, "step": 3009 }, { "epoch": 0.5452898550724637, "grad_norm": 3.4541112540288053, "learning_rate": 4.5108267275058887e-07, "loss": 0.2635, "step": 3010 }, { "epoch": 0.5454710144927536, "grad_norm": 3.3428132709717384, "learning_rate": 4.507907012261244e-07, "loss": 0.2941, "step": 3011 }, { "epoch": 0.5456521739130434, "grad_norm": 4.753350387936348, "learning_rate": 4.5049874664465493e-07, "loss": 0.3719, "step": 3012 }, { "epoch": 0.5458333333333333, "grad_norm": 3.514733852595906, "learning_rate": 4.5020680910670114e-07, "loss": 0.2769, "step": 3013 }, { "epoch": 0.5460144927536232, "grad_norm": 7.147529483288572, "learning_rate": 4.4991488871277916e-07, "loss": 0.2406, "step": 3014 }, { "epoch": 0.5461956521739131, "grad_norm": 8.423306894007123, "learning_rate": 4.496229855633983e-07, "loss": 0.3859, "step": 3015 }, { "epoch": 0.5463768115942029, "grad_norm": 7.700930285396802, "learning_rate": 4.4933109975906215e-07, "loss": 0.3139, "step": 3016 }, { "epoch": 0.5465579710144928, "grad_norm": 9.1331181380497, "learning_rate": 4.490392314002683e-07, "loss": 0.3053, "step": 3017 }, { "epoch": 0.5467391304347826, "grad_norm": 4.895184243789536, "learning_rate": 4.487473805875086e-07, "loss": 0.3299, "step": 3018 }, { "epoch": 0.5469202898550725, "grad_norm": 3.6798840734975613, "learning_rate": 4.484555474212687e-07, "loss": 0.333, "step": 3019 }, { "epoch": 0.5471014492753623, "grad_norm": 3.5009996865176602, "learning_rate": 4.481637320020281e-07, "loss": 0.2326, "step": 3020 }, { "epoch": 0.5472826086956522, "grad_norm": 5.293161848635642, "learning_rate": 4.478719344302599e-07, "loss": 0.3563, "step": 3021 }, { "epoch": 0.547463768115942, "grad_norm": 8.159729543469046, "learning_rate": 4.4758015480643187e-07, "loss": 0.3024, "step": 3022 }, { "epoch": 0.5476449275362318, "grad_norm": 6.678209885498957, "learning_rate": 4.4728839323100497e-07, "loss": 0.3936, "step": 3023 }, { "epoch": 0.5478260869565217, "grad_norm": 4.914223685488828, "learning_rate": 4.4699664980443374e-07, "loss": 0.3192, "step": 3024 }, { "epoch": 0.5480072463768116, "grad_norm": 4.741715684114004, "learning_rate": 4.467049246271674e-07, "loss": 0.2931, "step": 3025 }, { "epoch": 0.5481884057971015, "grad_norm": 4.64421152835889, "learning_rate": 4.46413217799648e-07, "loss": 0.2894, "step": 3026 }, { "epoch": 0.5483695652173913, "grad_norm": 4.778274076162452, "learning_rate": 4.4612152942231153e-07, "loss": 0.3031, "step": 3027 }, { "epoch": 0.5485507246376812, "grad_norm": 3.8725403096508915, "learning_rate": 4.458298595955877e-07, "loss": 0.2834, "step": 3028 }, { "epoch": 0.548731884057971, "grad_norm": 6.526856152642174, "learning_rate": 4.455382084199e-07, "loss": 0.339, "step": 3029 }, { "epoch": 0.5489130434782609, "grad_norm": 7.448732740194804, "learning_rate": 4.452465759956651e-07, "loss": 0.3245, "step": 3030 }, { "epoch": 0.5490942028985507, "grad_norm": 4.325473892031156, "learning_rate": 4.4495496242329373e-07, "loss": 0.3671, "step": 3031 }, { "epoch": 0.5492753623188406, "grad_norm": 3.313219598344092, "learning_rate": 4.4466336780318925e-07, "loss": 0.2812, "step": 3032 }, { "epoch": 0.5494565217391304, "grad_norm": 4.170847141322727, "learning_rate": 4.4437179223574975e-07, "loss": 0.2792, "step": 3033 }, { "epoch": 0.5496376811594202, "grad_norm": 3.83450749289122, "learning_rate": 4.440802358213656e-07, "loss": 0.2888, "step": 3034 }, { "epoch": 0.5498188405797102, "grad_norm": 11.602192690816686, "learning_rate": 4.4378869866042123e-07, "loss": 0.3203, "step": 3035 }, { "epoch": 0.55, "grad_norm": 4.080561661575524, "learning_rate": 4.434971808532941e-07, "loss": 0.2663, "step": 3036 }, { "epoch": 0.5501811594202899, "grad_norm": 3.714301638662152, "learning_rate": 4.432056825003555e-07, "loss": 0.2521, "step": 3037 }, { "epoch": 0.5503623188405797, "grad_norm": 4.083970659729972, "learning_rate": 4.4291420370196954e-07, "loss": 0.3152, "step": 3038 }, { "epoch": 0.5505434782608696, "grad_norm": 8.1406301605279, "learning_rate": 4.4262274455849374e-07, "loss": 0.3028, "step": 3039 }, { "epoch": 0.5507246376811594, "grad_norm": 3.6107795952953894, "learning_rate": 4.4233130517027854e-07, "loss": 0.2984, "step": 3040 }, { "epoch": 0.5509057971014493, "grad_norm": 4.940447745353018, "learning_rate": 4.420398856376686e-07, "loss": 0.2968, "step": 3041 }, { "epoch": 0.5510869565217391, "grad_norm": 3.2577852404309553, "learning_rate": 4.417484860610005e-07, "loss": 0.2059, "step": 3042 }, { "epoch": 0.551268115942029, "grad_norm": 3.449398679342599, "learning_rate": 4.4145710654060466e-07, "loss": 0.2029, "step": 3043 }, { "epoch": 0.5514492753623188, "grad_norm": 5.680281589249258, "learning_rate": 4.411657471768043e-07, "loss": 0.3632, "step": 3044 }, { "epoch": 0.5516304347826086, "grad_norm": 5.594986464110723, "learning_rate": 4.4087440806991606e-07, "loss": 0.2917, "step": 3045 }, { "epoch": 0.5518115942028986, "grad_norm": 4.151506151568021, "learning_rate": 4.405830893202493e-07, "loss": 0.2303, "step": 3046 }, { "epoch": 0.5519927536231884, "grad_norm": 4.81399760256334, "learning_rate": 4.402917910281065e-07, "loss": 0.3193, "step": 3047 }, { "epoch": 0.5521739130434783, "grad_norm": 9.003512088687303, "learning_rate": 4.4000051329378256e-07, "loss": 0.2758, "step": 3048 }, { "epoch": 0.5523550724637681, "grad_norm": 10.180209402771974, "learning_rate": 4.397092562175666e-07, "loss": 0.222, "step": 3049 }, { "epoch": 0.552536231884058, "grad_norm": 8.975281306131343, "learning_rate": 4.394180198997392e-07, "loss": 0.2719, "step": 3050 }, { "epoch": 0.5527173913043478, "grad_norm": 9.774435292322675, "learning_rate": 4.3912680444057465e-07, "loss": 0.2654, "step": 3051 }, { "epoch": 0.5528985507246377, "grad_norm": 5.981068924724844, "learning_rate": 4.3883560994033965e-07, "loss": 0.2547, "step": 3052 }, { "epoch": 0.5530797101449275, "grad_norm": 4.057776427574125, "learning_rate": 4.38544436499294e-07, "loss": 0.2628, "step": 3053 }, { "epoch": 0.5532608695652174, "grad_norm": 7.213901341063897, "learning_rate": 4.382532842176901e-07, "loss": 0.3431, "step": 3054 }, { "epoch": 0.5534420289855072, "grad_norm": 5.568143023767482, "learning_rate": 4.3796215319577304e-07, "loss": 0.3236, "step": 3055 }, { "epoch": 0.553623188405797, "grad_norm": 9.110566168045363, "learning_rate": 4.376710435337803e-07, "loss": 0.3753, "step": 3056 }, { "epoch": 0.553804347826087, "grad_norm": 7.904957367726735, "learning_rate": 4.3737995533194285e-07, "loss": 0.2792, "step": 3057 }, { "epoch": 0.5539855072463769, "grad_norm": 9.819636699779448, "learning_rate": 4.3708888869048357e-07, "loss": 0.3073, "step": 3058 }, { "epoch": 0.5541666666666667, "grad_norm": 3.5955363064912467, "learning_rate": 4.3679784370961763e-07, "loss": 0.2136, "step": 3059 }, { "epoch": 0.5543478260869565, "grad_norm": 4.364748613602733, "learning_rate": 4.365068204895539e-07, "loss": 0.2978, "step": 3060 }, { "epoch": 0.5545289855072464, "grad_norm": 4.952137117510628, "learning_rate": 4.362158191304926e-07, "loss": 0.286, "step": 3061 }, { "epoch": 0.5547101449275362, "grad_norm": 4.250720821427454, "learning_rate": 4.35924839732627e-07, "loss": 0.2574, "step": 3062 }, { "epoch": 0.5548913043478261, "grad_norm": 4.096297202278202, "learning_rate": 4.356338823961426e-07, "loss": 0.2784, "step": 3063 }, { "epoch": 0.5550724637681159, "grad_norm": 7.13673607899764, "learning_rate": 4.353429472212175e-07, "loss": 0.2777, "step": 3064 }, { "epoch": 0.5552536231884058, "grad_norm": 4.546895754919055, "learning_rate": 4.350520343080221e-07, "loss": 0.2878, "step": 3065 }, { "epoch": 0.5554347826086956, "grad_norm": 4.661956689875289, "learning_rate": 4.3476114375671904e-07, "loss": 0.2853, "step": 3066 }, { "epoch": 0.5556159420289855, "grad_norm": 3.245026121304084, "learning_rate": 4.3447027566746296e-07, "loss": 0.2377, "step": 3067 }, { "epoch": 0.5557971014492754, "grad_norm": 3.816033137196711, "learning_rate": 4.341794301404019e-07, "loss": 0.3168, "step": 3068 }, { "epoch": 0.5559782608695653, "grad_norm": 5.170592258852042, "learning_rate": 4.338886072756747e-07, "loss": 0.2686, "step": 3069 }, { "epoch": 0.5561594202898551, "grad_norm": 4.193429030165531, "learning_rate": 4.335978071734133e-07, "loss": 0.2838, "step": 3070 }, { "epoch": 0.556340579710145, "grad_norm": 4.252985585484878, "learning_rate": 4.3330702993374136e-07, "loss": 0.3035, "step": 3071 }, { "epoch": 0.5565217391304348, "grad_norm": 4.415642859341208, "learning_rate": 4.330162756567752e-07, "loss": 0.3315, "step": 3072 }, { "epoch": 0.5567028985507246, "grad_norm": 3.373147831717073, "learning_rate": 4.3272554444262265e-07, "loss": 0.3083, "step": 3073 }, { "epoch": 0.5568840579710145, "grad_norm": 3.861033259888325, "learning_rate": 4.32434836391384e-07, "loss": 0.3135, "step": 3074 }, { "epoch": 0.5570652173913043, "grad_norm": 4.139382322849648, "learning_rate": 4.3214415160315096e-07, "loss": 0.2689, "step": 3075 }, { "epoch": 0.5572463768115942, "grad_norm": 3.9494392571429415, "learning_rate": 4.318534901780084e-07, "loss": 0.2547, "step": 3076 }, { "epoch": 0.557427536231884, "grad_norm": 5.836651239042293, "learning_rate": 4.3156285221603195e-07, "loss": 0.3206, "step": 3077 }, { "epoch": 0.5576086956521739, "grad_norm": 5.241781109855, "learning_rate": 4.312722378172898e-07, "loss": 0.316, "step": 3078 }, { "epoch": 0.5577898550724638, "grad_norm": 5.5655640008768055, "learning_rate": 4.309816470818416e-07, "loss": 0.3806, "step": 3079 }, { "epoch": 0.5579710144927537, "grad_norm": 5.1015152634940755, "learning_rate": 4.3069108010973947e-07, "loss": 0.3045, "step": 3080 }, { "epoch": 0.5581521739130435, "grad_norm": 9.555409191887346, "learning_rate": 4.3040053700102685e-07, "loss": 0.3007, "step": 3081 }, { "epoch": 0.5583333333333333, "grad_norm": 4.0169729126319735, "learning_rate": 4.3011001785573927e-07, "loss": 0.3313, "step": 3082 }, { "epoch": 0.5585144927536232, "grad_norm": 4.061341405939489, "learning_rate": 4.298195227739033e-07, "loss": 0.351, "step": 3083 }, { "epoch": 0.558695652173913, "grad_norm": 3.748759139505214, "learning_rate": 4.2952905185553844e-07, "loss": 0.2306, "step": 3084 }, { "epoch": 0.5588768115942029, "grad_norm": 5.1452429073871695, "learning_rate": 4.292386052006549e-07, "loss": 0.248, "step": 3085 }, { "epoch": 0.5590579710144927, "grad_norm": 8.889189688543281, "learning_rate": 4.2894818290925483e-07, "loss": 0.2637, "step": 3086 }, { "epoch": 0.5592391304347826, "grad_norm": 5.078518650418005, "learning_rate": 4.28657785081332e-07, "loss": 0.358, "step": 3087 }, { "epoch": 0.5594202898550724, "grad_norm": 8.79729138579761, "learning_rate": 4.283674118168718e-07, "loss": 0.2508, "step": 3088 }, { "epoch": 0.5596014492753624, "grad_norm": 3.944535166883078, "learning_rate": 4.2807706321585115e-07, "loss": 0.3259, "step": 3089 }, { "epoch": 0.5597826086956522, "grad_norm": 3.9694116321997117, "learning_rate": 4.277867393782385e-07, "loss": 0.2987, "step": 3090 }, { "epoch": 0.5599637681159421, "grad_norm": 9.300774582156349, "learning_rate": 4.2749644040399336e-07, "loss": 0.304, "step": 3091 }, { "epoch": 0.5601449275362319, "grad_norm": 4.456884646682889, "learning_rate": 4.272061663930675e-07, "loss": 0.3054, "step": 3092 }, { "epoch": 0.5603260869565218, "grad_norm": 3.6323857273192024, "learning_rate": 4.2691591744540357e-07, "loss": 0.2633, "step": 3093 }, { "epoch": 0.5605072463768116, "grad_norm": 4.924186217588339, "learning_rate": 4.2662569366093525e-07, "loss": 0.2698, "step": 3094 }, { "epoch": 0.5606884057971014, "grad_norm": 4.035031136056453, "learning_rate": 4.2633549513958855e-07, "loss": 0.3078, "step": 3095 }, { "epoch": 0.5608695652173913, "grad_norm": 6.817369011577607, "learning_rate": 4.260453219812798e-07, "loss": 0.2662, "step": 3096 }, { "epoch": 0.5610507246376811, "grad_norm": 9.477026212375621, "learning_rate": 4.2575517428591707e-07, "loss": 0.2849, "step": 3097 }, { "epoch": 0.561231884057971, "grad_norm": 4.0656429152612, "learning_rate": 4.254650521533996e-07, "loss": 0.271, "step": 3098 }, { "epoch": 0.5614130434782608, "grad_norm": 6.312032905329363, "learning_rate": 4.2517495568361776e-07, "loss": 0.3003, "step": 3099 }, { "epoch": 0.5615942028985508, "grad_norm": 4.185878879962339, "learning_rate": 4.2488488497645335e-07, "loss": 0.2839, "step": 3100 }, { "epoch": 0.5615942028985508, "eval_loss": 0.2846718728542328, "eval_runtime": 9.777, "eval_samples_per_second": 51.14, "eval_steps_per_second": 0.102, "step": 3100 }, { "epoch": 0.5617753623188406, "grad_norm": 6.949084863457091, "learning_rate": 4.2459484013177906e-07, "loss": 0.3165, "step": 3101 }, { "epoch": 0.5619565217391305, "grad_norm": 4.592299253828998, "learning_rate": 4.2430482124945816e-07, "loss": 0.2896, "step": 3102 }, { "epoch": 0.5621376811594203, "grad_norm": 3.806262327518691, "learning_rate": 4.240148284293463e-07, "loss": 0.3167, "step": 3103 }, { "epoch": 0.5623188405797102, "grad_norm": 5.257043951174633, "learning_rate": 4.2372486177128903e-07, "loss": 0.3397, "step": 3104 }, { "epoch": 0.5625, "grad_norm": 6.609651408730121, "learning_rate": 4.2343492137512314e-07, "loss": 0.3698, "step": 3105 }, { "epoch": 0.5626811594202898, "grad_norm": 6.137139695731701, "learning_rate": 4.231450073406766e-07, "loss": 0.3414, "step": 3106 }, { "epoch": 0.5628623188405797, "grad_norm": 3.6846949971123313, "learning_rate": 4.2285511976776823e-07, "loss": 0.2878, "step": 3107 }, { "epoch": 0.5630434782608695, "grad_norm": 4.623271243801174, "learning_rate": 4.225652587562076e-07, "loss": 0.2772, "step": 3108 }, { "epoch": 0.5632246376811594, "grad_norm": 5.648686512173944, "learning_rate": 4.2227542440579545e-07, "loss": 0.2966, "step": 3109 }, { "epoch": 0.5634057971014492, "grad_norm": 3.9955644324813777, "learning_rate": 4.2198561681632256e-07, "loss": 0.2767, "step": 3110 }, { "epoch": 0.5635869565217392, "grad_norm": 3.635889960840437, "learning_rate": 4.2169583608757183e-07, "loss": 0.2879, "step": 3111 }, { "epoch": 0.563768115942029, "grad_norm": 3.7507716297553766, "learning_rate": 4.214060823193156e-07, "loss": 0.3352, "step": 3112 }, { "epoch": 0.5639492753623189, "grad_norm": 5.607021049512822, "learning_rate": 4.2111635561131756e-07, "loss": 0.3185, "step": 3113 }, { "epoch": 0.5641304347826087, "grad_norm": 3.431461168725288, "learning_rate": 4.20826656063332e-07, "loss": 0.2626, "step": 3114 }, { "epoch": 0.5643115942028986, "grad_norm": 4.102314089864365, "learning_rate": 4.20536983775104e-07, "loss": 0.3115, "step": 3115 }, { "epoch": 0.5644927536231884, "grad_norm": 5.666883550192323, "learning_rate": 4.20247338846369e-07, "loss": 0.2644, "step": 3116 }, { "epoch": 0.5646739130434782, "grad_norm": 4.139260839997685, "learning_rate": 4.1995772137685317e-07, "loss": 0.2957, "step": 3117 }, { "epoch": 0.5648550724637681, "grad_norm": 4.124176552616789, "learning_rate": 4.196681314662728e-07, "loss": 0.3167, "step": 3118 }, { "epoch": 0.5650362318840579, "grad_norm": 3.479069189641907, "learning_rate": 4.1937856921433574e-07, "loss": 0.2617, "step": 3119 }, { "epoch": 0.5652173913043478, "grad_norm": 4.679231527721626, "learning_rate": 4.190890347207392e-07, "loss": 0.3134, "step": 3120 }, { "epoch": 0.5653985507246376, "grad_norm": 3.2858164223322244, "learning_rate": 4.1879952808517133e-07, "loss": 0.2576, "step": 3121 }, { "epoch": 0.5655797101449276, "grad_norm": 5.965198139648401, "learning_rate": 4.1851004940731054e-07, "loss": 0.219, "step": 3122 }, { "epoch": 0.5657608695652174, "grad_norm": 7.409424627993025, "learning_rate": 4.1822059878682605e-07, "loss": 0.3167, "step": 3123 }, { "epoch": 0.5659420289855073, "grad_norm": 4.864908478326171, "learning_rate": 4.179311763233768e-07, "loss": 0.2312, "step": 3124 }, { "epoch": 0.5661231884057971, "grad_norm": 5.406560086172158, "learning_rate": 4.176417821166125e-07, "loss": 0.3193, "step": 3125 }, { "epoch": 0.566304347826087, "grad_norm": 5.54760518102799, "learning_rate": 4.173524162661726e-07, "loss": 0.2976, "step": 3126 }, { "epoch": 0.5664855072463768, "grad_norm": 12.752484752986526, "learning_rate": 4.170630788716875e-07, "loss": 0.3015, "step": 3127 }, { "epoch": 0.5666666666666667, "grad_norm": 4.5778004716882235, "learning_rate": 4.1677377003277743e-07, "loss": 0.3035, "step": 3128 }, { "epoch": 0.5668478260869565, "grad_norm": 3.919913317470607, "learning_rate": 4.1648448984905234e-07, "loss": 0.2591, "step": 3129 }, { "epoch": 0.5670289855072463, "grad_norm": 3.2804008284376933, "learning_rate": 4.1619523842011343e-07, "loss": 0.2717, "step": 3130 }, { "epoch": 0.5672101449275362, "grad_norm": 5.0904713266390145, "learning_rate": 4.15906015845551e-07, "loss": 0.2885, "step": 3131 }, { "epoch": 0.5673913043478261, "grad_norm": 6.787433047325534, "learning_rate": 4.156168222249456e-07, "loss": 0.289, "step": 3132 }, { "epoch": 0.567572463768116, "grad_norm": 3.135895726086536, "learning_rate": 4.1532765765786807e-07, "loss": 0.2587, "step": 3133 }, { "epoch": 0.5677536231884058, "grad_norm": 5.3886413582760655, "learning_rate": 4.1503852224387934e-07, "loss": 0.2865, "step": 3134 }, { "epoch": 0.5679347826086957, "grad_norm": 6.891292318722531, "learning_rate": 4.1474941608252994e-07, "loss": 0.3004, "step": 3135 }, { "epoch": 0.5681159420289855, "grad_norm": 4.0631171289460575, "learning_rate": 4.144603392733606e-07, "loss": 0.2924, "step": 3136 }, { "epoch": 0.5682971014492754, "grad_norm": 3.971983785321238, "learning_rate": 4.141712919159014e-07, "loss": 0.3422, "step": 3137 }, { "epoch": 0.5684782608695652, "grad_norm": 4.421868693202735, "learning_rate": 4.1388227410967344e-07, "loss": 0.2748, "step": 3138 }, { "epoch": 0.568659420289855, "grad_norm": 3.78304635392667, "learning_rate": 4.135932859541864e-07, "loss": 0.2799, "step": 3139 }, { "epoch": 0.5688405797101449, "grad_norm": 9.240160227832156, "learning_rate": 4.133043275489404e-07, "loss": 0.3167, "step": 3140 }, { "epoch": 0.5690217391304347, "grad_norm": 4.299978197339649, "learning_rate": 4.1301539899342513e-07, "loss": 0.306, "step": 3141 }, { "epoch": 0.5692028985507246, "grad_norm": 6.495885024418353, "learning_rate": 4.127265003871202e-07, "loss": 0.3121, "step": 3142 }, { "epoch": 0.5693840579710145, "grad_norm": 5.930882769486175, "learning_rate": 4.1243763182949475e-07, "loss": 0.2737, "step": 3143 }, { "epoch": 0.5695652173913044, "grad_norm": 3.5745530243430226, "learning_rate": 4.121487934200076e-07, "loss": 0.2837, "step": 3144 }, { "epoch": 0.5697463768115942, "grad_norm": 5.313532692298856, "learning_rate": 4.118599852581068e-07, "loss": 0.3122, "step": 3145 }, { "epoch": 0.5699275362318841, "grad_norm": 3.4633376632436788, "learning_rate": 4.11571207443231e-07, "loss": 0.2672, "step": 3146 }, { "epoch": 0.5701086956521739, "grad_norm": 6.337617029583747, "learning_rate": 4.112824600748074e-07, "loss": 0.2706, "step": 3147 }, { "epoch": 0.5702898550724638, "grad_norm": 3.9319648502173057, "learning_rate": 4.109937432522531e-07, "loss": 0.3105, "step": 3148 }, { "epoch": 0.5704710144927536, "grad_norm": 3.7605660949503164, "learning_rate": 4.1070505707497466e-07, "loss": 0.2592, "step": 3149 }, { "epoch": 0.5706521739130435, "grad_norm": 4.973335691821945, "learning_rate": 4.1041640164236825e-07, "loss": 0.3108, "step": 3150 }, { "epoch": 0.5708333333333333, "grad_norm": 3.956145442108406, "learning_rate": 4.1012777705381917e-07, "loss": 0.2471, "step": 3151 }, { "epoch": 0.5710144927536231, "grad_norm": 4.520781047221492, "learning_rate": 4.098391834087024e-07, "loss": 0.3473, "step": 3152 }, { "epoch": 0.571195652173913, "grad_norm": 5.322092816443009, "learning_rate": 4.095506208063817e-07, "loss": 0.358, "step": 3153 }, { "epoch": 0.571376811594203, "grad_norm": 3.8052133702527535, "learning_rate": 4.0926208934621107e-07, "loss": 0.2829, "step": 3154 }, { "epoch": 0.5715579710144928, "grad_norm": 7.80563711340646, "learning_rate": 4.089735891275329e-07, "loss": 0.2911, "step": 3155 }, { "epoch": 0.5717391304347826, "grad_norm": 4.352110278183092, "learning_rate": 4.086851202496794e-07, "loss": 0.2012, "step": 3156 }, { "epoch": 0.5719202898550725, "grad_norm": 3.5658827225666867, "learning_rate": 4.083966828119714e-07, "loss": 0.2701, "step": 3157 }, { "epoch": 0.5721014492753623, "grad_norm": 4.411114675989049, "learning_rate": 4.0810827691371976e-07, "loss": 0.2751, "step": 3158 }, { "epoch": 0.5722826086956522, "grad_norm": 4.123892333270912, "learning_rate": 4.0781990265422365e-07, "loss": 0.2978, "step": 3159 }, { "epoch": 0.572463768115942, "grad_norm": 3.813536335277768, "learning_rate": 4.0753156013277204e-07, "loss": 0.2974, "step": 3160 }, { "epoch": 0.5726449275362319, "grad_norm": 3.7620965430692532, "learning_rate": 4.07243249448642e-07, "loss": 0.2796, "step": 3161 }, { "epoch": 0.5728260869565217, "grad_norm": 3.565521747323397, "learning_rate": 4.069549707011009e-07, "loss": 0.2118, "step": 3162 }, { "epoch": 0.5730072463768116, "grad_norm": 4.857204202840505, "learning_rate": 4.066667239894043e-07, "loss": 0.3456, "step": 3163 }, { "epoch": 0.5731884057971014, "grad_norm": 3.5528962088365987, "learning_rate": 4.0637850941279686e-07, "loss": 0.2639, "step": 3164 }, { "epoch": 0.5733695652173914, "grad_norm": 4.776114143366503, "learning_rate": 4.06090327070512e-07, "loss": 0.3894, "step": 3165 }, { "epoch": 0.5735507246376812, "grad_norm": 4.018969363195631, "learning_rate": 4.058021770617727e-07, "loss": 0.2462, "step": 3166 }, { "epoch": 0.573731884057971, "grad_norm": 3.288804497553654, "learning_rate": 4.055140594857901e-07, "loss": 0.273, "step": 3167 }, { "epoch": 0.5739130434782609, "grad_norm": 3.9788793824218405, "learning_rate": 4.052259744417643e-07, "loss": 0.2915, "step": 3168 }, { "epoch": 0.5740942028985507, "grad_norm": 8.572059514244884, "learning_rate": 4.049379220288848e-07, "loss": 0.2836, "step": 3169 }, { "epoch": 0.5742753623188406, "grad_norm": 5.01916765722816, "learning_rate": 4.0464990234632914e-07, "loss": 0.3712, "step": 3170 }, { "epoch": 0.5744565217391304, "grad_norm": 3.7980771704558327, "learning_rate": 4.0436191549326393e-07, "loss": 0.2504, "step": 3171 }, { "epoch": 0.5746376811594203, "grad_norm": 8.559109239460847, "learning_rate": 4.0407396156884405e-07, "loss": 0.321, "step": 3172 }, { "epoch": 0.5748188405797101, "grad_norm": 4.741865725066212, "learning_rate": 4.0378604067221406e-07, "loss": 0.2537, "step": 3173 }, { "epoch": 0.575, "grad_norm": 7.071076993744153, "learning_rate": 4.03498152902506e-07, "loss": 0.3358, "step": 3174 }, { "epoch": 0.5751811594202898, "grad_norm": 6.473387412681447, "learning_rate": 4.032102983588411e-07, "loss": 0.2787, "step": 3175 }, { "epoch": 0.5753623188405798, "grad_norm": 6.521075587914448, "learning_rate": 4.0292247714032906e-07, "loss": 0.3716, "step": 3176 }, { "epoch": 0.5755434782608696, "grad_norm": 4.5883114766243045, "learning_rate": 4.0263468934606814e-07, "loss": 0.2586, "step": 3177 }, { "epoch": 0.5757246376811594, "grad_norm": 6.354345728235465, "learning_rate": 4.0234693507514506e-07, "loss": 0.327, "step": 3178 }, { "epoch": 0.5759057971014493, "grad_norm": 7.045053993053714, "learning_rate": 4.02059214426635e-07, "loss": 0.3047, "step": 3179 }, { "epoch": 0.5760869565217391, "grad_norm": 4.349362756969652, "learning_rate": 4.0177152749960106e-07, "loss": 0.3257, "step": 3180 }, { "epoch": 0.576268115942029, "grad_norm": 3.7961314109792093, "learning_rate": 4.0148387439309607e-07, "loss": 0.2623, "step": 3181 }, { "epoch": 0.5764492753623188, "grad_norm": 6.670507682263265, "learning_rate": 4.0119625520615976e-07, "loss": 0.2667, "step": 3182 }, { "epoch": 0.5766304347826087, "grad_norm": 6.710426180940357, "learning_rate": 4.009086700378209e-07, "loss": 0.2349, "step": 3183 }, { "epoch": 0.5768115942028985, "grad_norm": 7.508029539245563, "learning_rate": 4.006211189870964e-07, "loss": 0.3401, "step": 3184 }, { "epoch": 0.5769927536231884, "grad_norm": 7.076363750359712, "learning_rate": 4.003336021529915e-07, "loss": 0.2695, "step": 3185 }, { "epoch": 0.5771739130434783, "grad_norm": 3.7809178451605123, "learning_rate": 4.0004611963449966e-07, "loss": 0.3062, "step": 3186 }, { "epoch": 0.5773550724637682, "grad_norm": 3.5979546910937352, "learning_rate": 3.997586715306026e-07, "loss": 0.2785, "step": 3187 }, { "epoch": 0.577536231884058, "grad_norm": 5.830730515411123, "learning_rate": 3.994712579402695e-07, "loss": 0.2812, "step": 3188 }, { "epoch": 0.5777173913043478, "grad_norm": 7.476645754214058, "learning_rate": 3.991838789624589e-07, "loss": 0.3339, "step": 3189 }, { "epoch": 0.5778985507246377, "grad_norm": 5.417749254434929, "learning_rate": 3.988965346961164e-07, "loss": 0.2789, "step": 3190 }, { "epoch": 0.5780797101449275, "grad_norm": 3.657765745586712, "learning_rate": 3.98609225240176e-07, "loss": 0.2615, "step": 3191 }, { "epoch": 0.5782608695652174, "grad_norm": 5.675044204409349, "learning_rate": 3.983219506935597e-07, "loss": 0.2599, "step": 3192 }, { "epoch": 0.5784420289855072, "grad_norm": 8.459184805088174, "learning_rate": 3.9803471115517756e-07, "loss": 0.2801, "step": 3193 }, { "epoch": 0.5786231884057971, "grad_norm": 4.157611675203298, "learning_rate": 3.9774750672392754e-07, "loss": 0.2852, "step": 3194 }, { "epoch": 0.5788043478260869, "grad_norm": 8.947731097209754, "learning_rate": 3.974603374986956e-07, "loss": 0.3367, "step": 3195 }, { "epoch": 0.5789855072463768, "grad_norm": 8.917631421416088, "learning_rate": 3.9717320357835486e-07, "loss": 0.2678, "step": 3196 }, { "epoch": 0.5791666666666667, "grad_norm": 3.5654023616724633, "learning_rate": 3.968861050617676e-07, "loss": 0.2662, "step": 3197 }, { "epoch": 0.5793478260869566, "grad_norm": 4.001834116139316, "learning_rate": 3.9659904204778304e-07, "loss": 0.3232, "step": 3198 }, { "epoch": 0.5795289855072464, "grad_norm": 3.2428858988458837, "learning_rate": 3.963120146352381e-07, "loss": 0.2375, "step": 3199 }, { "epoch": 0.5797101449275363, "grad_norm": 5.317994623227954, "learning_rate": 3.960250229229577e-07, "loss": 0.2979, "step": 3200 }, { "epoch": 0.5797101449275363, "eval_loss": 0.2931874990463257, "eval_runtime": 9.7575, "eval_samples_per_second": 51.243, "eval_steps_per_second": 0.102, "step": 3200 }, { "epoch": 0.5798913043478261, "grad_norm": 10.632587403830119, "learning_rate": 3.9573806700975475e-07, "loss": 0.363, "step": 3201 }, { "epoch": 0.5800724637681159, "grad_norm": 8.201436610766304, "learning_rate": 3.9545114699442927e-07, "loss": 0.2855, "step": 3202 }, { "epoch": 0.5802536231884058, "grad_norm": 4.847513761989736, "learning_rate": 3.951642629757691e-07, "loss": 0.311, "step": 3203 }, { "epoch": 0.5804347826086956, "grad_norm": 5.402295087246476, "learning_rate": 3.9487741505255e-07, "loss": 0.2665, "step": 3204 }, { "epoch": 0.5806159420289855, "grad_norm": 5.398358882498308, "learning_rate": 3.9459060332353504e-07, "loss": 0.2705, "step": 3205 }, { "epoch": 0.5807971014492753, "grad_norm": 6.826953785054319, "learning_rate": 3.943038278874749e-07, "loss": 0.3216, "step": 3206 }, { "epoch": 0.5809782608695652, "grad_norm": 4.312309292767122, "learning_rate": 3.940170888431073e-07, "loss": 0.2814, "step": 3207 }, { "epoch": 0.5811594202898551, "grad_norm": 4.571435328470545, "learning_rate": 3.9373038628915846e-07, "loss": 0.3012, "step": 3208 }, { "epoch": 0.581340579710145, "grad_norm": 4.6779913648386975, "learning_rate": 3.9344372032434104e-07, "loss": 0.3028, "step": 3209 }, { "epoch": 0.5815217391304348, "grad_norm": 4.379491450121226, "learning_rate": 3.931570910473556e-07, "loss": 0.3233, "step": 3210 }, { "epoch": 0.5817028985507247, "grad_norm": 8.267627570898787, "learning_rate": 3.928704985568898e-07, "loss": 0.3013, "step": 3211 }, { "epoch": 0.5818840579710145, "grad_norm": 4.332113991406994, "learning_rate": 3.925839429516191e-07, "loss": 0.2283, "step": 3212 }, { "epoch": 0.5820652173913043, "grad_norm": 4.032941534913446, "learning_rate": 3.9229742433020575e-07, "loss": 0.2905, "step": 3213 }, { "epoch": 0.5822463768115942, "grad_norm": 3.821066720226336, "learning_rate": 3.9201094279129967e-07, "loss": 0.2689, "step": 3214 }, { "epoch": 0.582427536231884, "grad_norm": 7.352123234074928, "learning_rate": 3.917244984335372e-07, "loss": 0.2378, "step": 3215 }, { "epoch": 0.5826086956521739, "grad_norm": 3.591410503349207, "learning_rate": 3.914380913555434e-07, "loss": 0.3025, "step": 3216 }, { "epoch": 0.5827898550724637, "grad_norm": 4.678012854640022, "learning_rate": 3.911517216559289e-07, "loss": 0.2707, "step": 3217 }, { "epoch": 0.5829710144927536, "grad_norm": 5.782072710550837, "learning_rate": 3.908653894332925e-07, "loss": 0.2901, "step": 3218 }, { "epoch": 0.5831521739130435, "grad_norm": 4.0049765729710085, "learning_rate": 3.905790947862194e-07, "loss": 0.2679, "step": 3219 }, { "epoch": 0.5833333333333334, "grad_norm": 5.717771141483069, "learning_rate": 3.9029283781328255e-07, "loss": 0.3343, "step": 3220 }, { "epoch": 0.5835144927536232, "grad_norm": 4.610237268426266, "learning_rate": 3.9000661861304147e-07, "loss": 0.4015, "step": 3221 }, { "epoch": 0.5836956521739131, "grad_norm": 3.7547520008586686, "learning_rate": 3.897204372840428e-07, "loss": 0.2813, "step": 3222 }, { "epoch": 0.5838768115942029, "grad_norm": 5.66949126602403, "learning_rate": 3.8943429392481987e-07, "loss": 0.3884, "step": 3223 }, { "epoch": 0.5840579710144927, "grad_norm": 5.677565741953224, "learning_rate": 3.8914818863389363e-07, "loss": 0.3382, "step": 3224 }, { "epoch": 0.5842391304347826, "grad_norm": 4.440918277352746, "learning_rate": 3.8886212150977124e-07, "loss": 0.2991, "step": 3225 }, { "epoch": 0.5844202898550724, "grad_norm": 3.7331164079927954, "learning_rate": 3.8857609265094695e-07, "loss": 0.3155, "step": 3226 }, { "epoch": 0.5846014492753623, "grad_norm": 4.1338565102813885, "learning_rate": 3.882901021559018e-07, "loss": 0.3377, "step": 3227 }, { "epoch": 0.5847826086956521, "grad_norm": 3.49638612370747, "learning_rate": 3.8800415012310385e-07, "loss": 0.3029, "step": 3228 }, { "epoch": 0.5849637681159421, "grad_norm": 4.659768814843623, "learning_rate": 3.8771823665100765e-07, "loss": 0.3031, "step": 3229 }, { "epoch": 0.5851449275362319, "grad_norm": 4.260517216937475, "learning_rate": 3.8743236183805467e-07, "loss": 0.2606, "step": 3230 }, { "epoch": 0.5853260869565218, "grad_norm": 5.120662909136054, "learning_rate": 3.871465257826727e-07, "loss": 0.3626, "step": 3231 }, { "epoch": 0.5855072463768116, "grad_norm": 3.3822814763757405, "learning_rate": 3.8686072858327674e-07, "loss": 0.2247, "step": 3232 }, { "epoch": 0.5856884057971015, "grad_norm": 6.219499803703084, "learning_rate": 3.865749703382681e-07, "loss": 0.3559, "step": 3233 }, { "epoch": 0.5858695652173913, "grad_norm": 3.901230265365265, "learning_rate": 3.8628925114603445e-07, "loss": 0.3047, "step": 3234 }, { "epoch": 0.5860507246376812, "grad_norm": 5.789195524685199, "learning_rate": 3.860035711049503e-07, "loss": 0.2838, "step": 3235 }, { "epoch": 0.586231884057971, "grad_norm": 5.212309378126797, "learning_rate": 3.8571793031337683e-07, "loss": 0.2679, "step": 3236 }, { "epoch": 0.5864130434782608, "grad_norm": 6.4838493888046, "learning_rate": 3.854323288696615e-07, "loss": 0.3177, "step": 3237 }, { "epoch": 0.5865942028985507, "grad_norm": 4.238989543633254, "learning_rate": 3.8514676687213805e-07, "loss": 0.2897, "step": 3238 }, { "epoch": 0.5867753623188405, "grad_norm": 4.0237297775096605, "learning_rate": 3.84861244419127e-07, "loss": 0.3091, "step": 3239 }, { "epoch": 0.5869565217391305, "grad_norm": 11.250363171204638, "learning_rate": 3.845757616089351e-07, "loss": 0.3068, "step": 3240 }, { "epoch": 0.5871376811594203, "grad_norm": 4.271114958381544, "learning_rate": 3.842903185398555e-07, "loss": 0.2757, "step": 3241 }, { "epoch": 0.5873188405797102, "grad_norm": 3.363792699061715, "learning_rate": 3.840049153101671e-07, "loss": 0.2961, "step": 3242 }, { "epoch": 0.5875, "grad_norm": 3.4954472309060542, "learning_rate": 3.8371955201813626e-07, "loss": 0.2579, "step": 3243 }, { "epoch": 0.5876811594202899, "grad_norm": 3.586913567068485, "learning_rate": 3.8343422876201453e-07, "loss": 0.2827, "step": 3244 }, { "epoch": 0.5878623188405797, "grad_norm": 5.0360083602088626, "learning_rate": 3.8314894564004014e-07, "loss": 0.2249, "step": 3245 }, { "epoch": 0.5880434782608696, "grad_norm": 9.006627240002917, "learning_rate": 3.8286370275043727e-07, "loss": 0.3461, "step": 3246 }, { "epoch": 0.5882246376811594, "grad_norm": 6.204042962760713, "learning_rate": 3.825785001914167e-07, "loss": 0.341, "step": 3247 }, { "epoch": 0.5884057971014492, "grad_norm": 4.098854608581227, "learning_rate": 3.8229333806117484e-07, "loss": 0.2946, "step": 3248 }, { "epoch": 0.5885869565217391, "grad_norm": 4.243210178727484, "learning_rate": 3.8200821645789453e-07, "loss": 0.2633, "step": 3249 }, { "epoch": 0.5887681159420289, "grad_norm": 5.574360299927119, "learning_rate": 3.81723135479744e-07, "loss": 0.2849, "step": 3250 }, { "epoch": 0.5889492753623189, "grad_norm": 5.278660862481296, "learning_rate": 3.814380952248787e-07, "loss": 0.2842, "step": 3251 }, { "epoch": 0.5891304347826087, "grad_norm": 6.588640102490535, "learning_rate": 3.8115309579143884e-07, "loss": 0.2468, "step": 3252 }, { "epoch": 0.5893115942028986, "grad_norm": 3.50813118412626, "learning_rate": 3.8086813727755116e-07, "loss": 0.2671, "step": 3253 }, { "epoch": 0.5894927536231884, "grad_norm": 3.6360011048964713, "learning_rate": 3.8058321978132825e-07, "loss": 0.2677, "step": 3254 }, { "epoch": 0.5896739130434783, "grad_norm": 3.4689192795021886, "learning_rate": 3.802983434008686e-07, "loss": 0.3187, "step": 3255 }, { "epoch": 0.5898550724637681, "grad_norm": 14.163157871751594, "learning_rate": 3.8001350823425654e-07, "loss": 0.3575, "step": 3256 }, { "epoch": 0.590036231884058, "grad_norm": 4.752879779040167, "learning_rate": 3.7972871437956215e-07, "loss": 0.2869, "step": 3257 }, { "epoch": 0.5902173913043478, "grad_norm": 4.166900134651269, "learning_rate": 3.7944396193484097e-07, "loss": 0.372, "step": 3258 }, { "epoch": 0.5903985507246376, "grad_norm": 3.3734948533573212, "learning_rate": 3.791592509981353e-07, "loss": 0.2589, "step": 3259 }, { "epoch": 0.5905797101449275, "grad_norm": 4.183294737934078, "learning_rate": 3.788745816674718e-07, "loss": 0.3133, "step": 3260 }, { "epoch": 0.5907608695652173, "grad_norm": 5.080898492769002, "learning_rate": 3.785899540408639e-07, "loss": 0.2875, "step": 3261 }, { "epoch": 0.5909420289855073, "grad_norm": 4.528968696106944, "learning_rate": 3.7830536821630986e-07, "loss": 0.3152, "step": 3262 }, { "epoch": 0.5911231884057971, "grad_norm": 4.029363255046118, "learning_rate": 3.780208242917943e-07, "loss": 0.2794, "step": 3263 }, { "epoch": 0.591304347826087, "grad_norm": 3.6929004444166282, "learning_rate": 3.7773632236528687e-07, "loss": 0.258, "step": 3264 }, { "epoch": 0.5914855072463768, "grad_norm": 5.120903405267128, "learning_rate": 3.774518625347429e-07, "loss": 0.3018, "step": 3265 }, { "epoch": 0.5916666666666667, "grad_norm": 6.583937887940884, "learning_rate": 3.7716744489810324e-07, "loss": 0.2676, "step": 3266 }, { "epoch": 0.5918478260869565, "grad_norm": 3.8061779805510025, "learning_rate": 3.768830695532944e-07, "loss": 0.2682, "step": 3267 }, { "epoch": 0.5920289855072464, "grad_norm": 6.667825440545987, "learning_rate": 3.765987365982282e-07, "loss": 0.2851, "step": 3268 }, { "epoch": 0.5922101449275362, "grad_norm": 6.5544563472066955, "learning_rate": 3.7631444613080147e-07, "loss": 0.2962, "step": 3269 }, { "epoch": 0.592391304347826, "grad_norm": 7.032115641658924, "learning_rate": 3.760301982488969e-07, "loss": 0.2695, "step": 3270 }, { "epoch": 0.5925724637681159, "grad_norm": 6.59219998684683, "learning_rate": 3.757459930503826e-07, "loss": 0.2904, "step": 3271 }, { "epoch": 0.5927536231884057, "grad_norm": 4.694711776980366, "learning_rate": 3.754618306331117e-07, "loss": 0.2942, "step": 3272 }, { "epoch": 0.5929347826086957, "grad_norm": 5.809791734644619, "learning_rate": 3.7517771109492233e-07, "loss": 0.2739, "step": 3273 }, { "epoch": 0.5931159420289855, "grad_norm": 6.521801647853962, "learning_rate": 3.7489363453363864e-07, "loss": 0.2362, "step": 3274 }, { "epoch": 0.5932971014492754, "grad_norm": 4.334760103721602, "learning_rate": 3.746096010470693e-07, "loss": 0.3292, "step": 3275 }, { "epoch": 0.5934782608695652, "grad_norm": 4.233674351733492, "learning_rate": 3.743256107330086e-07, "loss": 0.3026, "step": 3276 }, { "epoch": 0.5936594202898551, "grad_norm": 3.6110189239777957, "learning_rate": 3.740416636892352e-07, "loss": 0.2613, "step": 3277 }, { "epoch": 0.5938405797101449, "grad_norm": 6.528239288578401, "learning_rate": 3.737577600135141e-07, "loss": 0.3016, "step": 3278 }, { "epoch": 0.5940217391304348, "grad_norm": 6.194466142336035, "learning_rate": 3.734738998035943e-07, "loss": 0.315, "step": 3279 }, { "epoch": 0.5942028985507246, "grad_norm": 5.2522975418429425, "learning_rate": 3.731900831572103e-07, "loss": 0.3537, "step": 3280 }, { "epoch": 0.5943840579710145, "grad_norm": 5.053300408112575, "learning_rate": 3.729063101720814e-07, "loss": 0.3765, "step": 3281 }, { "epoch": 0.5945652173913043, "grad_norm": 4.08815108337814, "learning_rate": 3.7262258094591224e-07, "loss": 0.2597, "step": 3282 }, { "epoch": 0.5947463768115943, "grad_norm": 6.285926564167949, "learning_rate": 3.723388955763919e-07, "loss": 0.2876, "step": 3283 }, { "epoch": 0.5949275362318841, "grad_norm": 5.681103637436708, "learning_rate": 3.7205525416119487e-07, "loss": 0.3243, "step": 3284 }, { "epoch": 0.595108695652174, "grad_norm": 4.778912246133184, "learning_rate": 3.7177165679797965e-07, "loss": 0.3038, "step": 3285 }, { "epoch": 0.5952898550724638, "grad_norm": 4.792956483475528, "learning_rate": 3.7148810358439095e-07, "loss": 0.2946, "step": 3286 }, { "epoch": 0.5954710144927536, "grad_norm": 3.72172770604181, "learning_rate": 3.71204594618057e-07, "loss": 0.2953, "step": 3287 }, { "epoch": 0.5956521739130435, "grad_norm": 4.1948615320251905, "learning_rate": 3.7092112999659123e-07, "loss": 0.2538, "step": 3288 }, { "epoch": 0.5958333333333333, "grad_norm": 5.343797727190462, "learning_rate": 3.7063770981759203e-07, "loss": 0.2542, "step": 3289 }, { "epoch": 0.5960144927536232, "grad_norm": 3.9454674952969877, "learning_rate": 3.7035433417864224e-07, "loss": 0.2883, "step": 3290 }, { "epoch": 0.596195652173913, "grad_norm": 3.184036996587978, "learning_rate": 3.7007100317730953e-07, "loss": 0.2418, "step": 3291 }, { "epoch": 0.5963768115942029, "grad_norm": 4.508573683342913, "learning_rate": 3.697877169111462e-07, "loss": 0.2541, "step": 3292 }, { "epoch": 0.5965579710144927, "grad_norm": 14.260662825127268, "learning_rate": 3.695044754776885e-07, "loss": 0.3101, "step": 3293 }, { "epoch": 0.5967391304347827, "grad_norm": 5.394422598058214, "learning_rate": 3.6922127897445857e-07, "loss": 0.2617, "step": 3294 }, { "epoch": 0.5969202898550725, "grad_norm": 4.25312159106331, "learning_rate": 3.689381274989618e-07, "loss": 0.261, "step": 3295 }, { "epoch": 0.5971014492753624, "grad_norm": 6.529957747875226, "learning_rate": 3.6865502114868876e-07, "loss": 0.241, "step": 3296 }, { "epoch": 0.5972826086956522, "grad_norm": 8.190660379147307, "learning_rate": 3.683719600211141e-07, "loss": 0.3252, "step": 3297 }, { "epoch": 0.597463768115942, "grad_norm": 3.1977277274683624, "learning_rate": 3.680889442136974e-07, "loss": 0.2196, "step": 3298 }, { "epoch": 0.5976449275362319, "grad_norm": 7.144997125269342, "learning_rate": 3.678059738238822e-07, "loss": 0.3522, "step": 3299 }, { "epoch": 0.5978260869565217, "grad_norm": 4.022176567756693, "learning_rate": 3.675230489490966e-07, "loss": 0.3139, "step": 3300 }, { "epoch": 0.5978260869565217, "eval_loss": 0.2955937385559082, "eval_runtime": 9.7532, "eval_samples_per_second": 51.265, "eval_steps_per_second": 0.103, "step": 3300 }, { "epoch": 0.5980072463768116, "grad_norm": 3.651167428835501, "learning_rate": 3.6724016968675274e-07, "loss": 0.3181, "step": 3301 }, { "epoch": 0.5981884057971014, "grad_norm": 4.020400213111246, "learning_rate": 3.669573361342477e-07, "loss": 0.2268, "step": 3302 }, { "epoch": 0.5983695652173913, "grad_norm": 3.563789639177328, "learning_rate": 3.6667454838896226e-07, "loss": 0.279, "step": 3303 }, { "epoch": 0.5985507246376811, "grad_norm": 3.679358010953408, "learning_rate": 3.663918065482614e-07, "loss": 0.2967, "step": 3304 }, { "epoch": 0.5987318840579711, "grad_norm": 5.381985935358111, "learning_rate": 3.6610911070949453e-07, "loss": 0.2859, "step": 3305 }, { "epoch": 0.5989130434782609, "grad_norm": 10.081364661882816, "learning_rate": 3.6582646096999525e-07, "loss": 0.3456, "step": 3306 }, { "epoch": 0.5990942028985508, "grad_norm": 6.863005718495925, "learning_rate": 3.6554385742708126e-07, "loss": 0.3032, "step": 3307 }, { "epoch": 0.5992753623188406, "grad_norm": 5.2518483261676945, "learning_rate": 3.6526130017805414e-07, "loss": 0.2938, "step": 3308 }, { "epoch": 0.5994565217391304, "grad_norm": 4.557840109538176, "learning_rate": 3.649787893201998e-07, "loss": 0.3504, "step": 3309 }, { "epoch": 0.5996376811594203, "grad_norm": 3.8251551994823965, "learning_rate": 3.646963249507881e-07, "loss": 0.2773, "step": 3310 }, { "epoch": 0.5998188405797101, "grad_norm": 4.748903552679971, "learning_rate": 3.6441390716707286e-07, "loss": 0.3156, "step": 3311 }, { "epoch": 0.6, "grad_norm": 8.469180151323103, "learning_rate": 3.6413153606629153e-07, "loss": 0.2674, "step": 3312 }, { "epoch": 0.6001811594202898, "grad_norm": 5.932796807598475, "learning_rate": 3.638492117456664e-07, "loss": 0.3092, "step": 3313 }, { "epoch": 0.6003623188405797, "grad_norm": 4.780136478246534, "learning_rate": 3.635669343024027e-07, "loss": 0.2219, "step": 3314 }, { "epoch": 0.6005434782608695, "grad_norm": 3.916191481404587, "learning_rate": 3.6328470383368987e-07, "loss": 0.2697, "step": 3315 }, { "epoch": 0.6007246376811595, "grad_norm": 4.0423793769398815, "learning_rate": 3.630025204367012e-07, "loss": 0.2955, "step": 3316 }, { "epoch": 0.6009057971014493, "grad_norm": 4.246249678323975, "learning_rate": 3.6272038420859396e-07, "loss": 0.3051, "step": 3317 }, { "epoch": 0.6010869565217392, "grad_norm": 4.452423326475648, "learning_rate": 3.6243829524650895e-07, "loss": 0.2737, "step": 3318 }, { "epoch": 0.601268115942029, "grad_norm": 3.6839003826844325, "learning_rate": 3.6215625364757063e-07, "loss": 0.2668, "step": 3319 }, { "epoch": 0.6014492753623188, "grad_norm": 5.179098262317639, "learning_rate": 3.6187425950888706e-07, "loss": 0.2559, "step": 3320 }, { "epoch": 0.6016304347826087, "grad_norm": 4.89579337971018, "learning_rate": 3.615923129275507e-07, "loss": 0.2967, "step": 3321 }, { "epoch": 0.6018115942028985, "grad_norm": 4.7918255542702655, "learning_rate": 3.613104140006367e-07, "loss": 0.3235, "step": 3322 }, { "epoch": 0.6019927536231884, "grad_norm": 5.507709040721174, "learning_rate": 3.6102856282520435e-07, "loss": 0.2935, "step": 3323 }, { "epoch": 0.6021739130434782, "grad_norm": 5.366217628216412, "learning_rate": 3.6074675949829603e-07, "loss": 0.3015, "step": 3324 }, { "epoch": 0.6023550724637681, "grad_norm": 7.153041854058611, "learning_rate": 3.604650041169384e-07, "loss": 0.3326, "step": 3325 }, { "epoch": 0.6025362318840579, "grad_norm": 4.36631190057768, "learning_rate": 3.60183296778141e-07, "loss": 0.2353, "step": 3326 }, { "epoch": 0.6027173913043479, "grad_norm": 3.549837351966586, "learning_rate": 3.5990163757889704e-07, "loss": 0.2206, "step": 3327 }, { "epoch": 0.6028985507246377, "grad_norm": 10.07892491028872, "learning_rate": 3.596200266161827e-07, "loss": 0.2715, "step": 3328 }, { "epoch": 0.6030797101449276, "grad_norm": 3.8983570432557104, "learning_rate": 3.593384639869587e-07, "loss": 0.3248, "step": 3329 }, { "epoch": 0.6032608695652174, "grad_norm": 4.489422865055295, "learning_rate": 3.59056949788168e-07, "loss": 0.3404, "step": 3330 }, { "epoch": 0.6034420289855073, "grad_norm": 3.4483135395784945, "learning_rate": 3.587754841167372e-07, "loss": 0.2872, "step": 3331 }, { "epoch": 0.6036231884057971, "grad_norm": 4.235392034039292, "learning_rate": 3.584940670695763e-07, "loss": 0.3454, "step": 3332 }, { "epoch": 0.6038043478260869, "grad_norm": 5.96991656199851, "learning_rate": 3.5821269874357864e-07, "loss": 0.3012, "step": 3333 }, { "epoch": 0.6039855072463768, "grad_norm": 5.049135852776992, "learning_rate": 3.5793137923562053e-07, "loss": 0.262, "step": 3334 }, { "epoch": 0.6041666666666666, "grad_norm": 6.559784801461435, "learning_rate": 3.5765010864256184e-07, "loss": 0.3138, "step": 3335 }, { "epoch": 0.6043478260869565, "grad_norm": 3.6045134388981657, "learning_rate": 3.57368887061245e-07, "loss": 0.3234, "step": 3336 }, { "epoch": 0.6045289855072464, "grad_norm": 4.19751242627348, "learning_rate": 3.570877145884963e-07, "loss": 0.244, "step": 3337 }, { "epoch": 0.6047101449275363, "grad_norm": 3.4859578467470445, "learning_rate": 3.568065913211247e-07, "loss": 0.2568, "step": 3338 }, { "epoch": 0.6048913043478261, "grad_norm": 6.3285829932738435, "learning_rate": 3.5652551735592205e-07, "loss": 0.3026, "step": 3339 }, { "epoch": 0.605072463768116, "grad_norm": 9.85991638979906, "learning_rate": 3.5624449278966347e-07, "loss": 0.3299, "step": 3340 }, { "epoch": 0.6052536231884058, "grad_norm": 7.714432474088771, "learning_rate": 3.559635177191073e-07, "loss": 0.279, "step": 3341 }, { "epoch": 0.6054347826086957, "grad_norm": 3.713120356597635, "learning_rate": 3.556825922409943e-07, "loss": 0.3166, "step": 3342 }, { "epoch": 0.6056159420289855, "grad_norm": 5.902214459588551, "learning_rate": 3.554017164520486e-07, "loss": 0.2979, "step": 3343 }, { "epoch": 0.6057971014492753, "grad_norm": 6.898580287480659, "learning_rate": 3.5512089044897714e-07, "loss": 0.3271, "step": 3344 }, { "epoch": 0.6059782608695652, "grad_norm": 5.063178511010489, "learning_rate": 3.548401143284695e-07, "loss": 0.2301, "step": 3345 }, { "epoch": 0.606159420289855, "grad_norm": 4.25370324712661, "learning_rate": 3.5455938818719843e-07, "loss": 0.3126, "step": 3346 }, { "epoch": 0.6063405797101449, "grad_norm": 5.713186328910256, "learning_rate": 3.542787121218188e-07, "loss": 0.3062, "step": 3347 }, { "epoch": 0.6065217391304348, "grad_norm": 3.3805392598126005, "learning_rate": 3.539980862289693e-07, "loss": 0.2588, "step": 3348 }, { "epoch": 0.6067028985507247, "grad_norm": 4.178821876849478, "learning_rate": 3.5371751060527046e-07, "loss": 0.3023, "step": 3349 }, { "epoch": 0.6068840579710145, "grad_norm": 4.505841378383071, "learning_rate": 3.534369853473258e-07, "loss": 0.2392, "step": 3350 }, { "epoch": 0.6070652173913044, "grad_norm": 5.967333772132723, "learning_rate": 3.5315651055172133e-07, "loss": 0.2878, "step": 3351 }, { "epoch": 0.6072463768115942, "grad_norm": 4.2825410858985515, "learning_rate": 3.528760863150262e-07, "loss": 0.2711, "step": 3352 }, { "epoch": 0.6074275362318841, "grad_norm": 3.6390844389142467, "learning_rate": 3.525957127337916e-07, "loss": 0.2304, "step": 3353 }, { "epoch": 0.6076086956521739, "grad_norm": 4.747535560337662, "learning_rate": 3.523153899045517e-07, "loss": 0.3466, "step": 3354 }, { "epoch": 0.6077898550724637, "grad_norm": 4.37513776994546, "learning_rate": 3.5203511792382246e-07, "loss": 0.2843, "step": 3355 }, { "epoch": 0.6079710144927536, "grad_norm": 4.205680129712524, "learning_rate": 3.5175489688810344e-07, "loss": 0.3276, "step": 3356 }, { "epoch": 0.6081521739130434, "grad_norm": 5.806257279452617, "learning_rate": 3.5147472689387583e-07, "loss": 0.2974, "step": 3357 }, { "epoch": 0.6083333333333333, "grad_norm": 3.9912106927406046, "learning_rate": 3.511946080376034e-07, "loss": 0.2292, "step": 3358 }, { "epoch": 0.6085144927536232, "grad_norm": 5.014533378817344, "learning_rate": 3.5091454041573236e-07, "loss": 0.3342, "step": 3359 }, { "epoch": 0.6086956521739131, "grad_norm": 5.9651598388958105, "learning_rate": 3.5063452412469157e-07, "loss": 0.3226, "step": 3360 }, { "epoch": 0.6088768115942029, "grad_norm": 8.104090091646354, "learning_rate": 3.5035455926089184e-07, "loss": 0.2911, "step": 3361 }, { "epoch": 0.6090579710144928, "grad_norm": 4.41459225803029, "learning_rate": 3.5007464592072643e-07, "loss": 0.3134, "step": 3362 }, { "epoch": 0.6092391304347826, "grad_norm": 3.3807735209487197, "learning_rate": 3.4979478420057057e-07, "loss": 0.2551, "step": 3363 }, { "epoch": 0.6094202898550725, "grad_norm": 4.782780075566693, "learning_rate": 3.495149741967824e-07, "loss": 0.2216, "step": 3364 }, { "epoch": 0.6096014492753623, "grad_norm": 5.241905344890328, "learning_rate": 3.4923521600570173e-07, "loss": 0.2244, "step": 3365 }, { "epoch": 0.6097826086956522, "grad_norm": 3.5897214172554928, "learning_rate": 3.4895550972365035e-07, "loss": 0.2721, "step": 3366 }, { "epoch": 0.609963768115942, "grad_norm": 5.61174062325696, "learning_rate": 3.4867585544693256e-07, "loss": 0.2862, "step": 3367 }, { "epoch": 0.6101449275362318, "grad_norm": 4.371578527908705, "learning_rate": 3.483962532718349e-07, "loss": 0.2991, "step": 3368 }, { "epoch": 0.6103260869565217, "grad_norm": 3.6397045144993094, "learning_rate": 3.4811670329462546e-07, "loss": 0.2785, "step": 3369 }, { "epoch": 0.6105072463768116, "grad_norm": 4.0070612839986754, "learning_rate": 3.478372056115547e-07, "loss": 0.3022, "step": 3370 }, { "epoch": 0.6106884057971015, "grad_norm": 3.41215812125486, "learning_rate": 3.4755776031885496e-07, "loss": 0.2204, "step": 3371 }, { "epoch": 0.6108695652173913, "grad_norm": 3.6570079575541383, "learning_rate": 3.4727836751274073e-07, "loss": 0.3153, "step": 3372 }, { "epoch": 0.6110507246376812, "grad_norm": 4.92218841189247, "learning_rate": 3.4699902728940833e-07, "loss": 0.3091, "step": 3373 }, { "epoch": 0.611231884057971, "grad_norm": 3.918209467061928, "learning_rate": 3.4671973974503565e-07, "loss": 0.2362, "step": 3374 }, { "epoch": 0.6114130434782609, "grad_norm": 4.02918279825564, "learning_rate": 3.464405049757827e-07, "loss": 0.2953, "step": 3375 }, { "epoch": 0.6115942028985507, "grad_norm": 4.6206769906014085, "learning_rate": 3.4616132307779156e-07, "loss": 0.3171, "step": 3376 }, { "epoch": 0.6117753623188406, "grad_norm": 4.665061766891564, "learning_rate": 3.458821941471858e-07, "loss": 0.2837, "step": 3377 }, { "epoch": 0.6119565217391304, "grad_norm": 5.39750229597707, "learning_rate": 3.456031182800708e-07, "loss": 0.302, "step": 3378 }, { "epoch": 0.6121376811594202, "grad_norm": 6.686040640006759, "learning_rate": 3.453240955725336e-07, "loss": 0.3274, "step": 3379 }, { "epoch": 0.6123188405797102, "grad_norm": 7.965186976185531, "learning_rate": 3.450451261206433e-07, "loss": 0.3138, "step": 3380 }, { "epoch": 0.6125, "grad_norm": 7.29154606731725, "learning_rate": 3.4476621002045034e-07, "loss": 0.2838, "step": 3381 }, { "epoch": 0.6126811594202899, "grad_norm": 3.627494328076497, "learning_rate": 3.444873473679866e-07, "loss": 0.2866, "step": 3382 }, { "epoch": 0.6128623188405797, "grad_norm": 4.26359221396759, "learning_rate": 3.442085382592662e-07, "loss": 0.2896, "step": 3383 }, { "epoch": 0.6130434782608696, "grad_norm": 3.2586955889005846, "learning_rate": 3.439297827902841e-07, "loss": 0.2314, "step": 3384 }, { "epoch": 0.6132246376811594, "grad_norm": 3.5385499510280893, "learning_rate": 3.436510810570173e-07, "loss": 0.2329, "step": 3385 }, { "epoch": 0.6134057971014493, "grad_norm": 5.964639570819032, "learning_rate": 3.43372433155424e-07, "loss": 0.2726, "step": 3386 }, { "epoch": 0.6135869565217391, "grad_norm": 3.8224831558381838, "learning_rate": 3.430938391814442e-07, "loss": 0.2848, "step": 3387 }, { "epoch": 0.613768115942029, "grad_norm": 4.646803586653227, "learning_rate": 3.4281529923099895e-07, "loss": 0.2561, "step": 3388 }, { "epoch": 0.6139492753623188, "grad_norm": 4.008871311678018, "learning_rate": 3.4253681339999106e-07, "loss": 0.3173, "step": 3389 }, { "epoch": 0.6141304347826086, "grad_norm": 5.398176800269643, "learning_rate": 3.422583817843041e-07, "loss": 0.2813, "step": 3390 }, { "epoch": 0.6143115942028986, "grad_norm": 4.081504817136662, "learning_rate": 3.41980004479804e-07, "loss": 0.3089, "step": 3391 }, { "epoch": 0.6144927536231884, "grad_norm": 5.69908946404749, "learning_rate": 3.417016815823369e-07, "loss": 0.2864, "step": 3392 }, { "epoch": 0.6146739130434783, "grad_norm": 3.7981437707990136, "learning_rate": 3.414234131877309e-07, "loss": 0.3279, "step": 3393 }, { "epoch": 0.6148550724637681, "grad_norm": 4.090170855014332, "learning_rate": 3.411451993917951e-07, "loss": 0.3269, "step": 3394 }, { "epoch": 0.615036231884058, "grad_norm": 7.195674493614941, "learning_rate": 3.408670402903198e-07, "loss": 0.2845, "step": 3395 }, { "epoch": 0.6152173913043478, "grad_norm": 3.713785620643085, "learning_rate": 3.4058893597907655e-07, "loss": 0.2922, "step": 3396 }, { "epoch": 0.6153985507246377, "grad_norm": 5.029608827363032, "learning_rate": 3.4031088655381803e-07, "loss": 0.2916, "step": 3397 }, { "epoch": 0.6155797101449275, "grad_norm": 3.9481893696522627, "learning_rate": 3.400328921102776e-07, "loss": 0.2956, "step": 3398 }, { "epoch": 0.6157608695652174, "grad_norm": 3.926948761101867, "learning_rate": 3.3975495274417056e-07, "loss": 0.2933, "step": 3399 }, { "epoch": 0.6159420289855072, "grad_norm": 3.448641775407215, "learning_rate": 3.3947706855119263e-07, "loss": 0.2419, "step": 3400 }, { "epoch": 0.6159420289855072, "eval_loss": 0.2915000021457672, "eval_runtime": 9.7516, "eval_samples_per_second": 51.274, "eval_steps_per_second": 0.103, "step": 3400 }, { "epoch": 0.616123188405797, "grad_norm": 5.416204808194034, "learning_rate": 3.391992396270205e-07, "loss": 0.3008, "step": 3401 }, { "epoch": 0.616304347826087, "grad_norm": 3.879247725695068, "learning_rate": 3.3892146606731195e-07, "loss": 0.2998, "step": 3402 }, { "epoch": 0.6164855072463769, "grad_norm": 4.792584029000788, "learning_rate": 3.386437479677059e-07, "loss": 0.2835, "step": 3403 }, { "epoch": 0.6166666666666667, "grad_norm": 10.50501637472616, "learning_rate": 3.3836608542382206e-07, "loss": 0.2781, "step": 3404 }, { "epoch": 0.6168478260869565, "grad_norm": 4.774879211806365, "learning_rate": 3.380884785312608e-07, "loss": 0.3511, "step": 3405 }, { "epoch": 0.6170289855072464, "grad_norm": 4.770318371124587, "learning_rate": 3.3781092738560334e-07, "loss": 0.3533, "step": 3406 }, { "epoch": 0.6172101449275362, "grad_norm": 5.767018152305727, "learning_rate": 3.375334320824122e-07, "loss": 0.227, "step": 3407 }, { "epoch": 0.6173913043478261, "grad_norm": 4.197197355550296, "learning_rate": 3.3725599271723024e-07, "loss": 0.2906, "step": 3408 }, { "epoch": 0.6175724637681159, "grad_norm": 4.090042387105566, "learning_rate": 3.3697860938558107e-07, "loss": 0.2596, "step": 3409 }, { "epoch": 0.6177536231884058, "grad_norm": 6.108399253355486, "learning_rate": 3.36701282182969e-07, "loss": 0.312, "step": 3410 }, { "epoch": 0.6179347826086956, "grad_norm": 3.949792855294822, "learning_rate": 3.3642401120487925e-07, "loss": 0.2729, "step": 3411 }, { "epoch": 0.6181159420289855, "grad_norm": 4.068161218770359, "learning_rate": 3.361467965467775e-07, "loss": 0.2581, "step": 3412 }, { "epoch": 0.6182971014492754, "grad_norm": 7.26254582304241, "learning_rate": 3.3586963830411004e-07, "loss": 0.2687, "step": 3413 }, { "epoch": 0.6184782608695653, "grad_norm": 7.939674765239618, "learning_rate": 3.355925365723037e-07, "loss": 0.2827, "step": 3414 }, { "epoch": 0.6186594202898551, "grad_norm": 4.768702238108111, "learning_rate": 3.3531549144676606e-07, "loss": 0.3147, "step": 3415 }, { "epoch": 0.618840579710145, "grad_norm": 5.081040663047155, "learning_rate": 3.3503850302288517e-07, "loss": 0.2643, "step": 3416 }, { "epoch": 0.6190217391304348, "grad_norm": 5.307761754284609, "learning_rate": 3.347615713960289e-07, "loss": 0.3767, "step": 3417 }, { "epoch": 0.6192028985507246, "grad_norm": 5.7265332134745, "learning_rate": 3.3448469666154687e-07, "loss": 0.2711, "step": 3418 }, { "epoch": 0.6193840579710145, "grad_norm": 5.98983013767756, "learning_rate": 3.3420787891476785e-07, "loss": 0.306, "step": 3419 }, { "epoch": 0.6195652173913043, "grad_norm": 10.665162947850796, "learning_rate": 3.3393111825100176e-07, "loss": 0.2927, "step": 3420 }, { "epoch": 0.6197463768115942, "grad_norm": 10.369888656993997, "learning_rate": 3.3365441476553837e-07, "loss": 0.2486, "step": 3421 }, { "epoch": 0.619927536231884, "grad_norm": 5.319507481340947, "learning_rate": 3.333777685536482e-07, "loss": 0.319, "step": 3422 }, { "epoch": 0.6201086956521739, "grad_norm": 5.341075825958292, "learning_rate": 3.3310117971058184e-07, "loss": 0.2625, "step": 3423 }, { "epoch": 0.6202898550724638, "grad_norm": 5.119010067014903, "learning_rate": 3.3282464833157016e-07, "loss": 0.3404, "step": 3424 }, { "epoch": 0.6204710144927537, "grad_norm": 5.518815813722564, "learning_rate": 3.3254817451182383e-07, "loss": 0.2431, "step": 3425 }, { "epoch": 0.6206521739130435, "grad_norm": 3.8941872483903084, "learning_rate": 3.3227175834653475e-07, "loss": 0.2717, "step": 3426 }, { "epoch": 0.6208333333333333, "grad_norm": 5.739700907654499, "learning_rate": 3.319953999308739e-07, "loss": 0.3448, "step": 3427 }, { "epoch": 0.6210144927536232, "grad_norm": 4.1154490717111045, "learning_rate": 3.317190993599929e-07, "loss": 0.2578, "step": 3428 }, { "epoch": 0.621195652173913, "grad_norm": 7.147933098720609, "learning_rate": 3.3144285672902314e-07, "loss": 0.3057, "step": 3429 }, { "epoch": 0.6213768115942029, "grad_norm": 5.835403803071247, "learning_rate": 3.3116667213307657e-07, "loss": 0.3485, "step": 3430 }, { "epoch": 0.6215579710144927, "grad_norm": 4.378248901854491, "learning_rate": 3.3089054566724474e-07, "loss": 0.2889, "step": 3431 }, { "epoch": 0.6217391304347826, "grad_norm": 3.45538849605132, "learning_rate": 3.306144774265994e-07, "loss": 0.2964, "step": 3432 }, { "epoch": 0.6219202898550724, "grad_norm": 4.898633954388818, "learning_rate": 3.303384675061918e-07, "loss": 0.2666, "step": 3433 }, { "epoch": 0.6221014492753624, "grad_norm": 4.157915349439017, "learning_rate": 3.300625160010538e-07, "loss": 0.3556, "step": 3434 }, { "epoch": 0.6222826086956522, "grad_norm": 4.15883329519004, "learning_rate": 3.297866230061969e-07, "loss": 0.3005, "step": 3435 }, { "epoch": 0.6224637681159421, "grad_norm": 4.612893971422, "learning_rate": 3.295107886166121e-07, "loss": 0.2969, "step": 3436 }, { "epoch": 0.6226449275362319, "grad_norm": 6.407804537293918, "learning_rate": 3.292350129272704e-07, "loss": 0.2571, "step": 3437 }, { "epoch": 0.6228260869565218, "grad_norm": 11.807591402410461, "learning_rate": 3.28959296033123e-07, "loss": 0.2889, "step": 3438 }, { "epoch": 0.6230072463768116, "grad_norm": 6.707847759024104, "learning_rate": 3.2868363802910036e-07, "loss": 0.2947, "step": 3439 }, { "epoch": 0.6231884057971014, "grad_norm": 5.553636979851655, "learning_rate": 3.2840803901011293e-07, "loss": 0.3348, "step": 3440 }, { "epoch": 0.6233695652173913, "grad_norm": 5.267889379745383, "learning_rate": 3.281324990710506e-07, "loss": 0.268, "step": 3441 }, { "epoch": 0.6235507246376811, "grad_norm": 3.4546733257721374, "learning_rate": 3.2785701830678317e-07, "loss": 0.2639, "step": 3442 }, { "epoch": 0.623731884057971, "grad_norm": 4.319401349102515, "learning_rate": 3.2758159681216006e-07, "loss": 0.2735, "step": 3443 }, { "epoch": 0.6239130434782608, "grad_norm": 6.245917005495754, "learning_rate": 3.2730623468201005e-07, "loss": 0.2809, "step": 3444 }, { "epoch": 0.6240942028985508, "grad_norm": 7.218059252158506, "learning_rate": 3.2703093201114164e-07, "loss": 0.3095, "step": 3445 }, { "epoch": 0.6242753623188406, "grad_norm": 6.223554817332476, "learning_rate": 3.2675568889434283e-07, "loss": 0.3175, "step": 3446 }, { "epoch": 0.6244565217391305, "grad_norm": 5.208981718445207, "learning_rate": 3.2648050542638127e-07, "loss": 0.3013, "step": 3447 }, { "epoch": 0.6246376811594203, "grad_norm": 3.643399611839414, "learning_rate": 3.2620538170200384e-07, "loss": 0.3114, "step": 3448 }, { "epoch": 0.6248188405797102, "grad_norm": 9.026695166408295, "learning_rate": 3.259303178159369e-07, "loss": 0.2425, "step": 3449 }, { "epoch": 0.625, "grad_norm": 3.8624549332886042, "learning_rate": 3.2565531386288634e-07, "loss": 0.3045, "step": 3450 }, { "epoch": 0.6251811594202898, "grad_norm": 3.357615249587839, "learning_rate": 3.253803699375374e-07, "loss": 0.2858, "step": 3451 }, { "epoch": 0.6253623188405797, "grad_norm": 4.7049179864879775, "learning_rate": 3.251054861345541e-07, "loss": 0.35, "step": 3452 }, { "epoch": 0.6255434782608695, "grad_norm": 5.497095510356356, "learning_rate": 3.2483066254858094e-07, "loss": 0.343, "step": 3453 }, { "epoch": 0.6257246376811594, "grad_norm": 5.123134396124013, "learning_rate": 3.2455589927424056e-07, "loss": 0.2871, "step": 3454 }, { "epoch": 0.6259057971014492, "grad_norm": 4.461689299945907, "learning_rate": 3.242811964061353e-07, "loss": 0.2745, "step": 3455 }, { "epoch": 0.6260869565217392, "grad_norm": 7.422599470726544, "learning_rate": 3.2400655403884657e-07, "loss": 0.2746, "step": 3456 }, { "epoch": 0.626268115942029, "grad_norm": 10.878100702500891, "learning_rate": 3.237319722669353e-07, "loss": 0.2689, "step": 3457 }, { "epoch": 0.6264492753623189, "grad_norm": 5.8444935654169905, "learning_rate": 3.2345745118494105e-07, "loss": 0.2494, "step": 3458 }, { "epoch": 0.6266304347826087, "grad_norm": 3.3332809025468886, "learning_rate": 3.2318299088738306e-07, "loss": 0.249, "step": 3459 }, { "epoch": 0.6268115942028986, "grad_norm": 11.392629740906484, "learning_rate": 3.229085914687587e-07, "loss": 0.281, "step": 3460 }, { "epoch": 0.6269927536231884, "grad_norm": 4.492616175317283, "learning_rate": 3.2263425302354576e-07, "loss": 0.3051, "step": 3461 }, { "epoch": 0.6271739130434782, "grad_norm": 8.144427767289047, "learning_rate": 3.223599756461997e-07, "loss": 0.3134, "step": 3462 }, { "epoch": 0.6273550724637681, "grad_norm": 5.289769844162745, "learning_rate": 3.2208575943115577e-07, "loss": 0.2764, "step": 3463 }, { "epoch": 0.6275362318840579, "grad_norm": 5.204015174587078, "learning_rate": 3.218116044728277e-07, "loss": 0.3087, "step": 3464 }, { "epoch": 0.6277173913043478, "grad_norm": 5.366672131118816, "learning_rate": 3.2153751086560856e-07, "loss": 0.2218, "step": 3465 }, { "epoch": 0.6278985507246376, "grad_norm": 6.319604205684525, "learning_rate": 3.2126347870387006e-07, "loss": 0.2805, "step": 3466 }, { "epoch": 0.6280797101449276, "grad_norm": 5.190052431153651, "learning_rate": 3.209895080819628e-07, "loss": 0.3307, "step": 3467 }, { "epoch": 0.6282608695652174, "grad_norm": 5.973216655062214, "learning_rate": 3.2071559909421574e-07, "loss": 0.2201, "step": 3468 }, { "epoch": 0.6284420289855073, "grad_norm": 4.256350548736856, "learning_rate": 3.204417518349376e-07, "loss": 0.2855, "step": 3469 }, { "epoch": 0.6286231884057971, "grad_norm": 4.68663346579806, "learning_rate": 3.2016796639841515e-07, "loss": 0.3165, "step": 3470 }, { "epoch": 0.628804347826087, "grad_norm": 6.720386983039846, "learning_rate": 3.1989424287891386e-07, "loss": 0.3184, "step": 3471 }, { "epoch": 0.6289855072463768, "grad_norm": 4.595726189265466, "learning_rate": 3.19620581370678e-07, "loss": 0.2801, "step": 3472 }, { "epoch": 0.6291666666666667, "grad_norm": 6.091865827108255, "learning_rate": 3.1934698196793077e-07, "loss": 0.2778, "step": 3473 }, { "epoch": 0.6293478260869565, "grad_norm": 3.576356064987536, "learning_rate": 3.190734447648735e-07, "loss": 0.2841, "step": 3474 }, { "epoch": 0.6295289855072463, "grad_norm": 3.394416153346299, "learning_rate": 3.187999698556865e-07, "loss": 0.2241, "step": 3475 }, { "epoch": 0.6297101449275362, "grad_norm": 6.186491700325118, "learning_rate": 3.185265573345284e-07, "loss": 0.3525, "step": 3476 }, { "epoch": 0.6298913043478261, "grad_norm": 5.144816350253735, "learning_rate": 3.182532072955364e-07, "loss": 0.262, "step": 3477 }, { "epoch": 0.630072463768116, "grad_norm": 3.876315392290179, "learning_rate": 3.179799198328265e-07, "loss": 0.3162, "step": 3478 }, { "epoch": 0.6302536231884058, "grad_norm": 3.1545542978646237, "learning_rate": 3.177066950404924e-07, "loss": 0.239, "step": 3479 }, { "epoch": 0.6304347826086957, "grad_norm": 3.8527715428261757, "learning_rate": 3.174335330126069e-07, "loss": 0.3144, "step": 3480 }, { "epoch": 0.6306159420289855, "grad_norm": 10.807349738611567, "learning_rate": 3.171604338432211e-07, "loss": 0.2858, "step": 3481 }, { "epoch": 0.6307971014492754, "grad_norm": 3.90341742411574, "learning_rate": 3.1688739762636425e-07, "loss": 0.2615, "step": 3482 }, { "epoch": 0.6309782608695652, "grad_norm": 5.956069805012847, "learning_rate": 3.1661442445604395e-07, "loss": 0.3372, "step": 3483 }, { "epoch": 0.631159420289855, "grad_norm": 7.589601830606784, "learning_rate": 3.163415144262461e-07, "loss": 0.2803, "step": 3484 }, { "epoch": 0.6313405797101449, "grad_norm": 6.597118343210361, "learning_rate": 3.160686676309352e-07, "loss": 0.3384, "step": 3485 }, { "epoch": 0.6315217391304347, "grad_norm": 6.012869394286747, "learning_rate": 3.157958841640536e-07, "loss": 0.3165, "step": 3486 }, { "epoch": 0.6317028985507246, "grad_norm": 6.555759175967989, "learning_rate": 3.1552316411952154e-07, "loss": 0.3617, "step": 3487 }, { "epoch": 0.6318840579710145, "grad_norm": 5.300184139611348, "learning_rate": 3.1525050759123843e-07, "loss": 0.2909, "step": 3488 }, { "epoch": 0.6320652173913044, "grad_norm": 4.005959374094459, "learning_rate": 3.149779146730809e-07, "loss": 0.299, "step": 3489 }, { "epoch": 0.6322463768115942, "grad_norm": 5.95346829250653, "learning_rate": 3.147053854589039e-07, "loss": 0.2699, "step": 3490 }, { "epoch": 0.6324275362318841, "grad_norm": 4.272464390805704, "learning_rate": 3.144329200425406e-07, "loss": 0.3195, "step": 3491 }, { "epoch": 0.6326086956521739, "grad_norm": 9.057414868259817, "learning_rate": 3.1416051851780223e-07, "loss": 0.286, "step": 3492 }, { "epoch": 0.6327898550724638, "grad_norm": 8.477771706292318, "learning_rate": 3.138881809784778e-07, "loss": 0.2807, "step": 3493 }, { "epoch": 0.6329710144927536, "grad_norm": 3.7911143503502793, "learning_rate": 3.1361590751833467e-07, "loss": 0.3135, "step": 3494 }, { "epoch": 0.6331521739130435, "grad_norm": 5.681542121834925, "learning_rate": 3.133436982311174e-07, "loss": 0.2736, "step": 3495 }, { "epoch": 0.6333333333333333, "grad_norm": 9.349125398278641, "learning_rate": 3.1307155321054947e-07, "loss": 0.3322, "step": 3496 }, { "epoch": 0.6335144927536231, "grad_norm": 7.240340442453464, "learning_rate": 3.127994725503313e-07, "loss": 0.2888, "step": 3497 }, { "epoch": 0.633695652173913, "grad_norm": 3.9439792548592645, "learning_rate": 3.125274563441418e-07, "loss": 0.3116, "step": 3498 }, { "epoch": 0.633876811594203, "grad_norm": 6.244013027880077, "learning_rate": 3.1225550468563724e-07, "loss": 0.2688, "step": 3499 }, { "epoch": 0.6340579710144928, "grad_norm": 5.060186724217018, "learning_rate": 3.1198361766845205e-07, "loss": 0.2377, "step": 3500 }, { "epoch": 0.6340579710144928, "eval_loss": 0.2820156216621399, "eval_runtime": 9.7492, "eval_samples_per_second": 51.286, "eval_steps_per_second": 0.103, "step": 3500 }, { "epoch": 0.6342391304347826, "grad_norm": 6.2382579177319295, "learning_rate": 3.117117953861981e-07, "loss": 0.3268, "step": 3501 }, { "epoch": 0.6344202898550725, "grad_norm": 8.029668359470788, "learning_rate": 3.1144003793246524e-07, "loss": 0.3033, "step": 3502 }, { "epoch": 0.6346014492753623, "grad_norm": 5.564703030577194, "learning_rate": 3.1116834540082047e-07, "loss": 0.2782, "step": 3503 }, { "epoch": 0.6347826086956522, "grad_norm": 4.421979813804192, "learning_rate": 3.108967178848093e-07, "loss": 0.3212, "step": 3504 }, { "epoch": 0.634963768115942, "grad_norm": 6.855121681408441, "learning_rate": 3.106251554779542e-07, "loss": 0.3108, "step": 3505 }, { "epoch": 0.6351449275362319, "grad_norm": 4.696215758109863, "learning_rate": 3.103536582737553e-07, "loss": 0.3031, "step": 3506 }, { "epoch": 0.6353260869565217, "grad_norm": 3.549072914840705, "learning_rate": 3.1008222636569023e-07, "loss": 0.2772, "step": 3507 }, { "epoch": 0.6355072463768116, "grad_norm": 3.7165036252945227, "learning_rate": 3.098108598472147e-07, "loss": 0.2446, "step": 3508 }, { "epoch": 0.6356884057971014, "grad_norm": 8.03291885166229, "learning_rate": 3.0953955881176116e-07, "loss": 0.2819, "step": 3509 }, { "epoch": 0.6358695652173914, "grad_norm": 5.192305453981132, "learning_rate": 3.0926832335273996e-07, "loss": 0.2822, "step": 3510 }, { "epoch": 0.6360507246376812, "grad_norm": 4.5139104730937305, "learning_rate": 3.089971535635386e-07, "loss": 0.3029, "step": 3511 }, { "epoch": 0.636231884057971, "grad_norm": 5.713649470606083, "learning_rate": 3.087260495375224e-07, "loss": 0.3518, "step": 3512 }, { "epoch": 0.6364130434782609, "grad_norm": 8.90754021622772, "learning_rate": 3.0845501136803376e-07, "loss": 0.3116, "step": 3513 }, { "epoch": 0.6365942028985507, "grad_norm": 5.7333201268731155, "learning_rate": 3.0818403914839206e-07, "loss": 0.2771, "step": 3514 }, { "epoch": 0.6367753623188406, "grad_norm": 5.417018044112498, "learning_rate": 3.0791313297189454e-07, "loss": 0.3217, "step": 3515 }, { "epoch": 0.6369565217391304, "grad_norm": 6.228573519493017, "learning_rate": 3.076422929318155e-07, "loss": 0.2775, "step": 3516 }, { "epoch": 0.6371376811594203, "grad_norm": 3.797721447614127, "learning_rate": 3.073715191214065e-07, "loss": 0.2864, "step": 3517 }, { "epoch": 0.6373188405797101, "grad_norm": 4.1832251869510175, "learning_rate": 3.07100811633896e-07, "loss": 0.291, "step": 3518 }, { "epoch": 0.6375, "grad_norm": 5.4771009462827696, "learning_rate": 3.0683017056249005e-07, "loss": 0.2808, "step": 3519 }, { "epoch": 0.6376811594202898, "grad_norm": 4.604587667290546, "learning_rate": 3.0655959600037167e-07, "loss": 0.3158, "step": 3520 }, { "epoch": 0.6378623188405798, "grad_norm": 7.029354005414273, "learning_rate": 3.062890880407011e-07, "loss": 0.3228, "step": 3521 }, { "epoch": 0.6380434782608696, "grad_norm": 6.918926416341621, "learning_rate": 3.060186467766149e-07, "loss": 0.2932, "step": 3522 }, { "epoch": 0.6382246376811594, "grad_norm": 3.674184108477483, "learning_rate": 3.057482723012282e-07, "loss": 0.2091, "step": 3523 }, { "epoch": 0.6384057971014493, "grad_norm": 3.2998305727648702, "learning_rate": 3.054779647076315e-07, "loss": 0.2574, "step": 3524 }, { "epoch": 0.6385869565217391, "grad_norm": 6.642957874434674, "learning_rate": 3.0520772408889327e-07, "loss": 0.302, "step": 3525 }, { "epoch": 0.638768115942029, "grad_norm": 3.380285563687606, "learning_rate": 3.049375505380585e-07, "loss": 0.2422, "step": 3526 }, { "epoch": 0.6389492753623188, "grad_norm": 4.673713098123802, "learning_rate": 3.046674441481494e-07, "loss": 0.306, "step": 3527 }, { "epoch": 0.6391304347826087, "grad_norm": 3.9190718489309124, "learning_rate": 3.043974050121647e-07, "loss": 0.344, "step": 3528 }, { "epoch": 0.6393115942028985, "grad_norm": 11.003515385389463, "learning_rate": 3.0412743322308044e-07, "loss": 0.3078, "step": 3529 }, { "epoch": 0.6394927536231884, "grad_norm": 4.157619027974557, "learning_rate": 3.0385752887384864e-07, "loss": 0.2899, "step": 3530 }, { "epoch": 0.6396739130434783, "grad_norm": 6.931307811011895, "learning_rate": 3.0358769205739927e-07, "loss": 0.3147, "step": 3531 }, { "epoch": 0.6398550724637682, "grad_norm": 4.0321045784591645, "learning_rate": 3.03317922866638e-07, "loss": 0.2841, "step": 3532 }, { "epoch": 0.640036231884058, "grad_norm": 5.4472268314775665, "learning_rate": 3.0304822139444795e-07, "loss": 0.3238, "step": 3533 }, { "epoch": 0.6402173913043478, "grad_norm": 6.201566239732925, "learning_rate": 3.0277858773368826e-07, "loss": 0.319, "step": 3534 }, { "epoch": 0.6403985507246377, "grad_norm": 6.427192834854141, "learning_rate": 3.025090219771953e-07, "loss": 0.3573, "step": 3535 }, { "epoch": 0.6405797101449275, "grad_norm": 4.734874658914441, "learning_rate": 3.0223952421778186e-07, "loss": 0.325, "step": 3536 }, { "epoch": 0.6407608695652174, "grad_norm": 3.9676697086942867, "learning_rate": 3.019700945482374e-07, "loss": 0.2262, "step": 3537 }, { "epoch": 0.6409420289855072, "grad_norm": 4.37865260270185, "learning_rate": 3.017007330613273e-07, "loss": 0.2751, "step": 3538 }, { "epoch": 0.6411231884057971, "grad_norm": 4.287482247035775, "learning_rate": 3.0143143984979464e-07, "loss": 0.261, "step": 3539 }, { "epoch": 0.6413043478260869, "grad_norm": 4.486192162549001, "learning_rate": 3.0116221500635806e-07, "loss": 0.2871, "step": 3540 }, { "epoch": 0.6414855072463768, "grad_norm": 7.8293707971430155, "learning_rate": 3.0089305862371294e-07, "loss": 0.308, "step": 3541 }, { "epoch": 0.6416666666666667, "grad_norm": 8.106392472119104, "learning_rate": 3.0062397079453104e-07, "loss": 0.2628, "step": 3542 }, { "epoch": 0.6418478260869566, "grad_norm": 4.069627385869626, "learning_rate": 3.003549516114607e-07, "loss": 0.2195, "step": 3543 }, { "epoch": 0.6420289855072464, "grad_norm": 4.6315966071872845, "learning_rate": 3.0008600116712657e-07, "loss": 0.2597, "step": 3544 }, { "epoch": 0.6422101449275363, "grad_norm": 8.120672814882225, "learning_rate": 2.9981711955412935e-07, "loss": 0.3131, "step": 3545 }, { "epoch": 0.6423913043478261, "grad_norm": 8.400669278395796, "learning_rate": 2.9954830686504627e-07, "loss": 0.3154, "step": 3546 }, { "epoch": 0.6425724637681159, "grad_norm": 9.726823509023543, "learning_rate": 2.9927956319243103e-07, "loss": 0.2377, "step": 3547 }, { "epoch": 0.6427536231884058, "grad_norm": 5.26665970832048, "learning_rate": 2.990108886288133e-07, "loss": 0.35, "step": 3548 }, { "epoch": 0.6429347826086956, "grad_norm": 7.755263541017699, "learning_rate": 2.987422832666988e-07, "loss": 0.3204, "step": 3549 }, { "epoch": 0.6431159420289855, "grad_norm": 7.935360862257978, "learning_rate": 2.9847374719856965e-07, "loss": 0.3047, "step": 3550 }, { "epoch": 0.6432971014492753, "grad_norm": 4.980652920524264, "learning_rate": 2.9820528051688433e-07, "loss": 0.2729, "step": 3551 }, { "epoch": 0.6434782608695652, "grad_norm": 4.958848830089449, "learning_rate": 2.9793688331407707e-07, "loss": 0.3193, "step": 3552 }, { "epoch": 0.6436594202898551, "grad_norm": 6.451077083156074, "learning_rate": 2.976685556825582e-07, "loss": 0.2958, "step": 3553 }, { "epoch": 0.643840579710145, "grad_norm": 4.456700058175346, "learning_rate": 2.974002977147142e-07, "loss": 0.3391, "step": 3554 }, { "epoch": 0.6440217391304348, "grad_norm": 4.717785655282856, "learning_rate": 2.971321095029078e-07, "loss": 0.3224, "step": 3555 }, { "epoch": 0.6442028985507247, "grad_norm": 6.604709962169939, "learning_rate": 2.9686399113947737e-07, "loss": 0.3217, "step": 3556 }, { "epoch": 0.6443840579710145, "grad_norm": 4.142883397303934, "learning_rate": 2.9659594271673693e-07, "loss": 0.2995, "step": 3557 }, { "epoch": 0.6445652173913043, "grad_norm": 4.520706467154874, "learning_rate": 2.9632796432697746e-07, "loss": 0.2965, "step": 3558 }, { "epoch": 0.6447463768115942, "grad_norm": 6.921158413824603, "learning_rate": 2.960600560624648e-07, "loss": 0.2851, "step": 3559 }, { "epoch": 0.644927536231884, "grad_norm": 5.663681867486989, "learning_rate": 2.957922180154412e-07, "loss": 0.3085, "step": 3560 }, { "epoch": 0.6451086956521739, "grad_norm": 6.446251091437465, "learning_rate": 2.955244502781243e-07, "loss": 0.2506, "step": 3561 }, { "epoch": 0.6452898550724637, "grad_norm": 4.661010702875182, "learning_rate": 2.952567529427081e-07, "loss": 0.2925, "step": 3562 }, { "epoch": 0.6454710144927536, "grad_norm": 3.5723092201246724, "learning_rate": 2.9498912610136203e-07, "loss": 0.2347, "step": 3563 }, { "epoch": 0.6456521739130435, "grad_norm": 4.453548900575511, "learning_rate": 2.9472156984623124e-07, "loss": 0.3224, "step": 3564 }, { "epoch": 0.6458333333333334, "grad_norm": 7.88515240431393, "learning_rate": 2.944540842694363e-07, "loss": 0.2769, "step": 3565 }, { "epoch": 0.6460144927536232, "grad_norm": 6.2753908255033295, "learning_rate": 2.9418666946307434e-07, "loss": 0.2767, "step": 3566 }, { "epoch": 0.6461956521739131, "grad_norm": 4.967052897771148, "learning_rate": 2.939193255192172e-07, "loss": 0.2951, "step": 3567 }, { "epoch": 0.6463768115942029, "grad_norm": 6.29695871870701, "learning_rate": 2.9365205252991267e-07, "loss": 0.3206, "step": 3568 }, { "epoch": 0.6465579710144927, "grad_norm": 5.461688541890273, "learning_rate": 2.93384850587184e-07, "loss": 0.2759, "step": 3569 }, { "epoch": 0.6467391304347826, "grad_norm": 5.176528427497427, "learning_rate": 2.9311771978303035e-07, "loss": 0.3406, "step": 3570 }, { "epoch": 0.6469202898550724, "grad_norm": 4.850940843561452, "learning_rate": 2.928506602094261e-07, "loss": 0.2776, "step": 3571 }, { "epoch": 0.6471014492753623, "grad_norm": 4.781029397025524, "learning_rate": 2.925836719583211e-07, "loss": 0.2465, "step": 3572 }, { "epoch": 0.6472826086956521, "grad_norm": 6.414965132211462, "learning_rate": 2.923167551216402e-07, "loss": 0.3738, "step": 3573 }, { "epoch": 0.6474637681159421, "grad_norm": 3.314440794934911, "learning_rate": 2.9204990979128485e-07, "loss": 0.285, "step": 3574 }, { "epoch": 0.6476449275362319, "grad_norm": 4.0654746745282475, "learning_rate": 2.917831360591309e-07, "loss": 0.2924, "step": 3575 }, { "epoch": 0.6478260869565218, "grad_norm": 4.648658864332499, "learning_rate": 2.915164340170297e-07, "loss": 0.2477, "step": 3576 }, { "epoch": 0.6480072463768116, "grad_norm": 3.813850364481976, "learning_rate": 2.9124980375680784e-07, "loss": 0.2574, "step": 3577 }, { "epoch": 0.6481884057971015, "grad_norm": 7.338703093723526, "learning_rate": 2.9098324537026785e-07, "loss": 0.2906, "step": 3578 }, { "epoch": 0.6483695652173913, "grad_norm": 7.98600963714465, "learning_rate": 2.90716758949187e-07, "loss": 0.2627, "step": 3579 }, { "epoch": 0.6485507246376812, "grad_norm": 6.683950177557102, "learning_rate": 2.904503445853175e-07, "loss": 0.3596, "step": 3580 }, { "epoch": 0.648731884057971, "grad_norm": 4.624608572412256, "learning_rate": 2.9018400237038695e-07, "loss": 0.2997, "step": 3581 }, { "epoch": 0.6489130434782608, "grad_norm": 4.100339115439525, "learning_rate": 2.8991773239609873e-07, "loss": 0.3193, "step": 3582 }, { "epoch": 0.6490942028985507, "grad_norm": 5.473573855639002, "learning_rate": 2.8965153475413065e-07, "loss": 0.3152, "step": 3583 }, { "epoch": 0.6492753623188405, "grad_norm": 5.123033095878148, "learning_rate": 2.8938540953613575e-07, "loss": 0.3493, "step": 3584 }, { "epoch": 0.6494565217391305, "grad_norm": 4.6530707252908385, "learning_rate": 2.891193568337424e-07, "loss": 0.3193, "step": 3585 }, { "epoch": 0.6496376811594203, "grad_norm": 3.804542653115263, "learning_rate": 2.888533767385536e-07, "loss": 0.2352, "step": 3586 }, { "epoch": 0.6498188405797102, "grad_norm": 5.914939404188481, "learning_rate": 2.8858746934214757e-07, "loss": 0.3182, "step": 3587 }, { "epoch": 0.65, "grad_norm": 9.73199331068314, "learning_rate": 2.883216347360776e-07, "loss": 0.3198, "step": 3588 }, { "epoch": 0.6501811594202899, "grad_norm": 4.132394173945082, "learning_rate": 2.880558730118717e-07, "loss": 0.3091, "step": 3589 }, { "epoch": 0.6503623188405797, "grad_norm": 3.989013391867055, "learning_rate": 2.8779018426103294e-07, "loss": 0.2842, "step": 3590 }, { "epoch": 0.6505434782608696, "grad_norm": 5.056871858922117, "learning_rate": 2.8752456857503926e-07, "loss": 0.2593, "step": 3591 }, { "epoch": 0.6507246376811594, "grad_norm": 4.065174857509861, "learning_rate": 2.8725902604534327e-07, "loss": 0.3019, "step": 3592 }, { "epoch": 0.6509057971014492, "grad_norm": 7.30139705704604, "learning_rate": 2.8699355676337244e-07, "loss": 0.3196, "step": 3593 }, { "epoch": 0.6510869565217391, "grad_norm": 5.435898779179223, "learning_rate": 2.8672816082052947e-07, "loss": 0.2925, "step": 3594 }, { "epoch": 0.6512681159420289, "grad_norm": 5.442550149053178, "learning_rate": 2.8646283830819147e-07, "loss": 0.2471, "step": 3595 }, { "epoch": 0.6514492753623189, "grad_norm": 4.723932616589107, "learning_rate": 2.8619758931770956e-07, "loss": 0.2673, "step": 3596 }, { "epoch": 0.6516304347826087, "grad_norm": 4.394625475998589, "learning_rate": 2.8593241394041085e-07, "loss": 0.3923, "step": 3597 }, { "epoch": 0.6518115942028986, "grad_norm": 3.718928857222592, "learning_rate": 2.856673122675963e-07, "loss": 0.3085, "step": 3598 }, { "epoch": 0.6519927536231884, "grad_norm": 9.255569009093893, "learning_rate": 2.854022843905417e-07, "loss": 0.2863, "step": 3599 }, { "epoch": 0.6521739130434783, "grad_norm": 3.488786308451071, "learning_rate": 2.851373304004973e-07, "loss": 0.265, "step": 3600 }, { "epoch": 0.6521739130434783, "eval_loss": 0.279484361410141, "eval_runtime": 9.7579, "eval_samples_per_second": 51.241, "eval_steps_per_second": 0.102, "step": 3600 }, { "epoch": 0.6523550724637681, "grad_norm": 3.588964815454868, "learning_rate": 2.8487245038868815e-07, "loss": 0.2914, "step": 3601 }, { "epoch": 0.652536231884058, "grad_norm": 6.321461042729916, "learning_rate": 2.8460764444631355e-07, "loss": 0.267, "step": 3602 }, { "epoch": 0.6527173913043478, "grad_norm": 3.5278027809007564, "learning_rate": 2.843429126645476e-07, "loss": 0.2185, "step": 3603 }, { "epoch": 0.6528985507246376, "grad_norm": 3.642436917467338, "learning_rate": 2.8407825513453843e-07, "loss": 0.2671, "step": 3604 }, { "epoch": 0.6530797101449275, "grad_norm": 4.776068447729105, "learning_rate": 2.838136719474094e-07, "loss": 0.2625, "step": 3605 }, { "epoch": 0.6532608695652173, "grad_norm": 4.181266998541342, "learning_rate": 2.8354916319425727e-07, "loss": 0.2915, "step": 3606 }, { "epoch": 0.6534420289855073, "grad_norm": 5.154781344835548, "learning_rate": 2.8328472896615387e-07, "loss": 0.2954, "step": 3607 }, { "epoch": 0.6536231884057971, "grad_norm": 4.765559229241035, "learning_rate": 2.8302036935414486e-07, "loss": 0.2863, "step": 3608 }, { "epoch": 0.653804347826087, "grad_norm": 4.0211848337348375, "learning_rate": 2.8275608444925093e-07, "loss": 0.2562, "step": 3609 }, { "epoch": 0.6539855072463768, "grad_norm": 4.335879578395847, "learning_rate": 2.8249187434246644e-07, "loss": 0.2533, "step": 3610 }, { "epoch": 0.6541666666666667, "grad_norm": 7.899859825786745, "learning_rate": 2.822277391247604e-07, "loss": 0.3308, "step": 3611 }, { "epoch": 0.6543478260869565, "grad_norm": 3.5302127760530886, "learning_rate": 2.8196367888707523e-07, "loss": 0.261, "step": 3612 }, { "epoch": 0.6545289855072464, "grad_norm": 4.249023241291757, "learning_rate": 2.816996937203287e-07, "loss": 0.2715, "step": 3613 }, { "epoch": 0.6547101449275362, "grad_norm": 5.75142314526588, "learning_rate": 2.8143578371541193e-07, "loss": 0.2981, "step": 3614 }, { "epoch": 0.654891304347826, "grad_norm": 5.824776336629638, "learning_rate": 2.8117194896319043e-07, "loss": 0.2747, "step": 3615 }, { "epoch": 0.6550724637681159, "grad_norm": 3.73281373204958, "learning_rate": 2.809081895545037e-07, "loss": 0.2938, "step": 3616 }, { "epoch": 0.6552536231884057, "grad_norm": 4.2455903901890455, "learning_rate": 2.806445055801654e-07, "loss": 0.3126, "step": 3617 }, { "epoch": 0.6554347826086957, "grad_norm": 6.411375057907996, "learning_rate": 2.8038089713096315e-07, "loss": 0.3749, "step": 3618 }, { "epoch": 0.6556159420289855, "grad_norm": 5.820503794110793, "learning_rate": 2.801173642976586e-07, "loss": 0.3274, "step": 3619 }, { "epoch": 0.6557971014492754, "grad_norm": 4.913970927173583, "learning_rate": 2.7985390717098715e-07, "loss": 0.3523, "step": 3620 }, { "epoch": 0.6559782608695652, "grad_norm": 4.619482463634723, "learning_rate": 2.7959052584165876e-07, "loss": 0.2765, "step": 3621 }, { "epoch": 0.6561594202898551, "grad_norm": 3.417401039290704, "learning_rate": 2.793272204003568e-07, "loss": 0.248, "step": 3622 }, { "epoch": 0.6563405797101449, "grad_norm": 4.011142184815913, "learning_rate": 2.7906399093773824e-07, "loss": 0.2491, "step": 3623 }, { "epoch": 0.6565217391304348, "grad_norm": 3.836371639593736, "learning_rate": 2.7880083754443424e-07, "loss": 0.2418, "step": 3624 }, { "epoch": 0.6567028985507246, "grad_norm": 3.4113225116294426, "learning_rate": 2.785377603110501e-07, "loss": 0.2578, "step": 3625 }, { "epoch": 0.6568840579710145, "grad_norm": 5.503691518951337, "learning_rate": 2.782747593281644e-07, "loss": 0.3781, "step": 3626 }, { "epoch": 0.6570652173913043, "grad_norm": 4.927070530792558, "learning_rate": 2.7801183468632964e-07, "loss": 0.2755, "step": 3627 }, { "epoch": 0.6572463768115943, "grad_norm": 8.859870219546806, "learning_rate": 2.7774898647607205e-07, "loss": 0.3488, "step": 3628 }, { "epoch": 0.6574275362318841, "grad_norm": 3.8815894478064683, "learning_rate": 2.7748621478789137e-07, "loss": 0.261, "step": 3629 }, { "epoch": 0.657608695652174, "grad_norm": 6.688821754510899, "learning_rate": 2.772235197122612e-07, "loss": 0.2667, "step": 3630 }, { "epoch": 0.6577898550724638, "grad_norm": 7.288455264644119, "learning_rate": 2.7696090133962866e-07, "loss": 0.3488, "step": 3631 }, { "epoch": 0.6579710144927536, "grad_norm": 3.909989091520067, "learning_rate": 2.766983597604149e-07, "loss": 0.2975, "step": 3632 }, { "epoch": 0.6581521739130435, "grad_norm": 6.100025961693236, "learning_rate": 2.764358950650137e-07, "loss": 0.3118, "step": 3633 }, { "epoch": 0.6583333333333333, "grad_norm": 4.105225728555756, "learning_rate": 2.761735073437931e-07, "loss": 0.3026, "step": 3634 }, { "epoch": 0.6585144927536232, "grad_norm": 4.367941485720895, "learning_rate": 2.7591119668709426e-07, "loss": 0.3368, "step": 3635 }, { "epoch": 0.658695652173913, "grad_norm": 4.747645896093567, "learning_rate": 2.7564896318523235e-07, "loss": 0.2986, "step": 3636 }, { "epoch": 0.6588768115942029, "grad_norm": 6.123179664376465, "learning_rate": 2.753868069284954e-07, "loss": 0.248, "step": 3637 }, { "epoch": 0.6590579710144927, "grad_norm": 6.171803416165604, "learning_rate": 2.7512472800714524e-07, "loss": 0.2244, "step": 3638 }, { "epoch": 0.6592391304347827, "grad_norm": 4.101033123111261, "learning_rate": 2.7486272651141626e-07, "loss": 0.3093, "step": 3639 }, { "epoch": 0.6594202898550725, "grad_norm": 4.439942192889116, "learning_rate": 2.746008025315176e-07, "loss": 0.2698, "step": 3640 }, { "epoch": 0.6596014492753624, "grad_norm": 4.242412112289066, "learning_rate": 2.743389561576305e-07, "loss": 0.2428, "step": 3641 }, { "epoch": 0.6597826086956522, "grad_norm": 5.087518885612564, "learning_rate": 2.7407718747991006e-07, "loss": 0.2472, "step": 3642 }, { "epoch": 0.659963768115942, "grad_norm": 7.055745367515722, "learning_rate": 2.7381549658848437e-07, "loss": 0.3213, "step": 3643 }, { "epoch": 0.6601449275362319, "grad_norm": 5.169329899691847, "learning_rate": 2.7355388357345487e-07, "loss": 0.2875, "step": 3644 }, { "epoch": 0.6603260869565217, "grad_norm": 6.595283603259727, "learning_rate": 2.7329234852489623e-07, "loss": 0.2546, "step": 3645 }, { "epoch": 0.6605072463768116, "grad_norm": 3.8489407852792468, "learning_rate": 2.7303089153285615e-07, "loss": 0.2702, "step": 3646 }, { "epoch": 0.6606884057971014, "grad_norm": 3.892946075740103, "learning_rate": 2.727695126873553e-07, "loss": 0.2726, "step": 3647 }, { "epoch": 0.6608695652173913, "grad_norm": 4.533870632461735, "learning_rate": 2.7250821207838805e-07, "loss": 0.2774, "step": 3648 }, { "epoch": 0.6610507246376811, "grad_norm": 3.7080499511784577, "learning_rate": 2.7224698979592155e-07, "loss": 0.257, "step": 3649 }, { "epoch": 0.6612318840579711, "grad_norm": 6.809284094972931, "learning_rate": 2.7198584592989527e-07, "loss": 0.3, "step": 3650 }, { "epoch": 0.6614130434782609, "grad_norm": 5.690296168129186, "learning_rate": 2.717247805702224e-07, "loss": 0.2307, "step": 3651 }, { "epoch": 0.6615942028985508, "grad_norm": 5.037475673626697, "learning_rate": 2.714637938067894e-07, "loss": 0.2774, "step": 3652 }, { "epoch": 0.6617753623188406, "grad_norm": 3.400094922068276, "learning_rate": 2.7120288572945506e-07, "loss": 0.257, "step": 3653 }, { "epoch": 0.6619565217391304, "grad_norm": 3.878530967935302, "learning_rate": 2.7094205642805123e-07, "loss": 0.308, "step": 3654 }, { "epoch": 0.6621376811594203, "grad_norm": 5.782688264320758, "learning_rate": 2.706813059923826e-07, "loss": 0.3242, "step": 3655 }, { "epoch": 0.6623188405797101, "grad_norm": 4.816636776378942, "learning_rate": 2.7042063451222685e-07, "loss": 0.3356, "step": 3656 }, { "epoch": 0.6625, "grad_norm": 5.812525310932098, "learning_rate": 2.7016004207733445e-07, "loss": 0.279, "step": 3657 }, { "epoch": 0.6626811594202898, "grad_norm": 3.489803123765879, "learning_rate": 2.6989952877742863e-07, "loss": 0.2511, "step": 3658 }, { "epoch": 0.6628623188405797, "grad_norm": 4.209224942182741, "learning_rate": 2.6963909470220516e-07, "loss": 0.2585, "step": 3659 }, { "epoch": 0.6630434782608695, "grad_norm": 4.781288714945626, "learning_rate": 2.6937873994133294e-07, "loss": 0.298, "step": 3660 }, { "epoch": 0.6632246376811595, "grad_norm": 4.22864320769655, "learning_rate": 2.691184645844532e-07, "loss": 0.3086, "step": 3661 }, { "epoch": 0.6634057971014493, "grad_norm": 5.6425340265076365, "learning_rate": 2.688582687211801e-07, "loss": 0.3309, "step": 3662 }, { "epoch": 0.6635869565217392, "grad_norm": 4.3399493955212405, "learning_rate": 2.6859815244110006e-07, "loss": 0.27, "step": 3663 }, { "epoch": 0.663768115942029, "grad_norm": 7.329606499573125, "learning_rate": 2.683381158337727e-07, "loss": 0.3699, "step": 3664 }, { "epoch": 0.6639492753623188, "grad_norm": 4.548447538524371, "learning_rate": 2.680781589887299e-07, "loss": 0.284, "step": 3665 }, { "epoch": 0.6641304347826087, "grad_norm": 5.257102677918078, "learning_rate": 2.6781828199547545e-07, "loss": 0.3345, "step": 3666 }, { "epoch": 0.6643115942028985, "grad_norm": 5.057863993945306, "learning_rate": 2.675584849434868e-07, "loss": 0.275, "step": 3667 }, { "epoch": 0.6644927536231884, "grad_norm": 5.083540153822678, "learning_rate": 2.672987679222131e-07, "loss": 0.3251, "step": 3668 }, { "epoch": 0.6646739130434782, "grad_norm": 5.36873637733001, "learning_rate": 2.670391310210762e-07, "loss": 0.318, "step": 3669 }, { "epoch": 0.6648550724637681, "grad_norm": 3.701325257273936, "learning_rate": 2.667795743294703e-07, "loss": 0.3444, "step": 3670 }, { "epoch": 0.6650362318840579, "grad_norm": 5.710167866377029, "learning_rate": 2.665200979367619e-07, "loss": 0.2747, "step": 3671 }, { "epoch": 0.6652173913043479, "grad_norm": 3.2670581234439897, "learning_rate": 2.662607019322901e-07, "loss": 0.2459, "step": 3672 }, { "epoch": 0.6653985507246377, "grad_norm": 4.739820353040369, "learning_rate": 2.6600138640536606e-07, "loss": 0.3047, "step": 3673 }, { "epoch": 0.6655797101449276, "grad_norm": 4.646381608458436, "learning_rate": 2.657421514452732e-07, "loss": 0.1967, "step": 3674 }, { "epoch": 0.6657608695652174, "grad_norm": 4.424479068892569, "learning_rate": 2.65482997141268e-07, "loss": 0.2993, "step": 3675 }, { "epoch": 0.6659420289855073, "grad_norm": 11.823343439874979, "learning_rate": 2.652239235825777e-07, "loss": 0.3167, "step": 3676 }, { "epoch": 0.6661231884057971, "grad_norm": 8.421222300347122, "learning_rate": 2.64964930858403e-07, "loss": 0.2632, "step": 3677 }, { "epoch": 0.6663043478260869, "grad_norm": 6.684647609976163, "learning_rate": 2.64706019057916e-07, "loss": 0.2806, "step": 3678 }, { "epoch": 0.6664855072463768, "grad_norm": 5.69973139210881, "learning_rate": 2.644471882702617e-07, "loss": 0.2846, "step": 3679 }, { "epoch": 0.6666666666666666, "grad_norm": 4.44474700518741, "learning_rate": 2.6418843858455664e-07, "loss": 0.3004, "step": 3680 }, { "epoch": 0.6668478260869565, "grad_norm": 4.37476347752842, "learning_rate": 2.6392977008988954e-07, "loss": 0.2831, "step": 3681 }, { "epoch": 0.6670289855072464, "grad_norm": 8.191837727313821, "learning_rate": 2.6367118287532075e-07, "loss": 0.3109, "step": 3682 }, { "epoch": 0.6672101449275363, "grad_norm": 4.537798478846723, "learning_rate": 2.6341267702988366e-07, "loss": 0.319, "step": 3683 }, { "epoch": 0.6673913043478261, "grad_norm": 6.0466095915037315, "learning_rate": 2.6315425264258285e-07, "loss": 0.2448, "step": 3684 }, { "epoch": 0.667572463768116, "grad_norm": 6.750126278221356, "learning_rate": 2.6289590980239504e-07, "loss": 0.3277, "step": 3685 }, { "epoch": 0.6677536231884058, "grad_norm": 4.75955449751252, "learning_rate": 2.6263764859826897e-07, "loss": 0.3595, "step": 3686 }, { "epoch": 0.6679347826086957, "grad_norm": 6.82483244022603, "learning_rate": 2.6237946911912505e-07, "loss": 0.3149, "step": 3687 }, { "epoch": 0.6681159420289855, "grad_norm": 3.5489601048998285, "learning_rate": 2.6212137145385583e-07, "loss": 0.2621, "step": 3688 }, { "epoch": 0.6682971014492753, "grad_norm": 6.608472992586457, "learning_rate": 2.618633556913255e-07, "loss": 0.3195, "step": 3689 }, { "epoch": 0.6684782608695652, "grad_norm": 9.622788416228062, "learning_rate": 2.6160542192036994e-07, "loss": 0.3537, "step": 3690 }, { "epoch": 0.668659420289855, "grad_norm": 5.998937943780035, "learning_rate": 2.613475702297973e-07, "loss": 0.2273, "step": 3691 }, { "epoch": 0.6688405797101449, "grad_norm": 5.507125327866316, "learning_rate": 2.610898007083872e-07, "loss": 0.2766, "step": 3692 }, { "epoch": 0.6690217391304348, "grad_norm": 4.998722513971953, "learning_rate": 2.6083211344489053e-07, "loss": 0.2771, "step": 3693 }, { "epoch": 0.6692028985507247, "grad_norm": 4.815733635290072, "learning_rate": 2.605745085280302e-07, "loss": 0.2745, "step": 3694 }, { "epoch": 0.6693840579710145, "grad_norm": 5.169377603311988, "learning_rate": 2.603169860465012e-07, "loss": 0.2627, "step": 3695 }, { "epoch": 0.6695652173913044, "grad_norm": 3.51605881908899, "learning_rate": 2.6005954608896954e-07, "loss": 0.2889, "step": 3696 }, { "epoch": 0.6697463768115942, "grad_norm": 3.3417235649319146, "learning_rate": 2.59802188744073e-07, "loss": 0.2629, "step": 3697 }, { "epoch": 0.6699275362318841, "grad_norm": 3.3583454007999216, "learning_rate": 2.5954491410042094e-07, "loss": 0.268, "step": 3698 }, { "epoch": 0.6701086956521739, "grad_norm": 3.482215764461388, "learning_rate": 2.5928772224659436e-07, "loss": 0.2585, "step": 3699 }, { "epoch": 0.6702898550724637, "grad_norm": 6.939062804970206, "learning_rate": 2.5903061327114537e-07, "loss": 0.3046, "step": 3700 }, { "epoch": 0.6702898550724637, "eval_loss": 0.27495312690734863, "eval_runtime": 9.8384, "eval_samples_per_second": 50.821, "eval_steps_per_second": 0.102, "step": 3700 }, { "epoch": 0.6704710144927536, "grad_norm": 3.823398836964224, "learning_rate": 2.587735872625979e-07, "loss": 0.2745, "step": 3701 }, { "epoch": 0.6706521739130434, "grad_norm": 4.892659381418445, "learning_rate": 2.585166443094476e-07, "loss": 0.29, "step": 3702 }, { "epoch": 0.6708333333333333, "grad_norm": 3.846838461143228, "learning_rate": 2.582597845001607e-07, "loss": 0.2869, "step": 3703 }, { "epoch": 0.6710144927536232, "grad_norm": 4.82868446969758, "learning_rate": 2.580030079231754e-07, "loss": 0.2871, "step": 3704 }, { "epoch": 0.6711956521739131, "grad_norm": 3.5194616192698893, "learning_rate": 2.5774631466690087e-07, "loss": 0.2928, "step": 3705 }, { "epoch": 0.6713768115942029, "grad_norm": 5.433930905167059, "learning_rate": 2.574897048197182e-07, "loss": 0.3018, "step": 3706 }, { "epoch": 0.6715579710144928, "grad_norm": 3.923716063835157, "learning_rate": 2.5723317846997916e-07, "loss": 0.2405, "step": 3707 }, { "epoch": 0.6717391304347826, "grad_norm": 4.16140983937233, "learning_rate": 2.569767357060073e-07, "loss": 0.2783, "step": 3708 }, { "epoch": 0.6719202898550725, "grad_norm": 3.7676758780958037, "learning_rate": 2.567203766160964e-07, "loss": 0.2922, "step": 3709 }, { "epoch": 0.6721014492753623, "grad_norm": 4.343592744738559, "learning_rate": 2.564641012885126e-07, "loss": 0.2432, "step": 3710 }, { "epoch": 0.6722826086956522, "grad_norm": 6.377767425993752, "learning_rate": 2.5620790981149274e-07, "loss": 0.2734, "step": 3711 }, { "epoch": 0.672463768115942, "grad_norm": 3.9623108479923697, "learning_rate": 2.559518022732446e-07, "loss": 0.3119, "step": 3712 }, { "epoch": 0.6726449275362318, "grad_norm": 7.791476134644975, "learning_rate": 2.5569577876194725e-07, "loss": 0.2958, "step": 3713 }, { "epoch": 0.6728260869565217, "grad_norm": 7.555931885836235, "learning_rate": 2.5543983936575085e-07, "loss": 0.2807, "step": 3714 }, { "epoch": 0.6730072463768116, "grad_norm": 4.441798355122445, "learning_rate": 2.551839841727765e-07, "loss": 0.321, "step": 3715 }, { "epoch": 0.6731884057971015, "grad_norm": 4.497291138388753, "learning_rate": 2.5492821327111634e-07, "loss": 0.312, "step": 3716 }, { "epoch": 0.6733695652173913, "grad_norm": 6.19309082208371, "learning_rate": 2.5467252674883334e-07, "loss": 0.2809, "step": 3717 }, { "epoch": 0.6735507246376812, "grad_norm": 3.9176046880543294, "learning_rate": 2.544169246939619e-07, "loss": 0.2253, "step": 3718 }, { "epoch": 0.673731884057971, "grad_norm": 3.8381856072852116, "learning_rate": 2.5416140719450716e-07, "loss": 0.2464, "step": 3719 }, { "epoch": 0.6739130434782609, "grad_norm": 3.9871109924332573, "learning_rate": 2.5390597433844454e-07, "loss": 0.2563, "step": 3720 }, { "epoch": 0.6740942028985507, "grad_norm": 7.1776908292991575, "learning_rate": 2.536506262137208e-07, "loss": 0.3132, "step": 3721 }, { "epoch": 0.6742753623188406, "grad_norm": 5.935203294853653, "learning_rate": 2.5339536290825383e-07, "loss": 0.2835, "step": 3722 }, { "epoch": 0.6744565217391304, "grad_norm": 7.16900488574676, "learning_rate": 2.531401845099318e-07, "loss": 0.336, "step": 3723 }, { "epoch": 0.6746376811594202, "grad_norm": 3.484536818051787, "learning_rate": 2.5288509110661394e-07, "loss": 0.2505, "step": 3724 }, { "epoch": 0.6748188405797102, "grad_norm": 4.152149688453602, "learning_rate": 2.5263008278613005e-07, "loss": 0.3403, "step": 3725 }, { "epoch": 0.675, "grad_norm": 3.923635820170825, "learning_rate": 2.523751596362808e-07, "loss": 0.2747, "step": 3726 }, { "epoch": 0.6751811594202899, "grad_norm": 3.5615440570905417, "learning_rate": 2.521203217448372e-07, "loss": 0.2621, "step": 3727 }, { "epoch": 0.6753623188405797, "grad_norm": 4.477573613988831, "learning_rate": 2.518655691995415e-07, "loss": 0.3018, "step": 3728 }, { "epoch": 0.6755434782608696, "grad_norm": 4.937307739699936, "learning_rate": 2.516109020881059e-07, "loss": 0.2702, "step": 3729 }, { "epoch": 0.6757246376811594, "grad_norm": 3.578465065191748, "learning_rate": 2.5135632049821354e-07, "loss": 0.2735, "step": 3730 }, { "epoch": 0.6759057971014493, "grad_norm": 9.165499789617822, "learning_rate": 2.5110182451751825e-07, "loss": 0.2474, "step": 3731 }, { "epoch": 0.6760869565217391, "grad_norm": 3.6574739974542902, "learning_rate": 2.5084741423364397e-07, "loss": 0.2176, "step": 3732 }, { "epoch": 0.676268115942029, "grad_norm": 3.4945780501329633, "learning_rate": 2.5059308973418535e-07, "loss": 0.2553, "step": 3733 }, { "epoch": 0.6764492753623188, "grad_norm": 4.7676407893054416, "learning_rate": 2.503388511067079e-07, "loss": 0.2938, "step": 3734 }, { "epoch": 0.6766304347826086, "grad_norm": 3.714527861368929, "learning_rate": 2.5008469843874705e-07, "loss": 0.2833, "step": 3735 }, { "epoch": 0.6768115942028986, "grad_norm": 5.2619405581981376, "learning_rate": 2.4983063181780827e-07, "loss": 0.3008, "step": 3736 }, { "epoch": 0.6769927536231884, "grad_norm": 5.806052309007467, "learning_rate": 2.495766513313685e-07, "loss": 0.3541, "step": 3737 }, { "epoch": 0.6771739130434783, "grad_norm": 8.334073080872042, "learning_rate": 2.4932275706687416e-07, "loss": 0.3122, "step": 3738 }, { "epoch": 0.6773550724637681, "grad_norm": 5.036014008178167, "learning_rate": 2.490689491117424e-07, "loss": 0.2791, "step": 3739 }, { "epoch": 0.677536231884058, "grad_norm": 4.894421293528215, "learning_rate": 2.4881522755336024e-07, "loss": 0.3398, "step": 3740 }, { "epoch": 0.6777173913043478, "grad_norm": 3.8092840533788253, "learning_rate": 2.485615924790855e-07, "loss": 0.29, "step": 3741 }, { "epoch": 0.6778985507246377, "grad_norm": 3.5658172235659524, "learning_rate": 2.483080439762458e-07, "loss": 0.2381, "step": 3742 }, { "epoch": 0.6780797101449275, "grad_norm": 5.211964956085503, "learning_rate": 2.4805458213213904e-07, "loss": 0.328, "step": 3743 }, { "epoch": 0.6782608695652174, "grad_norm": 3.8336922392325277, "learning_rate": 2.478012070340332e-07, "loss": 0.2769, "step": 3744 }, { "epoch": 0.6784420289855072, "grad_norm": 4.0706889002832485, "learning_rate": 2.475479187691672e-07, "loss": 0.3317, "step": 3745 }, { "epoch": 0.678623188405797, "grad_norm": 4.8548088776608145, "learning_rate": 2.472947174247486e-07, "loss": 0.3067, "step": 3746 }, { "epoch": 0.678804347826087, "grad_norm": 7.700384039520854, "learning_rate": 2.4704160308795623e-07, "loss": 0.2748, "step": 3747 }, { "epoch": 0.6789855072463769, "grad_norm": 4.377439947690035, "learning_rate": 2.4678857584593823e-07, "loss": 0.2952, "step": 3748 }, { "epoch": 0.6791666666666667, "grad_norm": 4.191160381355023, "learning_rate": 2.465356357858135e-07, "loss": 0.3186, "step": 3749 }, { "epoch": 0.6793478260869565, "grad_norm": 3.8901980606507256, "learning_rate": 2.4628278299467025e-07, "loss": 0.2575, "step": 3750 }, { "epoch": 0.6795289855072464, "grad_norm": 5.933138967534731, "learning_rate": 2.4603001755956706e-07, "loss": 0.3002, "step": 3751 }, { "epoch": 0.6797101449275362, "grad_norm": 5.885488032232096, "learning_rate": 2.4577733956753204e-07, "loss": 0.2572, "step": 3752 }, { "epoch": 0.6798913043478261, "grad_norm": 5.063156531520064, "learning_rate": 2.4552474910556366e-07, "loss": 0.2988, "step": 3753 }, { "epoch": 0.6800724637681159, "grad_norm": 3.6891494181801194, "learning_rate": 2.4527224626062983e-07, "loss": 0.31, "step": 3754 }, { "epoch": 0.6802536231884058, "grad_norm": 4.048423715052501, "learning_rate": 2.450198311196685e-07, "loss": 0.2791, "step": 3755 }, { "epoch": 0.6804347826086956, "grad_norm": 3.5643477289797474, "learning_rate": 2.447675037695875e-07, "loss": 0.2852, "step": 3756 }, { "epoch": 0.6806159420289855, "grad_norm": 7.374217912480681, "learning_rate": 2.4451526429726425e-07, "loss": 0.27, "step": 3757 }, { "epoch": 0.6807971014492754, "grad_norm": 4.320903535185061, "learning_rate": 2.4426311278954604e-07, "loss": 0.3285, "step": 3758 }, { "epoch": 0.6809782608695653, "grad_norm": 5.493615002209071, "learning_rate": 2.4401104933324973e-07, "loss": 0.3208, "step": 3759 }, { "epoch": 0.6811594202898551, "grad_norm": 3.348509491773087, "learning_rate": 2.437590740151619e-07, "loss": 0.2755, "step": 3760 }, { "epoch": 0.681340579710145, "grad_norm": 6.1246952309264735, "learning_rate": 2.435071869220392e-07, "loss": 0.2782, "step": 3761 }, { "epoch": 0.6815217391304348, "grad_norm": 5.360414590507508, "learning_rate": 2.432553881406075e-07, "loss": 0.2525, "step": 3762 }, { "epoch": 0.6817028985507246, "grad_norm": 3.154309192637602, "learning_rate": 2.430036777575619e-07, "loss": 0.2622, "step": 3763 }, { "epoch": 0.6818840579710145, "grad_norm": 8.186132924282555, "learning_rate": 2.4275205585956757e-07, "loss": 0.2174, "step": 3764 }, { "epoch": 0.6820652173913043, "grad_norm": 6.183140415361262, "learning_rate": 2.425005225332595e-07, "loss": 0.3044, "step": 3765 }, { "epoch": 0.6822463768115942, "grad_norm": 4.440254661755383, "learning_rate": 2.4224907786524154e-07, "loss": 0.2992, "step": 3766 }, { "epoch": 0.682427536231884, "grad_norm": 3.7596045205174278, "learning_rate": 2.4199772194208726e-07, "loss": 0.2947, "step": 3767 }, { "epoch": 0.6826086956521739, "grad_norm": 4.609703182758899, "learning_rate": 2.417464548503398e-07, "loss": 0.262, "step": 3768 }, { "epoch": 0.6827898550724638, "grad_norm": 3.9194289858445086, "learning_rate": 2.4149527667651166e-07, "loss": 0.3021, "step": 3769 }, { "epoch": 0.6829710144927537, "grad_norm": 5.309697719592353, "learning_rate": 2.412441875070845e-07, "loss": 0.2904, "step": 3770 }, { "epoch": 0.6831521739130435, "grad_norm": 5.481643647046267, "learning_rate": 2.409931874285096e-07, "loss": 0.3001, "step": 3771 }, { "epoch": 0.6833333333333333, "grad_norm": 4.742414949513573, "learning_rate": 2.4074227652720746e-07, "loss": 0.2664, "step": 3772 }, { "epoch": 0.6835144927536232, "grad_norm": 10.468248103439224, "learning_rate": 2.404914548895679e-07, "loss": 0.2708, "step": 3773 }, { "epoch": 0.683695652173913, "grad_norm": 4.70162866668075, "learning_rate": 2.4024072260195e-07, "loss": 0.2391, "step": 3774 }, { "epoch": 0.6838768115942029, "grad_norm": 7.271222561201524, "learning_rate": 2.3999007975068193e-07, "loss": 0.2966, "step": 3775 }, { "epoch": 0.6840579710144927, "grad_norm": 4.935387864568864, "learning_rate": 2.3973952642206146e-07, "loss": 0.3543, "step": 3776 }, { "epoch": 0.6842391304347826, "grad_norm": 3.9182071596776384, "learning_rate": 2.3948906270235524e-07, "loss": 0.2336, "step": 3777 }, { "epoch": 0.6844202898550724, "grad_norm": 5.98865061652781, "learning_rate": 2.3923868867779923e-07, "loss": 0.2335, "step": 3778 }, { "epoch": 0.6846014492753624, "grad_norm": 4.016137179708794, "learning_rate": 2.3898840443459787e-07, "loss": 0.3028, "step": 3779 }, { "epoch": 0.6847826086956522, "grad_norm": 3.359287852461111, "learning_rate": 2.3873821005892575e-07, "loss": 0.2885, "step": 3780 }, { "epoch": 0.6849637681159421, "grad_norm": 5.74270091761096, "learning_rate": 2.384881056369257e-07, "loss": 0.2542, "step": 3781 }, { "epoch": 0.6851449275362319, "grad_norm": 5.5518861750157535, "learning_rate": 2.3823809125471006e-07, "loss": 0.3141, "step": 3782 }, { "epoch": 0.6853260869565218, "grad_norm": 8.20629876964488, "learning_rate": 2.3798816699835982e-07, "loss": 0.2784, "step": 3783 }, { "epoch": 0.6855072463768116, "grad_norm": 4.634983024752152, "learning_rate": 2.3773833295392514e-07, "loss": 0.2207, "step": 3784 }, { "epoch": 0.6856884057971014, "grad_norm": 6.191718181565424, "learning_rate": 2.3748858920742498e-07, "loss": 0.2737, "step": 3785 }, { "epoch": 0.6858695652173913, "grad_norm": 5.284665196778069, "learning_rate": 2.3723893584484744e-07, "loss": 0.3055, "step": 3786 }, { "epoch": 0.6860507246376811, "grad_norm": 3.6539431978292742, "learning_rate": 2.3698937295214905e-07, "loss": 0.2796, "step": 3787 }, { "epoch": 0.686231884057971, "grad_norm": 5.738723518638768, "learning_rate": 2.367399006152559e-07, "loss": 0.2868, "step": 3788 }, { "epoch": 0.6864130434782608, "grad_norm": 3.93371117987035, "learning_rate": 2.364905189200625e-07, "loss": 0.2718, "step": 3789 }, { "epoch": 0.6865942028985508, "grad_norm": 5.756673818183177, "learning_rate": 2.3624122795243183e-07, "loss": 0.3469, "step": 3790 }, { "epoch": 0.6867753623188406, "grad_norm": 3.6251648800464054, "learning_rate": 2.359920277981959e-07, "loss": 0.2632, "step": 3791 }, { "epoch": 0.6869565217391305, "grad_norm": 3.729185749355193, "learning_rate": 2.3574291854315582e-07, "loss": 0.3147, "step": 3792 }, { "epoch": 0.6871376811594203, "grad_norm": 4.260088636735624, "learning_rate": 2.3549390027308103e-07, "loss": 0.3128, "step": 3793 }, { "epoch": 0.6873188405797102, "grad_norm": 5.971931144202631, "learning_rate": 2.3524497307370954e-07, "loss": 0.2944, "step": 3794 }, { "epoch": 0.6875, "grad_norm": 7.096882484653727, "learning_rate": 2.3499613703074834e-07, "loss": 0.283, "step": 3795 }, { "epoch": 0.6876811594202898, "grad_norm": 7.532401824587726, "learning_rate": 2.3474739222987277e-07, "loss": 0.2555, "step": 3796 }, { "epoch": 0.6878623188405797, "grad_norm": 4.837707478698999, "learning_rate": 2.344987387567268e-07, "loss": 0.2844, "step": 3797 }, { "epoch": 0.6880434782608695, "grad_norm": 4.5302550670805015, "learning_rate": 2.342501766969231e-07, "loss": 0.2845, "step": 3798 }, { "epoch": 0.6882246376811594, "grad_norm": 4.107654960421121, "learning_rate": 2.340017061360427e-07, "loss": 0.299, "step": 3799 }, { "epoch": 0.6884057971014492, "grad_norm": 5.1077384860866335, "learning_rate": 2.337533271596352e-07, "loss": 0.3354, "step": 3800 }, { "epoch": 0.6884057971014492, "eval_loss": 0.28162500262260437, "eval_runtime": 9.7581, "eval_samples_per_second": 51.239, "eval_steps_per_second": 0.102, "step": 3800 }, { "epoch": 0.6885869565217392, "grad_norm": 3.895599508421316, "learning_rate": 2.3350503985321863e-07, "loss": 0.2299, "step": 3801 }, { "epoch": 0.688768115942029, "grad_norm": 6.36223034603808, "learning_rate": 2.3325684430227953e-07, "loss": 0.2933, "step": 3802 }, { "epoch": 0.6889492753623189, "grad_norm": 5.793919492476561, "learning_rate": 2.3300874059227265e-07, "loss": 0.2676, "step": 3803 }, { "epoch": 0.6891304347826087, "grad_norm": 3.5537249304223066, "learning_rate": 2.3276072880862159e-07, "loss": 0.2871, "step": 3804 }, { "epoch": 0.6893115942028986, "grad_norm": 7.85421289156873, "learning_rate": 2.3251280903671793e-07, "loss": 0.2793, "step": 3805 }, { "epoch": 0.6894927536231884, "grad_norm": 5.2015059804309836, "learning_rate": 2.322649813619214e-07, "loss": 0.2999, "step": 3806 }, { "epoch": 0.6896739130434782, "grad_norm": 3.89391652983408, "learning_rate": 2.3201724586956013e-07, "loss": 0.264, "step": 3807 }, { "epoch": 0.6898550724637681, "grad_norm": 4.940018783241108, "learning_rate": 2.31769602644931e-07, "loss": 0.3566, "step": 3808 }, { "epoch": 0.6900362318840579, "grad_norm": 3.577183307372419, "learning_rate": 2.315220517732986e-07, "loss": 0.2273, "step": 3809 }, { "epoch": 0.6902173913043478, "grad_norm": 8.334791736171915, "learning_rate": 2.3127459333989578e-07, "loss": 0.2323, "step": 3810 }, { "epoch": 0.6903985507246376, "grad_norm": 5.026998604234912, "learning_rate": 2.310272274299238e-07, "loss": 0.2943, "step": 3811 }, { "epoch": 0.6905797101449276, "grad_norm": 7.631456430315683, "learning_rate": 2.3077995412855183e-07, "loss": 0.3053, "step": 3812 }, { "epoch": 0.6907608695652174, "grad_norm": 5.904942862059751, "learning_rate": 2.305327735209172e-07, "loss": 0.227, "step": 3813 }, { "epoch": 0.6909420289855073, "grad_norm": 4.466594245637513, "learning_rate": 2.3028568569212526e-07, "loss": 0.279, "step": 3814 }, { "epoch": 0.6911231884057971, "grad_norm": 4.2423410047423715, "learning_rate": 2.3003869072724986e-07, "loss": 0.2641, "step": 3815 }, { "epoch": 0.691304347826087, "grad_norm": 4.085576865913926, "learning_rate": 2.2979178871133255e-07, "loss": 0.2751, "step": 3816 }, { "epoch": 0.6914855072463768, "grad_norm": 5.300833733691943, "learning_rate": 2.295449797293824e-07, "loss": 0.3022, "step": 3817 }, { "epoch": 0.6916666666666667, "grad_norm": 4.350403258352893, "learning_rate": 2.2929826386637703e-07, "loss": 0.2694, "step": 3818 }, { "epoch": 0.6918478260869565, "grad_norm": 7.237121567159541, "learning_rate": 2.290516412072622e-07, "loss": 0.2486, "step": 3819 }, { "epoch": 0.6920289855072463, "grad_norm": 5.413961554673111, "learning_rate": 2.288051118369511e-07, "loss": 0.3938, "step": 3820 }, { "epoch": 0.6922101449275362, "grad_norm": 8.626655468906, "learning_rate": 2.2855867584032496e-07, "loss": 0.291, "step": 3821 }, { "epoch": 0.6923913043478261, "grad_norm": 4.157941098726383, "learning_rate": 2.2831233330223282e-07, "loss": 0.3305, "step": 3822 }, { "epoch": 0.692572463768116, "grad_norm": 7.99926831495356, "learning_rate": 2.2806608430749158e-07, "loss": 0.2769, "step": 3823 }, { "epoch": 0.6927536231884058, "grad_norm": 7.928550603128422, "learning_rate": 2.2781992894088599e-07, "loss": 0.3252, "step": 3824 }, { "epoch": 0.6929347826086957, "grad_norm": 4.941603073940312, "learning_rate": 2.2757386728716849e-07, "loss": 0.266, "step": 3825 }, { "epoch": 0.6931159420289855, "grad_norm": 5.025470239545978, "learning_rate": 2.2732789943105924e-07, "loss": 0.2733, "step": 3826 }, { "epoch": 0.6932971014492754, "grad_norm": 3.5713345321840717, "learning_rate": 2.270820254572462e-07, "loss": 0.2453, "step": 3827 }, { "epoch": 0.6934782608695652, "grad_norm": 3.723252805044335, "learning_rate": 2.2683624545038488e-07, "loss": 0.2809, "step": 3828 }, { "epoch": 0.693659420289855, "grad_norm": 3.98260193173211, "learning_rate": 2.2659055949509852e-07, "loss": 0.2941, "step": 3829 }, { "epoch": 0.6938405797101449, "grad_norm": 4.697418403635444, "learning_rate": 2.2634496767597784e-07, "loss": 0.3369, "step": 3830 }, { "epoch": 0.6940217391304347, "grad_norm": 5.99252702407414, "learning_rate": 2.2609947007758152e-07, "loss": 0.2697, "step": 3831 }, { "epoch": 0.6942028985507246, "grad_norm": 6.180970861163161, "learning_rate": 2.2585406678443558e-07, "loss": 0.3008, "step": 3832 }, { "epoch": 0.6943840579710145, "grad_norm": 3.9945378122709516, "learning_rate": 2.2560875788103323e-07, "loss": 0.2973, "step": 3833 }, { "epoch": 0.6945652173913044, "grad_norm": 3.142306917182887, "learning_rate": 2.2536354345183545e-07, "loss": 0.2305, "step": 3834 }, { "epoch": 0.6947463768115942, "grad_norm": 3.5138372491509857, "learning_rate": 2.2511842358127114e-07, "loss": 0.2518, "step": 3835 }, { "epoch": 0.6949275362318841, "grad_norm": 2.8847508407145437, "learning_rate": 2.2487339835373593e-07, "loss": 0.2212, "step": 3836 }, { "epoch": 0.6951086956521739, "grad_norm": 10.539855375468377, "learning_rate": 2.246284678535933e-07, "loss": 0.2988, "step": 3837 }, { "epoch": 0.6952898550724638, "grad_norm": 4.358687419164801, "learning_rate": 2.243836321651739e-07, "loss": 0.2793, "step": 3838 }, { "epoch": 0.6954710144927536, "grad_norm": 3.5529227307144238, "learning_rate": 2.2413889137277586e-07, "loss": 0.2875, "step": 3839 }, { "epoch": 0.6956521739130435, "grad_norm": 6.488059008046831, "learning_rate": 2.2389424556066455e-07, "loss": 0.2217, "step": 3840 }, { "epoch": 0.6958333333333333, "grad_norm": 3.83985159372148, "learning_rate": 2.2364969481307272e-07, "loss": 0.2709, "step": 3841 }, { "epoch": 0.6960144927536231, "grad_norm": 5.792415216894456, "learning_rate": 2.2340523921420034e-07, "loss": 0.2792, "step": 3842 }, { "epoch": 0.696195652173913, "grad_norm": 3.633719525565086, "learning_rate": 2.2316087884821467e-07, "loss": 0.2446, "step": 3843 }, { "epoch": 0.696376811594203, "grad_norm": 5.443811432586644, "learning_rate": 2.2291661379925008e-07, "loss": 0.3033, "step": 3844 }, { "epoch": 0.6965579710144928, "grad_norm": 3.793910056797811, "learning_rate": 2.22672444151408e-07, "loss": 0.3072, "step": 3845 }, { "epoch": 0.6967391304347826, "grad_norm": 4.581990500799826, "learning_rate": 2.2242836998875763e-07, "loss": 0.2587, "step": 3846 }, { "epoch": 0.6969202898550725, "grad_norm": 3.9969392833061437, "learning_rate": 2.2218439139533462e-07, "loss": 0.252, "step": 3847 }, { "epoch": 0.6971014492753623, "grad_norm": 3.813600695419182, "learning_rate": 2.2194050845514212e-07, "loss": 0.292, "step": 3848 }, { "epoch": 0.6972826086956522, "grad_norm": 8.126997778204192, "learning_rate": 2.2169672125214971e-07, "loss": 0.2962, "step": 3849 }, { "epoch": 0.697463768115942, "grad_norm": 3.817536113558924, "learning_rate": 2.2145302987029495e-07, "loss": 0.3088, "step": 3850 }, { "epoch": 0.6976449275362319, "grad_norm": 6.393869814890446, "learning_rate": 2.2120943439348184e-07, "loss": 0.2606, "step": 3851 }, { "epoch": 0.6978260869565217, "grad_norm": 4.84019102550198, "learning_rate": 2.2096593490558134e-07, "loss": 0.2768, "step": 3852 }, { "epoch": 0.6980072463768116, "grad_norm": 7.712696443643765, "learning_rate": 2.2072253149043158e-07, "loss": 0.3077, "step": 3853 }, { "epoch": 0.6981884057971014, "grad_norm": 5.247110955916728, "learning_rate": 2.2047922423183746e-07, "loss": 0.3003, "step": 3854 }, { "epoch": 0.6983695652173914, "grad_norm": 6.554265872852244, "learning_rate": 2.2023601321357082e-07, "loss": 0.235, "step": 3855 }, { "epoch": 0.6985507246376812, "grad_norm": 3.8444108134514385, "learning_rate": 2.199928985193704e-07, "loss": 0.2716, "step": 3856 }, { "epoch": 0.698731884057971, "grad_norm": 8.164588563505138, "learning_rate": 2.1974988023294156e-07, "loss": 0.3179, "step": 3857 }, { "epoch": 0.6989130434782609, "grad_norm": 6.013200076195491, "learning_rate": 2.1950695843795697e-07, "loss": 0.2845, "step": 3858 }, { "epoch": 0.6990942028985507, "grad_norm": 4.48517778665608, "learning_rate": 2.1926413321805574e-07, "loss": 0.3093, "step": 3859 }, { "epoch": 0.6992753623188406, "grad_norm": 4.817871914286493, "learning_rate": 2.190214046568435e-07, "loss": 0.2713, "step": 3860 }, { "epoch": 0.6994565217391304, "grad_norm": 5.423984674913027, "learning_rate": 2.187787728378927e-07, "loss": 0.3008, "step": 3861 }, { "epoch": 0.6996376811594203, "grad_norm": 4.036100771245654, "learning_rate": 2.18536237844743e-07, "loss": 0.2845, "step": 3862 }, { "epoch": 0.6998188405797101, "grad_norm": 4.0802816512856666, "learning_rate": 2.1829379976090028e-07, "loss": 0.2702, "step": 3863 }, { "epoch": 0.7, "grad_norm": 8.461585122702223, "learning_rate": 2.180514586698371e-07, "loss": 0.2683, "step": 3864 }, { "epoch": 0.7001811594202898, "grad_norm": 6.164450221428653, "learning_rate": 2.1780921465499252e-07, "loss": 0.3211, "step": 3865 }, { "epoch": 0.7003623188405798, "grad_norm": 3.234841078108479, "learning_rate": 2.175670677997724e-07, "loss": 0.2292, "step": 3866 }, { "epoch": 0.7005434782608696, "grad_norm": 3.893588969357438, "learning_rate": 2.1732501818754906e-07, "loss": 0.298, "step": 3867 }, { "epoch": 0.7007246376811594, "grad_norm": 8.101903589515599, "learning_rate": 2.1708306590166126e-07, "loss": 0.2877, "step": 3868 }, { "epoch": 0.7009057971014493, "grad_norm": 8.238000132321554, "learning_rate": 2.1684121102541435e-07, "loss": 0.3257, "step": 3869 }, { "epoch": 0.7010869565217391, "grad_norm": 6.974761142208887, "learning_rate": 2.1659945364208017e-07, "loss": 0.3155, "step": 3870 }, { "epoch": 0.701268115942029, "grad_norm": 4.785887971435475, "learning_rate": 2.1635779383489687e-07, "loss": 0.2602, "step": 3871 }, { "epoch": 0.7014492753623188, "grad_norm": 3.6665107106925614, "learning_rate": 2.1611623168706905e-07, "loss": 0.3032, "step": 3872 }, { "epoch": 0.7016304347826087, "grad_norm": 4.266989471594792, "learning_rate": 2.1587476728176757e-07, "loss": 0.3152, "step": 3873 }, { "epoch": 0.7018115942028985, "grad_norm": 3.477226931322972, "learning_rate": 2.156334007021301e-07, "loss": 0.2901, "step": 3874 }, { "epoch": 0.7019927536231884, "grad_norm": 5.14395421208077, "learning_rate": 2.1539213203126034e-07, "loss": 0.3104, "step": 3875 }, { "epoch": 0.7021739130434783, "grad_norm": 5.257762792478757, "learning_rate": 2.1515096135222775e-07, "loss": 0.3226, "step": 3876 }, { "epoch": 0.7023550724637682, "grad_norm": 3.9327063916925433, "learning_rate": 2.149098887480687e-07, "loss": 0.2662, "step": 3877 }, { "epoch": 0.702536231884058, "grad_norm": 4.931721664454086, "learning_rate": 2.146689143017859e-07, "loss": 0.3296, "step": 3878 }, { "epoch": 0.7027173913043478, "grad_norm": 7.7016531339644505, "learning_rate": 2.1442803809634785e-07, "loss": 0.2648, "step": 3879 }, { "epoch": 0.7028985507246377, "grad_norm": 3.651930880124436, "learning_rate": 2.1418726021468937e-07, "loss": 0.2563, "step": 3880 }, { "epoch": 0.7030797101449275, "grad_norm": 5.508046338388239, "learning_rate": 2.1394658073971135e-07, "loss": 0.3018, "step": 3881 }, { "epoch": 0.7032608695652174, "grad_norm": 3.7365230202239585, "learning_rate": 2.13705999754281e-07, "loss": 0.248, "step": 3882 }, { "epoch": 0.7034420289855072, "grad_norm": 3.683401828315749, "learning_rate": 2.1346551734123136e-07, "loss": 0.295, "step": 3883 }, { "epoch": 0.7036231884057971, "grad_norm": 3.6852567553988553, "learning_rate": 2.1322513358336158e-07, "loss": 0.2618, "step": 3884 }, { "epoch": 0.7038043478260869, "grad_norm": 5.8496857708182075, "learning_rate": 2.1298484856343724e-07, "loss": 0.3161, "step": 3885 }, { "epoch": 0.7039855072463768, "grad_norm": 4.110668816683411, "learning_rate": 2.1274466236418963e-07, "loss": 0.3302, "step": 3886 }, { "epoch": 0.7041666666666667, "grad_norm": 3.443120256525678, "learning_rate": 2.1250457506831565e-07, "loss": 0.257, "step": 3887 }, { "epoch": 0.7043478260869566, "grad_norm": 5.992851148463246, "learning_rate": 2.1226458675847847e-07, "loss": 0.322, "step": 3888 }, { "epoch": 0.7045289855072464, "grad_norm": 5.735196706510981, "learning_rate": 2.1202469751730757e-07, "loss": 0.2557, "step": 3889 }, { "epoch": 0.7047101449275363, "grad_norm": 4.543808253257551, "learning_rate": 2.1178490742739773e-07, "loss": 0.3094, "step": 3890 }, { "epoch": 0.7048913043478261, "grad_norm": 4.176058969502284, "learning_rate": 2.1154521657130985e-07, "loss": 0.2956, "step": 3891 }, { "epoch": 0.7050724637681159, "grad_norm": 5.820421759039495, "learning_rate": 2.1130562503157068e-07, "loss": 0.263, "step": 3892 }, { "epoch": 0.7052536231884058, "grad_norm": 3.864294812391932, "learning_rate": 2.1106613289067266e-07, "loss": 0.2593, "step": 3893 }, { "epoch": 0.7054347826086956, "grad_norm": 4.013625144244584, "learning_rate": 2.108267402310741e-07, "loss": 0.2783, "step": 3894 }, { "epoch": 0.7056159420289855, "grad_norm": 3.4819882548369487, "learning_rate": 2.10587447135199e-07, "loss": 0.2769, "step": 3895 }, { "epoch": 0.7057971014492753, "grad_norm": 4.494904366891253, "learning_rate": 2.1034825368543713e-07, "loss": 0.2751, "step": 3896 }, { "epoch": 0.7059782608695652, "grad_norm": 3.788228229461717, "learning_rate": 2.1010915996414387e-07, "loss": 0.2715, "step": 3897 }, { "epoch": 0.7061594202898551, "grad_norm": 4.558146418442677, "learning_rate": 2.0987016605364038e-07, "loss": 0.2485, "step": 3898 }, { "epoch": 0.706340579710145, "grad_norm": 9.011667071487581, "learning_rate": 2.096312720362134e-07, "loss": 0.3279, "step": 3899 }, { "epoch": 0.7065217391304348, "grad_norm": 4.033816335560063, "learning_rate": 2.093924779941151e-07, "loss": 0.2674, "step": 3900 }, { "epoch": 0.7065217391304348, "eval_loss": 0.2748437523841858, "eval_runtime": 9.7949, "eval_samples_per_second": 51.047, "eval_steps_per_second": 0.102, "step": 3900 }, { "epoch": 0.7067028985507247, "grad_norm": 6.570008043991807, "learning_rate": 2.091537840095637e-07, "loss": 0.3518, "step": 3901 }, { "epoch": 0.7068840579710145, "grad_norm": 8.488845064009187, "learning_rate": 2.0891519016474268e-07, "loss": 0.2664, "step": 3902 }, { "epoch": 0.7070652173913043, "grad_norm": 7.184044904634563, "learning_rate": 2.086766965418007e-07, "loss": 0.3019, "step": 3903 }, { "epoch": 0.7072463768115942, "grad_norm": 6.153126550834495, "learning_rate": 2.0843830322285227e-07, "loss": 0.3337, "step": 3904 }, { "epoch": 0.707427536231884, "grad_norm": 3.18806571497542, "learning_rate": 2.0820001028997763e-07, "loss": 0.2554, "step": 3905 }, { "epoch": 0.7076086956521739, "grad_norm": 7.231120940129814, "learning_rate": 2.079618178252221e-07, "loss": 0.292, "step": 3906 }, { "epoch": 0.7077898550724637, "grad_norm": 6.9688938353825645, "learning_rate": 2.0772372591059633e-07, "loss": 0.2545, "step": 3907 }, { "epoch": 0.7079710144927536, "grad_norm": 4.725859624784996, "learning_rate": 2.0748573462807657e-07, "loss": 0.2582, "step": 3908 }, { "epoch": 0.7081521739130435, "grad_norm": 4.493747199323723, "learning_rate": 2.0724784405960438e-07, "loss": 0.249, "step": 3909 }, { "epoch": 0.7083333333333334, "grad_norm": 4.05813342984372, "learning_rate": 2.070100542870865e-07, "loss": 0.3141, "step": 3910 }, { "epoch": 0.7085144927536232, "grad_norm": 3.646999104315953, "learning_rate": 2.0677236539239523e-07, "loss": 0.2483, "step": 3911 }, { "epoch": 0.7086956521739131, "grad_norm": 7.400528883186729, "learning_rate": 2.0653477745736786e-07, "loss": 0.2589, "step": 3912 }, { "epoch": 0.7088768115942029, "grad_norm": 3.923596584462551, "learning_rate": 2.0629729056380708e-07, "loss": 0.2462, "step": 3913 }, { "epoch": 0.7090579710144927, "grad_norm": 10.302374494290001, "learning_rate": 2.0605990479348072e-07, "loss": 0.2842, "step": 3914 }, { "epoch": 0.7092391304347826, "grad_norm": 4.363029422198087, "learning_rate": 2.058226202281217e-07, "loss": 0.2841, "step": 3915 }, { "epoch": 0.7094202898550724, "grad_norm": 3.1212162356638316, "learning_rate": 2.0558543694942853e-07, "loss": 0.1978, "step": 3916 }, { "epoch": 0.7096014492753623, "grad_norm": 4.278531571620851, "learning_rate": 2.0534835503906446e-07, "loss": 0.3357, "step": 3917 }, { "epoch": 0.7097826086956521, "grad_norm": 9.792183444454981, "learning_rate": 2.0511137457865797e-07, "loss": 0.3221, "step": 3918 }, { "epoch": 0.7099637681159421, "grad_norm": 3.365095405079111, "learning_rate": 2.0487449564980202e-07, "loss": 0.2504, "step": 3919 }, { "epoch": 0.7101449275362319, "grad_norm": 8.133772695587682, "learning_rate": 2.046377183340558e-07, "loss": 0.3301, "step": 3920 }, { "epoch": 0.7103260869565218, "grad_norm": 9.7820690195649, "learning_rate": 2.0440104271294257e-07, "loss": 0.2816, "step": 3921 }, { "epoch": 0.7105072463768116, "grad_norm": 7.165834345146065, "learning_rate": 2.041644688679509e-07, "loss": 0.286, "step": 3922 }, { "epoch": 0.7106884057971015, "grad_norm": 7.163950001306182, "learning_rate": 2.0392799688053435e-07, "loss": 0.2729, "step": 3923 }, { "epoch": 0.7108695652173913, "grad_norm": 3.328188233958173, "learning_rate": 2.0369162683211129e-07, "loss": 0.247, "step": 3924 }, { "epoch": 0.7110507246376812, "grad_norm": 3.438127332975321, "learning_rate": 2.03455358804065e-07, "loss": 0.2189, "step": 3925 }, { "epoch": 0.711231884057971, "grad_norm": 6.8672147674006, "learning_rate": 2.0321919287774374e-07, "loss": 0.2694, "step": 3926 }, { "epoch": 0.7114130434782608, "grad_norm": 4.262116419446079, "learning_rate": 2.0298312913446042e-07, "loss": 0.3133, "step": 3927 }, { "epoch": 0.7115942028985507, "grad_norm": 3.8195989168364486, "learning_rate": 2.027471676554932e-07, "loss": 0.265, "step": 3928 }, { "epoch": 0.7117753623188405, "grad_norm": 4.205182147749765, "learning_rate": 2.025113085220847e-07, "loss": 0.2808, "step": 3929 }, { "epoch": 0.7119565217391305, "grad_norm": 8.755210542150248, "learning_rate": 2.022755518154421e-07, "loss": 0.2946, "step": 3930 }, { "epoch": 0.7121376811594203, "grad_norm": 5.157590356896721, "learning_rate": 2.020398976167374e-07, "loss": 0.2964, "step": 3931 }, { "epoch": 0.7123188405797102, "grad_norm": 4.19122884140391, "learning_rate": 2.0180434600710794e-07, "loss": 0.2569, "step": 3932 }, { "epoch": 0.7125, "grad_norm": 5.382443262970783, "learning_rate": 2.0156889706765506e-07, "loss": 0.3063, "step": 3933 }, { "epoch": 0.7126811594202899, "grad_norm": 3.7882097904317646, "learning_rate": 2.0133355087944488e-07, "loss": 0.2797, "step": 3934 }, { "epoch": 0.7128623188405797, "grad_norm": 5.14816199052616, "learning_rate": 2.010983075235082e-07, "loss": 0.2382, "step": 3935 }, { "epoch": 0.7130434782608696, "grad_norm": 5.251671940574674, "learning_rate": 2.0086316708084055e-07, "loss": 0.3133, "step": 3936 }, { "epoch": 0.7132246376811594, "grad_norm": 4.655460168551347, "learning_rate": 2.0062812963240177e-07, "loss": 0.2487, "step": 3937 }, { "epoch": 0.7134057971014492, "grad_norm": 8.264429332214254, "learning_rate": 2.003931952591164e-07, "loss": 0.2998, "step": 3938 }, { "epoch": 0.7135869565217391, "grad_norm": 4.289325963492413, "learning_rate": 2.0015836404187348e-07, "loss": 0.2999, "step": 3939 }, { "epoch": 0.7137681159420289, "grad_norm": 4.135300883105886, "learning_rate": 1.999236360615265e-07, "loss": 0.334, "step": 3940 }, { "epoch": 0.7139492753623189, "grad_norm": 6.914956714424375, "learning_rate": 1.9968901139889343e-07, "loss": 0.289, "step": 3941 }, { "epoch": 0.7141304347826087, "grad_norm": 3.7373158418014905, "learning_rate": 1.9945449013475663e-07, "loss": 0.2569, "step": 3942 }, { "epoch": 0.7143115942028986, "grad_norm": 4.7759805108158355, "learning_rate": 1.992200723498627e-07, "loss": 0.2658, "step": 3943 }, { "epoch": 0.7144927536231884, "grad_norm": 3.8138379019022457, "learning_rate": 1.9898575812492317e-07, "loss": 0.2783, "step": 3944 }, { "epoch": 0.7146739130434783, "grad_norm": 4.250834228868454, "learning_rate": 1.9875154754061351e-07, "loss": 0.302, "step": 3945 }, { "epoch": 0.7148550724637681, "grad_norm": 5.5105589011747655, "learning_rate": 1.9851744067757324e-07, "loss": 0.2726, "step": 3946 }, { "epoch": 0.715036231884058, "grad_norm": 4.4198010547508675, "learning_rate": 1.9828343761640642e-07, "loss": 0.2381, "step": 3947 }, { "epoch": 0.7152173913043478, "grad_norm": 5.537535517595483, "learning_rate": 1.9804953843768174e-07, "loss": 0.3453, "step": 3948 }, { "epoch": 0.7153985507246376, "grad_norm": 7.963945946747752, "learning_rate": 1.9781574322193168e-07, "loss": 0.3154, "step": 3949 }, { "epoch": 0.7155797101449275, "grad_norm": 3.9181213151800827, "learning_rate": 1.9758205204965294e-07, "loss": 0.2582, "step": 3950 }, { "epoch": 0.7157608695652173, "grad_norm": 4.562695087166056, "learning_rate": 1.9734846500130665e-07, "loss": 0.2763, "step": 3951 }, { "epoch": 0.7159420289855073, "grad_norm": 5.2157578413710715, "learning_rate": 1.971149821573178e-07, "loss": 0.3168, "step": 3952 }, { "epoch": 0.7161231884057971, "grad_norm": 6.969112931137719, "learning_rate": 1.9688160359807571e-07, "loss": 0.3432, "step": 3953 }, { "epoch": 0.716304347826087, "grad_norm": 4.125750770221318, "learning_rate": 1.9664832940393355e-07, "loss": 0.265, "step": 3954 }, { "epoch": 0.7164855072463768, "grad_norm": 7.183623564070895, "learning_rate": 1.9641515965520905e-07, "loss": 0.2924, "step": 3955 }, { "epoch": 0.7166666666666667, "grad_norm": 4.4769166730914725, "learning_rate": 1.9618209443218363e-07, "loss": 0.305, "step": 3956 }, { "epoch": 0.7168478260869565, "grad_norm": 4.150501688576831, "learning_rate": 1.9594913381510246e-07, "loss": 0.2535, "step": 3957 }, { "epoch": 0.7170289855072464, "grad_norm": 5.819210510466341, "learning_rate": 1.95716277884175e-07, "loss": 0.2811, "step": 3958 }, { "epoch": 0.7172101449275362, "grad_norm": 7.3890368754842655, "learning_rate": 1.9548352671957496e-07, "loss": 0.2869, "step": 3959 }, { "epoch": 0.717391304347826, "grad_norm": 3.7529413710727035, "learning_rate": 1.952508804014395e-07, "loss": 0.2644, "step": 3960 }, { "epoch": 0.7175724637681159, "grad_norm": 4.702756802345117, "learning_rate": 1.950183390098698e-07, "loss": 0.3067, "step": 3961 }, { "epoch": 0.7177536231884057, "grad_norm": 6.550127858133256, "learning_rate": 1.9478590262493106e-07, "loss": 0.2847, "step": 3962 }, { "epoch": 0.7179347826086957, "grad_norm": 6.313935907248912, "learning_rate": 1.9455357132665218e-07, "loss": 0.2468, "step": 3963 }, { "epoch": 0.7181159420289855, "grad_norm": 4.057518227244305, "learning_rate": 1.943213451950259e-07, "loss": 0.313, "step": 3964 }, { "epoch": 0.7182971014492754, "grad_norm": 4.204988904175218, "learning_rate": 1.9408922431000885e-07, "loss": 0.2439, "step": 3965 }, { "epoch": 0.7184782608695652, "grad_norm": 5.061574208602043, "learning_rate": 1.9385720875152122e-07, "loss": 0.3147, "step": 3966 }, { "epoch": 0.7186594202898551, "grad_norm": 4.5808412053038445, "learning_rate": 1.9362529859944727e-07, "loss": 0.2769, "step": 3967 }, { "epoch": 0.7188405797101449, "grad_norm": 4.0086085354532965, "learning_rate": 1.9339349393363458e-07, "loss": 0.2761, "step": 3968 }, { "epoch": 0.7190217391304348, "grad_norm": 5.049972341909374, "learning_rate": 1.931617948338946e-07, "loss": 0.312, "step": 3969 }, { "epoch": 0.7192028985507246, "grad_norm": 7.540720873630649, "learning_rate": 1.9293020138000244e-07, "loss": 0.2534, "step": 3970 }, { "epoch": 0.7193840579710145, "grad_norm": 4.634285839162342, "learning_rate": 1.9269871365169692e-07, "loss": 0.3176, "step": 3971 }, { "epoch": 0.7195652173913043, "grad_norm": 4.619762711532391, "learning_rate": 1.9246733172868056e-07, "loss": 0.3036, "step": 3972 }, { "epoch": 0.7197463768115943, "grad_norm": 5.05537239517053, "learning_rate": 1.9223605569061885e-07, "loss": 0.3593, "step": 3973 }, { "epoch": 0.7199275362318841, "grad_norm": 3.7669551631783085, "learning_rate": 1.9200488561714118e-07, "loss": 0.2793, "step": 3974 }, { "epoch": 0.720108695652174, "grad_norm": 7.9264972886021035, "learning_rate": 1.9177382158784088e-07, "loss": 0.2951, "step": 3975 }, { "epoch": 0.7202898550724638, "grad_norm": 3.966875002065434, "learning_rate": 1.9154286368227423e-07, "loss": 0.2998, "step": 3976 }, { "epoch": 0.7204710144927536, "grad_norm": 7.076021952740293, "learning_rate": 1.913120119799611e-07, "loss": 0.297, "step": 3977 }, { "epoch": 0.7206521739130435, "grad_norm": 4.144593857369443, "learning_rate": 1.9108126656038482e-07, "loss": 0.2999, "step": 3978 }, { "epoch": 0.7208333333333333, "grad_norm": 4.238234717643975, "learning_rate": 1.908506275029922e-07, "loss": 0.2742, "step": 3979 }, { "epoch": 0.7210144927536232, "grad_norm": 4.055815296261728, "learning_rate": 1.9062009488719326e-07, "loss": 0.2943, "step": 3980 }, { "epoch": 0.721195652173913, "grad_norm": 3.5371083302642146, "learning_rate": 1.903896687923615e-07, "loss": 0.2992, "step": 3981 }, { "epoch": 0.7213768115942029, "grad_norm": 7.77354011345013, "learning_rate": 1.9015934929783383e-07, "loss": 0.2567, "step": 3982 }, { "epoch": 0.7215579710144927, "grad_norm": 5.88584406704857, "learning_rate": 1.899291364829102e-07, "loss": 0.3499, "step": 3983 }, { "epoch": 0.7217391304347827, "grad_norm": 5.695720052748647, "learning_rate": 1.8969903042685403e-07, "loss": 0.3163, "step": 3984 }, { "epoch": 0.7219202898550725, "grad_norm": 4.028077446196472, "learning_rate": 1.894690312088919e-07, "loss": 0.2874, "step": 3985 }, { "epoch": 0.7221014492753624, "grad_norm": 6.088330308383606, "learning_rate": 1.8923913890821352e-07, "loss": 0.3097, "step": 3986 }, { "epoch": 0.7222826086956522, "grad_norm": 3.721897113873162, "learning_rate": 1.890093536039722e-07, "loss": 0.2999, "step": 3987 }, { "epoch": 0.722463768115942, "grad_norm": 4.939991099997167, "learning_rate": 1.8877967537528405e-07, "loss": 0.2817, "step": 3988 }, { "epoch": 0.7226449275362319, "grad_norm": 5.561433227333897, "learning_rate": 1.8855010430122798e-07, "loss": 0.2955, "step": 3989 }, { "epoch": 0.7228260869565217, "grad_norm": 8.647609171184131, "learning_rate": 1.8832064046084683e-07, "loss": 0.3036, "step": 3990 }, { "epoch": 0.7230072463768116, "grad_norm": 6.130374411993332, "learning_rate": 1.8809128393314595e-07, "loss": 0.2071, "step": 3991 }, { "epoch": 0.7231884057971014, "grad_norm": 3.0488322090344013, "learning_rate": 1.8786203479709383e-07, "loss": 0.2276, "step": 3992 }, { "epoch": 0.7233695652173913, "grad_norm": 8.082891288733899, "learning_rate": 1.8763289313162212e-07, "loss": 0.2735, "step": 3993 }, { "epoch": 0.7235507246376811, "grad_norm": 3.9796336873473774, "learning_rate": 1.874038590156253e-07, "loss": 0.2992, "step": 3994 }, { "epoch": 0.7237318840579711, "grad_norm": 5.486330471952747, "learning_rate": 1.871749325279609e-07, "loss": 0.2946, "step": 3995 }, { "epoch": 0.7239130434782609, "grad_norm": 4.123125049435668, "learning_rate": 1.869461137474495e-07, "loss": 0.2979, "step": 3996 }, { "epoch": 0.7240942028985508, "grad_norm": 7.711667669591236, "learning_rate": 1.8671740275287416e-07, "loss": 0.316, "step": 3997 }, { "epoch": 0.7242753623188406, "grad_norm": 4.361146302612137, "learning_rate": 1.864887996229817e-07, "loss": 0.2792, "step": 3998 }, { "epoch": 0.7244565217391304, "grad_norm": 6.372879497140796, "learning_rate": 1.8626030443648105e-07, "loss": 0.3497, "step": 3999 }, { "epoch": 0.7246376811594203, "grad_norm": 5.290022841495765, "learning_rate": 1.8603191727204398e-07, "loss": 0.3032, "step": 4000 }, { "epoch": 0.7246376811594203, "eval_loss": 0.2775624990463257, "eval_runtime": 9.8232, "eval_samples_per_second": 50.9, "eval_steps_per_second": 0.102, "step": 4000 }, { "epoch": 0.7248188405797101, "grad_norm": 3.7837835685083743, "learning_rate": 1.8580363820830525e-07, "loss": 0.2328, "step": 4001 }, { "epoch": 0.725, "grad_norm": 11.243154586682063, "learning_rate": 1.855754673238627e-07, "loss": 0.3296, "step": 4002 }, { "epoch": 0.7251811594202898, "grad_norm": 4.283002776116116, "learning_rate": 1.8534740469727655e-07, "loss": 0.3029, "step": 4003 }, { "epoch": 0.7253623188405797, "grad_norm": 4.460776168875511, "learning_rate": 1.8511945040706984e-07, "loss": 0.3147, "step": 4004 }, { "epoch": 0.7255434782608695, "grad_norm": 7.1945108217623694, "learning_rate": 1.848916045317283e-07, "loss": 0.276, "step": 4005 }, { "epoch": 0.7257246376811595, "grad_norm": 4.454930156948343, "learning_rate": 1.8466386714970027e-07, "loss": 0.3214, "step": 4006 }, { "epoch": 0.7259057971014493, "grad_norm": 6.783122275683271, "learning_rate": 1.8443623833939693e-07, "loss": 0.2924, "step": 4007 }, { "epoch": 0.7260869565217392, "grad_norm": 4.581667747058475, "learning_rate": 1.8420871817919187e-07, "loss": 0.2937, "step": 4008 }, { "epoch": 0.726268115942029, "grad_norm": 6.188306253550094, "learning_rate": 1.839813067474214e-07, "loss": 0.2735, "step": 4009 }, { "epoch": 0.7264492753623188, "grad_norm": 5.070865441582673, "learning_rate": 1.837540041223844e-07, "loss": 0.3237, "step": 4010 }, { "epoch": 0.7266304347826087, "grad_norm": 4.001889637709018, "learning_rate": 1.8352681038234212e-07, "loss": 0.2719, "step": 4011 }, { "epoch": 0.7268115942028985, "grad_norm": 5.72533614339234, "learning_rate": 1.8329972560551854e-07, "loss": 0.2997, "step": 4012 }, { "epoch": 0.7269927536231884, "grad_norm": 4.695592503012075, "learning_rate": 1.830727498700998e-07, "loss": 0.2615, "step": 4013 }, { "epoch": 0.7271739130434782, "grad_norm": 7.1601274393712, "learning_rate": 1.8284588325423505e-07, "loss": 0.2591, "step": 4014 }, { "epoch": 0.7273550724637681, "grad_norm": 5.915948575911907, "learning_rate": 1.826191258360356e-07, "loss": 0.2983, "step": 4015 }, { "epoch": 0.7275362318840579, "grad_norm": 3.3690810921377534, "learning_rate": 1.823924776935748e-07, "loss": 0.2572, "step": 4016 }, { "epoch": 0.7277173913043479, "grad_norm": 3.6344854353821767, "learning_rate": 1.821659389048885e-07, "loss": 0.3033, "step": 4017 }, { "epoch": 0.7278985507246377, "grad_norm": 4.010645014504151, "learning_rate": 1.8193950954797565e-07, "loss": 0.3045, "step": 4018 }, { "epoch": 0.7280797101449276, "grad_norm": 3.784089514278222, "learning_rate": 1.8171318970079658e-07, "loss": 0.2926, "step": 4019 }, { "epoch": 0.7282608695652174, "grad_norm": 5.136532169035658, "learning_rate": 1.8148697944127438e-07, "loss": 0.309, "step": 4020 }, { "epoch": 0.7284420289855073, "grad_norm": 4.9724965037391735, "learning_rate": 1.8126087884729434e-07, "loss": 0.2961, "step": 4021 }, { "epoch": 0.7286231884057971, "grad_norm": 15.440337642479328, "learning_rate": 1.8103488799670395e-07, "loss": 0.3103, "step": 4022 }, { "epoch": 0.7288043478260869, "grad_norm": 4.229264400754731, "learning_rate": 1.8080900696731288e-07, "loss": 0.2177, "step": 4023 }, { "epoch": 0.7289855072463768, "grad_norm": 7.591407787228796, "learning_rate": 1.8058323583689288e-07, "loss": 0.312, "step": 4024 }, { "epoch": 0.7291666666666666, "grad_norm": 3.9192300374901547, "learning_rate": 1.8035757468317842e-07, "loss": 0.2862, "step": 4025 }, { "epoch": 0.7293478260869565, "grad_norm": 5.0747843670043835, "learning_rate": 1.8013202358386565e-07, "loss": 0.2777, "step": 4026 }, { "epoch": 0.7295289855072464, "grad_norm": 9.622895749432057, "learning_rate": 1.799065826166125e-07, "loss": 0.3371, "step": 4027 }, { "epoch": 0.7297101449275363, "grad_norm": 5.199294203354176, "learning_rate": 1.796812518590395e-07, "loss": 0.3027, "step": 4028 }, { "epoch": 0.7298913043478261, "grad_norm": 4.161505912391221, "learning_rate": 1.7945603138872933e-07, "loss": 0.29, "step": 4029 }, { "epoch": 0.730072463768116, "grad_norm": 6.124467623153883, "learning_rate": 1.792309212832263e-07, "loss": 0.2486, "step": 4030 }, { "epoch": 0.7302536231884058, "grad_norm": 3.8038601359536233, "learning_rate": 1.7900592162003692e-07, "loss": 0.2899, "step": 4031 }, { "epoch": 0.7304347826086957, "grad_norm": 4.321741200410663, "learning_rate": 1.7878103247662962e-07, "loss": 0.2809, "step": 4032 }, { "epoch": 0.7306159420289855, "grad_norm": 3.700894881311305, "learning_rate": 1.7855625393043482e-07, "loss": 0.2821, "step": 4033 }, { "epoch": 0.7307971014492753, "grad_norm": 4.112404728514255, "learning_rate": 1.7833158605884485e-07, "loss": 0.2992, "step": 4034 }, { "epoch": 0.7309782608695652, "grad_norm": 6.260070397947784, "learning_rate": 1.7810702893921387e-07, "loss": 0.3093, "step": 4035 }, { "epoch": 0.731159420289855, "grad_norm": 3.778333428132222, "learning_rate": 1.7788258264885797e-07, "loss": 0.2587, "step": 4036 }, { "epoch": 0.7313405797101449, "grad_norm": 4.904834391024922, "learning_rate": 1.7765824726505512e-07, "loss": 0.277, "step": 4037 }, { "epoch": 0.7315217391304348, "grad_norm": 5.272550025120003, "learning_rate": 1.77434022865045e-07, "loss": 0.2838, "step": 4038 }, { "epoch": 0.7317028985507247, "grad_norm": 7.441763426474568, "learning_rate": 1.7720990952602916e-07, "loss": 0.2766, "step": 4039 }, { "epoch": 0.7318840579710145, "grad_norm": 4.397405024936907, "learning_rate": 1.769859073251707e-07, "loss": 0.3033, "step": 4040 }, { "epoch": 0.7320652173913044, "grad_norm": 7.211549355034018, "learning_rate": 1.7676201633959503e-07, "loss": 0.2489, "step": 4041 }, { "epoch": 0.7322463768115942, "grad_norm": 10.135333894179256, "learning_rate": 1.7653823664638884e-07, "loss": 0.2521, "step": 4042 }, { "epoch": 0.7324275362318841, "grad_norm": 4.204932706072341, "learning_rate": 1.7631456832260017e-07, "loss": 0.2693, "step": 4043 }, { "epoch": 0.7326086956521739, "grad_norm": 5.00090250074175, "learning_rate": 1.7609101144523908e-07, "loss": 0.2872, "step": 4044 }, { "epoch": 0.7327898550724637, "grad_norm": 3.409662996576817, "learning_rate": 1.7586756609127768e-07, "loss": 0.2715, "step": 4045 }, { "epoch": 0.7329710144927536, "grad_norm": 3.3456447784627414, "learning_rate": 1.7564423233764901e-07, "loss": 0.2756, "step": 4046 }, { "epoch": 0.7331521739130434, "grad_norm": 4.044345182147048, "learning_rate": 1.75421010261248e-07, "loss": 0.2793, "step": 4047 }, { "epoch": 0.7333333333333333, "grad_norm": 6.100970187970424, "learning_rate": 1.7519789993893103e-07, "loss": 0.2995, "step": 4048 }, { "epoch": 0.7335144927536232, "grad_norm": 3.8182739620184853, "learning_rate": 1.749749014475161e-07, "loss": 0.2834, "step": 4049 }, { "epoch": 0.7336956521739131, "grad_norm": 4.831301384608513, "learning_rate": 1.7475201486378255e-07, "loss": 0.2715, "step": 4050 }, { "epoch": 0.7338768115942029, "grad_norm": 3.0211303095125146, "learning_rate": 1.7452924026447136e-07, "loss": 0.2327, "step": 4051 }, { "epoch": 0.7340579710144928, "grad_norm": 5.447840050625781, "learning_rate": 1.7430657772628488e-07, "loss": 0.2921, "step": 4052 }, { "epoch": 0.7342391304347826, "grad_norm": 7.112618612472881, "learning_rate": 1.7408402732588685e-07, "loss": 0.2825, "step": 4053 }, { "epoch": 0.7344202898550725, "grad_norm": 7.231675239000602, "learning_rate": 1.738615891399024e-07, "loss": 0.3269, "step": 4054 }, { "epoch": 0.7346014492753623, "grad_norm": 4.143875646341541, "learning_rate": 1.7363926324491813e-07, "loss": 0.2305, "step": 4055 }, { "epoch": 0.7347826086956522, "grad_norm": 10.373148091535601, "learning_rate": 1.734170497174816e-07, "loss": 0.2494, "step": 4056 }, { "epoch": 0.734963768115942, "grad_norm": 13.364313613074309, "learning_rate": 1.731949486341025e-07, "loss": 0.3307, "step": 4057 }, { "epoch": 0.7351449275362318, "grad_norm": 4.382852748350924, "learning_rate": 1.7297296007125112e-07, "loss": 0.292, "step": 4058 }, { "epoch": 0.7353260869565217, "grad_norm": 5.327589801474438, "learning_rate": 1.7275108410535877e-07, "loss": 0.2167, "step": 4059 }, { "epoch": 0.7355072463768116, "grad_norm": 6.394193193401852, "learning_rate": 1.725293208128189e-07, "loss": 0.3071, "step": 4060 }, { "epoch": 0.7356884057971015, "grad_norm": 8.234081461035954, "learning_rate": 1.7230767026998543e-07, "loss": 0.333, "step": 4061 }, { "epoch": 0.7358695652173913, "grad_norm": 5.857308893031953, "learning_rate": 1.720861325531737e-07, "loss": 0.3001, "step": 4062 }, { "epoch": 0.7360507246376812, "grad_norm": 2.933292943525002, "learning_rate": 1.7186470773866025e-07, "loss": 0.2111, "step": 4063 }, { "epoch": 0.736231884057971, "grad_norm": 4.109004963026581, "learning_rate": 1.7164339590268267e-07, "loss": 0.2748, "step": 4064 }, { "epoch": 0.7364130434782609, "grad_norm": 8.67488643719478, "learning_rate": 1.7142219712143968e-07, "loss": 0.3017, "step": 4065 }, { "epoch": 0.7365942028985507, "grad_norm": 3.9605724776002447, "learning_rate": 1.71201111471091e-07, "loss": 0.2592, "step": 4066 }, { "epoch": 0.7367753623188406, "grad_norm": 4.891730972876826, "learning_rate": 1.7098013902775736e-07, "loss": 0.2716, "step": 4067 }, { "epoch": 0.7369565217391304, "grad_norm": 10.637408114660769, "learning_rate": 1.7075927986752104e-07, "loss": 0.3638, "step": 4068 }, { "epoch": 0.7371376811594202, "grad_norm": 3.7711968309292203, "learning_rate": 1.7053853406642471e-07, "loss": 0.2878, "step": 4069 }, { "epoch": 0.7373188405797102, "grad_norm": 4.7111630448958435, "learning_rate": 1.70317901700472e-07, "loss": 0.2631, "step": 4070 }, { "epoch": 0.7375, "grad_norm": 4.766524430188335, "learning_rate": 1.700973828456276e-07, "loss": 0.3398, "step": 4071 }, { "epoch": 0.7376811594202899, "grad_norm": 4.878166475157653, "learning_rate": 1.698769775778176e-07, "loss": 0.3311, "step": 4072 }, { "epoch": 0.7378623188405797, "grad_norm": 4.5110435891535055, "learning_rate": 1.6965668597292832e-07, "loss": 0.3185, "step": 4073 }, { "epoch": 0.7380434782608696, "grad_norm": 6.105604566424603, "learning_rate": 1.694365081068073e-07, "loss": 0.3055, "step": 4074 }, { "epoch": 0.7382246376811594, "grad_norm": 9.808078236948342, "learning_rate": 1.692164440552628e-07, "loss": 0.3059, "step": 4075 }, { "epoch": 0.7384057971014493, "grad_norm": 6.478678720347312, "learning_rate": 1.6899649389406384e-07, "loss": 0.3526, "step": 4076 }, { "epoch": 0.7385869565217391, "grad_norm": 7.119297399782839, "learning_rate": 1.6877665769894038e-07, "loss": 0.3319, "step": 4077 }, { "epoch": 0.738768115942029, "grad_norm": 5.277490930196286, "learning_rate": 1.685569355455831e-07, "loss": 0.2769, "step": 4078 }, { "epoch": 0.7389492753623188, "grad_norm": 5.556273927347214, "learning_rate": 1.683373275096433e-07, "loss": 0.2494, "step": 4079 }, { "epoch": 0.7391304347826086, "grad_norm": 4.3268188693427465, "learning_rate": 1.6811783366673304e-07, "loss": 0.3031, "step": 4080 }, { "epoch": 0.7393115942028986, "grad_norm": 3.8310131069680864, "learning_rate": 1.678984540924252e-07, "loss": 0.252, "step": 4081 }, { "epoch": 0.7394927536231884, "grad_norm": 5.287240234109758, "learning_rate": 1.6767918886225307e-07, "loss": 0.2477, "step": 4082 }, { "epoch": 0.7396739130434783, "grad_norm": 3.4545222855171565, "learning_rate": 1.6746003805171067e-07, "loss": 0.2697, "step": 4083 }, { "epoch": 0.7398550724637681, "grad_norm": 3.609924065994071, "learning_rate": 1.6724100173625293e-07, "loss": 0.2983, "step": 4084 }, { "epoch": 0.740036231884058, "grad_norm": 4.6170587748580765, "learning_rate": 1.6702207999129515e-07, "loss": 0.3274, "step": 4085 }, { "epoch": 0.7402173913043478, "grad_norm": 4.300091182191505, "learning_rate": 1.6680327289221267e-07, "loss": 0.3186, "step": 4086 }, { "epoch": 0.7403985507246377, "grad_norm": 3.9430702549675587, "learning_rate": 1.6658458051434193e-07, "loss": 0.2607, "step": 4087 }, { "epoch": 0.7405797101449275, "grad_norm": 6.253163173036239, "learning_rate": 1.6636600293298003e-07, "loss": 0.284, "step": 4088 }, { "epoch": 0.7407608695652174, "grad_norm": 3.676676767813719, "learning_rate": 1.6614754022338412e-07, "loss": 0.2747, "step": 4089 }, { "epoch": 0.7409420289855072, "grad_norm": 5.784409718586341, "learning_rate": 1.659291924607719e-07, "loss": 0.2615, "step": 4090 }, { "epoch": 0.741123188405797, "grad_norm": 4.577076389372262, "learning_rate": 1.6571095972032161e-07, "loss": 0.2761, "step": 4091 }, { "epoch": 0.741304347826087, "grad_norm": 5.024957066205122, "learning_rate": 1.654928420771718e-07, "loss": 0.2974, "step": 4092 }, { "epoch": 0.7414855072463769, "grad_norm": 4.326184341671534, "learning_rate": 1.6527483960642135e-07, "loss": 0.2399, "step": 4093 }, { "epoch": 0.7416666666666667, "grad_norm": 11.610959921859596, "learning_rate": 1.6505695238312944e-07, "loss": 0.3054, "step": 4094 }, { "epoch": 0.7418478260869565, "grad_norm": 5.813119547517348, "learning_rate": 1.64839180482316e-07, "loss": 0.3284, "step": 4095 }, { "epoch": 0.7420289855072464, "grad_norm": 10.051182128455602, "learning_rate": 1.646215239789609e-07, "loss": 0.3024, "step": 4096 }, { "epoch": 0.7422101449275362, "grad_norm": 6.03298777596573, "learning_rate": 1.6440398294800394e-07, "loss": 0.3374, "step": 4097 }, { "epoch": 0.7423913043478261, "grad_norm": 4.861400252041628, "learning_rate": 1.6418655746434563e-07, "loss": 0.2641, "step": 4098 }, { "epoch": 0.7425724637681159, "grad_norm": 4.274509348863295, "learning_rate": 1.639692476028468e-07, "loss": 0.2832, "step": 4099 }, { "epoch": 0.7427536231884058, "grad_norm": 8.73539491030266, "learning_rate": 1.6375205343832825e-07, "loss": 0.2552, "step": 4100 }, { "epoch": 0.7427536231884058, "eval_loss": 0.2782343626022339, "eval_runtime": 9.7517, "eval_samples_per_second": 51.273, "eval_steps_per_second": 0.103, "step": 4100 }, { "epoch": 0.7429347826086956, "grad_norm": 6.997640893148164, "learning_rate": 1.6353497504557085e-07, "loss": 0.3903, "step": 4101 }, { "epoch": 0.7431159420289855, "grad_norm": 3.405098061964635, "learning_rate": 1.633180124993157e-07, "loss": 0.2123, "step": 4102 }, { "epoch": 0.7432971014492754, "grad_norm": 5.5333548242744435, "learning_rate": 1.6310116587426415e-07, "loss": 0.3348, "step": 4103 }, { "epoch": 0.7434782608695653, "grad_norm": 6.414892445450211, "learning_rate": 1.628844352450774e-07, "loss": 0.2959, "step": 4104 }, { "epoch": 0.7436594202898551, "grad_norm": 3.7926079170974045, "learning_rate": 1.6266782068637692e-07, "loss": 0.294, "step": 4105 }, { "epoch": 0.743840579710145, "grad_norm": 7.616807850241395, "learning_rate": 1.6245132227274406e-07, "loss": 0.3031, "step": 4106 }, { "epoch": 0.7440217391304348, "grad_norm": 9.835890341953727, "learning_rate": 1.622349400787203e-07, "loss": 0.3392, "step": 4107 }, { "epoch": 0.7442028985507246, "grad_norm": 8.789924256009957, "learning_rate": 1.6201867417880699e-07, "loss": 0.2862, "step": 4108 }, { "epoch": 0.7443840579710145, "grad_norm": 7.889504668661902, "learning_rate": 1.6180252464746558e-07, "loss": 0.2928, "step": 4109 }, { "epoch": 0.7445652173913043, "grad_norm": 8.359257683253054, "learning_rate": 1.615864915591172e-07, "loss": 0.2426, "step": 4110 }, { "epoch": 0.7447463768115942, "grad_norm": 7.616949080245882, "learning_rate": 1.6137057498814338e-07, "loss": 0.3115, "step": 4111 }, { "epoch": 0.744927536231884, "grad_norm": 3.9817073099023292, "learning_rate": 1.6115477500888518e-07, "loss": 0.3411, "step": 4112 }, { "epoch": 0.7451086956521739, "grad_norm": 6.522618790958368, "learning_rate": 1.609390916956433e-07, "loss": 0.343, "step": 4113 }, { "epoch": 0.7452898550724638, "grad_norm": 5.04808828468488, "learning_rate": 1.6072352512267846e-07, "loss": 0.3392, "step": 4114 }, { "epoch": 0.7454710144927537, "grad_norm": 5.154141129836924, "learning_rate": 1.6050807536421163e-07, "loss": 0.2626, "step": 4115 }, { "epoch": 0.7456521739130435, "grad_norm": 5.605569529132893, "learning_rate": 1.6029274249442299e-07, "loss": 0.2807, "step": 4116 }, { "epoch": 0.7458333333333333, "grad_norm": 4.878872963968999, "learning_rate": 1.6007752658745267e-07, "loss": 0.3379, "step": 4117 }, { "epoch": 0.7460144927536232, "grad_norm": 4.022436621642571, "learning_rate": 1.5986242771740056e-07, "loss": 0.2585, "step": 4118 }, { "epoch": 0.746195652173913, "grad_norm": 4.520911370530145, "learning_rate": 1.5964744595832614e-07, "loss": 0.2227, "step": 4119 }, { "epoch": 0.7463768115942029, "grad_norm": 5.066824652835237, "learning_rate": 1.5943258138424875e-07, "loss": 0.2846, "step": 4120 }, { "epoch": 0.7465579710144927, "grad_norm": 6.506074663360499, "learning_rate": 1.5921783406914724e-07, "loss": 0.2517, "step": 4121 }, { "epoch": 0.7467391304347826, "grad_norm": 4.407061276506215, "learning_rate": 1.5900320408696007e-07, "loss": 0.2575, "step": 4122 }, { "epoch": 0.7469202898550724, "grad_norm": 4.093206575359533, "learning_rate": 1.5878869151158542e-07, "loss": 0.2934, "step": 4123 }, { "epoch": 0.7471014492753624, "grad_norm": 3.5818064242370404, "learning_rate": 1.5857429641688097e-07, "loss": 0.2489, "step": 4124 }, { "epoch": 0.7472826086956522, "grad_norm": 8.332450278959207, "learning_rate": 1.58360018876664e-07, "loss": 0.2742, "step": 4125 }, { "epoch": 0.7474637681159421, "grad_norm": 4.117337045630649, "learning_rate": 1.581458589647111e-07, "loss": 0.3022, "step": 4126 }, { "epoch": 0.7476449275362319, "grad_norm": 7.547400067842371, "learning_rate": 1.5793181675475885e-07, "loss": 0.3691, "step": 4127 }, { "epoch": 0.7478260869565218, "grad_norm": 3.9307806662900324, "learning_rate": 1.57717892320503e-07, "loss": 0.2103, "step": 4128 }, { "epoch": 0.7480072463768116, "grad_norm": 4.483912691908728, "learning_rate": 1.5750408573559827e-07, "loss": 0.3202, "step": 4129 }, { "epoch": 0.7481884057971014, "grad_norm": 3.648510635186452, "learning_rate": 1.5729039707365977e-07, "loss": 0.2698, "step": 4130 }, { "epoch": 0.7483695652173913, "grad_norm": 7.938871952174254, "learning_rate": 1.570768264082613e-07, "loss": 0.3275, "step": 4131 }, { "epoch": 0.7485507246376811, "grad_norm": 8.708073803465622, "learning_rate": 1.5686337381293635e-07, "loss": 0.3507, "step": 4132 }, { "epoch": 0.748731884057971, "grad_norm": 3.471292359775845, "learning_rate": 1.566500393611776e-07, "loss": 0.2842, "step": 4133 }, { "epoch": 0.7489130434782608, "grad_norm": 4.23335422597505, "learning_rate": 1.5643682312643714e-07, "loss": 0.2565, "step": 4134 }, { "epoch": 0.7490942028985508, "grad_norm": 4.407386327956635, "learning_rate": 1.562237251821263e-07, "loss": 0.2333, "step": 4135 }, { "epoch": 0.7492753623188406, "grad_norm": 4.846907412454899, "learning_rate": 1.560107456016157e-07, "loss": 0.261, "step": 4136 }, { "epoch": 0.7494565217391305, "grad_norm": 3.6805621141324236, "learning_rate": 1.5579788445823512e-07, "loss": 0.2783, "step": 4137 }, { "epoch": 0.7496376811594203, "grad_norm": 4.3644613723940076, "learning_rate": 1.5558514182527392e-07, "loss": 0.2745, "step": 4138 }, { "epoch": 0.7498188405797102, "grad_norm": 4.645181489364351, "learning_rate": 1.5537251777598043e-07, "loss": 0.2913, "step": 4139 }, { "epoch": 0.75, "grad_norm": 4.069215011046552, "learning_rate": 1.551600123835618e-07, "loss": 0.2611, "step": 4140 }, { "epoch": 0.7501811594202898, "grad_norm": 8.70871320591032, "learning_rate": 1.5494762572118464e-07, "loss": 0.2652, "step": 4141 }, { "epoch": 0.7503623188405797, "grad_norm": 3.677680153020506, "learning_rate": 1.54735357861975e-07, "loss": 0.324, "step": 4142 }, { "epoch": 0.7505434782608695, "grad_norm": 4.291613794269187, "learning_rate": 1.5452320887901755e-07, "loss": 0.2715, "step": 4143 }, { "epoch": 0.7507246376811594, "grad_norm": 8.488776567402915, "learning_rate": 1.543111788453561e-07, "loss": 0.2785, "step": 4144 }, { "epoch": 0.7509057971014492, "grad_norm": 4.370941696431717, "learning_rate": 1.540992678339938e-07, "loss": 0.3212, "step": 4145 }, { "epoch": 0.7510869565217392, "grad_norm": 4.909632496990874, "learning_rate": 1.5388747591789242e-07, "loss": 0.2714, "step": 4146 }, { "epoch": 0.751268115942029, "grad_norm": 3.9703608273347317, "learning_rate": 1.5367580316997302e-07, "loss": 0.3184, "step": 4147 }, { "epoch": 0.7514492753623189, "grad_norm": 4.362969990390846, "learning_rate": 1.534642496631155e-07, "loss": 0.3372, "step": 4148 }, { "epoch": 0.7516304347826087, "grad_norm": 5.1601994634830595, "learning_rate": 1.5325281547015879e-07, "loss": 0.264, "step": 4149 }, { "epoch": 0.7518115942028986, "grad_norm": 5.3739980521431905, "learning_rate": 1.530415006639006e-07, "loss": 0.2723, "step": 4150 }, { "epoch": 0.7519927536231884, "grad_norm": 4.5117810317927285, "learning_rate": 1.5283030531709763e-07, "loss": 0.3054, "step": 4151 }, { "epoch": 0.7521739130434782, "grad_norm": 4.2660004626055255, "learning_rate": 1.5261922950246548e-07, "loss": 0.2832, "step": 4152 }, { "epoch": 0.7523550724637681, "grad_norm": 5.480095079200267, "learning_rate": 1.5240827329267835e-07, "loss": 0.2521, "step": 4153 }, { "epoch": 0.7525362318840579, "grad_norm": 7.851072070959068, "learning_rate": 1.521974367603699e-07, "loss": 0.2841, "step": 4154 }, { "epoch": 0.7527173913043478, "grad_norm": 3.8152780757210905, "learning_rate": 1.5198671997813195e-07, "loss": 0.236, "step": 4155 }, { "epoch": 0.7528985507246376, "grad_norm": 5.010142250718976, "learning_rate": 1.517761230185151e-07, "loss": 0.3008, "step": 4156 }, { "epoch": 0.7530797101449276, "grad_norm": 3.626894823200263, "learning_rate": 1.5156564595402894e-07, "loss": 0.2288, "step": 4157 }, { "epoch": 0.7532608695652174, "grad_norm": 5.786197465607927, "learning_rate": 1.513552888571419e-07, "loss": 0.3024, "step": 4158 }, { "epoch": 0.7534420289855073, "grad_norm": 4.253387001983209, "learning_rate": 1.5114505180028075e-07, "loss": 0.2839, "step": 4159 }, { "epoch": 0.7536231884057971, "grad_norm": 4.481154963364705, "learning_rate": 1.5093493485583126e-07, "loss": 0.2959, "step": 4160 }, { "epoch": 0.753804347826087, "grad_norm": 7.810375492123668, "learning_rate": 1.5072493809613756e-07, "loss": 0.3178, "step": 4161 }, { "epoch": 0.7539855072463768, "grad_norm": 6.590657181605559, "learning_rate": 1.5051506159350257e-07, "loss": 0.3199, "step": 4162 }, { "epoch": 0.7541666666666667, "grad_norm": 5.370073227049868, "learning_rate": 1.5030530542018784e-07, "loss": 0.2837, "step": 4163 }, { "epoch": 0.7543478260869565, "grad_norm": 3.219210745592578, "learning_rate": 1.5009566964841313e-07, "loss": 0.181, "step": 4164 }, { "epoch": 0.7545289855072463, "grad_norm": 4.216171026106692, "learning_rate": 1.498861543503574e-07, "loss": 0.2444, "step": 4165 }, { "epoch": 0.7547101449275362, "grad_norm": 4.0956290985620685, "learning_rate": 1.4967675959815772e-07, "loss": 0.2893, "step": 4166 }, { "epoch": 0.7548913043478261, "grad_norm": 3.579762785926032, "learning_rate": 1.4946748546390947e-07, "loss": 0.2187, "step": 4167 }, { "epoch": 0.755072463768116, "grad_norm": 4.42191594003051, "learning_rate": 1.492583320196667e-07, "loss": 0.2939, "step": 4168 }, { "epoch": 0.7552536231884058, "grad_norm": 6.8527628806133905, "learning_rate": 1.4904929933744215e-07, "loss": 0.2354, "step": 4169 }, { "epoch": 0.7554347826086957, "grad_norm": 8.513229368530343, "learning_rate": 1.4884038748920674e-07, "loss": 0.2917, "step": 4170 }, { "epoch": 0.7556159420289855, "grad_norm": 3.474828642165609, "learning_rate": 1.4863159654688973e-07, "loss": 0.252, "step": 4171 }, { "epoch": 0.7557971014492754, "grad_norm": 4.808820151260308, "learning_rate": 1.4842292658237883e-07, "loss": 0.2598, "step": 4172 }, { "epoch": 0.7559782608695652, "grad_norm": 5.058108048547361, "learning_rate": 1.482143776675201e-07, "loss": 0.2723, "step": 4173 }, { "epoch": 0.756159420289855, "grad_norm": 7.044222693881663, "learning_rate": 1.4800594987411797e-07, "loss": 0.2658, "step": 4174 }, { "epoch": 0.7563405797101449, "grad_norm": 4.213877280511019, "learning_rate": 1.4779764327393507e-07, "loss": 0.2941, "step": 4175 }, { "epoch": 0.7565217391304347, "grad_norm": 4.361878436395466, "learning_rate": 1.4758945793869237e-07, "loss": 0.2592, "step": 4176 }, { "epoch": 0.7567028985507246, "grad_norm": 6.568963758929252, "learning_rate": 1.4738139394006905e-07, "loss": 0.317, "step": 4177 }, { "epoch": 0.7568840579710145, "grad_norm": 3.803413229713382, "learning_rate": 1.471734513497025e-07, "loss": 0.2423, "step": 4178 }, { "epoch": 0.7570652173913044, "grad_norm": 3.8079128979911303, "learning_rate": 1.469656302391884e-07, "loss": 0.2382, "step": 4179 }, { "epoch": 0.7572463768115942, "grad_norm": 3.8940522193023743, "learning_rate": 1.467579306800804e-07, "loss": 0.2844, "step": 4180 }, { "epoch": 0.7574275362318841, "grad_norm": 6.788700170594187, "learning_rate": 1.465503527438907e-07, "loss": 0.2817, "step": 4181 }, { "epoch": 0.7576086956521739, "grad_norm": 6.452652737045557, "learning_rate": 1.4634289650208936e-07, "loss": 0.3367, "step": 4182 }, { "epoch": 0.7577898550724638, "grad_norm": 3.9382792088577037, "learning_rate": 1.4613556202610426e-07, "loss": 0.2291, "step": 4183 }, { "epoch": 0.7579710144927536, "grad_norm": 3.5800344995715885, "learning_rate": 1.4592834938732167e-07, "loss": 0.2698, "step": 4184 }, { "epoch": 0.7581521739130435, "grad_norm": 4.390570948316386, "learning_rate": 1.4572125865708617e-07, "loss": 0.2936, "step": 4185 }, { "epoch": 0.7583333333333333, "grad_norm": 6.913478110394954, "learning_rate": 1.4551428990669994e-07, "loss": 0.2905, "step": 4186 }, { "epoch": 0.7585144927536231, "grad_norm": 7.941113618589702, "learning_rate": 1.4530744320742327e-07, "loss": 0.4074, "step": 4187 }, { "epoch": 0.758695652173913, "grad_norm": 4.638788236343474, "learning_rate": 1.4510071863047445e-07, "loss": 0.2895, "step": 4188 }, { "epoch": 0.758876811594203, "grad_norm": 3.8351442733602714, "learning_rate": 1.4489411624702975e-07, "loss": 0.2842, "step": 4189 }, { "epoch": 0.7590579710144928, "grad_norm": 8.03330761938926, "learning_rate": 1.4468763612822338e-07, "loss": 0.3365, "step": 4190 }, { "epoch": 0.7592391304347826, "grad_norm": 3.6270601815734165, "learning_rate": 1.4448127834514738e-07, "loss": 0.2734, "step": 4191 }, { "epoch": 0.7594202898550725, "grad_norm": 6.530119672654416, "learning_rate": 1.4427504296885172e-07, "loss": 0.3629, "step": 4192 }, { "epoch": 0.7596014492753623, "grad_norm": 4.8994945308352, "learning_rate": 1.4406893007034426e-07, "loss": 0.2542, "step": 4193 }, { "epoch": 0.7597826086956522, "grad_norm": 5.781770686188912, "learning_rate": 1.438629397205906e-07, "loss": 0.239, "step": 4194 }, { "epoch": 0.759963768115942, "grad_norm": 6.9974053713430155, "learning_rate": 1.4365707199051418e-07, "loss": 0.3015, "step": 4195 }, { "epoch": 0.7601449275362319, "grad_norm": 3.685187860278038, "learning_rate": 1.4345132695099615e-07, "loss": 0.2604, "step": 4196 }, { "epoch": 0.7603260869565217, "grad_norm": 4.616105592637974, "learning_rate": 1.4324570467287572e-07, "loss": 0.2524, "step": 4197 }, { "epoch": 0.7605072463768116, "grad_norm": 3.754075937797778, "learning_rate": 1.430402052269497e-07, "loss": 0.2279, "step": 4198 }, { "epoch": 0.7606884057971014, "grad_norm": 4.67240179142241, "learning_rate": 1.4283482868397218e-07, "loss": 0.3127, "step": 4199 }, { "epoch": 0.7608695652173914, "grad_norm": 4.336504431516033, "learning_rate": 1.4262957511465522e-07, "loss": 0.2823, "step": 4200 }, { "epoch": 0.7608695652173914, "eval_loss": 0.27192187309265137, "eval_runtime": 9.8168, "eval_samples_per_second": 50.933, "eval_steps_per_second": 0.102, "step": 4200 }, { "epoch": 0.7610507246376812, "grad_norm": 4.7301263936488525, "learning_rate": 1.42424444589669e-07, "loss": 0.3142, "step": 4201 }, { "epoch": 0.761231884057971, "grad_norm": 7.178449701857179, "learning_rate": 1.4221943717964074e-07, "loss": 0.2501, "step": 4202 }, { "epoch": 0.7614130434782609, "grad_norm": 4.371466132811722, "learning_rate": 1.4201455295515547e-07, "loss": 0.2727, "step": 4203 }, { "epoch": 0.7615942028985507, "grad_norm": 6.684248286835398, "learning_rate": 1.4180979198675575e-07, "loss": 0.229, "step": 4204 }, { "epoch": 0.7617753623188406, "grad_norm": 5.705066448837432, "learning_rate": 1.416051543449418e-07, "loss": 0.3169, "step": 4205 }, { "epoch": 0.7619565217391304, "grad_norm": 3.8661632846651193, "learning_rate": 1.4140064010017134e-07, "loss": 0.3196, "step": 4206 }, { "epoch": 0.7621376811594203, "grad_norm": 4.329415578700389, "learning_rate": 1.4119624932285939e-07, "loss": 0.2486, "step": 4207 }, { "epoch": 0.7623188405797101, "grad_norm": 5.556767223400084, "learning_rate": 1.4099198208337905e-07, "loss": 0.306, "step": 4208 }, { "epoch": 0.7625, "grad_norm": 3.652407110653765, "learning_rate": 1.4078783845206045e-07, "loss": 0.1817, "step": 4209 }, { "epoch": 0.7626811594202898, "grad_norm": 4.486455444170379, "learning_rate": 1.4058381849919083e-07, "loss": 0.3275, "step": 4210 }, { "epoch": 0.7628623188405798, "grad_norm": 8.492892926974811, "learning_rate": 1.4037992229501533e-07, "loss": 0.3127, "step": 4211 }, { "epoch": 0.7630434782608696, "grad_norm": 4.980890710303177, "learning_rate": 1.4017614990973663e-07, "loss": 0.2785, "step": 4212 }, { "epoch": 0.7632246376811594, "grad_norm": 4.2119146247255355, "learning_rate": 1.3997250141351447e-07, "loss": 0.2526, "step": 4213 }, { "epoch": 0.7634057971014493, "grad_norm": 10.343217546623658, "learning_rate": 1.3976897687646584e-07, "loss": 0.2785, "step": 4214 }, { "epoch": 0.7635869565217391, "grad_norm": 4.545859529884203, "learning_rate": 1.3956557636866534e-07, "loss": 0.256, "step": 4215 }, { "epoch": 0.763768115942029, "grad_norm": 3.624383069864175, "learning_rate": 1.3936229996014464e-07, "loss": 0.2658, "step": 4216 }, { "epoch": 0.7639492753623188, "grad_norm": 4.112704435755177, "learning_rate": 1.3915914772089281e-07, "loss": 0.2633, "step": 4217 }, { "epoch": 0.7641304347826087, "grad_norm": 5.52640967342109, "learning_rate": 1.3895611972085609e-07, "loss": 0.2656, "step": 4218 }, { "epoch": 0.7643115942028985, "grad_norm": 6.179450699028498, "learning_rate": 1.3875321602993805e-07, "loss": 0.31, "step": 4219 }, { "epoch": 0.7644927536231884, "grad_norm": 8.567682621644742, "learning_rate": 1.385504367179993e-07, "loss": 0.2313, "step": 4220 }, { "epoch": 0.7646739130434783, "grad_norm": 5.199140701796199, "learning_rate": 1.3834778185485785e-07, "loss": 0.2846, "step": 4221 }, { "epoch": 0.7648550724637682, "grad_norm": 4.19093577587622, "learning_rate": 1.381452515102886e-07, "loss": 0.3342, "step": 4222 }, { "epoch": 0.765036231884058, "grad_norm": 6.177618278834126, "learning_rate": 1.3794284575402365e-07, "loss": 0.2689, "step": 4223 }, { "epoch": 0.7652173913043478, "grad_norm": 4.5386623077575665, "learning_rate": 1.3774056465575247e-07, "loss": 0.2402, "step": 4224 }, { "epoch": 0.7653985507246377, "grad_norm": 6.323743985541725, "learning_rate": 1.3753840828512148e-07, "loss": 0.2587, "step": 4225 }, { "epoch": 0.7655797101449275, "grad_norm": 6.098581096792941, "learning_rate": 1.3733637671173375e-07, "loss": 0.2495, "step": 4226 }, { "epoch": 0.7657608695652174, "grad_norm": 5.466505544348531, "learning_rate": 1.3713447000514967e-07, "loss": 0.2788, "step": 4227 }, { "epoch": 0.7659420289855072, "grad_norm": 8.223041813063077, "learning_rate": 1.36932688234887e-07, "loss": 0.3181, "step": 4228 }, { "epoch": 0.7661231884057971, "grad_norm": 6.39944566226906, "learning_rate": 1.367310314704201e-07, "loss": 0.2869, "step": 4229 }, { "epoch": 0.7663043478260869, "grad_norm": 3.844702541479501, "learning_rate": 1.3652949978118021e-07, "loss": 0.2424, "step": 4230 }, { "epoch": 0.7664855072463768, "grad_norm": 4.2510147854212565, "learning_rate": 1.363280932365557e-07, "loss": 0.2825, "step": 4231 }, { "epoch": 0.7666666666666667, "grad_norm": 6.29174491356392, "learning_rate": 1.3612681190589183e-07, "loss": 0.2799, "step": 4232 }, { "epoch": 0.7668478260869566, "grad_norm": 9.719189928359546, "learning_rate": 1.359256558584907e-07, "loss": 0.2976, "step": 4233 }, { "epoch": 0.7670289855072464, "grad_norm": 4.6877384723755995, "learning_rate": 1.357246251636112e-07, "loss": 0.2967, "step": 4234 }, { "epoch": 0.7672101449275363, "grad_norm": 5.024022407328368, "learning_rate": 1.3552371989046917e-07, "loss": 0.3116, "step": 4235 }, { "epoch": 0.7673913043478261, "grad_norm": 7.6677217400183215, "learning_rate": 1.3532294010823757e-07, "loss": 0.2378, "step": 4236 }, { "epoch": 0.7675724637681159, "grad_norm": 6.981662881687111, "learning_rate": 1.3512228588604542e-07, "loss": 0.2849, "step": 4237 }, { "epoch": 0.7677536231884058, "grad_norm": 4.880698556120381, "learning_rate": 1.349217572929789e-07, "loss": 0.2538, "step": 4238 }, { "epoch": 0.7679347826086956, "grad_norm": 5.136854827223748, "learning_rate": 1.347213543980813e-07, "loss": 0.3288, "step": 4239 }, { "epoch": 0.7681159420289855, "grad_norm": 3.766387750092391, "learning_rate": 1.3452107727035212e-07, "loss": 0.2618, "step": 4240 }, { "epoch": 0.7682971014492753, "grad_norm": 4.636147779595781, "learning_rate": 1.3432092597874768e-07, "loss": 0.2407, "step": 4241 }, { "epoch": 0.7684782608695652, "grad_norm": 4.165515611198434, "learning_rate": 1.34120900592181e-07, "loss": 0.246, "step": 4242 }, { "epoch": 0.7686594202898551, "grad_norm": 4.966108961627089, "learning_rate": 1.3392100117952189e-07, "loss": 0.2666, "step": 4243 }, { "epoch": 0.768840579710145, "grad_norm": 3.782537784382739, "learning_rate": 1.3372122780959654e-07, "loss": 0.2381, "step": 4244 }, { "epoch": 0.7690217391304348, "grad_norm": 6.500482548869603, "learning_rate": 1.33521580551188e-07, "loss": 0.2378, "step": 4245 }, { "epoch": 0.7692028985507247, "grad_norm": 5.397272885702307, "learning_rate": 1.3332205947303564e-07, "loss": 0.2235, "step": 4246 }, { "epoch": 0.7693840579710145, "grad_norm": 4.6291956763175035, "learning_rate": 1.3312266464383553e-07, "loss": 0.3353, "step": 4247 }, { "epoch": 0.7695652173913043, "grad_norm": 5.04456172886357, "learning_rate": 1.3292339613224036e-07, "loss": 0.2473, "step": 4248 }, { "epoch": 0.7697463768115942, "grad_norm": 4.610438720089957, "learning_rate": 1.3272425400685905e-07, "loss": 0.229, "step": 4249 }, { "epoch": 0.769927536231884, "grad_norm": 3.5357954968155334, "learning_rate": 1.3252523833625717e-07, "loss": 0.26, "step": 4250 }, { "epoch": 0.7701086956521739, "grad_norm": 5.518110853842271, "learning_rate": 1.3232634918895702e-07, "loss": 0.286, "step": 4251 }, { "epoch": 0.7702898550724637, "grad_norm": 5.276677152342091, "learning_rate": 1.3212758663343708e-07, "loss": 0.2625, "step": 4252 }, { "epoch": 0.7704710144927536, "grad_norm": 5.4594377001446555, "learning_rate": 1.3192895073813193e-07, "loss": 0.2956, "step": 4253 }, { "epoch": 0.7706521739130435, "grad_norm": 5.219013116171993, "learning_rate": 1.3173044157143282e-07, "loss": 0.2343, "step": 4254 }, { "epoch": 0.7708333333333334, "grad_norm": 8.742759598778866, "learning_rate": 1.3153205920168775e-07, "loss": 0.3239, "step": 4255 }, { "epoch": 0.7710144927536232, "grad_norm": 3.616740810665911, "learning_rate": 1.3133380369720055e-07, "loss": 0.2605, "step": 4256 }, { "epoch": 0.7711956521739131, "grad_norm": 3.917254595375453, "learning_rate": 1.3113567512623147e-07, "loss": 0.2941, "step": 4257 }, { "epoch": 0.7713768115942029, "grad_norm": 4.03465354171425, "learning_rate": 1.3093767355699715e-07, "loss": 0.2528, "step": 4258 }, { "epoch": 0.7715579710144927, "grad_norm": 4.118308765400935, "learning_rate": 1.307397990576705e-07, "loss": 0.2932, "step": 4259 }, { "epoch": 0.7717391304347826, "grad_norm": 5.919875582433207, "learning_rate": 1.3054205169638065e-07, "loss": 0.2896, "step": 4260 }, { "epoch": 0.7719202898550724, "grad_norm": 3.9386329365764245, "learning_rate": 1.303444315412129e-07, "loss": 0.2594, "step": 4261 }, { "epoch": 0.7721014492753623, "grad_norm": 8.446124708946124, "learning_rate": 1.3014693866020881e-07, "loss": 0.2993, "step": 4262 }, { "epoch": 0.7722826086956521, "grad_norm": 3.7518634564241, "learning_rate": 1.2994957312136622e-07, "loss": 0.2457, "step": 4263 }, { "epoch": 0.7724637681159421, "grad_norm": 5.277053300889531, "learning_rate": 1.2975233499263887e-07, "loss": 0.3564, "step": 4264 }, { "epoch": 0.7726449275362319, "grad_norm": 3.4737252317609815, "learning_rate": 1.295552243419369e-07, "loss": 0.2466, "step": 4265 }, { "epoch": 0.7728260869565218, "grad_norm": 4.955076209571368, "learning_rate": 1.293582412371262e-07, "loss": 0.2694, "step": 4266 }, { "epoch": 0.7730072463768116, "grad_norm": 3.829758219547843, "learning_rate": 1.2916138574602937e-07, "loss": 0.2077, "step": 4267 }, { "epoch": 0.7731884057971015, "grad_norm": 7.940474059004807, "learning_rate": 1.2896465793642459e-07, "loss": 0.3445, "step": 4268 }, { "epoch": 0.7733695652173913, "grad_norm": 3.771841398545769, "learning_rate": 1.28768057876046e-07, "loss": 0.3044, "step": 4269 }, { "epoch": 0.7735507246376812, "grad_norm": 7.247392127189758, "learning_rate": 1.285715856325838e-07, "loss": 0.2571, "step": 4270 }, { "epoch": 0.773731884057971, "grad_norm": 3.9908414542369544, "learning_rate": 1.2837524127368477e-07, "loss": 0.2574, "step": 4271 }, { "epoch": 0.7739130434782608, "grad_norm": 6.57559591721725, "learning_rate": 1.2817902486695088e-07, "loss": 0.2859, "step": 4272 }, { "epoch": 0.7740942028985507, "grad_norm": 4.360375468246992, "learning_rate": 1.279829364799405e-07, "loss": 0.2895, "step": 4273 }, { "epoch": 0.7742753623188405, "grad_norm": 12.665574219810782, "learning_rate": 1.2778697618016772e-07, "loss": 0.2782, "step": 4274 }, { "epoch": 0.7744565217391305, "grad_norm": 4.2619959146898445, "learning_rate": 1.2759114403510262e-07, "loss": 0.2907, "step": 4275 }, { "epoch": 0.7746376811594203, "grad_norm": 4.312343028709694, "learning_rate": 1.273954401121712e-07, "loss": 0.2841, "step": 4276 }, { "epoch": 0.7748188405797102, "grad_norm": 9.039239924794582, "learning_rate": 1.2719986447875497e-07, "loss": 0.3053, "step": 4277 }, { "epoch": 0.775, "grad_norm": 4.791390608295223, "learning_rate": 1.270044172021919e-07, "loss": 0.2412, "step": 4278 }, { "epoch": 0.7751811594202899, "grad_norm": 3.637247407813648, "learning_rate": 1.268090983497755e-07, "loss": 0.262, "step": 4279 }, { "epoch": 0.7753623188405797, "grad_norm": 3.7610351208892916, "learning_rate": 1.2661390798875453e-07, "loss": 0.2452, "step": 4280 }, { "epoch": 0.7755434782608696, "grad_norm": 5.234613384236787, "learning_rate": 1.2641884618633408e-07, "loss": 0.2431, "step": 4281 }, { "epoch": 0.7757246376811594, "grad_norm": 6.967885374647666, "learning_rate": 1.262239130096751e-07, "loss": 0.2687, "step": 4282 }, { "epoch": 0.7759057971014492, "grad_norm": 4.134985338581212, "learning_rate": 1.2602910852589382e-07, "loss": 0.2769, "step": 4283 }, { "epoch": 0.7760869565217391, "grad_norm": 5.074025185329541, "learning_rate": 1.258344328020624e-07, "loss": 0.2731, "step": 4284 }, { "epoch": 0.7762681159420289, "grad_norm": 7.451443342150492, "learning_rate": 1.2563988590520864e-07, "loss": 0.3038, "step": 4285 }, { "epoch": 0.7764492753623189, "grad_norm": 3.5768365120156003, "learning_rate": 1.2544546790231587e-07, "loss": 0.2547, "step": 4286 }, { "epoch": 0.7766304347826087, "grad_norm": 4.183702271956826, "learning_rate": 1.252511788603232e-07, "loss": 0.2826, "step": 4287 }, { "epoch": 0.7768115942028986, "grad_norm": 5.8009154904728595, "learning_rate": 1.2505701884612524e-07, "loss": 0.226, "step": 4288 }, { "epoch": 0.7769927536231884, "grad_norm": 4.7817198402104815, "learning_rate": 1.2486298792657223e-07, "loss": 0.333, "step": 4289 }, { "epoch": 0.7771739130434783, "grad_norm": 5.260721823023794, "learning_rate": 1.246690861684699e-07, "loss": 0.2512, "step": 4290 }, { "epoch": 0.7773550724637681, "grad_norm": 5.547021387034899, "learning_rate": 1.2447531363857955e-07, "loss": 0.2807, "step": 4291 }, { "epoch": 0.777536231884058, "grad_norm": 3.609036974458032, "learning_rate": 1.2428167040361797e-07, "loss": 0.2164, "step": 4292 }, { "epoch": 0.7777173913043478, "grad_norm": 6.372754101043129, "learning_rate": 1.2408815653025734e-07, "loss": 0.2881, "step": 4293 }, { "epoch": 0.7778985507246376, "grad_norm": 6.880907363032734, "learning_rate": 1.2389477208512565e-07, "loss": 0.3198, "step": 4294 }, { "epoch": 0.7780797101449275, "grad_norm": 5.1045788428220895, "learning_rate": 1.2370151713480614e-07, "loss": 0.3468, "step": 4295 }, { "epoch": 0.7782608695652173, "grad_norm": 6.342279152312474, "learning_rate": 1.2350839174583706e-07, "loss": 0.2995, "step": 4296 }, { "epoch": 0.7784420289855073, "grad_norm": 5.960982001109151, "learning_rate": 1.2331539598471235e-07, "loss": 0.404, "step": 4297 }, { "epoch": 0.7786231884057971, "grad_norm": 6.714653132956193, "learning_rate": 1.231225299178818e-07, "loss": 0.3202, "step": 4298 }, { "epoch": 0.778804347826087, "grad_norm": 8.308904574850047, "learning_rate": 1.2292979361174982e-07, "loss": 0.2458, "step": 4299 }, { "epoch": 0.7789855072463768, "grad_norm": 6.034433055648776, "learning_rate": 1.2273718713267655e-07, "loss": 0.2302, "step": 4300 }, { "epoch": 0.7789855072463768, "eval_loss": 0.2721562385559082, "eval_runtime": 9.7999, "eval_samples_per_second": 51.021, "eval_steps_per_second": 0.102, "step": 4300 }, { "epoch": 0.7791666666666667, "grad_norm": 3.9641529430639197, "learning_rate": 1.2254471054697724e-07, "loss": 0.2313, "step": 4301 }, { "epoch": 0.7793478260869565, "grad_norm": 4.2347563972297255, "learning_rate": 1.2235236392092247e-07, "loss": 0.2908, "step": 4302 }, { "epoch": 0.7795289855072464, "grad_norm": 5.560438683980006, "learning_rate": 1.2216014732073822e-07, "loss": 0.2839, "step": 4303 }, { "epoch": 0.7797101449275362, "grad_norm": 5.047560533143979, "learning_rate": 1.219680608126054e-07, "loss": 0.2195, "step": 4304 }, { "epoch": 0.779891304347826, "grad_norm": 4.251025729695162, "learning_rate": 1.217761044626603e-07, "loss": 0.2908, "step": 4305 }, { "epoch": 0.7800724637681159, "grad_norm": 4.081770458735122, "learning_rate": 1.2158427833699475e-07, "loss": 0.3004, "step": 4306 }, { "epoch": 0.7802536231884057, "grad_norm": 7.901593041828544, "learning_rate": 1.2139258250165502e-07, "loss": 0.3129, "step": 4307 }, { "epoch": 0.7804347826086957, "grad_norm": 4.896613534680306, "learning_rate": 1.2120101702264284e-07, "loss": 0.2588, "step": 4308 }, { "epoch": 0.7806159420289855, "grad_norm": 8.82432248622185, "learning_rate": 1.210095819659155e-07, "loss": 0.3273, "step": 4309 }, { "epoch": 0.7807971014492754, "grad_norm": 4.169852595347561, "learning_rate": 1.208182773973847e-07, "loss": 0.2874, "step": 4310 }, { "epoch": 0.7809782608695652, "grad_norm": 6.537371092731516, "learning_rate": 1.2062710338291764e-07, "loss": 0.2168, "step": 4311 }, { "epoch": 0.7811594202898551, "grad_norm": 4.538257206402436, "learning_rate": 1.204360599883364e-07, "loss": 0.2805, "step": 4312 }, { "epoch": 0.7813405797101449, "grad_norm": 5.3269107160191, "learning_rate": 1.202451472794181e-07, "loss": 0.2737, "step": 4313 }, { "epoch": 0.7815217391304348, "grad_norm": 7.211691377268884, "learning_rate": 1.2005436532189494e-07, "loss": 0.316, "step": 4314 }, { "epoch": 0.7817028985507246, "grad_norm": 6.396467829044089, "learning_rate": 1.1986371418145398e-07, "loss": 0.386, "step": 4315 }, { "epoch": 0.7818840579710145, "grad_norm": 11.572665625743616, "learning_rate": 1.1967319392373737e-07, "loss": 0.2558, "step": 4316 }, { "epoch": 0.7820652173913043, "grad_norm": 6.864690424074091, "learning_rate": 1.1948280461434208e-07, "loss": 0.2794, "step": 4317 }, { "epoch": 0.7822463768115943, "grad_norm": 6.459327350400014, "learning_rate": 1.1929254631882013e-07, "loss": 0.2517, "step": 4318 }, { "epoch": 0.7824275362318841, "grad_norm": 4.002285778979529, "learning_rate": 1.1910241910267831e-07, "loss": 0.3039, "step": 4319 }, { "epoch": 0.782608695652174, "grad_norm": 3.3428263625272843, "learning_rate": 1.1891242303137816e-07, "loss": 0.1966, "step": 4320 }, { "epoch": 0.7827898550724638, "grad_norm": 4.269009188343704, "learning_rate": 1.1872255817033655e-07, "loss": 0.2655, "step": 4321 }, { "epoch": 0.7829710144927536, "grad_norm": 3.6777823157651395, "learning_rate": 1.1853282458492481e-07, "loss": 0.2174, "step": 4322 }, { "epoch": 0.7831521739130435, "grad_norm": 5.215493351557534, "learning_rate": 1.1834322234046889e-07, "loss": 0.3128, "step": 4323 }, { "epoch": 0.7833333333333333, "grad_norm": 3.3498642280783435, "learning_rate": 1.181537515022497e-07, "loss": 0.2614, "step": 4324 }, { "epoch": 0.7835144927536232, "grad_norm": 6.5430837453714545, "learning_rate": 1.1796441213550323e-07, "loss": 0.2397, "step": 4325 }, { "epoch": 0.783695652173913, "grad_norm": 6.09958769098372, "learning_rate": 1.1777520430541976e-07, "loss": 0.2855, "step": 4326 }, { "epoch": 0.7838768115942029, "grad_norm": 7.624891119846719, "learning_rate": 1.1758612807714446e-07, "loss": 0.274, "step": 4327 }, { "epoch": 0.7840579710144927, "grad_norm": 4.417815433473692, "learning_rate": 1.1739718351577721e-07, "loss": 0.319, "step": 4328 }, { "epoch": 0.7842391304347827, "grad_norm": 5.344259716517517, "learning_rate": 1.1720837068637245e-07, "loss": 0.2053, "step": 4329 }, { "epoch": 0.7844202898550725, "grad_norm": 6.780602735357904, "learning_rate": 1.1701968965393943e-07, "loss": 0.2965, "step": 4330 }, { "epoch": 0.7846014492753624, "grad_norm": 7.368422017759983, "learning_rate": 1.1683114048344183e-07, "loss": 0.269, "step": 4331 }, { "epoch": 0.7847826086956522, "grad_norm": 5.265692290999796, "learning_rate": 1.1664272323979801e-07, "loss": 0.3048, "step": 4332 }, { "epoch": 0.784963768115942, "grad_norm": 5.773739533904738, "learning_rate": 1.1645443798788102e-07, "loss": 0.2999, "step": 4333 }, { "epoch": 0.7851449275362319, "grad_norm": 5.997367717396313, "learning_rate": 1.1626628479251827e-07, "loss": 0.3083, "step": 4334 }, { "epoch": 0.7853260869565217, "grad_norm": 5.242432066263883, "learning_rate": 1.1607826371849189e-07, "loss": 0.2283, "step": 4335 }, { "epoch": 0.7855072463768116, "grad_norm": 3.772756701272312, "learning_rate": 1.1589037483053815e-07, "loss": 0.2817, "step": 4336 }, { "epoch": 0.7856884057971014, "grad_norm": 4.221995209084304, "learning_rate": 1.1570261819334854e-07, "loss": 0.2899, "step": 4337 }, { "epoch": 0.7858695652173913, "grad_norm": 4.289148856890564, "learning_rate": 1.1551499387156838e-07, "loss": 0.2249, "step": 4338 }, { "epoch": 0.7860507246376811, "grad_norm": 3.9027639821573583, "learning_rate": 1.1532750192979745e-07, "loss": 0.28, "step": 4339 }, { "epoch": 0.7862318840579711, "grad_norm": 10.274929003346609, "learning_rate": 1.1514014243259007e-07, "loss": 0.3168, "step": 4340 }, { "epoch": 0.7864130434782609, "grad_norm": 3.723581129625457, "learning_rate": 1.1495291544445535e-07, "loss": 0.2489, "step": 4341 }, { "epoch": 0.7865942028985508, "grad_norm": 4.9455984601475595, "learning_rate": 1.1476582102985616e-07, "loss": 0.3217, "step": 4342 }, { "epoch": 0.7867753623188406, "grad_norm": 4.8970597242938405, "learning_rate": 1.145788592532101e-07, "loss": 0.2731, "step": 4343 }, { "epoch": 0.7869565217391304, "grad_norm": 9.31684561437788, "learning_rate": 1.1439203017888899e-07, "loss": 0.2532, "step": 4344 }, { "epoch": 0.7871376811594203, "grad_norm": 5.143484648090279, "learning_rate": 1.1420533387121889e-07, "loss": 0.3276, "step": 4345 }, { "epoch": 0.7873188405797101, "grad_norm": 6.421568313001237, "learning_rate": 1.1401877039448033e-07, "loss": 0.2794, "step": 4346 }, { "epoch": 0.7875, "grad_norm": 4.376016615786184, "learning_rate": 1.1383233981290775e-07, "loss": 0.2724, "step": 4347 }, { "epoch": 0.7876811594202898, "grad_norm": 7.5639722310396555, "learning_rate": 1.136460421906904e-07, "loss": 0.298, "step": 4348 }, { "epoch": 0.7878623188405797, "grad_norm": 3.7635990191806683, "learning_rate": 1.134598775919715e-07, "loss": 0.2614, "step": 4349 }, { "epoch": 0.7880434782608695, "grad_norm": 4.867646163487617, "learning_rate": 1.1327384608084801e-07, "loss": 0.2857, "step": 4350 }, { "epoch": 0.7882246376811595, "grad_norm": 4.345151585157378, "learning_rate": 1.1308794772137159e-07, "loss": 0.3154, "step": 4351 }, { "epoch": 0.7884057971014493, "grad_norm": 4.5041696021371145, "learning_rate": 1.1290218257754808e-07, "loss": 0.2754, "step": 4352 }, { "epoch": 0.7885869565217392, "grad_norm": 4.721472794674815, "learning_rate": 1.1271655071333724e-07, "loss": 0.2739, "step": 4353 }, { "epoch": 0.788768115942029, "grad_norm": 6.334531397734939, "learning_rate": 1.1253105219265297e-07, "loss": 0.2805, "step": 4354 }, { "epoch": 0.7889492753623188, "grad_norm": 5.409477526323355, "learning_rate": 1.1234568707936332e-07, "loss": 0.2499, "step": 4355 }, { "epoch": 0.7891304347826087, "grad_norm": 5.742348091227902, "learning_rate": 1.121604554372903e-07, "loss": 0.267, "step": 4356 }, { "epoch": 0.7893115942028985, "grad_norm": 3.717893113109431, "learning_rate": 1.1197535733021012e-07, "loss": 0.2898, "step": 4357 }, { "epoch": 0.7894927536231884, "grad_norm": 4.930278931744944, "learning_rate": 1.1179039282185292e-07, "loss": 0.263, "step": 4358 }, { "epoch": 0.7896739130434782, "grad_norm": 5.787542121585203, "learning_rate": 1.1160556197590288e-07, "loss": 0.271, "step": 4359 }, { "epoch": 0.7898550724637681, "grad_norm": 7.03213955596468, "learning_rate": 1.1142086485599805e-07, "loss": 0.2353, "step": 4360 }, { "epoch": 0.7900362318840579, "grad_norm": 7.639632036770238, "learning_rate": 1.112363015257306e-07, "loss": 0.247, "step": 4361 }, { "epoch": 0.7902173913043479, "grad_norm": 4.991909473610004, "learning_rate": 1.1105187204864652e-07, "loss": 0.3652, "step": 4362 }, { "epoch": 0.7903985507246377, "grad_norm": 5.0503056546086045, "learning_rate": 1.1086757648824568e-07, "loss": 0.2707, "step": 4363 }, { "epoch": 0.7905797101449276, "grad_norm": 4.87105653339573, "learning_rate": 1.1068341490798211e-07, "loss": 0.2922, "step": 4364 }, { "epoch": 0.7907608695652174, "grad_norm": 5.611936823840022, "learning_rate": 1.1049938737126352e-07, "loss": 0.2849, "step": 4365 }, { "epoch": 0.7909420289855073, "grad_norm": 6.987743600677044, "learning_rate": 1.1031549394145123e-07, "loss": 0.2533, "step": 4366 }, { "epoch": 0.7911231884057971, "grad_norm": 4.936783841785507, "learning_rate": 1.101317346818606e-07, "loss": 0.287, "step": 4367 }, { "epoch": 0.7913043478260869, "grad_norm": 6.360574917374179, "learning_rate": 1.0994810965576113e-07, "loss": 0.2127, "step": 4368 }, { "epoch": 0.7914855072463768, "grad_norm": 3.7163582835044195, "learning_rate": 1.0976461892637556e-07, "loss": 0.2467, "step": 4369 }, { "epoch": 0.7916666666666666, "grad_norm": 5.275052750252274, "learning_rate": 1.0958126255688066e-07, "loss": 0.2879, "step": 4370 }, { "epoch": 0.7918478260869565, "grad_norm": 4.897464727012649, "learning_rate": 1.0939804061040692e-07, "loss": 0.2687, "step": 4371 }, { "epoch": 0.7920289855072464, "grad_norm": 5.8651730444391506, "learning_rate": 1.0921495315003854e-07, "loss": 0.3102, "step": 4372 }, { "epoch": 0.7922101449275363, "grad_norm": 6.034431392103668, "learning_rate": 1.0903200023881331e-07, "loss": 0.226, "step": 4373 }, { "epoch": 0.7923913043478261, "grad_norm": 6.093578570390563, "learning_rate": 1.0884918193972292e-07, "loss": 0.2888, "step": 4374 }, { "epoch": 0.792572463768116, "grad_norm": 5.301588109567178, "learning_rate": 1.0866649831571228e-07, "loss": 0.2769, "step": 4375 }, { "epoch": 0.7927536231884058, "grad_norm": 3.841455526864192, "learning_rate": 1.0848394942968082e-07, "loss": 0.2262, "step": 4376 }, { "epoch": 0.7929347826086957, "grad_norm": 5.244813046494392, "learning_rate": 1.0830153534448039e-07, "loss": 0.2725, "step": 4377 }, { "epoch": 0.7931159420289855, "grad_norm": 4.855975519598137, "learning_rate": 1.0811925612291711e-07, "loss": 0.2708, "step": 4378 }, { "epoch": 0.7932971014492753, "grad_norm": 6.4299517809467055, "learning_rate": 1.0793711182775089e-07, "loss": 0.2932, "step": 4379 }, { "epoch": 0.7934782608695652, "grad_norm": 9.412166418445324, "learning_rate": 1.0775510252169473e-07, "loss": 0.295, "step": 4380 }, { "epoch": 0.793659420289855, "grad_norm": 4.66989288450747, "learning_rate": 1.0757322826741522e-07, "loss": 0.3268, "step": 4381 }, { "epoch": 0.7938405797101449, "grad_norm": 7.045510131562466, "learning_rate": 1.0739148912753254e-07, "loss": 0.2671, "step": 4382 }, { "epoch": 0.7940217391304348, "grad_norm": 4.359004547017069, "learning_rate": 1.0720988516462043e-07, "loss": 0.2749, "step": 4383 }, { "epoch": 0.7942028985507247, "grad_norm": 9.067450917670207, "learning_rate": 1.0702841644120592e-07, "loss": 0.2484, "step": 4384 }, { "epoch": 0.7943840579710145, "grad_norm": 7.562776868430158, "learning_rate": 1.0684708301976958e-07, "loss": 0.338, "step": 4385 }, { "epoch": 0.7945652173913044, "grad_norm": 6.365375507040643, "learning_rate": 1.0666588496274536e-07, "loss": 0.3289, "step": 4386 }, { "epoch": 0.7947463768115942, "grad_norm": 8.467988587426978, "learning_rate": 1.0648482233252059e-07, "loss": 0.3213, "step": 4387 }, { "epoch": 0.7949275362318841, "grad_norm": 5.051029289686665, "learning_rate": 1.0630389519143596e-07, "loss": 0.3028, "step": 4388 }, { "epoch": 0.7951086956521739, "grad_norm": 3.927823722496376, "learning_rate": 1.0612310360178567e-07, "loss": 0.2188, "step": 4389 }, { "epoch": 0.7952898550724637, "grad_norm": 5.931690158092268, "learning_rate": 1.0594244762581684e-07, "loss": 0.3128, "step": 4390 }, { "epoch": 0.7954710144927536, "grad_norm": 4.1543030349305345, "learning_rate": 1.0576192732573052e-07, "loss": 0.2016, "step": 4391 }, { "epoch": 0.7956521739130434, "grad_norm": 4.82124142941218, "learning_rate": 1.055815427636807e-07, "loss": 0.3058, "step": 4392 }, { "epoch": 0.7958333333333333, "grad_norm": 4.102743339663563, "learning_rate": 1.0540129400177439e-07, "loss": 0.3086, "step": 4393 }, { "epoch": 0.7960144927536232, "grad_norm": 10.320209982991518, "learning_rate": 1.0522118110207207e-07, "loss": 0.2979, "step": 4394 }, { "epoch": 0.7961956521739131, "grad_norm": 4.30696671470187, "learning_rate": 1.0504120412658768e-07, "loss": 0.2893, "step": 4395 }, { "epoch": 0.7963768115942029, "grad_norm": 3.82243937732715, "learning_rate": 1.048613631372881e-07, "loss": 0.3101, "step": 4396 }, { "epoch": 0.7965579710144928, "grad_norm": 4.704501285624889, "learning_rate": 1.0468165819609342e-07, "loss": 0.2772, "step": 4397 }, { "epoch": 0.7967391304347826, "grad_norm": 3.7848537954282704, "learning_rate": 1.0450208936487682e-07, "loss": 0.226, "step": 4398 }, { "epoch": 0.7969202898550725, "grad_norm": 4.452202943100958, "learning_rate": 1.0432265670546481e-07, "loss": 0.3208, "step": 4399 }, { "epoch": 0.7971014492753623, "grad_norm": 4.701155658158093, "learning_rate": 1.0414336027963683e-07, "loss": 0.3186, "step": 4400 }, { "epoch": 0.7971014492753623, "eval_loss": 0.26926562190055847, "eval_runtime": 9.7489, "eval_samples_per_second": 51.288, "eval_steps_per_second": 0.103, "step": 4400 }, { "epoch": 0.7972826086956522, "grad_norm": 4.483976895035428, "learning_rate": 1.0396420014912555e-07, "loss": 0.2769, "step": 4401 }, { "epoch": 0.797463768115942, "grad_norm": 3.9281427091930574, "learning_rate": 1.0378517637561656e-07, "loss": 0.2526, "step": 4402 }, { "epoch": 0.7976449275362318, "grad_norm": 8.835697258547217, "learning_rate": 1.0360628902074869e-07, "loss": 0.232, "step": 4403 }, { "epoch": 0.7978260869565217, "grad_norm": 4.840737002780634, "learning_rate": 1.0342753814611366e-07, "loss": 0.2697, "step": 4404 }, { "epoch": 0.7980072463768116, "grad_norm": 3.882108337877771, "learning_rate": 1.0324892381325622e-07, "loss": 0.2794, "step": 4405 }, { "epoch": 0.7981884057971015, "grad_norm": 3.6751993075289087, "learning_rate": 1.0307044608367404e-07, "loss": 0.2745, "step": 4406 }, { "epoch": 0.7983695652173913, "grad_norm": 9.78329844589078, "learning_rate": 1.0289210501881812e-07, "loss": 0.2809, "step": 4407 }, { "epoch": 0.7985507246376812, "grad_norm": 7.940037743288601, "learning_rate": 1.0271390068009205e-07, "loss": 0.2606, "step": 4408 }, { "epoch": 0.798731884057971, "grad_norm": 4.335585900794439, "learning_rate": 1.0253583312885228e-07, "loss": 0.2849, "step": 4409 }, { "epoch": 0.7989130434782609, "grad_norm": 4.800262715348598, "learning_rate": 1.0235790242640824e-07, "loss": 0.3176, "step": 4410 }, { "epoch": 0.7990942028985507, "grad_norm": 3.3199641010341474, "learning_rate": 1.021801086340226e-07, "loss": 0.2716, "step": 4411 }, { "epoch": 0.7992753623188406, "grad_norm": 5.394266166423357, "learning_rate": 1.0200245181291045e-07, "loss": 0.3093, "step": 4412 }, { "epoch": 0.7994565217391304, "grad_norm": 3.4182170503118177, "learning_rate": 1.0182493202423992e-07, "loss": 0.2184, "step": 4413 }, { "epoch": 0.7996376811594202, "grad_norm": 5.90931191839386, "learning_rate": 1.0164754932913189e-07, "loss": 0.2704, "step": 4414 }, { "epoch": 0.7998188405797102, "grad_norm": 5.174370325166519, "learning_rate": 1.0147030378866001e-07, "loss": 0.3286, "step": 4415 }, { "epoch": 0.8, "grad_norm": 4.065339598191638, "learning_rate": 1.0129319546385084e-07, "loss": 0.2577, "step": 4416 }, { "epoch": 0.8001811594202899, "grad_norm": 5.253527984832621, "learning_rate": 1.0111622441568347e-07, "loss": 0.2521, "step": 4417 }, { "epoch": 0.8003623188405797, "grad_norm": 4.030342323593917, "learning_rate": 1.009393907050901e-07, "loss": 0.2462, "step": 4418 }, { "epoch": 0.8005434782608696, "grad_norm": 8.545226693843421, "learning_rate": 1.0076269439295542e-07, "loss": 0.3408, "step": 4419 }, { "epoch": 0.8007246376811594, "grad_norm": 9.194804076339683, "learning_rate": 1.0058613554011658e-07, "loss": 0.3095, "step": 4420 }, { "epoch": 0.8009057971014493, "grad_norm": 5.415262708466341, "learning_rate": 1.0040971420736354e-07, "loss": 0.2368, "step": 4421 }, { "epoch": 0.8010869565217391, "grad_norm": 7.295542387729058, "learning_rate": 1.0023343045543942e-07, "loss": 0.2711, "step": 4422 }, { "epoch": 0.801268115942029, "grad_norm": 5.972930590199982, "learning_rate": 1.000572843450393e-07, "loss": 0.3103, "step": 4423 }, { "epoch": 0.8014492753623188, "grad_norm": 3.5939071024882954, "learning_rate": 9.988127593681112e-08, "loss": 0.2411, "step": 4424 }, { "epoch": 0.8016304347826086, "grad_norm": 3.971706738585476, "learning_rate": 9.970540529135546e-08, "loss": 0.2788, "step": 4425 }, { "epoch": 0.8018115942028986, "grad_norm": 6.617494185028553, "learning_rate": 9.952967246922533e-08, "loss": 0.2982, "step": 4426 }, { "epoch": 0.8019927536231884, "grad_norm": 3.823629960705914, "learning_rate": 9.93540775309265e-08, "loss": 0.2871, "step": 4427 }, { "epoch": 0.8021739130434783, "grad_norm": 4.443748685179726, "learning_rate": 9.917862053691711e-08, "loss": 0.3027, "step": 4428 }, { "epoch": 0.8023550724637681, "grad_norm": 3.691182788639689, "learning_rate": 9.900330154760772e-08, "loss": 0.2379, "step": 4429 }, { "epoch": 0.802536231884058, "grad_norm": 3.765337784084508, "learning_rate": 9.88281206233616e-08, "loss": 0.3134, "step": 4430 }, { "epoch": 0.8027173913043478, "grad_norm": 7.517591554219634, "learning_rate": 9.865307782449438e-08, "loss": 0.2959, "step": 4431 }, { "epoch": 0.8028985507246377, "grad_norm": 5.206635904310933, "learning_rate": 9.847817321127405e-08, "loss": 0.2965, "step": 4432 }, { "epoch": 0.8030797101449275, "grad_norm": 4.315714455320872, "learning_rate": 9.830340684392102e-08, "loss": 0.308, "step": 4433 }, { "epoch": 0.8032608695652174, "grad_norm": 4.573085504573862, "learning_rate": 9.81287787826084e-08, "loss": 0.3131, "step": 4434 }, { "epoch": 0.8034420289855072, "grad_norm": 6.0129171159902395, "learning_rate": 9.79542890874615e-08, "loss": 0.2848, "step": 4435 }, { "epoch": 0.803623188405797, "grad_norm": 4.446642005512727, "learning_rate": 9.777993781855765e-08, "loss": 0.2756, "step": 4436 }, { "epoch": 0.803804347826087, "grad_norm": 6.6918027311394646, "learning_rate": 9.760572503592684e-08, "loss": 0.3088, "step": 4437 }, { "epoch": 0.8039855072463769, "grad_norm": 3.775716611132495, "learning_rate": 9.743165079955162e-08, "loss": 0.2742, "step": 4438 }, { "epoch": 0.8041666666666667, "grad_norm": 3.8840903495932158, "learning_rate": 9.725771516936643e-08, "loss": 0.3095, "step": 4439 }, { "epoch": 0.8043478260869565, "grad_norm": 5.679148174501449, "learning_rate": 9.708391820525819e-08, "loss": 0.3871, "step": 4440 }, { "epoch": 0.8045289855072464, "grad_norm": 5.146405362566784, "learning_rate": 9.691025996706592e-08, "loss": 0.2687, "step": 4441 }, { "epoch": 0.8047101449275362, "grad_norm": 3.8738716557708237, "learning_rate": 9.673674051458102e-08, "loss": 0.2836, "step": 4442 }, { "epoch": 0.8048913043478261, "grad_norm": 4.892873410120914, "learning_rate": 9.656335990754705e-08, "loss": 0.2916, "step": 4443 }, { "epoch": 0.8050724637681159, "grad_norm": 4.20176314788988, "learning_rate": 9.639011820565985e-08, "loss": 0.2564, "step": 4444 }, { "epoch": 0.8052536231884058, "grad_norm": 3.844048241526155, "learning_rate": 9.621701546856708e-08, "loss": 0.287, "step": 4445 }, { "epoch": 0.8054347826086956, "grad_norm": 4.26916196506891, "learning_rate": 9.604405175586938e-08, "loss": 0.321, "step": 4446 }, { "epoch": 0.8056159420289855, "grad_norm": 3.755143878960769, "learning_rate": 9.587122712711848e-08, "loss": 0.2868, "step": 4447 }, { "epoch": 0.8057971014492754, "grad_norm": 4.368650120206068, "learning_rate": 9.569854164181889e-08, "loss": 0.2901, "step": 4448 }, { "epoch": 0.8059782608695653, "grad_norm": 10.91322396602523, "learning_rate": 9.55259953594269e-08, "loss": 0.2773, "step": 4449 }, { "epoch": 0.8061594202898551, "grad_norm": 6.516672511642825, "learning_rate": 9.53535883393513e-08, "loss": 0.3225, "step": 4450 }, { "epoch": 0.806340579710145, "grad_norm": 5.427710697957984, "learning_rate": 9.518132064095247e-08, "loss": 0.2355, "step": 4451 }, { "epoch": 0.8065217391304348, "grad_norm": 7.559695412580726, "learning_rate": 9.500919232354298e-08, "loss": 0.2981, "step": 4452 }, { "epoch": 0.8067028985507246, "grad_norm": 6.955825651938101, "learning_rate": 9.483720344638751e-08, "loss": 0.2448, "step": 4453 }, { "epoch": 0.8068840579710145, "grad_norm": 3.713207678670163, "learning_rate": 9.466535406870257e-08, "loss": 0.3129, "step": 4454 }, { "epoch": 0.8070652173913043, "grad_norm": 4.157397748525312, "learning_rate": 9.449364424965679e-08, "loss": 0.2848, "step": 4455 }, { "epoch": 0.8072463768115942, "grad_norm": 3.5746065286249262, "learning_rate": 9.432207404837056e-08, "loss": 0.2544, "step": 4456 }, { "epoch": 0.807427536231884, "grad_norm": 4.6564568125611485, "learning_rate": 9.415064352391638e-08, "loss": 0.271, "step": 4457 }, { "epoch": 0.8076086956521739, "grad_norm": 10.2790428533922, "learning_rate": 9.39793527353186e-08, "loss": 0.2902, "step": 4458 }, { "epoch": 0.8077898550724638, "grad_norm": 4.998771791064756, "learning_rate": 9.380820174155346e-08, "loss": 0.3238, "step": 4459 }, { "epoch": 0.8079710144927537, "grad_norm": 6.616505634224275, "learning_rate": 9.36371906015489e-08, "loss": 0.2136, "step": 4460 }, { "epoch": 0.8081521739130435, "grad_norm": 7.35628141094853, "learning_rate": 9.346631937418515e-08, "loss": 0.2473, "step": 4461 }, { "epoch": 0.8083333333333333, "grad_norm": 3.4954802861571497, "learning_rate": 9.329558811829391e-08, "loss": 0.2547, "step": 4462 }, { "epoch": 0.8085144927536232, "grad_norm": 4.182033931861492, "learning_rate": 9.312499689265862e-08, "loss": 0.3068, "step": 4463 }, { "epoch": 0.808695652173913, "grad_norm": 4.230426910270004, "learning_rate": 9.295454575601452e-08, "loss": 0.3209, "step": 4464 }, { "epoch": 0.8088768115942029, "grad_norm": 7.807013171989984, "learning_rate": 9.27842347670491e-08, "loss": 0.2917, "step": 4465 }, { "epoch": 0.8090579710144927, "grad_norm": 3.970589920750272, "learning_rate": 9.261406398440109e-08, "loss": 0.3092, "step": 4466 }, { "epoch": 0.8092391304347826, "grad_norm": 8.422901506396375, "learning_rate": 9.244403346666108e-08, "loss": 0.2765, "step": 4467 }, { "epoch": 0.8094202898550724, "grad_norm": 5.601125621354369, "learning_rate": 9.227414327237137e-08, "loss": 0.2999, "step": 4468 }, { "epoch": 0.8096014492753624, "grad_norm": 9.239891652940003, "learning_rate": 9.210439346002601e-08, "loss": 0.2802, "step": 4469 }, { "epoch": 0.8097826086956522, "grad_norm": 3.928911000826045, "learning_rate": 9.193478408807059e-08, "loss": 0.3155, "step": 4470 }, { "epoch": 0.8099637681159421, "grad_norm": 5.046923233095308, "learning_rate": 9.17653152149025e-08, "loss": 0.2582, "step": 4471 }, { "epoch": 0.8101449275362319, "grad_norm": 9.222433672154175, "learning_rate": 9.159598689887066e-08, "loss": 0.3009, "step": 4472 }, { "epoch": 0.8103260869565218, "grad_norm": 5.154128135295395, "learning_rate": 9.142679919827551e-08, "loss": 0.2422, "step": 4473 }, { "epoch": 0.8105072463768116, "grad_norm": 3.9716287840363274, "learning_rate": 9.125775217136933e-08, "loss": 0.3397, "step": 4474 }, { "epoch": 0.8106884057971014, "grad_norm": 11.52519174148552, "learning_rate": 9.108884587635568e-08, "loss": 0.4065, "step": 4475 }, { "epoch": 0.8108695652173913, "grad_norm": 4.832587139333683, "learning_rate": 9.092008037138976e-08, "loss": 0.2925, "step": 4476 }, { "epoch": 0.8110507246376811, "grad_norm": 3.738426851167183, "learning_rate": 9.075145571457854e-08, "loss": 0.2787, "step": 4477 }, { "epoch": 0.811231884057971, "grad_norm": 5.813514122224783, "learning_rate": 9.058297196398035e-08, "loss": 0.2907, "step": 4478 }, { "epoch": 0.8114130434782608, "grad_norm": 4.928152994891378, "learning_rate": 9.041462917760462e-08, "loss": 0.344, "step": 4479 }, { "epoch": 0.8115942028985508, "grad_norm": 4.7269464698545995, "learning_rate": 9.024642741341254e-08, "loss": 0.2448, "step": 4480 }, { "epoch": 0.8117753623188406, "grad_norm": 4.609460154469483, "learning_rate": 9.007836672931718e-08, "loss": 0.287, "step": 4481 }, { "epoch": 0.8119565217391305, "grad_norm": 5.455383492329131, "learning_rate": 8.99104471831823e-08, "loss": 0.3176, "step": 4482 }, { "epoch": 0.8121376811594203, "grad_norm": 5.620968383856849, "learning_rate": 8.974266883282356e-08, "loss": 0.3005, "step": 4483 }, { "epoch": 0.8123188405797102, "grad_norm": 4.85414070159053, "learning_rate": 8.957503173600777e-08, "loss": 0.3463, "step": 4484 }, { "epoch": 0.8125, "grad_norm": 4.365315467978919, "learning_rate": 8.940753595045319e-08, "loss": 0.2781, "step": 4485 }, { "epoch": 0.8126811594202898, "grad_norm": 5.696896947712273, "learning_rate": 8.924018153382944e-08, "loss": 0.2685, "step": 4486 }, { "epoch": 0.8128623188405797, "grad_norm": 3.813575653169163, "learning_rate": 8.907296854375723e-08, "loss": 0.2249, "step": 4487 }, { "epoch": 0.8130434782608695, "grad_norm": 9.336856657595598, "learning_rate": 8.89058970378091e-08, "loss": 0.304, "step": 4488 }, { "epoch": 0.8132246376811594, "grad_norm": 4.801517466990611, "learning_rate": 8.873896707350859e-08, "loss": 0.3719, "step": 4489 }, { "epoch": 0.8134057971014492, "grad_norm": 3.8749576849356604, "learning_rate": 8.857217870833017e-08, "loss": 0.225, "step": 4490 }, { "epoch": 0.8135869565217392, "grad_norm": 4.339821124136354, "learning_rate": 8.840553199969986e-08, "loss": 0.265, "step": 4491 }, { "epoch": 0.813768115942029, "grad_norm": 6.635825756875962, "learning_rate": 8.82390270049952e-08, "loss": 0.3177, "step": 4492 }, { "epoch": 0.8139492753623189, "grad_norm": 3.889016573529078, "learning_rate": 8.807266378154448e-08, "loss": 0.3056, "step": 4493 }, { "epoch": 0.8141304347826087, "grad_norm": 4.436326612962354, "learning_rate": 8.790644238662736e-08, "loss": 0.3503, "step": 4494 }, { "epoch": 0.8143115942028986, "grad_norm": 4.943557798377172, "learning_rate": 8.774036287747472e-08, "loss": 0.2885, "step": 4495 }, { "epoch": 0.8144927536231884, "grad_norm": 4.076109356131406, "learning_rate": 8.757442531126847e-08, "loss": 0.3089, "step": 4496 }, { "epoch": 0.8146739130434782, "grad_norm": 4.510012709219583, "learning_rate": 8.740862974514168e-08, "loss": 0.2659, "step": 4497 }, { "epoch": 0.8148550724637681, "grad_norm": 4.791410314064984, "learning_rate": 8.724297623617854e-08, "loss": 0.2907, "step": 4498 }, { "epoch": 0.8150362318840579, "grad_norm": 8.087254925325885, "learning_rate": 8.707746484141437e-08, "loss": 0.3569, "step": 4499 }, { "epoch": 0.8152173913043478, "grad_norm": 4.752153835035208, "learning_rate": 8.691209561783558e-08, "loss": 0.3065, "step": 4500 }, { "epoch": 0.8152173913043478, "eval_loss": 0.2705000042915344, "eval_runtime": 9.9574, "eval_samples_per_second": 50.214, "eval_steps_per_second": 0.1, "step": 4500 }, { "epoch": 0.8153985507246376, "grad_norm": 3.7412207252349905, "learning_rate": 8.674686862237945e-08, "loss": 0.2535, "step": 4501 }, { "epoch": 0.8155797101449276, "grad_norm": 4.239432844108309, "learning_rate": 8.658178391193455e-08, "loss": 0.2935, "step": 4502 }, { "epoch": 0.8157608695652174, "grad_norm": 4.807420571566828, "learning_rate": 8.641684154334011e-08, "loss": 0.2274, "step": 4503 }, { "epoch": 0.8159420289855073, "grad_norm": 3.4584932735012015, "learning_rate": 8.625204157338683e-08, "loss": 0.2878, "step": 4504 }, { "epoch": 0.8161231884057971, "grad_norm": 3.274783382714847, "learning_rate": 8.608738405881615e-08, "loss": 0.2406, "step": 4505 }, { "epoch": 0.816304347826087, "grad_norm": 3.8227126282928836, "learning_rate": 8.592286905632018e-08, "loss": 0.2698, "step": 4506 }, { "epoch": 0.8164855072463768, "grad_norm": 5.4628156362565665, "learning_rate": 8.57584966225422e-08, "loss": 0.2831, "step": 4507 }, { "epoch": 0.8166666666666667, "grad_norm": 3.9440184370132405, "learning_rate": 8.559426681407673e-08, "loss": 0.2774, "step": 4508 }, { "epoch": 0.8168478260869565, "grad_norm": 3.2194696444855793, "learning_rate": 8.543017968746863e-08, "loss": 0.2469, "step": 4509 }, { "epoch": 0.8170289855072463, "grad_norm": 8.468175360267214, "learning_rate": 8.526623529921395e-08, "loss": 0.3587, "step": 4510 }, { "epoch": 0.8172101449275362, "grad_norm": 5.483904636675657, "learning_rate": 8.510243370575948e-08, "loss": 0.3037, "step": 4511 }, { "epoch": 0.8173913043478261, "grad_norm": 3.6498687641484002, "learning_rate": 8.493877496350293e-08, "loss": 0.2844, "step": 4512 }, { "epoch": 0.817572463768116, "grad_norm": 5.538497802151831, "learning_rate": 8.477525912879279e-08, "loss": 0.314, "step": 4513 }, { "epoch": 0.8177536231884058, "grad_norm": 4.2583938931388685, "learning_rate": 8.461188625792831e-08, "loss": 0.295, "step": 4514 }, { "epoch": 0.8179347826086957, "grad_norm": 4.856622093857845, "learning_rate": 8.444865640715937e-08, "loss": 0.3069, "step": 4515 }, { "epoch": 0.8181159420289855, "grad_norm": 3.1976385386297332, "learning_rate": 8.428556963268724e-08, "loss": 0.2473, "step": 4516 }, { "epoch": 0.8182971014492754, "grad_norm": 3.5014865815986282, "learning_rate": 8.412262599066305e-08, "loss": 0.2619, "step": 4517 }, { "epoch": 0.8184782608695652, "grad_norm": 4.042521641949122, "learning_rate": 8.395982553718916e-08, "loss": 0.2372, "step": 4518 }, { "epoch": 0.818659420289855, "grad_norm": 7.272152537808116, "learning_rate": 8.379716832831851e-08, "loss": 0.3098, "step": 4519 }, { "epoch": 0.8188405797101449, "grad_norm": 4.400055328962682, "learning_rate": 8.363465442005491e-08, "loss": 0.2678, "step": 4520 }, { "epoch": 0.8190217391304347, "grad_norm": 5.800760413180088, "learning_rate": 8.347228386835258e-08, "loss": 0.2325, "step": 4521 }, { "epoch": 0.8192028985507246, "grad_norm": 7.017407354096239, "learning_rate": 8.331005672911645e-08, "loss": 0.2915, "step": 4522 }, { "epoch": 0.8193840579710145, "grad_norm": 4.879020770616914, "learning_rate": 8.314797305820215e-08, "loss": 0.2935, "step": 4523 }, { "epoch": 0.8195652173913044, "grad_norm": 4.8002026579078025, "learning_rate": 8.298603291141576e-08, "loss": 0.2384, "step": 4524 }, { "epoch": 0.8197463768115942, "grad_norm": 5.626022687975708, "learning_rate": 8.282423634451413e-08, "loss": 0.297, "step": 4525 }, { "epoch": 0.8199275362318841, "grad_norm": 3.9870916742008027, "learning_rate": 8.266258341320454e-08, "loss": 0.2549, "step": 4526 }, { "epoch": 0.8201086956521739, "grad_norm": 4.516704214771563, "learning_rate": 8.250107417314483e-08, "loss": 0.2635, "step": 4527 }, { "epoch": 0.8202898550724638, "grad_norm": 4.036721398397296, "learning_rate": 8.23397086799435e-08, "loss": 0.2828, "step": 4528 }, { "epoch": 0.8204710144927536, "grad_norm": 4.146590494641051, "learning_rate": 8.21784869891593e-08, "loss": 0.2511, "step": 4529 }, { "epoch": 0.8206521739130435, "grad_norm": 4.186689228368776, "learning_rate": 8.201740915630168e-08, "loss": 0.2846, "step": 4530 }, { "epoch": 0.8208333333333333, "grad_norm": 6.510141393049845, "learning_rate": 8.185647523683059e-08, "loss": 0.269, "step": 4531 }, { "epoch": 0.8210144927536231, "grad_norm": 3.8618111825172337, "learning_rate": 8.169568528615645e-08, "loss": 0.28, "step": 4532 }, { "epoch": 0.821195652173913, "grad_norm": 4.9055214640138, "learning_rate": 8.153503935963969e-08, "loss": 0.2966, "step": 4533 }, { "epoch": 0.821376811594203, "grad_norm": 6.939194869769065, "learning_rate": 8.137453751259144e-08, "loss": 0.2386, "step": 4534 }, { "epoch": 0.8215579710144928, "grad_norm": 7.356681094644599, "learning_rate": 8.121417980027357e-08, "loss": 0.2611, "step": 4535 }, { "epoch": 0.8217391304347826, "grad_norm": 8.240959360183206, "learning_rate": 8.105396627789784e-08, "loss": 0.2979, "step": 4536 }, { "epoch": 0.8219202898550725, "grad_norm": 3.7187400594726188, "learning_rate": 8.089389700062655e-08, "loss": 0.2864, "step": 4537 }, { "epoch": 0.8221014492753623, "grad_norm": 3.5286679365410216, "learning_rate": 8.07339720235723e-08, "loss": 0.2773, "step": 4538 }, { "epoch": 0.8222826086956522, "grad_norm": 7.8396160310582195, "learning_rate": 8.057419140179794e-08, "loss": 0.3129, "step": 4539 }, { "epoch": 0.822463768115942, "grad_norm": 5.448547614255271, "learning_rate": 8.041455519031681e-08, "loss": 0.3088, "step": 4540 }, { "epoch": 0.8226449275362319, "grad_norm": 5.906614222823553, "learning_rate": 8.025506344409239e-08, "loss": 0.2906, "step": 4541 }, { "epoch": 0.8228260869565217, "grad_norm": 4.421292587900288, "learning_rate": 8.009571621803834e-08, "loss": 0.2994, "step": 4542 }, { "epoch": 0.8230072463768116, "grad_norm": 5.653466883853886, "learning_rate": 7.99365135670188e-08, "loss": 0.256, "step": 4543 }, { "epoch": 0.8231884057971014, "grad_norm": 4.033538522947047, "learning_rate": 7.977745554584792e-08, "loss": 0.223, "step": 4544 }, { "epoch": 0.8233695652173914, "grad_norm": 5.321304918514272, "learning_rate": 7.961854220929021e-08, "loss": 0.296, "step": 4545 }, { "epoch": 0.8235507246376812, "grad_norm": 3.392112387466615, "learning_rate": 7.945977361206002e-08, "loss": 0.2178, "step": 4546 }, { "epoch": 0.823731884057971, "grad_norm": 7.444897929950792, "learning_rate": 7.930114980882252e-08, "loss": 0.3275, "step": 4547 }, { "epoch": 0.8239130434782609, "grad_norm": 3.9537372422156927, "learning_rate": 7.914267085419252e-08, "loss": 0.254, "step": 4548 }, { "epoch": 0.8240942028985507, "grad_norm": 4.06748188882281, "learning_rate": 7.898433680273491e-08, "loss": 0.3094, "step": 4549 }, { "epoch": 0.8242753623188406, "grad_norm": 4.033843170165374, "learning_rate": 7.88261477089649e-08, "loss": 0.338, "step": 4550 }, { "epoch": 0.8244565217391304, "grad_norm": 3.7284617171729724, "learning_rate": 7.866810362734782e-08, "loss": 0.2702, "step": 4551 }, { "epoch": 0.8246376811594203, "grad_norm": 4.892225452540291, "learning_rate": 7.851020461229901e-08, "loss": 0.2605, "step": 4552 }, { "epoch": 0.8248188405797101, "grad_norm": 8.963708769452413, "learning_rate": 7.835245071818381e-08, "loss": 0.2347, "step": 4553 }, { "epoch": 0.825, "grad_norm": 3.762664293270453, "learning_rate": 7.819484199931764e-08, "loss": 0.2467, "step": 4554 }, { "epoch": 0.8251811594202898, "grad_norm": 4.478347655001307, "learning_rate": 7.803737850996595e-08, "loss": 0.3044, "step": 4555 }, { "epoch": 0.8253623188405798, "grad_norm": 4.657945259550352, "learning_rate": 7.788006030434413e-08, "loss": 0.2756, "step": 4556 }, { "epoch": 0.8255434782608696, "grad_norm": 5.996282107904425, "learning_rate": 7.772288743661743e-08, "loss": 0.3226, "step": 4557 }, { "epoch": 0.8257246376811594, "grad_norm": 5.08425677589519, "learning_rate": 7.756585996090153e-08, "loss": 0.2513, "step": 4558 }, { "epoch": 0.8259057971014493, "grad_norm": 4.074022136387155, "learning_rate": 7.740897793126172e-08, "loss": 0.303, "step": 4559 }, { "epoch": 0.8260869565217391, "grad_norm": 4.575120505710971, "learning_rate": 7.725224140171294e-08, "loss": 0.269, "step": 4560 }, { "epoch": 0.826268115942029, "grad_norm": 5.009299759414409, "learning_rate": 7.709565042622035e-08, "loss": 0.267, "step": 4561 }, { "epoch": 0.8264492753623188, "grad_norm": 8.676478870797638, "learning_rate": 7.693920505869916e-08, "loss": 0.2768, "step": 4562 }, { "epoch": 0.8266304347826087, "grad_norm": 3.7248881268132514, "learning_rate": 7.678290535301424e-08, "loss": 0.2294, "step": 4563 }, { "epoch": 0.8268115942028985, "grad_norm": 4.337018339811962, "learning_rate": 7.662675136298019e-08, "loss": 0.2517, "step": 4564 }, { "epoch": 0.8269927536231884, "grad_norm": 5.732066420693558, "learning_rate": 7.647074314236168e-08, "loss": 0.3009, "step": 4565 }, { "epoch": 0.8271739130434783, "grad_norm": 3.115914620329363, "learning_rate": 7.631488074487303e-08, "loss": 0.2008, "step": 4566 }, { "epoch": 0.8273550724637682, "grad_norm": 4.1053472125710195, "learning_rate": 7.615916422417835e-08, "loss": 0.2404, "step": 4567 }, { "epoch": 0.827536231884058, "grad_norm": 3.516506100886446, "learning_rate": 7.60035936338917e-08, "loss": 0.2543, "step": 4568 }, { "epoch": 0.8277173913043478, "grad_norm": 5.764692271925027, "learning_rate": 7.584816902757662e-08, "loss": 0.2623, "step": 4569 }, { "epoch": 0.8278985507246377, "grad_norm": 4.99289911434981, "learning_rate": 7.569289045874666e-08, "loss": 0.2876, "step": 4570 }, { "epoch": 0.8280797101449275, "grad_norm": 4.147310360008969, "learning_rate": 7.553775798086492e-08, "loss": 0.3152, "step": 4571 }, { "epoch": 0.8282608695652174, "grad_norm": 5.805804651292067, "learning_rate": 7.538277164734424e-08, "loss": 0.3445, "step": 4572 }, { "epoch": 0.8284420289855072, "grad_norm": 3.787622481562338, "learning_rate": 7.52279315115471e-08, "loss": 0.2699, "step": 4573 }, { "epoch": 0.8286231884057971, "grad_norm": 5.060317865165723, "learning_rate": 7.50732376267858e-08, "loss": 0.2759, "step": 4574 }, { "epoch": 0.8288043478260869, "grad_norm": 3.8006891740553925, "learning_rate": 7.491869004632228e-08, "loss": 0.2354, "step": 4575 }, { "epoch": 0.8289855072463768, "grad_norm": 6.383636251670721, "learning_rate": 7.476428882336771e-08, "loss": 0.3014, "step": 4576 }, { "epoch": 0.8291666666666667, "grad_norm": 4.504065411112429, "learning_rate": 7.461003401108324e-08, "loss": 0.3342, "step": 4577 }, { "epoch": 0.8293478260869566, "grad_norm": 7.821486454771593, "learning_rate": 7.445592566257969e-08, "loss": 0.3291, "step": 4578 }, { "epoch": 0.8295289855072464, "grad_norm": 6.540642944379602, "learning_rate": 7.430196383091719e-08, "loss": 0.2833, "step": 4579 }, { "epoch": 0.8297101449275363, "grad_norm": 4.801700366893205, "learning_rate": 7.414814856910556e-08, "loss": 0.2848, "step": 4580 }, { "epoch": 0.8298913043478261, "grad_norm": 8.770380136268663, "learning_rate": 7.39944799301041e-08, "loss": 0.2853, "step": 4581 }, { "epoch": 0.8300724637681159, "grad_norm": 4.488806416950078, "learning_rate": 7.384095796682167e-08, "loss": 0.2527, "step": 4582 }, { "epoch": 0.8302536231884058, "grad_norm": 4.8756132096210845, "learning_rate": 7.368758273211657e-08, "loss": 0.3311, "step": 4583 }, { "epoch": 0.8304347826086956, "grad_norm": 9.874067423348928, "learning_rate": 7.353435427879667e-08, "loss": 0.2852, "step": 4584 }, { "epoch": 0.8306159420289855, "grad_norm": 3.949042840801302, "learning_rate": 7.338127265961908e-08, "loss": 0.2869, "step": 4585 }, { "epoch": 0.8307971014492753, "grad_norm": 5.683093956796709, "learning_rate": 7.322833792729094e-08, "loss": 0.3038, "step": 4586 }, { "epoch": 0.8309782608695652, "grad_norm": 6.472196198784984, "learning_rate": 7.307555013446804e-08, "loss": 0.3348, "step": 4587 }, { "epoch": 0.8311594202898551, "grad_norm": 5.910602016287594, "learning_rate": 7.292290933375599e-08, "loss": 0.2895, "step": 4588 }, { "epoch": 0.831340579710145, "grad_norm": 8.730732847824768, "learning_rate": 7.277041557770968e-08, "loss": 0.2997, "step": 4589 }, { "epoch": 0.8315217391304348, "grad_norm": 5.319609501163069, "learning_rate": 7.261806891883366e-08, "loss": 0.2154, "step": 4590 }, { "epoch": 0.8317028985507247, "grad_norm": 3.8090097531533353, "learning_rate": 7.24658694095815e-08, "loss": 0.2503, "step": 4591 }, { "epoch": 0.8318840579710145, "grad_norm": 3.4929689277882083, "learning_rate": 7.231381710235624e-08, "loss": 0.2675, "step": 4592 }, { "epoch": 0.8320652173913043, "grad_norm": 4.505221488581019, "learning_rate": 7.216191204951011e-08, "loss": 0.3536, "step": 4593 }, { "epoch": 0.8322463768115942, "grad_norm": 6.8389613716428865, "learning_rate": 7.201015430334484e-08, "loss": 0.3765, "step": 4594 }, { "epoch": 0.832427536231884, "grad_norm": 7.644920094477968, "learning_rate": 7.18585439161113e-08, "loss": 0.3227, "step": 4595 }, { "epoch": 0.8326086956521739, "grad_norm": 7.017516603300654, "learning_rate": 7.17070809400096e-08, "loss": 0.2935, "step": 4596 }, { "epoch": 0.8327898550724637, "grad_norm": 3.9358900637729213, "learning_rate": 7.155576542718933e-08, "loss": 0.2968, "step": 4597 }, { "epoch": 0.8329710144927536, "grad_norm": 5.354095877087518, "learning_rate": 7.140459742974897e-08, "loss": 0.2567, "step": 4598 }, { "epoch": 0.8331521739130435, "grad_norm": 8.183165741916955, "learning_rate": 7.125357699973644e-08, "loss": 0.2357, "step": 4599 }, { "epoch": 0.8333333333333334, "grad_norm": 3.440220246863262, "learning_rate": 7.110270418914871e-08, "loss": 0.2675, "step": 4600 }, { "epoch": 0.8333333333333334, "eval_loss": 0.2700468897819519, "eval_runtime": 9.7193, "eval_samples_per_second": 51.444, "eval_steps_per_second": 0.103, "step": 4600 }, { "epoch": 0.8335144927536232, "grad_norm": 3.566197792932106, "learning_rate": 7.095197904993217e-08, "loss": 0.2826, "step": 4601 }, { "epoch": 0.8336956521739131, "grad_norm": 3.9282213710588105, "learning_rate": 7.080140163398224e-08, "loss": 0.2444, "step": 4602 }, { "epoch": 0.8338768115942029, "grad_norm": 4.1879570600795955, "learning_rate": 7.065097199314318e-08, "loss": 0.2952, "step": 4603 }, { "epoch": 0.8340579710144927, "grad_norm": 3.5402998322927712, "learning_rate": 7.050069017920867e-08, "loss": 0.2438, "step": 4604 }, { "epoch": 0.8342391304347826, "grad_norm": 4.80708840985529, "learning_rate": 7.035055624392166e-08, "loss": 0.3354, "step": 4605 }, { "epoch": 0.8344202898550724, "grad_norm": 4.252489328602519, "learning_rate": 7.020057023897385e-08, "loss": 0.3183, "step": 4606 }, { "epoch": 0.8346014492753623, "grad_norm": 4.024121953989593, "learning_rate": 7.005073221600616e-08, "loss": 0.3135, "step": 4607 }, { "epoch": 0.8347826086956521, "grad_norm": 9.099100334525772, "learning_rate": 6.990104222660848e-08, "loss": 0.3462, "step": 4608 }, { "epoch": 0.8349637681159421, "grad_norm": 4.463826660760659, "learning_rate": 6.975150032231986e-08, "loss": 0.3124, "step": 4609 }, { "epoch": 0.8351449275362319, "grad_norm": 4.364691720656996, "learning_rate": 6.960210655462817e-08, "loss": 0.2889, "step": 4610 }, { "epoch": 0.8353260869565218, "grad_norm": 11.89761114431437, "learning_rate": 6.94528609749705e-08, "loss": 0.34, "step": 4611 }, { "epoch": 0.8355072463768116, "grad_norm": 6.221055677129767, "learning_rate": 6.930376363473278e-08, "loss": 0.2609, "step": 4612 }, { "epoch": 0.8356884057971015, "grad_norm": 4.844396894405488, "learning_rate": 6.915481458524991e-08, "loss": 0.3019, "step": 4613 }, { "epoch": 0.8358695652173913, "grad_norm": 4.223033504683856, "learning_rate": 6.900601387780574e-08, "loss": 0.2987, "step": 4614 }, { "epoch": 0.8360507246376812, "grad_norm": 6.979081728781871, "learning_rate": 6.885736156363315e-08, "loss": 0.2936, "step": 4615 }, { "epoch": 0.836231884057971, "grad_norm": 5.105494733356094, "learning_rate": 6.87088576939136e-08, "loss": 0.3668, "step": 4616 }, { "epoch": 0.8364130434782608, "grad_norm": 4.168987919542068, "learning_rate": 6.856050231977795e-08, "loss": 0.2586, "step": 4617 }, { "epoch": 0.8365942028985507, "grad_norm": 5.130383147489327, "learning_rate": 6.84122954923056e-08, "loss": 0.287, "step": 4618 }, { "epoch": 0.8367753623188405, "grad_norm": 5.891940624600407, "learning_rate": 6.82642372625249e-08, "loss": 0.2816, "step": 4619 }, { "epoch": 0.8369565217391305, "grad_norm": 4.997358400898634, "learning_rate": 6.811632768141268e-08, "loss": 0.2674, "step": 4620 }, { "epoch": 0.8371376811594203, "grad_norm": 3.629416400595771, "learning_rate": 6.796856679989526e-08, "loss": 0.275, "step": 4621 }, { "epoch": 0.8373188405797102, "grad_norm": 4.240732717298964, "learning_rate": 6.782095466884735e-08, "loss": 0.2987, "step": 4622 }, { "epoch": 0.8375, "grad_norm": 4.893261770793716, "learning_rate": 6.767349133909244e-08, "loss": 0.2912, "step": 4623 }, { "epoch": 0.8376811594202899, "grad_norm": 4.127331232718756, "learning_rate": 6.752617686140283e-08, "loss": 0.3145, "step": 4624 }, { "epoch": 0.8378623188405797, "grad_norm": 3.641417688451911, "learning_rate": 6.737901128649964e-08, "loss": 0.2688, "step": 4625 }, { "epoch": 0.8380434782608696, "grad_norm": 4.664216618294672, "learning_rate": 6.723199466505269e-08, "loss": 0.2458, "step": 4626 }, { "epoch": 0.8382246376811594, "grad_norm": 5.015554223029727, "learning_rate": 6.708512704768043e-08, "loss": 0.3441, "step": 4627 }, { "epoch": 0.8384057971014492, "grad_norm": 4.105985011543899, "learning_rate": 6.693840848495008e-08, "loss": 0.2369, "step": 4628 }, { "epoch": 0.8385869565217391, "grad_norm": 3.625188684504257, "learning_rate": 6.679183902737772e-08, "loss": 0.2729, "step": 4629 }, { "epoch": 0.8387681159420289, "grad_norm": 3.3713080053839373, "learning_rate": 6.664541872542773e-08, "loss": 0.2684, "step": 4630 }, { "epoch": 0.8389492753623189, "grad_norm": 5.198538967945334, "learning_rate": 6.64991476295132e-08, "loss": 0.2855, "step": 4631 }, { "epoch": 0.8391304347826087, "grad_norm": 3.9928089077361526, "learning_rate": 6.635302578999625e-08, "loss": 0.2829, "step": 4632 }, { "epoch": 0.8393115942028986, "grad_norm": 5.963949663512671, "learning_rate": 6.620705325718711e-08, "loss": 0.2856, "step": 4633 }, { "epoch": 0.8394927536231884, "grad_norm": 4.4067839508621365, "learning_rate": 6.606123008134495e-08, "loss": 0.3211, "step": 4634 }, { "epoch": 0.8396739130434783, "grad_norm": 4.336194818451433, "learning_rate": 6.591555631267731e-08, "loss": 0.3331, "step": 4635 }, { "epoch": 0.8398550724637681, "grad_norm": 4.232302513571592, "learning_rate": 6.577003200134035e-08, "loss": 0.2977, "step": 4636 }, { "epoch": 0.840036231884058, "grad_norm": 5.002681776950905, "learning_rate": 6.562465719743882e-08, "loss": 0.3017, "step": 4637 }, { "epoch": 0.8402173913043478, "grad_norm": 9.41011178129296, "learning_rate": 6.54794319510259e-08, "loss": 0.3365, "step": 4638 }, { "epoch": 0.8403985507246376, "grad_norm": 4.729062019404165, "learning_rate": 6.533435631210337e-08, "loss": 0.3012, "step": 4639 }, { "epoch": 0.8405797101449275, "grad_norm": 4.294091482165877, "learning_rate": 6.518943033062152e-08, "loss": 0.2812, "step": 4640 }, { "epoch": 0.8407608695652173, "grad_norm": 4.722192168190877, "learning_rate": 6.504465405647891e-08, "loss": 0.2344, "step": 4641 }, { "epoch": 0.8409420289855073, "grad_norm": 3.7628341864837873, "learning_rate": 6.490002753952278e-08, "loss": 0.2495, "step": 4642 }, { "epoch": 0.8411231884057971, "grad_norm": 8.745142473889473, "learning_rate": 6.475555082954858e-08, "loss": 0.3187, "step": 4643 }, { "epoch": 0.841304347826087, "grad_norm": 6.64893913796071, "learning_rate": 6.461122397630059e-08, "loss": 0.2415, "step": 4644 }, { "epoch": 0.8414855072463768, "grad_norm": 4.266242376017027, "learning_rate": 6.446704702947126e-08, "loss": 0.2647, "step": 4645 }, { "epoch": 0.8416666666666667, "grad_norm": 8.999463129996617, "learning_rate": 6.432302003870105e-08, "loss": 0.2688, "step": 4646 }, { "epoch": 0.8418478260869565, "grad_norm": 3.860153262136091, "learning_rate": 6.417914305357924e-08, "loss": 0.2763, "step": 4647 }, { "epoch": 0.8420289855072464, "grad_norm": 4.7412639784227855, "learning_rate": 6.40354161236436e-08, "loss": 0.2893, "step": 4648 }, { "epoch": 0.8422101449275362, "grad_norm": 4.367860154244886, "learning_rate": 6.389183929837977e-08, "loss": 0.3115, "step": 4649 }, { "epoch": 0.842391304347826, "grad_norm": 5.3354416006066225, "learning_rate": 6.374841262722203e-08, "loss": 0.3115, "step": 4650 }, { "epoch": 0.8425724637681159, "grad_norm": 4.6763182073943135, "learning_rate": 6.360513615955276e-08, "loss": 0.3606, "step": 4651 }, { "epoch": 0.8427536231884057, "grad_norm": 4.463426545162006, "learning_rate": 6.346200994470286e-08, "loss": 0.2901, "step": 4652 }, { "epoch": 0.8429347826086957, "grad_norm": 3.829588436633688, "learning_rate": 6.331903403195126e-08, "loss": 0.2979, "step": 4653 }, { "epoch": 0.8431159420289855, "grad_norm": 6.322096952904681, "learning_rate": 6.317620847052529e-08, "loss": 0.3029, "step": 4654 }, { "epoch": 0.8432971014492754, "grad_norm": 6.606225194438366, "learning_rate": 6.303353330960037e-08, "loss": 0.3021, "step": 4655 }, { "epoch": 0.8434782608695652, "grad_norm": 5.308279302902277, "learning_rate": 6.289100859830054e-08, "loss": 0.2612, "step": 4656 }, { "epoch": 0.8436594202898551, "grad_norm": 6.929234085370544, "learning_rate": 6.274863438569739e-08, "loss": 0.2616, "step": 4657 }, { "epoch": 0.8438405797101449, "grad_norm": 4.388890278145771, "learning_rate": 6.260641072081113e-08, "loss": 0.2467, "step": 4658 }, { "epoch": 0.8440217391304348, "grad_norm": 3.872824440031464, "learning_rate": 6.246433765261e-08, "loss": 0.2393, "step": 4659 }, { "epoch": 0.8442028985507246, "grad_norm": 3.370403125573209, "learning_rate": 6.232241523001064e-08, "loss": 0.2651, "step": 4660 }, { "epoch": 0.8443840579710145, "grad_norm": 6.484148287846074, "learning_rate": 6.21806435018774e-08, "loss": 0.2365, "step": 4661 }, { "epoch": 0.8445652173913043, "grad_norm": 4.697103225670785, "learning_rate": 6.20390225170232e-08, "loss": 0.2861, "step": 4662 }, { "epoch": 0.8447463768115943, "grad_norm": 4.712302106970035, "learning_rate": 6.189755232420846e-08, "loss": 0.2513, "step": 4663 }, { "epoch": 0.8449275362318841, "grad_norm": 6.829499641931987, "learning_rate": 6.175623297214228e-08, "loss": 0.2568, "step": 4664 }, { "epoch": 0.845108695652174, "grad_norm": 7.255597424418193, "learning_rate": 6.161506450948156e-08, "loss": 0.2821, "step": 4665 }, { "epoch": 0.8452898550724638, "grad_norm": 5.826200964460358, "learning_rate": 6.147404698483121e-08, "loss": 0.3091, "step": 4666 }, { "epoch": 0.8454710144927536, "grad_norm": 6.528620863404902, "learning_rate": 6.133318044674429e-08, "loss": 0.2889, "step": 4667 }, { "epoch": 0.8456521739130435, "grad_norm": 3.6383405578413472, "learning_rate": 6.119246494372177e-08, "loss": 0.2276, "step": 4668 }, { "epoch": 0.8458333333333333, "grad_norm": 7.1429190119245884, "learning_rate": 6.105190052421267e-08, "loss": 0.2999, "step": 4669 }, { "epoch": 0.8460144927536232, "grad_norm": 5.0583545694082925, "learning_rate": 6.091148723661393e-08, "loss": 0.2669, "step": 4670 }, { "epoch": 0.846195652173913, "grad_norm": 3.4760269165341042, "learning_rate": 6.077122512927069e-08, "loss": 0.2548, "step": 4671 }, { "epoch": 0.8463768115942029, "grad_norm": 7.465285976872562, "learning_rate": 6.06311142504758e-08, "loss": 0.2528, "step": 4672 }, { "epoch": 0.8465579710144927, "grad_norm": 4.009623575510082, "learning_rate": 6.049115464846999e-08, "loss": 0.2544, "step": 4673 }, { "epoch": 0.8467391304347827, "grad_norm": 5.362324956894773, "learning_rate": 6.035134637144195e-08, "loss": 0.2807, "step": 4674 }, { "epoch": 0.8469202898550725, "grad_norm": 4.733962155504348, "learning_rate": 6.021168946752852e-08, "loss": 0.237, "step": 4675 }, { "epoch": 0.8471014492753624, "grad_norm": 5.8951886543697585, "learning_rate": 6.007218398481422e-08, "loss": 0.2189, "step": 4676 }, { "epoch": 0.8472826086956522, "grad_norm": 3.6510152757623606, "learning_rate": 5.993282997133142e-08, "loss": 0.2348, "step": 4677 }, { "epoch": 0.847463768115942, "grad_norm": 8.862483351831589, "learning_rate": 5.979362747506028e-08, "loss": 0.2478, "step": 4678 }, { "epoch": 0.8476449275362319, "grad_norm": 9.531432168149545, "learning_rate": 5.965457654392897e-08, "loss": 0.3038, "step": 4679 }, { "epoch": 0.8478260869565217, "grad_norm": 4.7306617601216345, "learning_rate": 5.9515677225813354e-08, "loss": 0.3643, "step": 4680 }, { "epoch": 0.8480072463768116, "grad_norm": 9.342600104866074, "learning_rate": 5.937692956853713e-08, "loss": 0.2897, "step": 4681 }, { "epoch": 0.8481884057971014, "grad_norm": 5.807126660613538, "learning_rate": 5.923833361987168e-08, "loss": 0.2376, "step": 4682 }, { "epoch": 0.8483695652173913, "grad_norm": 4.38295916468315, "learning_rate": 5.909988942753652e-08, "loss": 0.2624, "step": 4683 }, { "epoch": 0.8485507246376811, "grad_norm": 5.323071904588808, "learning_rate": 5.896159703919834e-08, "loss": 0.2485, "step": 4684 }, { "epoch": 0.8487318840579711, "grad_norm": 4.31720216281445, "learning_rate": 5.8823456502472004e-08, "loss": 0.2649, "step": 4685 }, { "epoch": 0.8489130434782609, "grad_norm": 5.184987202632363, "learning_rate": 5.868546786491985e-08, "loss": 0.269, "step": 4686 }, { "epoch": 0.8490942028985508, "grad_norm": 4.8085882817430265, "learning_rate": 5.85476311740522e-08, "loss": 0.2983, "step": 4687 }, { "epoch": 0.8492753623188406, "grad_norm": 5.925885020558899, "learning_rate": 5.8409946477326724e-08, "loss": 0.2866, "step": 4688 }, { "epoch": 0.8494565217391304, "grad_norm": 4.639274597320364, "learning_rate": 5.827241382214915e-08, "loss": 0.2687, "step": 4689 }, { "epoch": 0.8496376811594203, "grad_norm": 5.653284157072135, "learning_rate": 5.8135033255872214e-08, "loss": 0.2754, "step": 4690 }, { "epoch": 0.8498188405797101, "grad_norm": 4.995066341662183, "learning_rate": 5.799780482579703e-08, "loss": 0.271, "step": 4691 }, { "epoch": 0.85, "grad_norm": 6.596674435450166, "learning_rate": 5.7860728579171904e-08, "loss": 0.29, "step": 4692 }, { "epoch": 0.8501811594202898, "grad_norm": 6.288817020917897, "learning_rate": 5.7723804563192814e-08, "loss": 0.265, "step": 4693 }, { "epoch": 0.8503623188405797, "grad_norm": 6.463946918667353, "learning_rate": 5.758703282500332e-08, "loss": 0.3008, "step": 4694 }, { "epoch": 0.8505434782608695, "grad_norm": 11.56470820524333, "learning_rate": 5.745041341169471e-08, "loss": 0.2816, "step": 4695 }, { "epoch": 0.8507246376811595, "grad_norm": 6.651580681379385, "learning_rate": 5.731394637030551e-08, "loss": 0.2881, "step": 4696 }, { "epoch": 0.8509057971014493, "grad_norm": 4.186837179879975, "learning_rate": 5.7177631747822144e-08, "loss": 0.2469, "step": 4697 }, { "epoch": 0.8510869565217392, "grad_norm": 4.385237409060419, "learning_rate": 5.704146959117817e-08, "loss": 0.2703, "step": 4698 }, { "epoch": 0.851268115942029, "grad_norm": 4.485860860883113, "learning_rate": 5.6905459947255206e-08, "loss": 0.2814, "step": 4699 }, { "epoch": 0.8514492753623188, "grad_norm": 9.119240206569785, "learning_rate": 5.6769602862881685e-08, "loss": 0.3212, "step": 4700 }, { "epoch": 0.8514492753623188, "eval_loss": 0.2677968740463257, "eval_runtime": 9.8105, "eval_samples_per_second": 50.966, "eval_steps_per_second": 0.102, "step": 4700 }, { "epoch": 0.8516304347826087, "grad_norm": 4.111293348208835, "learning_rate": 5.66338983848339e-08, "loss": 0.3243, "step": 4701 }, { "epoch": 0.8518115942028985, "grad_norm": 5.833876041662731, "learning_rate": 5.6498346559835705e-08, "loss": 0.3138, "step": 4702 }, { "epoch": 0.8519927536231884, "grad_norm": 6.790067269751507, "learning_rate": 5.636294743455816e-08, "loss": 0.2774, "step": 4703 }, { "epoch": 0.8521739130434782, "grad_norm": 7.446225616700585, "learning_rate": 5.622770105561975e-08, "loss": 0.2301, "step": 4704 }, { "epoch": 0.8523550724637681, "grad_norm": 4.417469034169004, "learning_rate": 5.609260746958655e-08, "loss": 0.3073, "step": 4705 }, { "epoch": 0.8525362318840579, "grad_norm": 3.7395050534082213, "learning_rate": 5.5957666722971866e-08, "loss": 0.2215, "step": 4706 }, { "epoch": 0.8527173913043479, "grad_norm": 3.3868280170156653, "learning_rate": 5.5822878862236465e-08, "loss": 0.2477, "step": 4707 }, { "epoch": 0.8528985507246377, "grad_norm": 4.512889420209454, "learning_rate": 5.56882439337884e-08, "loss": 0.3072, "step": 4708 }, { "epoch": 0.8530797101449276, "grad_norm": 4.724107102209065, "learning_rate": 5.555376198398315e-08, "loss": 0.2813, "step": 4709 }, { "epoch": 0.8532608695652174, "grad_norm": 4.12087526145894, "learning_rate": 5.54194330591235e-08, "loss": 0.2514, "step": 4710 }, { "epoch": 0.8534420289855073, "grad_norm": 5.13964821194713, "learning_rate": 5.5285257205459515e-08, "loss": 0.2763, "step": 4711 }, { "epoch": 0.8536231884057971, "grad_norm": 4.970108016520217, "learning_rate": 5.5151234469188514e-08, "loss": 0.2092, "step": 4712 }, { "epoch": 0.8538043478260869, "grad_norm": 4.262464901136843, "learning_rate": 5.50173648964552e-08, "loss": 0.2497, "step": 4713 }, { "epoch": 0.8539855072463768, "grad_norm": 4.618533712913622, "learning_rate": 5.488364853335159e-08, "loss": 0.2919, "step": 4714 }, { "epoch": 0.8541666666666666, "grad_norm": 3.845424524180444, "learning_rate": 5.475008542591686e-08, "loss": 0.2914, "step": 4715 }, { "epoch": 0.8543478260869565, "grad_norm": 4.469559295410864, "learning_rate": 5.461667562013733e-08, "loss": 0.2626, "step": 4716 }, { "epoch": 0.8545289855072464, "grad_norm": 9.738853515626896, "learning_rate": 5.448341916194649e-08, "loss": 0.2697, "step": 4717 }, { "epoch": 0.8547101449275363, "grad_norm": 3.244657519826543, "learning_rate": 5.4350316097225454e-08, "loss": 0.1809, "step": 4718 }, { "epoch": 0.8548913043478261, "grad_norm": 6.693330699637032, "learning_rate": 5.421736647180214e-08, "loss": 0.2318, "step": 4719 }, { "epoch": 0.855072463768116, "grad_norm": 4.211542645489069, "learning_rate": 5.4084570331451694e-08, "loss": 0.2398, "step": 4720 }, { "epoch": 0.8552536231884058, "grad_norm": 3.6129928871791104, "learning_rate": 5.3951927721896494e-08, "loss": 0.2245, "step": 4721 }, { "epoch": 0.8554347826086957, "grad_norm": 4.787883134533534, "learning_rate": 5.381943868880595e-08, "loss": 0.2961, "step": 4722 }, { "epoch": 0.8556159420289855, "grad_norm": 3.986756509430605, "learning_rate": 5.3687103277796796e-08, "loss": 0.2698, "step": 4723 }, { "epoch": 0.8557971014492753, "grad_norm": 5.345254776169634, "learning_rate": 5.355492153443258e-08, "loss": 0.2896, "step": 4724 }, { "epoch": 0.8559782608695652, "grad_norm": 5.112378029932426, "learning_rate": 5.342289350422413e-08, "loss": 0.2886, "step": 4725 }, { "epoch": 0.856159420289855, "grad_norm": 3.98823730267497, "learning_rate": 5.329101923262952e-08, "loss": 0.2609, "step": 4726 }, { "epoch": 0.8563405797101449, "grad_norm": 4.692040697273998, "learning_rate": 5.315929876505348e-08, "loss": 0.3154, "step": 4727 }, { "epoch": 0.8565217391304348, "grad_norm": 7.547529256722531, "learning_rate": 5.302773214684803e-08, "loss": 0.2705, "step": 4728 }, { "epoch": 0.8567028985507247, "grad_norm": 5.106064010638053, "learning_rate": 5.289631942331213e-08, "loss": 0.2718, "step": 4729 }, { "epoch": 0.8568840579710145, "grad_norm": 7.947185915904829, "learning_rate": 5.2765060639691937e-08, "loss": 0.2701, "step": 4730 }, { "epoch": 0.8570652173913044, "grad_norm": 5.49692550338986, "learning_rate": 5.263395584118047e-08, "loss": 0.3084, "step": 4731 }, { "epoch": 0.8572463768115942, "grad_norm": 4.050460746928886, "learning_rate": 5.2503005072917713e-08, "loss": 0.226, "step": 4732 }, { "epoch": 0.8574275362318841, "grad_norm": 9.278539825170677, "learning_rate": 5.23722083799904e-08, "loss": 0.2383, "step": 4733 }, { "epoch": 0.8576086956521739, "grad_norm": 4.928670358788241, "learning_rate": 5.2241565807432776e-08, "loss": 0.2817, "step": 4734 }, { "epoch": 0.8577898550724637, "grad_norm": 3.5898406792671156, "learning_rate": 5.211107740022558e-08, "loss": 0.2701, "step": 4735 }, { "epoch": 0.8579710144927536, "grad_norm": 4.975485937192667, "learning_rate": 5.198074320329654e-08, "loss": 0.2805, "step": 4736 }, { "epoch": 0.8581521739130434, "grad_norm": 4.43262879083396, "learning_rate": 5.185056326152037e-08, "loss": 0.2542, "step": 4737 }, { "epoch": 0.8583333333333333, "grad_norm": 8.369412908749364, "learning_rate": 5.172053761971867e-08, "loss": 0.3017, "step": 4738 }, { "epoch": 0.8585144927536232, "grad_norm": 7.415489230243173, "learning_rate": 5.1590666322659845e-08, "loss": 0.2787, "step": 4739 }, { "epoch": 0.8586956521739131, "grad_norm": 6.9000582139541375, "learning_rate": 5.146094941505913e-08, "loss": 0.2208, "step": 4740 }, { "epoch": 0.8588768115942029, "grad_norm": 6.186041763846137, "learning_rate": 5.1331386941578846e-08, "loss": 0.247, "step": 4741 }, { "epoch": 0.8590579710144928, "grad_norm": 3.924576031889005, "learning_rate": 5.120197894682793e-08, "loss": 0.2699, "step": 4742 }, { "epoch": 0.8592391304347826, "grad_norm": 7.8848981890728105, "learning_rate": 5.107272547536207e-08, "loss": 0.2734, "step": 4743 }, { "epoch": 0.8594202898550725, "grad_norm": 6.519843239454956, "learning_rate": 5.0943626571683774e-08, "loss": 0.2563, "step": 4744 }, { "epoch": 0.8596014492753623, "grad_norm": 4.9993217366247364, "learning_rate": 5.0814682280242604e-08, "loss": 0.3061, "step": 4745 }, { "epoch": 0.8597826086956522, "grad_norm": 6.383583032520059, "learning_rate": 5.068589264543466e-08, "loss": 0.2708, "step": 4746 }, { "epoch": 0.859963768115942, "grad_norm": 5.9074673356650145, "learning_rate": 5.055725771160274e-08, "loss": 0.2956, "step": 4747 }, { "epoch": 0.8601449275362318, "grad_norm": 3.9232002430942012, "learning_rate": 5.042877752303648e-08, "loss": 0.3127, "step": 4748 }, { "epoch": 0.8603260869565217, "grad_norm": 5.816423456020595, "learning_rate": 5.030045212397227e-08, "loss": 0.2979, "step": 4749 }, { "epoch": 0.8605072463768116, "grad_norm": 5.964388887545941, "learning_rate": 5.0172281558593044e-08, "loss": 0.2415, "step": 4750 }, { "epoch": 0.8606884057971015, "grad_norm": 8.207513057690512, "learning_rate": 5.0044265871028666e-08, "loss": 0.2746, "step": 4751 }, { "epoch": 0.8608695652173913, "grad_norm": 4.4646718280336755, "learning_rate": 4.991640510535538e-08, "loss": 0.2393, "step": 4752 }, { "epoch": 0.8610507246376812, "grad_norm": 4.934126995471067, "learning_rate": 4.978869930559654e-08, "loss": 0.2518, "step": 4753 }, { "epoch": 0.861231884057971, "grad_norm": 4.75033009967884, "learning_rate": 4.966114851572156e-08, "loss": 0.2834, "step": 4754 }, { "epoch": 0.8614130434782609, "grad_norm": 5.824520280458007, "learning_rate": 4.953375277964694e-08, "loss": 0.3035, "step": 4755 }, { "epoch": 0.8615942028985507, "grad_norm": 4.2230720683193, "learning_rate": 4.940651214123548e-08, "loss": 0.2676, "step": 4756 }, { "epoch": 0.8617753623188406, "grad_norm": 4.356967722732345, "learning_rate": 4.9279426644296906e-08, "loss": 0.2866, "step": 4757 }, { "epoch": 0.8619565217391304, "grad_norm": 10.629175468892866, "learning_rate": 4.9152496332587336e-08, "loss": 0.3048, "step": 4758 }, { "epoch": 0.8621376811594202, "grad_norm": 3.7226318177999107, "learning_rate": 4.902572124980952e-08, "loss": 0.2728, "step": 4759 }, { "epoch": 0.8623188405797102, "grad_norm": 5.177982449770165, "learning_rate": 4.889910143961246e-08, "loss": 0.2422, "step": 4760 }, { "epoch": 0.8625, "grad_norm": 4.786627664604113, "learning_rate": 4.877263694559225e-08, "loss": 0.2783, "step": 4761 }, { "epoch": 0.8626811594202899, "grad_norm": 6.884433016024381, "learning_rate": 4.8646327811291074e-08, "loss": 0.2831, "step": 4762 }, { "epoch": 0.8628623188405797, "grad_norm": 5.554165733102009, "learning_rate": 4.8520174080197764e-08, "loss": 0.3451, "step": 4763 }, { "epoch": 0.8630434782608696, "grad_norm": 4.168758882239681, "learning_rate": 4.8394175795747725e-08, "loss": 0.2355, "step": 4764 }, { "epoch": 0.8632246376811594, "grad_norm": 5.941932261378795, "learning_rate": 4.826833300132266e-08, "loss": 0.2834, "step": 4765 }, { "epoch": 0.8634057971014493, "grad_norm": 5.599059540455458, "learning_rate": 4.814264574025095e-08, "loss": 0.2906, "step": 4766 }, { "epoch": 0.8635869565217391, "grad_norm": 4.992164502682655, "learning_rate": 4.801711405580722e-08, "loss": 0.3453, "step": 4767 }, { "epoch": 0.863768115942029, "grad_norm": 4.19583737550248, "learning_rate": 4.78917379912126e-08, "loss": 0.2665, "step": 4768 }, { "epoch": 0.8639492753623188, "grad_norm": 3.5818416605333, "learning_rate": 4.776651758963485e-08, "loss": 0.305, "step": 4769 }, { "epoch": 0.8641304347826086, "grad_norm": 5.740539453117691, "learning_rate": 4.764145289418775e-08, "loss": 0.2852, "step": 4770 }, { "epoch": 0.8643115942028986, "grad_norm": 7.302862362273101, "learning_rate": 4.7516543947931695e-08, "loss": 0.266, "step": 4771 }, { "epoch": 0.8644927536231884, "grad_norm": 8.665570781591065, "learning_rate": 4.739179079387351e-08, "loss": 0.2543, "step": 4772 }, { "epoch": 0.8646739130434783, "grad_norm": 6.737153509525896, "learning_rate": 4.726719347496627e-08, "loss": 0.324, "step": 4773 }, { "epoch": 0.8648550724637681, "grad_norm": 4.011633541562273, "learning_rate": 4.714275203410939e-08, "loss": 0.2929, "step": 4774 }, { "epoch": 0.865036231884058, "grad_norm": 5.739907818725988, "learning_rate": 4.70184665141487e-08, "loss": 0.2704, "step": 4775 }, { "epoch": 0.8652173913043478, "grad_norm": 7.08178737126034, "learning_rate": 4.689433695787626e-08, "loss": 0.3541, "step": 4776 }, { "epoch": 0.8653985507246377, "grad_norm": 4.893928876851549, "learning_rate": 4.6770363408030476e-08, "loss": 0.2469, "step": 4777 }, { "epoch": 0.8655797101449275, "grad_norm": 3.469906626602484, "learning_rate": 4.6646545907296e-08, "loss": 0.2394, "step": 4778 }, { "epoch": 0.8657608695652174, "grad_norm": 5.023344079291013, "learning_rate": 4.6522884498303774e-08, "loss": 0.2817, "step": 4779 }, { "epoch": 0.8659420289855072, "grad_norm": 7.366778179396263, "learning_rate": 4.6399379223631074e-08, "loss": 0.276, "step": 4780 }, { "epoch": 0.866123188405797, "grad_norm": 4.6909101921856875, "learning_rate": 4.627603012580128e-08, "loss": 0.2822, "step": 4781 }, { "epoch": 0.866304347826087, "grad_norm": 3.6207284062663496, "learning_rate": 4.6152837247284136e-08, "loss": 0.2505, "step": 4782 }, { "epoch": 0.8664855072463769, "grad_norm": 3.641406937992566, "learning_rate": 4.6029800630495385e-08, "loss": 0.2873, "step": 4783 }, { "epoch": 0.8666666666666667, "grad_norm": 5.3715093702827765, "learning_rate": 4.59069203177973e-08, "loss": 0.2204, "step": 4784 }, { "epoch": 0.8668478260869565, "grad_norm": 4.068329719988691, "learning_rate": 4.5784196351498216e-08, "loss": 0.2654, "step": 4785 }, { "epoch": 0.8670289855072464, "grad_norm": 6.627004008280092, "learning_rate": 4.566162877385238e-08, "loss": 0.3206, "step": 4786 }, { "epoch": 0.8672101449275362, "grad_norm": 9.649125062706997, "learning_rate": 4.553921762706031e-08, "loss": 0.2735, "step": 4787 }, { "epoch": 0.8673913043478261, "grad_norm": 4.496906073500851, "learning_rate": 4.541696295326902e-08, "loss": 0.2132, "step": 4788 }, { "epoch": 0.8675724637681159, "grad_norm": 6.412039995766007, "learning_rate": 4.529486479457123e-08, "loss": 0.2972, "step": 4789 }, { "epoch": 0.8677536231884058, "grad_norm": 6.992352001193379, "learning_rate": 4.5172923193005975e-08, "loss": 0.2576, "step": 4790 }, { "epoch": 0.8679347826086956, "grad_norm": 3.645832732654844, "learning_rate": 4.505113819055822e-08, "loss": 0.2805, "step": 4791 }, { "epoch": 0.8681159420289855, "grad_norm": 5.947747610274793, "learning_rate": 4.492950982915922e-08, "loss": 0.2874, "step": 4792 }, { "epoch": 0.8682971014492754, "grad_norm": 5.180417435140503, "learning_rate": 4.480803815068612e-08, "loss": 0.2882, "step": 4793 }, { "epoch": 0.8684782608695653, "grad_norm": 6.196152999866419, "learning_rate": 4.468672319696221e-08, "loss": 0.3124, "step": 4794 }, { "epoch": 0.8686594202898551, "grad_norm": 5.241817376967998, "learning_rate": 4.456556500975678e-08, "loss": 0.2569, "step": 4795 }, { "epoch": 0.868840579710145, "grad_norm": 5.969026236850944, "learning_rate": 4.444456363078536e-08, "loss": 0.3378, "step": 4796 }, { "epoch": 0.8690217391304348, "grad_norm": 8.659986363062528, "learning_rate": 4.432371910170907e-08, "loss": 0.2789, "step": 4797 }, { "epoch": 0.8692028985507246, "grad_norm": 4.458566167266721, "learning_rate": 4.4203031464135345e-08, "loss": 0.2671, "step": 4798 }, { "epoch": 0.8693840579710145, "grad_norm": 4.721670555075104, "learning_rate": 4.408250075961739e-08, "loss": 0.2736, "step": 4799 }, { "epoch": 0.8695652173913043, "grad_norm": 3.93480745081534, "learning_rate": 4.396212702965474e-08, "loss": 0.3227, "step": 4800 }, { "epoch": 0.8695652173913043, "eval_loss": 0.26845312118530273, "eval_runtime": 9.7816, "eval_samples_per_second": 51.116, "eval_steps_per_second": 0.102, "step": 4800 }, { "epoch": 0.8697463768115942, "grad_norm": 5.449914182625196, "learning_rate": 4.3841910315692456e-08, "loss": 0.218, "step": 4801 }, { "epoch": 0.869927536231884, "grad_norm": 4.445718265230896, "learning_rate": 4.3721850659121875e-08, "loss": 0.3426, "step": 4802 }, { "epoch": 0.8701086956521739, "grad_norm": 4.329150571611649, "learning_rate": 4.3601948101279814e-08, "loss": 0.2693, "step": 4803 }, { "epoch": 0.8702898550724638, "grad_norm": 3.545951424301417, "learning_rate": 4.34822026834496e-08, "loss": 0.2513, "step": 4804 }, { "epoch": 0.8704710144927537, "grad_norm": 3.484189392788549, "learning_rate": 4.336261444686001e-08, "loss": 0.2346, "step": 4805 }, { "epoch": 0.8706521739130435, "grad_norm": 6.097123904837969, "learning_rate": 4.324318343268585e-08, "loss": 0.3472, "step": 4806 }, { "epoch": 0.8708333333333333, "grad_norm": 4.026122015491682, "learning_rate": 4.312390968204782e-08, "loss": 0.2438, "step": 4807 }, { "epoch": 0.8710144927536232, "grad_norm": 4.471671504660214, "learning_rate": 4.3004793236012414e-08, "loss": 0.2803, "step": 4808 }, { "epoch": 0.871195652173913, "grad_norm": 5.533011614134167, "learning_rate": 4.2885834135591933e-08, "loss": 0.3653, "step": 4809 }, { "epoch": 0.8713768115942029, "grad_norm": 5.921194508936267, "learning_rate": 4.2767032421744565e-08, "loss": 0.2462, "step": 4810 }, { "epoch": 0.8715579710144927, "grad_norm": 7.571181168152904, "learning_rate": 4.264838813537447e-08, "loss": 0.2946, "step": 4811 }, { "epoch": 0.8717391304347826, "grad_norm": 6.735517852130179, "learning_rate": 4.252990131733136e-08, "loss": 0.2751, "step": 4812 }, { "epoch": 0.8719202898550724, "grad_norm": 6.10618697706934, "learning_rate": 4.2411572008410665e-08, "loss": 0.2744, "step": 4813 }, { "epoch": 0.8721014492753624, "grad_norm": 5.375193949200026, "learning_rate": 4.2293400249353737e-08, "loss": 0.2868, "step": 4814 }, { "epoch": 0.8722826086956522, "grad_norm": 4.562098702692266, "learning_rate": 4.217538608084792e-08, "loss": 0.3376, "step": 4815 }, { "epoch": 0.8724637681159421, "grad_norm": 4.0965578662631446, "learning_rate": 4.205752954352581e-08, "loss": 0.2745, "step": 4816 }, { "epoch": 0.8726449275362319, "grad_norm": 11.553300530492404, "learning_rate": 4.193983067796608e-08, "loss": 0.2716, "step": 4817 }, { "epoch": 0.8728260869565218, "grad_norm": 4.9190342539145195, "learning_rate": 4.182228952469302e-08, "loss": 0.3053, "step": 4818 }, { "epoch": 0.8730072463768116, "grad_norm": 7.173060887853867, "learning_rate": 4.1704906124176544e-08, "loss": 0.2352, "step": 4819 }, { "epoch": 0.8731884057971014, "grad_norm": 3.4336738510071534, "learning_rate": 4.158768051683231e-08, "loss": 0.2391, "step": 4820 }, { "epoch": 0.8733695652173913, "grad_norm": 5.013687030619245, "learning_rate": 4.147061274302172e-08, "loss": 0.2604, "step": 4821 }, { "epoch": 0.8735507246376811, "grad_norm": 4.967742778617009, "learning_rate": 4.135370284305162e-08, "loss": 0.2849, "step": 4822 }, { "epoch": 0.873731884057971, "grad_norm": 3.87426816123235, "learning_rate": 4.123695085717488e-08, "loss": 0.2939, "step": 4823 }, { "epoch": 0.8739130434782608, "grad_norm": 4.443053351641189, "learning_rate": 4.112035682558956e-08, "loss": 0.2355, "step": 4824 }, { "epoch": 0.8740942028985508, "grad_norm": 8.136016356071698, "learning_rate": 4.100392078843962e-08, "loss": 0.3054, "step": 4825 }, { "epoch": 0.8742753623188406, "grad_norm": 5.446284062484525, "learning_rate": 4.0887642785814416e-08, "loss": 0.3049, "step": 4826 }, { "epoch": 0.8744565217391305, "grad_norm": 3.972469992526076, "learning_rate": 4.077152285774921e-08, "loss": 0.2898, "step": 4827 }, { "epoch": 0.8746376811594203, "grad_norm": 3.8042995471976133, "learning_rate": 4.0655561044224575e-08, "loss": 0.2824, "step": 4828 }, { "epoch": 0.8748188405797102, "grad_norm": 4.462456674296698, "learning_rate": 4.053975738516674e-08, "loss": 0.3095, "step": 4829 }, { "epoch": 0.875, "grad_norm": 3.6003349523794435, "learning_rate": 4.0424111920447256e-08, "loss": 0.2736, "step": 4830 }, { "epoch": 0.8751811594202898, "grad_norm": 5.293492608681053, "learning_rate": 4.0308624689883665e-08, "loss": 0.3599, "step": 4831 }, { "epoch": 0.8753623188405797, "grad_norm": 4.559790756344651, "learning_rate": 4.0193295733238597e-08, "loss": 0.3252, "step": 4832 }, { "epoch": 0.8755434782608695, "grad_norm": 5.825058697053931, "learning_rate": 4.007812509022046e-08, "loss": 0.3034, "step": 4833 }, { "epoch": 0.8757246376811594, "grad_norm": 7.078450517220368, "learning_rate": 3.9963112800483035e-08, "loss": 0.3009, "step": 4834 }, { "epoch": 0.8759057971014492, "grad_norm": 3.5727324426677285, "learning_rate": 3.984825890362553e-08, "loss": 0.209, "step": 4835 }, { "epoch": 0.8760869565217392, "grad_norm": 5.363655439938031, "learning_rate": 3.973356343919271e-08, "loss": 0.3563, "step": 4836 }, { "epoch": 0.876268115942029, "grad_norm": 3.872962487747958, "learning_rate": 3.9619026446674865e-08, "loss": 0.2441, "step": 4837 }, { "epoch": 0.8764492753623189, "grad_norm": 3.6838392317210786, "learning_rate": 3.95046479655074e-08, "loss": 0.2673, "step": 4838 }, { "epoch": 0.8766304347826087, "grad_norm": 8.453286777806182, "learning_rate": 3.93904280350717e-08, "loss": 0.2264, "step": 4839 }, { "epoch": 0.8768115942028986, "grad_norm": 5.844697872745354, "learning_rate": 3.927636669469392e-08, "loss": 0.2577, "step": 4840 }, { "epoch": 0.8769927536231884, "grad_norm": 3.6777999015594083, "learning_rate": 3.916246398364609e-08, "loss": 0.2621, "step": 4841 }, { "epoch": 0.8771739130434782, "grad_norm": 6.001265432192033, "learning_rate": 3.9048719941145283e-08, "loss": 0.309, "step": 4842 }, { "epoch": 0.8773550724637681, "grad_norm": 6.462604414134679, "learning_rate": 3.893513460635434e-08, "loss": 0.3062, "step": 4843 }, { "epoch": 0.8775362318840579, "grad_norm": 3.4540402710718636, "learning_rate": 3.882170801838114e-08, "loss": 0.2057, "step": 4844 }, { "epoch": 0.8777173913043478, "grad_norm": 4.842282418840915, "learning_rate": 3.870844021627895e-08, "loss": 0.2336, "step": 4845 }, { "epoch": 0.8778985507246376, "grad_norm": 3.7689894636285373, "learning_rate": 3.85953312390464e-08, "loss": 0.2488, "step": 4846 }, { "epoch": 0.8780797101449276, "grad_norm": 7.61909799344492, "learning_rate": 3.848238112562757e-08, "loss": 0.3136, "step": 4847 }, { "epoch": 0.8782608695652174, "grad_norm": 4.224094208865548, "learning_rate": 3.836958991491157e-08, "loss": 0.2879, "step": 4848 }, { "epoch": 0.8784420289855073, "grad_norm": 6.87142070956398, "learning_rate": 3.825695764573306e-08, "loss": 0.2994, "step": 4849 }, { "epoch": 0.8786231884057971, "grad_norm": 6.84381033527655, "learning_rate": 3.8144484356871785e-08, "loss": 0.2286, "step": 4850 }, { "epoch": 0.878804347826087, "grad_norm": 5.619098974696006, "learning_rate": 3.803217008705289e-08, "loss": 0.2885, "step": 4851 }, { "epoch": 0.8789855072463768, "grad_norm": 4.279187710623164, "learning_rate": 3.792001487494667e-08, "loss": 0.3192, "step": 4852 }, { "epoch": 0.8791666666666667, "grad_norm": 6.9355359843170685, "learning_rate": 3.7808018759168614e-08, "loss": 0.2565, "step": 4853 }, { "epoch": 0.8793478260869565, "grad_norm": 6.059950578905525, "learning_rate": 3.769618177827971e-08, "loss": 0.2973, "step": 4854 }, { "epoch": 0.8795289855072463, "grad_norm": 4.691597441259821, "learning_rate": 3.758450397078594e-08, "loss": 0.3328, "step": 4855 }, { "epoch": 0.8797101449275362, "grad_norm": 4.2106759387131305, "learning_rate": 3.7472985375138325e-08, "loss": 0.2847, "step": 4856 }, { "epoch": 0.8798913043478261, "grad_norm": 9.681248745490038, "learning_rate": 3.736162602973325e-08, "loss": 0.3246, "step": 4857 }, { "epoch": 0.880072463768116, "grad_norm": 5.34928426402028, "learning_rate": 3.7250425972912505e-08, "loss": 0.3119, "step": 4858 }, { "epoch": 0.8802536231884058, "grad_norm": 7.115363303078974, "learning_rate": 3.713938524296256e-08, "loss": 0.2783, "step": 4859 }, { "epoch": 0.8804347826086957, "grad_norm": 3.9243422982401293, "learning_rate": 3.702850387811546e-08, "loss": 0.2657, "step": 4860 }, { "epoch": 0.8806159420289855, "grad_norm": 3.662297261711184, "learning_rate": 3.691778191654799e-08, "loss": 0.2921, "step": 4861 }, { "epoch": 0.8807971014492754, "grad_norm": 7.683679150255554, "learning_rate": 3.680721939638237e-08, "loss": 0.2434, "step": 4862 }, { "epoch": 0.8809782608695652, "grad_norm": 5.255328396636565, "learning_rate": 3.669681635568578e-08, "loss": 0.2537, "step": 4863 }, { "epoch": 0.881159420289855, "grad_norm": 5.3053784017465775, "learning_rate": 3.6586572832470464e-08, "loss": 0.2754, "step": 4864 }, { "epoch": 0.8813405797101449, "grad_norm": 5.288496246351217, "learning_rate": 3.647648886469379e-08, "loss": 0.2188, "step": 4865 }, { "epoch": 0.8815217391304347, "grad_norm": 5.606372318197591, "learning_rate": 3.636656449025832e-08, "loss": 0.2742, "step": 4866 }, { "epoch": 0.8817028985507246, "grad_norm": 4.892399152430679, "learning_rate": 3.62567997470114e-08, "loss": 0.2994, "step": 4867 }, { "epoch": 0.8818840579710145, "grad_norm": 3.807216904071993, "learning_rate": 3.614719467274557e-08, "loss": 0.2669, "step": 4868 }, { "epoch": 0.8820652173913044, "grad_norm": 4.5879566164909695, "learning_rate": 3.6037749305198317e-08, "loss": 0.2427, "step": 4869 }, { "epoch": 0.8822463768115942, "grad_norm": 4.618097294151533, "learning_rate": 3.592846368205238e-08, "loss": 0.2825, "step": 4870 }, { "epoch": 0.8824275362318841, "grad_norm": 4.708085248529916, "learning_rate": 3.581933784093516e-08, "loss": 0.2697, "step": 4871 }, { "epoch": 0.8826086956521739, "grad_norm": 4.412138029641981, "learning_rate": 3.5710371819419385e-08, "loss": 0.2245, "step": 4872 }, { "epoch": 0.8827898550724638, "grad_norm": 3.2617556890915003, "learning_rate": 3.560156565502226e-08, "loss": 0.2177, "step": 4873 }, { "epoch": 0.8829710144927536, "grad_norm": 7.6055981263454004, "learning_rate": 3.5492919385206546e-08, "loss": 0.2711, "step": 4874 }, { "epoch": 0.8831521739130435, "grad_norm": 5.815422259527724, "learning_rate": 3.538443304737954e-08, "loss": 0.2972, "step": 4875 }, { "epoch": 0.8833333333333333, "grad_norm": 4.712542839093594, "learning_rate": 3.5276106678893637e-08, "loss": 0.2962, "step": 4876 }, { "epoch": 0.8835144927536231, "grad_norm": 4.0208079557661, "learning_rate": 3.516794031704612e-08, "loss": 0.2825, "step": 4877 }, { "epoch": 0.883695652173913, "grad_norm": 6.862615966791953, "learning_rate": 3.505993399907919e-08, "loss": 0.2507, "step": 4878 }, { "epoch": 0.883876811594203, "grad_norm": 3.412905342786199, "learning_rate": 3.495208776217989e-08, "loss": 0.2336, "step": 4879 }, { "epoch": 0.8840579710144928, "grad_norm": 5.4957707027751805, "learning_rate": 3.484440164348018e-08, "loss": 0.2782, "step": 4880 }, { "epoch": 0.8842391304347826, "grad_norm": 3.813333636561732, "learning_rate": 3.473687568005696e-08, "loss": 0.2486, "step": 4881 }, { "epoch": 0.8844202898550725, "grad_norm": 4.6979226299210035, "learning_rate": 3.462950990893199e-08, "loss": 0.2738, "step": 4882 }, { "epoch": 0.8846014492753623, "grad_norm": 4.958394268257525, "learning_rate": 3.452230436707171e-08, "loss": 0.2748, "step": 4883 }, { "epoch": 0.8847826086956522, "grad_norm": 8.42221285391293, "learning_rate": 3.441525909138737e-08, "loss": 0.2933, "step": 4884 }, { "epoch": 0.884963768115942, "grad_norm": 4.762954284955818, "learning_rate": 3.4308374118735436e-08, "loss": 0.2499, "step": 4885 }, { "epoch": 0.8851449275362319, "grad_norm": 5.154770682313216, "learning_rate": 3.420164948591675e-08, "loss": 0.2362, "step": 4886 }, { "epoch": 0.8853260869565217, "grad_norm": 4.491915351344419, "learning_rate": 3.409508522967719e-08, "loss": 0.2784, "step": 4887 }, { "epoch": 0.8855072463768116, "grad_norm": 10.463644419930143, "learning_rate": 3.398868138670724e-08, "loss": 0.2869, "step": 4888 }, { "epoch": 0.8856884057971014, "grad_norm": 3.322646623941776, "learning_rate": 3.388243799364232e-08, "loss": 0.23, "step": 4889 }, { "epoch": 0.8858695652173914, "grad_norm": 4.972555851864783, "learning_rate": 3.37763550870625e-08, "loss": 0.3171, "step": 4890 }, { "epoch": 0.8860507246376812, "grad_norm": 5.429904523745769, "learning_rate": 3.367043270349262e-08, "loss": 0.2591, "step": 4891 }, { "epoch": 0.886231884057971, "grad_norm": 3.558324423323521, "learning_rate": 3.3564670879402236e-08, "loss": 0.245, "step": 4892 }, { "epoch": 0.8864130434782609, "grad_norm": 7.184105641762762, "learning_rate": 3.345906965120582e-08, "loss": 0.2751, "step": 4893 }, { "epoch": 0.8865942028985507, "grad_norm": 5.618966803832959, "learning_rate": 3.3353629055262176e-08, "loss": 0.2921, "step": 4894 }, { "epoch": 0.8867753623188406, "grad_norm": 4.009092763053742, "learning_rate": 3.324834912787505e-08, "loss": 0.2785, "step": 4895 }, { "epoch": 0.8869565217391304, "grad_norm": 4.367515994731803, "learning_rate": 3.314322990529278e-08, "loss": 0.2892, "step": 4896 }, { "epoch": 0.8871376811594203, "grad_norm": 4.633884973302452, "learning_rate": 3.3038271423708517e-08, "loss": 0.246, "step": 4897 }, { "epoch": 0.8873188405797101, "grad_norm": 6.72248643922981, "learning_rate": 3.2933473719259976e-08, "loss": 0.2435, "step": 4898 }, { "epoch": 0.8875, "grad_norm": 4.839274439927255, "learning_rate": 3.282883682802945e-08, "loss": 0.2688, "step": 4899 }, { "epoch": 0.8876811594202898, "grad_norm": 5.282350163828013, "learning_rate": 3.2724360786043794e-08, "loss": 0.3119, "step": 4900 }, { "epoch": 0.8876811594202898, "eval_loss": 0.2679218649864197, "eval_runtime": 9.8305, "eval_samples_per_second": 50.862, "eval_steps_per_second": 0.102, "step": 4900 }, { "epoch": 0.8878623188405798, "grad_norm": 4.423994719852854, "learning_rate": 3.262004562927473e-08, "loss": 0.2878, "step": 4901 }, { "epoch": 0.8880434782608696, "grad_norm": 8.32233694874011, "learning_rate": 3.251589139363853e-08, "loss": 0.3172, "step": 4902 }, { "epoch": 0.8882246376811594, "grad_norm": 3.8348645691430967, "learning_rate": 3.241189811499584e-08, "loss": 0.2498, "step": 4903 }, { "epoch": 0.8884057971014493, "grad_norm": 3.5383964242941133, "learning_rate": 3.230806582915213e-08, "loss": 0.2691, "step": 4904 }, { "epoch": 0.8885869565217391, "grad_norm": 4.283264010759309, "learning_rate": 3.220439457185736e-08, "loss": 0.28, "step": 4905 }, { "epoch": 0.888768115942029, "grad_norm": 6.662747446368292, "learning_rate": 3.210088437880598e-08, "loss": 0.2807, "step": 4906 }, { "epoch": 0.8889492753623188, "grad_norm": 5.462035865433335, "learning_rate": 3.199753528563703e-08, "loss": 0.3171, "step": 4907 }, { "epoch": 0.8891304347826087, "grad_norm": 5.120504887617094, "learning_rate": 3.1894347327934115e-08, "loss": 0.2391, "step": 4908 }, { "epoch": 0.8893115942028985, "grad_norm": 5.2963822327425545, "learning_rate": 3.1791320541225465e-08, "loss": 0.2938, "step": 4909 }, { "epoch": 0.8894927536231884, "grad_norm": 4.074254359372084, "learning_rate": 3.1688454960983434e-08, "loss": 0.2362, "step": 4910 }, { "epoch": 0.8896739130434783, "grad_norm": 6.002395734156921, "learning_rate": 3.158575062262536e-08, "loss": 0.2568, "step": 4911 }, { "epoch": 0.8898550724637682, "grad_norm": 4.221303060480557, "learning_rate": 3.148320756151263e-08, "loss": 0.2667, "step": 4912 }, { "epoch": 0.890036231884058, "grad_norm": 4.9094016326471115, "learning_rate": 3.138082581295149e-08, "loss": 0.246, "step": 4913 }, { "epoch": 0.8902173913043478, "grad_norm": 9.638882921892828, "learning_rate": 3.1278605412192346e-08, "loss": 0.2878, "step": 4914 }, { "epoch": 0.8903985507246377, "grad_norm": 7.482394468200364, "learning_rate": 3.117654639443024e-08, "loss": 0.29, "step": 4915 }, { "epoch": 0.8905797101449275, "grad_norm": 4.330414179407386, "learning_rate": 3.107464879480454e-08, "loss": 0.2974, "step": 4916 }, { "epoch": 0.8907608695652174, "grad_norm": 7.0903282291563485, "learning_rate": 3.0972912648399076e-08, "loss": 0.2452, "step": 4917 }, { "epoch": 0.8909420289855072, "grad_norm": 3.5933345467530304, "learning_rate": 3.087133799024211e-08, "loss": 0.272, "step": 4918 }, { "epoch": 0.8911231884057971, "grad_norm": 8.770212444775352, "learning_rate": 3.0769924855306215e-08, "loss": 0.2745, "step": 4919 }, { "epoch": 0.8913043478260869, "grad_norm": 4.159004302858452, "learning_rate": 3.066867327850847e-08, "loss": 0.2747, "step": 4920 }, { "epoch": 0.8914855072463768, "grad_norm": 3.973331775126609, "learning_rate": 3.0567583294710267e-08, "loss": 0.3397, "step": 4921 }, { "epoch": 0.8916666666666667, "grad_norm": 3.567657693090071, "learning_rate": 3.046665493871736e-08, "loss": 0.2499, "step": 4922 }, { "epoch": 0.8918478260869566, "grad_norm": 5.520179583929899, "learning_rate": 3.036588824527975e-08, "loss": 0.3479, "step": 4923 }, { "epoch": 0.8920289855072464, "grad_norm": 5.060652974172302, "learning_rate": 3.0265283249092055e-08, "loss": 0.3237, "step": 4924 }, { "epoch": 0.8922101449275363, "grad_norm": 6.761030305735268, "learning_rate": 3.016483998479308e-08, "loss": 0.2614, "step": 4925 }, { "epoch": 0.8923913043478261, "grad_norm": 6.2654758109735, "learning_rate": 3.006455848696576e-08, "loss": 0.2674, "step": 4926 }, { "epoch": 0.8925724637681159, "grad_norm": 8.002422344419415, "learning_rate": 2.9964438790137437e-08, "loss": 0.2798, "step": 4927 }, { "epoch": 0.8927536231884058, "grad_norm": 5.857452216529343, "learning_rate": 2.986448092878002e-08, "loss": 0.3051, "step": 4928 }, { "epoch": 0.8929347826086956, "grad_norm": 3.2551135069878083, "learning_rate": 2.9764684937309382e-08, "loss": 0.2422, "step": 4929 }, { "epoch": 0.8931159420289855, "grad_norm": 6.007800271505982, "learning_rate": 2.9665050850085694e-08, "loss": 0.2614, "step": 4930 }, { "epoch": 0.8932971014492753, "grad_norm": 3.805841564633844, "learning_rate": 2.9565578701413517e-08, "loss": 0.2437, "step": 4931 }, { "epoch": 0.8934782608695652, "grad_norm": 6.717074900683175, "learning_rate": 2.9466268525541572e-08, "loss": 0.2426, "step": 4932 }, { "epoch": 0.8936594202898551, "grad_norm": 5.522394967042641, "learning_rate": 2.9367120356662776e-08, "loss": 0.3172, "step": 4933 }, { "epoch": 0.893840579710145, "grad_norm": 4.4627192239062, "learning_rate": 2.9268134228914387e-08, "loss": 0.2798, "step": 4934 }, { "epoch": 0.8940217391304348, "grad_norm": 8.960653802495912, "learning_rate": 2.916931017637769e-08, "loss": 0.3389, "step": 4935 }, { "epoch": 0.8942028985507247, "grad_norm": 5.2878398968896185, "learning_rate": 2.9070648233078476e-08, "loss": 0.2716, "step": 4936 }, { "epoch": 0.8943840579710145, "grad_norm": 3.411084187465577, "learning_rate": 2.8972148432986286e-08, "loss": 0.2378, "step": 4937 }, { "epoch": 0.8945652173913043, "grad_norm": 6.973036498122388, "learning_rate": 2.8873810810015166e-08, "loss": 0.3407, "step": 4938 }, { "epoch": 0.8947463768115942, "grad_norm": 4.404505645813442, "learning_rate": 2.877563539802319e-08, "loss": 0.2634, "step": 4939 }, { "epoch": 0.894927536231884, "grad_norm": 4.807986854374034, "learning_rate": 2.8677622230812715e-08, "loss": 0.3073, "step": 4940 }, { "epoch": 0.8951086956521739, "grad_norm": 3.7893067280954384, "learning_rate": 2.8579771342130023e-08, "loss": 0.2964, "step": 4941 }, { "epoch": 0.8952898550724637, "grad_norm": 7.07325661497837, "learning_rate": 2.8482082765665827e-08, "loss": 0.2997, "step": 4942 }, { "epoch": 0.8954710144927536, "grad_norm": 9.4814809879375, "learning_rate": 2.8384556535054503e-08, "loss": 0.2734, "step": 4943 }, { "epoch": 0.8956521739130435, "grad_norm": 3.315429785228442, "learning_rate": 2.828719268387497e-08, "loss": 0.2289, "step": 4944 }, { "epoch": 0.8958333333333334, "grad_norm": 4.610959759278451, "learning_rate": 2.8189991245650024e-08, "loss": 0.2247, "step": 4945 }, { "epoch": 0.8960144927536232, "grad_norm": 6.368134462803115, "learning_rate": 2.809295225384656e-08, "loss": 0.2459, "step": 4946 }, { "epoch": 0.8961956521739131, "grad_norm": 5.0630745725534, "learning_rate": 2.799607574187557e-08, "loss": 0.2957, "step": 4947 }, { "epoch": 0.8963768115942029, "grad_norm": 6.373281658782514, "learning_rate": 2.7899361743092098e-08, "loss": 0.3298, "step": 4948 }, { "epoch": 0.8965579710144927, "grad_norm": 3.842168621171289, "learning_rate": 2.7802810290795288e-08, "loss": 0.2625, "step": 4949 }, { "epoch": 0.8967391304347826, "grad_norm": 4.754489989802223, "learning_rate": 2.7706421418228098e-08, "loss": 0.2701, "step": 4950 }, { "epoch": 0.8969202898550724, "grad_norm": 7.130775245119226, "learning_rate": 2.7610195158577875e-08, "loss": 0.2725, "step": 4951 }, { "epoch": 0.8971014492753623, "grad_norm": 4.404462866332119, "learning_rate": 2.7514131544975783e-08, "loss": 0.2342, "step": 4952 }, { "epoch": 0.8972826086956521, "grad_norm": 4.014695321327235, "learning_rate": 2.7418230610496752e-08, "loss": 0.333, "step": 4953 }, { "epoch": 0.8974637681159421, "grad_norm": 3.9185498671598946, "learning_rate": 2.7322492388160034e-08, "loss": 0.2718, "step": 4954 }, { "epoch": 0.8976449275362319, "grad_norm": 4.045161987136994, "learning_rate": 2.722691691092882e-08, "loss": 0.3177, "step": 4955 }, { "epoch": 0.8978260869565218, "grad_norm": 4.927213514518188, "learning_rate": 2.7131504211710176e-08, "loss": 0.2991, "step": 4956 }, { "epoch": 0.8980072463768116, "grad_norm": 4.845381208621474, "learning_rate": 2.7036254323355158e-08, "loss": 0.2947, "step": 4957 }, { "epoch": 0.8981884057971015, "grad_norm": 4.805877299472574, "learning_rate": 2.6941167278658695e-08, "loss": 0.254, "step": 4958 }, { "epoch": 0.8983695652173913, "grad_norm": 4.471561337672168, "learning_rate": 2.6846243110359766e-08, "loss": 0.3454, "step": 4959 }, { "epoch": 0.8985507246376812, "grad_norm": 4.605199813315009, "learning_rate": 2.6751481851141176e-08, "loss": 0.3346, "step": 4960 }, { "epoch": 0.898731884057971, "grad_norm": 6.0535188107746505, "learning_rate": 2.665688353362966e-08, "loss": 0.3691, "step": 4961 }, { "epoch": 0.8989130434782608, "grad_norm": 3.7750320846092156, "learning_rate": 2.6562448190395825e-08, "loss": 0.2136, "step": 4962 }, { "epoch": 0.8990942028985507, "grad_norm": 7.198297028466607, "learning_rate": 2.646817585395439e-08, "loss": 0.257, "step": 4963 }, { "epoch": 0.8992753623188405, "grad_norm": 6.476111737923261, "learning_rate": 2.6374066556763607e-08, "loss": 0.3098, "step": 4964 }, { "epoch": 0.8994565217391305, "grad_norm": 5.0366910329858925, "learning_rate": 2.6280120331225785e-08, "loss": 0.241, "step": 4965 }, { "epoch": 0.8996376811594203, "grad_norm": 4.470253039260638, "learning_rate": 2.6186337209687048e-08, "loss": 0.2919, "step": 4966 }, { "epoch": 0.8998188405797102, "grad_norm": 3.880286171090008, "learning_rate": 2.60927172244374e-08, "loss": 0.2949, "step": 4967 }, { "epoch": 0.9, "grad_norm": 4.514510867495617, "learning_rate": 2.5999260407710667e-08, "loss": 0.297, "step": 4968 }, { "epoch": 0.9001811594202899, "grad_norm": 6.161267685873009, "learning_rate": 2.5905966791684442e-08, "loss": 0.306, "step": 4969 }, { "epoch": 0.9003623188405797, "grad_norm": 4.762647472063687, "learning_rate": 2.581283640848009e-08, "loss": 0.3131, "step": 4970 }, { "epoch": 0.9005434782608696, "grad_norm": 4.418157380169589, "learning_rate": 2.5719869290162954e-08, "loss": 0.2644, "step": 4971 }, { "epoch": 0.9007246376811594, "grad_norm": 8.689256920679696, "learning_rate": 2.5627065468741994e-08, "loss": 0.3402, "step": 4972 }, { "epoch": 0.9009057971014492, "grad_norm": 10.44592708668007, "learning_rate": 2.5534424976169977e-08, "loss": 0.285, "step": 4973 }, { "epoch": 0.9010869565217391, "grad_norm": 9.902636234972858, "learning_rate": 2.5441947844343558e-08, "loss": 0.2535, "step": 4974 }, { "epoch": 0.9012681159420289, "grad_norm": 3.7938542421793806, "learning_rate": 2.534963410510299e-08, "loss": 0.2135, "step": 4975 }, { "epoch": 0.9014492753623189, "grad_norm": 4.875091412007716, "learning_rate": 2.5257483790232348e-08, "loss": 0.3093, "step": 4976 }, { "epoch": 0.9016304347826087, "grad_norm": 3.819686483849816, "learning_rate": 2.5165496931459418e-08, "loss": 0.2527, "step": 4977 }, { "epoch": 0.9018115942028986, "grad_norm": 3.842126850766311, "learning_rate": 2.5073673560455643e-08, "loss": 0.2332, "step": 4978 }, { "epoch": 0.9019927536231884, "grad_norm": 5.73557598392169, "learning_rate": 2.4982013708836457e-08, "loss": 0.2531, "step": 4979 }, { "epoch": 0.9021739130434783, "grad_norm": 12.47029981212199, "learning_rate": 2.4890517408160507e-08, "loss": 0.2736, "step": 4980 }, { "epoch": 0.9023550724637681, "grad_norm": 5.459747676317857, "learning_rate": 2.4799184689930585e-08, "loss": 0.2807, "step": 4981 }, { "epoch": 0.902536231884058, "grad_norm": 3.5772999663310396, "learning_rate": 2.4708015585592824e-08, "loss": 0.2598, "step": 4982 }, { "epoch": 0.9027173913043478, "grad_norm": 4.544660272019451, "learning_rate": 2.4617010126537386e-08, "loss": 0.2944, "step": 4983 }, { "epoch": 0.9028985507246376, "grad_norm": 4.293635149468965, "learning_rate": 2.452616834409771e-08, "loss": 0.3198, "step": 4984 }, { "epoch": 0.9030797101449275, "grad_norm": 4.825322656862586, "learning_rate": 2.4435490269551163e-08, "loss": 0.2246, "step": 4985 }, { "epoch": 0.9032608695652173, "grad_norm": 6.462231909797903, "learning_rate": 2.434497593411855e-08, "loss": 0.3259, "step": 4986 }, { "epoch": 0.9034420289855073, "grad_norm": 6.363823868240444, "learning_rate": 2.4254625368964442e-08, "loss": 0.2075, "step": 4987 }, { "epoch": 0.9036231884057971, "grad_norm": 6.283403785336436, "learning_rate": 2.416443860519696e-08, "loss": 0.2563, "step": 4988 }, { "epoch": 0.903804347826087, "grad_norm": 7.870698654080862, "learning_rate": 2.407441567386781e-08, "loss": 0.317, "step": 4989 }, { "epoch": 0.9039855072463768, "grad_norm": 5.638600354645924, "learning_rate": 2.3984556605972373e-08, "loss": 0.2985, "step": 4990 }, { "epoch": 0.9041666666666667, "grad_norm": 4.326858378065611, "learning_rate": 2.3894861432449454e-08, "loss": 0.3215, "step": 4991 }, { "epoch": 0.9043478260869565, "grad_norm": 4.86545450598657, "learning_rate": 2.380533018418157e-08, "loss": 0.2892, "step": 4992 }, { "epoch": 0.9045289855072464, "grad_norm": 3.6975460578264685, "learning_rate": 2.3715962891994733e-08, "loss": 0.2842, "step": 4993 }, { "epoch": 0.9047101449275362, "grad_norm": 6.772915684266903, "learning_rate": 2.3626759586658606e-08, "loss": 0.3147, "step": 4994 }, { "epoch": 0.904891304347826, "grad_norm": 3.441492913350605, "learning_rate": 2.3537720298886288e-08, "loss": 0.2209, "step": 4995 }, { "epoch": 0.9050724637681159, "grad_norm": 3.583978866285919, "learning_rate": 2.3448845059334364e-08, "loss": 0.217, "step": 4996 }, { "epoch": 0.9052536231884057, "grad_norm": 5.09401151709153, "learning_rate": 2.3360133898602975e-08, "loss": 0.227, "step": 4997 }, { "epoch": 0.9054347826086957, "grad_norm": 5.19084400479948, "learning_rate": 2.3271586847235903e-08, "loss": 0.276, "step": 4998 }, { "epoch": 0.9056159420289855, "grad_norm": 6.008678684248983, "learning_rate": 2.3183203935720263e-08, "loss": 0.3193, "step": 4999 }, { "epoch": 0.9057971014492754, "grad_norm": 6.562682545796406, "learning_rate": 2.3094985194486717e-08, "loss": 0.3159, "step": 5000 }, { "epoch": 0.9057971014492754, "eval_loss": 0.26676562428474426, "eval_runtime": 9.817, "eval_samples_per_second": 50.932, "eval_steps_per_second": 0.102, "step": 5000 }, { "epoch": 0.9059782608695652, "grad_norm": 3.229423738318707, "learning_rate": 2.3006930653909405e-08, "loss": 0.24, "step": 5001 }, { "epoch": 0.9061594202898551, "grad_norm": 4.6667148214844945, "learning_rate": 2.291904034430586e-08, "loss": 0.2508, "step": 5002 }, { "epoch": 0.9063405797101449, "grad_norm": 13.05844569406825, "learning_rate": 2.283131429593721e-08, "loss": 0.281, "step": 5003 }, { "epoch": 0.9065217391304348, "grad_norm": 5.216466468381175, "learning_rate": 2.2743752539007956e-08, "loss": 0.2722, "step": 5004 }, { "epoch": 0.9067028985507246, "grad_norm": 3.6511003886628357, "learning_rate": 2.265635510366587e-08, "loss": 0.264, "step": 5005 }, { "epoch": 0.9068840579710145, "grad_norm": 6.532822920264964, "learning_rate": 2.2569122020002608e-08, "loss": 0.2844, "step": 5006 }, { "epoch": 0.9070652173913043, "grad_norm": 4.279531197117882, "learning_rate": 2.248205331805264e-08, "loss": 0.3023, "step": 5007 }, { "epoch": 0.9072463768115943, "grad_norm": 3.6591732478307035, "learning_rate": 2.239514902779427e-08, "loss": 0.2346, "step": 5008 }, { "epoch": 0.9074275362318841, "grad_norm": 4.509440199602259, "learning_rate": 2.2308409179148946e-08, "loss": 0.2748, "step": 5009 }, { "epoch": 0.907608695652174, "grad_norm": 3.7171940871450784, "learning_rate": 2.222183380198178e-08, "loss": 0.2786, "step": 5010 }, { "epoch": 0.9077898550724638, "grad_norm": 3.876903831363149, "learning_rate": 2.213542292610099e-08, "loss": 0.2753, "step": 5011 }, { "epoch": 0.9079710144927536, "grad_norm": 5.446000205318678, "learning_rate": 2.2049176581258266e-08, "loss": 0.3321, "step": 5012 }, { "epoch": 0.9081521739130435, "grad_norm": 4.296970278371665, "learning_rate": 2.1963094797148586e-08, "loss": 0.2487, "step": 5013 }, { "epoch": 0.9083333333333333, "grad_norm": 5.545879698786914, "learning_rate": 2.1877177603410345e-08, "loss": 0.2685, "step": 5014 }, { "epoch": 0.9085144927536232, "grad_norm": 4.288822339549759, "learning_rate": 2.1791425029625278e-08, "loss": 0.3315, "step": 5015 }, { "epoch": 0.908695652173913, "grad_norm": 6.126726498296205, "learning_rate": 2.170583710531837e-08, "loss": 0.2769, "step": 5016 }, { "epoch": 0.9088768115942029, "grad_norm": 4.067313778600806, "learning_rate": 2.1620413859957942e-08, "loss": 0.2445, "step": 5017 }, { "epoch": 0.9090579710144927, "grad_norm": 7.796096841756646, "learning_rate": 2.1535155322955634e-08, "loss": 0.2668, "step": 5018 }, { "epoch": 0.9092391304347827, "grad_norm": 4.883764368102144, "learning_rate": 2.145006152366635e-08, "loss": 0.2594, "step": 5019 }, { "epoch": 0.9094202898550725, "grad_norm": 4.576816422415214, "learning_rate": 2.1365132491388326e-08, "loss": 0.274, "step": 5020 }, { "epoch": 0.9096014492753624, "grad_norm": 4.518333094509602, "learning_rate": 2.1280368255362944e-08, "loss": 0.2534, "step": 5021 }, { "epoch": 0.9097826086956522, "grad_norm": 3.6022373485406947, "learning_rate": 2.119576884477514e-08, "loss": 0.2592, "step": 5022 }, { "epoch": 0.909963768115942, "grad_norm": 8.729549762123135, "learning_rate": 2.1111334288752612e-08, "loss": 0.3268, "step": 5023 }, { "epoch": 0.9101449275362319, "grad_norm": 4.694266552021978, "learning_rate": 2.102706461636672e-08, "loss": 0.2753, "step": 5024 }, { "epoch": 0.9103260869565217, "grad_norm": 3.953792605102031, "learning_rate": 2.0942959856631913e-08, "loss": 0.2793, "step": 5025 }, { "epoch": 0.9105072463768116, "grad_norm": 3.799497583559223, "learning_rate": 2.085902003850587e-08, "loss": 0.2816, "step": 5026 }, { "epoch": 0.9106884057971014, "grad_norm": 6.345808495963679, "learning_rate": 2.0775245190889467e-08, "loss": 0.2639, "step": 5027 }, { "epoch": 0.9108695652173913, "grad_norm": 3.7361030805644218, "learning_rate": 2.069163534262669e-08, "loss": 0.3098, "step": 5028 }, { "epoch": 0.9110507246376811, "grad_norm": 6.29533855797318, "learning_rate": 2.0608190522504952e-08, "loss": 0.3438, "step": 5029 }, { "epoch": 0.9112318840579711, "grad_norm": 4.36788833095191, "learning_rate": 2.0524910759254554e-08, "loss": 0.3239, "step": 5030 }, { "epoch": 0.9114130434782609, "grad_norm": 6.3593487647630536, "learning_rate": 2.0441796081549222e-08, "loss": 0.2438, "step": 5031 }, { "epoch": 0.9115942028985508, "grad_norm": 3.319905367057363, "learning_rate": 2.0358846518005624e-08, "loss": 0.2296, "step": 5032 }, { "epoch": 0.9117753623188406, "grad_norm": 7.6219929520052245, "learning_rate": 2.027606209718391e-08, "loss": 0.2351, "step": 5033 }, { "epoch": 0.9119565217391304, "grad_norm": 3.895857690945518, "learning_rate": 2.0193442847586893e-08, "loss": 0.2766, "step": 5034 }, { "epoch": 0.9121376811594203, "grad_norm": 4.117482932198145, "learning_rate": 2.0110988797660876e-08, "loss": 0.2654, "step": 5035 }, { "epoch": 0.9123188405797101, "grad_norm": 5.438219935022428, "learning_rate": 2.002869997579515e-08, "loss": 0.2645, "step": 5036 }, { "epoch": 0.9125, "grad_norm": 4.262290777751656, "learning_rate": 1.9946576410322212e-08, "loss": 0.2953, "step": 5037 }, { "epoch": 0.9126811594202898, "grad_norm": 4.801393012434805, "learning_rate": 1.986461812951756e-08, "loss": 0.2266, "step": 5038 }, { "epoch": 0.9128623188405797, "grad_norm": 3.766270719758786, "learning_rate": 1.9782825161599903e-08, "loss": 0.2585, "step": 5039 }, { "epoch": 0.9130434782608695, "grad_norm": 4.615851033768331, "learning_rate": 1.9701197534730707e-08, "loss": 0.2607, "step": 5040 }, { "epoch": 0.9132246376811595, "grad_norm": 3.766218146499011, "learning_rate": 1.9619735277014937e-08, "loss": 0.2604, "step": 5041 }, { "epoch": 0.9134057971014493, "grad_norm": 5.399669782098153, "learning_rate": 1.9538438416500437e-08, "loss": 0.2994, "step": 5042 }, { "epoch": 0.9135869565217392, "grad_norm": 6.038002054653043, "learning_rate": 1.945730698117809e-08, "loss": 0.3291, "step": 5043 }, { "epoch": 0.913768115942029, "grad_norm": 5.018315155132307, "learning_rate": 1.937634099898172e-08, "loss": 0.3091, "step": 5044 }, { "epoch": 0.9139492753623188, "grad_norm": 4.756131607605878, "learning_rate": 1.9295540497788477e-08, "loss": 0.2685, "step": 5045 }, { "epoch": 0.9141304347826087, "grad_norm": 3.4968442666410557, "learning_rate": 1.921490550541821e-08, "loss": 0.2428, "step": 5046 }, { "epoch": 0.9143115942028985, "grad_norm": 5.277557304063835, "learning_rate": 1.913443604963405e-08, "loss": 0.2838, "step": 5047 }, { "epoch": 0.9144927536231884, "grad_norm": 3.8340585382415413, "learning_rate": 1.9054132158141834e-08, "loss": 0.2415, "step": 5048 }, { "epoch": 0.9146739130434782, "grad_norm": 4.955260026857148, "learning_rate": 1.8973993858590774e-08, "loss": 0.2588, "step": 5049 }, { "epoch": 0.9148550724637681, "grad_norm": 4.183827352373052, "learning_rate": 1.8894021178572807e-08, "loss": 0.3065, "step": 5050 }, { "epoch": 0.9150362318840579, "grad_norm": 5.089752794858959, "learning_rate": 1.8814214145622785e-08, "loss": 0.2816, "step": 5051 }, { "epoch": 0.9152173913043479, "grad_norm": 4.334815454747086, "learning_rate": 1.8734572787218738e-08, "loss": 0.2606, "step": 5052 }, { "epoch": 0.9153985507246377, "grad_norm": 5.958979549328401, "learning_rate": 1.8655097130781618e-08, "loss": 0.2757, "step": 5053 }, { "epoch": 0.9155797101449276, "grad_norm": 5.382098103782952, "learning_rate": 1.85757872036752e-08, "loss": 0.2805, "step": 5054 }, { "epoch": 0.9157608695652174, "grad_norm": 8.874010798483143, "learning_rate": 1.849664303320636e-08, "loss": 0.2847, "step": 5055 }, { "epoch": 0.9159420289855073, "grad_norm": 5.67527329704592, "learning_rate": 1.8417664646624587e-08, "loss": 0.2693, "step": 5056 }, { "epoch": 0.9161231884057971, "grad_norm": 6.475145950974189, "learning_rate": 1.8338852071122735e-08, "loss": 0.3095, "step": 5057 }, { "epoch": 0.9163043478260869, "grad_norm": 6.454985407522784, "learning_rate": 1.8260205333836263e-08, "loss": 0.2964, "step": 5058 }, { "epoch": 0.9164855072463768, "grad_norm": 5.080559793921507, "learning_rate": 1.8181724461843628e-08, "loss": 0.2381, "step": 5059 }, { "epoch": 0.9166666666666666, "grad_norm": 4.563514151399476, "learning_rate": 1.810340948216621e-08, "loss": 0.2857, "step": 5060 }, { "epoch": 0.9168478260869565, "grad_norm": 4.355115903306195, "learning_rate": 1.8025260421768106e-08, "loss": 0.294, "step": 5061 }, { "epoch": 0.9170289855072464, "grad_norm": 4.643713540221931, "learning_rate": 1.794727730755652e-08, "loss": 0.229, "step": 5062 }, { "epoch": 0.9172101449275363, "grad_norm": 4.199254981250338, "learning_rate": 1.7869460166381355e-08, "loss": 0.3133, "step": 5063 }, { "epoch": 0.9173913043478261, "grad_norm": 4.870083561785064, "learning_rate": 1.779180902503541e-08, "loss": 0.3428, "step": 5064 }, { "epoch": 0.917572463768116, "grad_norm": 7.541688996464984, "learning_rate": 1.7714323910254513e-08, "loss": 0.3094, "step": 5065 }, { "epoch": 0.9177536231884058, "grad_norm": 4.676154314433912, "learning_rate": 1.7637004848716884e-08, "loss": 0.2651, "step": 5066 }, { "epoch": 0.9179347826086957, "grad_norm": 3.9416739081153005, "learning_rate": 1.755985186704395e-08, "loss": 0.2463, "step": 5067 }, { "epoch": 0.9181159420289855, "grad_norm": 4.023576344996432, "learning_rate": 1.7482864991799906e-08, "loss": 0.3121, "step": 5068 }, { "epoch": 0.9182971014492753, "grad_norm": 5.051948611056018, "learning_rate": 1.7406044249491657e-08, "loss": 0.3416, "step": 5069 }, { "epoch": 0.9184782608695652, "grad_norm": 6.522263640665446, "learning_rate": 1.7329389666568995e-08, "loss": 0.2732, "step": 5070 }, { "epoch": 0.918659420289855, "grad_norm": 6.2663681174231325, "learning_rate": 1.725290126942436e-08, "loss": 0.2924, "step": 5071 }, { "epoch": 0.9188405797101449, "grad_norm": 4.6754627376987195, "learning_rate": 1.717657908439313e-08, "loss": 0.2655, "step": 5072 }, { "epoch": 0.9190217391304348, "grad_norm": 7.482888573489373, "learning_rate": 1.7100423137753395e-08, "loss": 0.3546, "step": 5073 }, { "epoch": 0.9192028985507247, "grad_norm": 6.048004571895161, "learning_rate": 1.7024433455726016e-08, "loss": 0.3192, "step": 5074 }, { "epoch": 0.9193840579710145, "grad_norm": 4.839161050960119, "learning_rate": 1.694861006447451e-08, "loss": 0.2652, "step": 5075 }, { "epoch": 0.9195652173913044, "grad_norm": 3.7481570921489067, "learning_rate": 1.687295299010538e-08, "loss": 0.2605, "step": 5076 }, { "epoch": 0.9197463768115942, "grad_norm": 4.862934612934118, "learning_rate": 1.6797462258667626e-08, "loss": 0.2938, "step": 5077 }, { "epoch": 0.9199275362318841, "grad_norm": 11.048120492288488, "learning_rate": 1.6722137896153066e-08, "loss": 0.2985, "step": 5078 }, { "epoch": 0.9201086956521739, "grad_norm": 4.461946790516098, "learning_rate": 1.664697992849623e-08, "loss": 0.2741, "step": 5079 }, { "epoch": 0.9202898550724637, "grad_norm": 4.93336853506006, "learning_rate": 1.6571988381574364e-08, "loss": 0.2665, "step": 5080 }, { "epoch": 0.9204710144927536, "grad_norm": 6.847480275289616, "learning_rate": 1.6497163281207482e-08, "loss": 0.2283, "step": 5081 }, { "epoch": 0.9206521739130434, "grad_norm": 3.550784214808077, "learning_rate": 1.6422504653158198e-08, "loss": 0.265, "step": 5082 }, { "epoch": 0.9208333333333333, "grad_norm": 4.057937731729632, "learning_rate": 1.6348012523131726e-08, "loss": 0.2822, "step": 5083 }, { "epoch": 0.9210144927536232, "grad_norm": 4.061184475967941, "learning_rate": 1.6273686916776164e-08, "loss": 0.2913, "step": 5084 }, { "epoch": 0.9211956521739131, "grad_norm": 4.492671861756009, "learning_rate": 1.6199527859682148e-08, "loss": 0.3105, "step": 5085 }, { "epoch": 0.9213768115942029, "grad_norm": 4.01204606905016, "learning_rate": 1.6125535377382926e-08, "loss": 0.1968, "step": 5086 }, { "epoch": 0.9215579710144928, "grad_norm": 5.193794014494544, "learning_rate": 1.6051709495354615e-08, "loss": 0.3356, "step": 5087 }, { "epoch": 0.9217391304347826, "grad_norm": 4.566021644013141, "learning_rate": 1.597805023901566e-08, "loss": 0.3008, "step": 5088 }, { "epoch": 0.9219202898550725, "grad_norm": 8.130145227303329, "learning_rate": 1.5904557633727334e-08, "loss": 0.2669, "step": 5089 }, { "epoch": 0.9221014492753623, "grad_norm": 3.736318496486843, "learning_rate": 1.583123170479356e-08, "loss": 0.2539, "step": 5090 }, { "epoch": 0.9222826086956522, "grad_norm": 5.068552662849525, "learning_rate": 1.5758072477460638e-08, "loss": 0.3506, "step": 5091 }, { "epoch": 0.922463768115942, "grad_norm": 3.6260657931632045, "learning_rate": 1.5685079976917926e-08, "loss": 0.2527, "step": 5092 }, { "epoch": 0.9226449275362318, "grad_norm": 4.80250953741809, "learning_rate": 1.5612254228296816e-08, "loss": 0.2701, "step": 5093 }, { "epoch": 0.9228260869565217, "grad_norm": 3.747919780124867, "learning_rate": 1.553959525667159e-08, "loss": 0.276, "step": 5094 }, { "epoch": 0.9230072463768116, "grad_norm": 3.6454171276463168, "learning_rate": 1.546710308705923e-08, "loss": 0.2492, "step": 5095 }, { "epoch": 0.9231884057971015, "grad_norm": 3.475438057973087, "learning_rate": 1.5394777744418997e-08, "loss": 0.2345, "step": 5096 }, { "epoch": 0.9233695652173913, "grad_norm": 5.1466940642142, "learning_rate": 1.5322619253652912e-08, "loss": 0.2461, "step": 5097 }, { "epoch": 0.9235507246376812, "grad_norm": 3.837391521056926, "learning_rate": 1.525062763960544e-08, "loss": 0.2072, "step": 5098 }, { "epoch": 0.923731884057971, "grad_norm": 4.4926669852053225, "learning_rate": 1.5178802927063693e-08, "loss": 0.2855, "step": 5099 }, { "epoch": 0.9239130434782609, "grad_norm": 6.664191297941805, "learning_rate": 1.5107145140757226e-08, "loss": 0.2456, "step": 5100 }, { "epoch": 0.9239130434782609, "eval_loss": 0.2668437361717224, "eval_runtime": 9.7476, "eval_samples_per_second": 51.295, "eval_steps_per_second": 0.103, "step": 5100 }, { "epoch": 0.9240942028985507, "grad_norm": 5.996202108686227, "learning_rate": 1.5035654305358192e-08, "loss": 0.312, "step": 5101 }, { "epoch": 0.9242753623188406, "grad_norm": 4.11531538231226, "learning_rate": 1.496433044548112e-08, "loss": 0.2408, "step": 5102 }, { "epoch": 0.9244565217391304, "grad_norm": 4.59986698816013, "learning_rate": 1.4893173585683261e-08, "loss": 0.2579, "step": 5103 }, { "epoch": 0.9246376811594202, "grad_norm": 6.818235046598892, "learning_rate": 1.4822183750464234e-08, "loss": 0.313, "step": 5104 }, { "epoch": 0.9248188405797102, "grad_norm": 4.956228855141052, "learning_rate": 1.4751360964266157e-08, "loss": 0.292, "step": 5105 }, { "epoch": 0.925, "grad_norm": 5.021762857267433, "learning_rate": 1.468070525147358e-08, "loss": 0.2688, "step": 5106 }, { "epoch": 0.9251811594202899, "grad_norm": 3.765696568104086, "learning_rate": 1.4610216636413764e-08, "loss": 0.2701, "step": 5107 }, { "epoch": 0.9253623188405797, "grad_norm": 8.847734337942263, "learning_rate": 1.4539895143356185e-08, "loss": 0.3277, "step": 5108 }, { "epoch": 0.9255434782608696, "grad_norm": 5.898594802793108, "learning_rate": 1.4469740796512863e-08, "loss": 0.2574, "step": 5109 }, { "epoch": 0.9257246376811594, "grad_norm": 7.290441196622495, "learning_rate": 1.4399753620038201e-08, "loss": 0.272, "step": 5110 }, { "epoch": 0.9259057971014493, "grad_norm": 6.587318004773873, "learning_rate": 1.4329933638029257e-08, "loss": 0.3222, "step": 5111 }, { "epoch": 0.9260869565217391, "grad_norm": 5.234551020115269, "learning_rate": 1.4260280874525299e-08, "loss": 0.2845, "step": 5112 }, { "epoch": 0.926268115942029, "grad_norm": 5.325414020813716, "learning_rate": 1.4190795353508145e-08, "loss": 0.2658, "step": 5113 }, { "epoch": 0.9264492753623188, "grad_norm": 4.241550296385165, "learning_rate": 1.4121477098901935e-08, "loss": 0.2872, "step": 5114 }, { "epoch": 0.9266304347826086, "grad_norm": 4.018722430337241, "learning_rate": 1.40523261345733e-08, "loss": 0.2512, "step": 5115 }, { "epoch": 0.9268115942028986, "grad_norm": 4.691758106098034, "learning_rate": 1.398334248433125e-08, "loss": 0.2219, "step": 5116 }, { "epoch": 0.9269927536231884, "grad_norm": 5.447034736047103, "learning_rate": 1.3914526171927176e-08, "loss": 0.3033, "step": 5117 }, { "epoch": 0.9271739130434783, "grad_norm": 8.689800691959029, "learning_rate": 1.3845877221054792e-08, "loss": 0.3181, "step": 5118 }, { "epoch": 0.9273550724637681, "grad_norm": 6.034756808020308, "learning_rate": 1.3777395655350465e-08, "loss": 0.3094, "step": 5119 }, { "epoch": 0.927536231884058, "grad_norm": 4.033737130482553, "learning_rate": 1.3709081498392505e-08, "loss": 0.3133, "step": 5120 }, { "epoch": 0.9277173913043478, "grad_norm": 4.7490448919508585, "learning_rate": 1.3640934773701928e-08, "loss": 0.2878, "step": 5121 }, { "epoch": 0.9278985507246377, "grad_norm": 4.475110452629174, "learning_rate": 1.3572955504741857e-08, "loss": 0.2625, "step": 5122 }, { "epoch": 0.9280797101449275, "grad_norm": 4.171413591876194, "learning_rate": 1.3505143714917955e-08, "loss": 0.2176, "step": 5123 }, { "epoch": 0.9282608695652174, "grad_norm": 4.532596633812448, "learning_rate": 1.3437499427578159e-08, "loss": 0.3316, "step": 5124 }, { "epoch": 0.9284420289855072, "grad_norm": 4.63419965649599, "learning_rate": 1.3370022666012726e-08, "loss": 0.3172, "step": 5125 }, { "epoch": 0.928623188405797, "grad_norm": 4.989870724738706, "learning_rate": 1.3302713453454128e-08, "loss": 0.2568, "step": 5126 }, { "epoch": 0.928804347826087, "grad_norm": 5.59649172880547, "learning_rate": 1.3235571813077328e-08, "loss": 0.3048, "step": 5127 }, { "epoch": 0.9289855072463769, "grad_norm": 3.802912816327616, "learning_rate": 1.31685977679995e-08, "loss": 0.2674, "step": 5128 }, { "epoch": 0.9291666666666667, "grad_norm": 4.522374770300389, "learning_rate": 1.3101791341280088e-08, "loss": 0.2617, "step": 5129 }, { "epoch": 0.9293478260869565, "grad_norm": 4.3658050539372635, "learning_rate": 1.3035152555920915e-08, "loss": 0.3161, "step": 5130 }, { "epoch": 0.9295289855072464, "grad_norm": 4.004384942366896, "learning_rate": 1.2968681434866013e-08, "loss": 0.2636, "step": 5131 }, { "epoch": 0.9297101449275362, "grad_norm": 3.8775817461579978, "learning_rate": 1.2902378001001691e-08, "loss": 0.295, "step": 5132 }, { "epoch": 0.9298913043478261, "grad_norm": 5.296565105031837, "learning_rate": 1.2836242277156517e-08, "loss": 0.2923, "step": 5133 }, { "epoch": 0.9300724637681159, "grad_norm": 9.626141623922942, "learning_rate": 1.277027428610139e-08, "loss": 0.3247, "step": 5134 }, { "epoch": 0.9302536231884058, "grad_norm": 3.833925962328562, "learning_rate": 1.2704474050549363e-08, "loss": 0.2736, "step": 5135 }, { "epoch": 0.9304347826086956, "grad_norm": 8.143596855845226, "learning_rate": 1.2638841593155758e-08, "loss": 0.3136, "step": 5136 }, { "epoch": 0.9306159420289855, "grad_norm": 6.425821858079505, "learning_rate": 1.2573376936518165e-08, "loss": 0.2686, "step": 5137 }, { "epoch": 0.9307971014492754, "grad_norm": 7.227079439326006, "learning_rate": 1.2508080103176333e-08, "loss": 0.2377, "step": 5138 }, { "epoch": 0.9309782608695653, "grad_norm": 4.969244802282517, "learning_rate": 1.2442951115612387e-08, "loss": 0.2693, "step": 5139 }, { "epoch": 0.9311594202898551, "grad_norm": 5.780637981934778, "learning_rate": 1.2377989996250394e-08, "loss": 0.234, "step": 5140 }, { "epoch": 0.931340579710145, "grad_norm": 5.727817731378993, "learning_rate": 1.2313196767456902e-08, "loss": 0.2651, "step": 5141 }, { "epoch": 0.9315217391304348, "grad_norm": 5.628729170796921, "learning_rate": 1.2248571451540401e-08, "loss": 0.2806, "step": 5142 }, { "epoch": 0.9317028985507246, "grad_norm": 3.9721827221516235, "learning_rate": 1.2184114070751817e-08, "loss": 0.2784, "step": 5143 }, { "epoch": 0.9318840579710145, "grad_norm": 3.902570172692182, "learning_rate": 1.2119824647284116e-08, "loss": 0.2786, "step": 5144 }, { "epoch": 0.9320652173913043, "grad_norm": 9.144841875299965, "learning_rate": 1.2055703203272317e-08, "loss": 0.3035, "step": 5145 }, { "epoch": 0.9322463768115942, "grad_norm": 7.11284579862854, "learning_rate": 1.1991749760793924e-08, "loss": 0.2806, "step": 5146 }, { "epoch": 0.932427536231884, "grad_norm": 6.261817635323582, "learning_rate": 1.1927964341868269e-08, "loss": 0.2509, "step": 5147 }, { "epoch": 0.9326086956521739, "grad_norm": 6.103494741894521, "learning_rate": 1.1864346968457007e-08, "loss": 0.2385, "step": 5148 }, { "epoch": 0.9327898550724638, "grad_norm": 3.529692060934503, "learning_rate": 1.1800897662463948e-08, "loss": 0.2329, "step": 5149 }, { "epoch": 0.9329710144927537, "grad_norm": 6.840440921411112, "learning_rate": 1.1737616445734954e-08, "loss": 0.2751, "step": 5150 }, { "epoch": 0.9331521739130435, "grad_norm": 8.95527016855722, "learning_rate": 1.167450334005804e-08, "loss": 0.3406, "step": 5151 }, { "epoch": 0.9333333333333333, "grad_norm": 4.728909246655847, "learning_rate": 1.1611558367163433e-08, "loss": 0.2968, "step": 5152 }, { "epoch": 0.9335144927536232, "grad_norm": 7.263384673038895, "learning_rate": 1.1548781548723186e-08, "loss": 0.2949, "step": 5153 }, { "epoch": 0.933695652173913, "grad_norm": 4.542923852915622, "learning_rate": 1.1486172906351898e-08, "loss": 0.3085, "step": 5154 }, { "epoch": 0.9338768115942029, "grad_norm": 8.489658178267135, "learning_rate": 1.1423732461605829e-08, "loss": 0.2824, "step": 5155 }, { "epoch": 0.9340579710144927, "grad_norm": 5.8764754767589835, "learning_rate": 1.1361460235983666e-08, "loss": 0.2451, "step": 5156 }, { "epoch": 0.9342391304347826, "grad_norm": 4.165217434363591, "learning_rate": 1.1299356250925929e-08, "loss": 0.2914, "step": 5157 }, { "epoch": 0.9344202898550724, "grad_norm": 7.990978544947612, "learning_rate": 1.1237420527815288e-08, "loss": 0.2505, "step": 5158 }, { "epoch": 0.9346014492753624, "grad_norm": 3.6882036061929044, "learning_rate": 1.1175653087976633e-08, "loss": 0.2792, "step": 5159 }, { "epoch": 0.9347826086956522, "grad_norm": 9.097928287582794, "learning_rate": 1.1114053952676728e-08, "loss": 0.2818, "step": 5160 }, { "epoch": 0.9349637681159421, "grad_norm": 4.336478351026284, "learning_rate": 1.1052623143124385e-08, "loss": 0.2999, "step": 5161 }, { "epoch": 0.9351449275362319, "grad_norm": 3.7031826051401264, "learning_rate": 1.099136068047063e-08, "loss": 0.2857, "step": 5162 }, { "epoch": 0.9353260869565218, "grad_norm": 8.12979280754706, "learning_rate": 1.0930266585808312e-08, "loss": 0.2515, "step": 5163 }, { "epoch": 0.9355072463768116, "grad_norm": 6.514235409319526, "learning_rate": 1.0869340880172495e-08, "loss": 0.3394, "step": 5164 }, { "epoch": 0.9356884057971014, "grad_norm": 5.657832353776019, "learning_rate": 1.080858358454012e-08, "loss": 0.3709, "step": 5165 }, { "epoch": 0.9358695652173913, "grad_norm": 6.783729641059947, "learning_rate": 1.0747994719830345e-08, "loss": 0.2923, "step": 5166 }, { "epoch": 0.9360507246376811, "grad_norm": 5.183126493072396, "learning_rate": 1.0687574306904034e-08, "loss": 0.231, "step": 5167 }, { "epoch": 0.936231884057971, "grad_norm": 5.303427453622968, "learning_rate": 1.0627322366564329e-08, "loss": 0.327, "step": 5168 }, { "epoch": 0.9364130434782608, "grad_norm": 4.044958135680238, "learning_rate": 1.0567238919556243e-08, "loss": 0.2185, "step": 5169 }, { "epoch": 0.9365942028985508, "grad_norm": 7.617891447867524, "learning_rate": 1.0507323986566785e-08, "loss": 0.3349, "step": 5170 }, { "epoch": 0.9367753623188406, "grad_norm": 4.489700405684392, "learning_rate": 1.0447577588224898e-08, "loss": 0.2772, "step": 5171 }, { "epoch": 0.9369565217391305, "grad_norm": 4.022225323000435, "learning_rate": 1.0387999745101573e-08, "loss": 0.2778, "step": 5172 }, { "epoch": 0.9371376811594203, "grad_norm": 4.315314558687485, "learning_rate": 1.0328590477709897e-08, "loss": 0.3119, "step": 5173 }, { "epoch": 0.9373188405797102, "grad_norm": 4.145775309360231, "learning_rate": 1.0269349806504513e-08, "loss": 0.2559, "step": 5174 }, { "epoch": 0.9375, "grad_norm": 3.5666409942919426, "learning_rate": 1.0210277751882435e-08, "loss": 0.2314, "step": 5175 }, { "epoch": 0.9376811594202898, "grad_norm": 7.868011153820809, "learning_rate": 1.0151374334182338e-08, "loss": 0.2843, "step": 5176 }, { "epoch": 0.9378623188405797, "grad_norm": 5.971769457271549, "learning_rate": 1.0092639573685058e-08, "loss": 0.3155, "step": 5177 }, { "epoch": 0.9380434782608695, "grad_norm": 3.6056420253172483, "learning_rate": 1.0034073490613193e-08, "loss": 0.3041, "step": 5178 }, { "epoch": 0.9382246376811594, "grad_norm": 3.4142709258500785, "learning_rate": 9.975676105131392e-09, "loss": 0.2557, "step": 5179 }, { "epoch": 0.9384057971014492, "grad_norm": 5.229571866439575, "learning_rate": 9.91744743734596e-09, "loss": 0.3055, "step": 5180 }, { "epoch": 0.9385869565217392, "grad_norm": 5.0343397214127465, "learning_rate": 9.859387507305527e-09, "loss": 0.2674, "step": 5181 }, { "epoch": 0.938768115942029, "grad_norm": 4.7813600296960255, "learning_rate": 9.801496335000325e-09, "loss": 0.3187, "step": 5182 }, { "epoch": 0.9389492753623189, "grad_norm": 4.111992620134856, "learning_rate": 9.743773940362521e-09, "loss": 0.291, "step": 5183 }, { "epoch": 0.9391304347826087, "grad_norm": 3.9882803679770453, "learning_rate": 9.686220343266215e-09, "loss": 0.1941, "step": 5184 }, { "epoch": 0.9393115942028986, "grad_norm": 5.012742363516596, "learning_rate": 9.628835563527394e-09, "loss": 0.3228, "step": 5185 }, { "epoch": 0.9394927536231884, "grad_norm": 6.935891231578678, "learning_rate": 9.571619620903915e-09, "loss": 0.3055, "step": 5186 }, { "epoch": 0.9396739130434782, "grad_norm": 4.0205122552600665, "learning_rate": 9.514572535095522e-09, "loss": 0.2887, "step": 5187 }, { "epoch": 0.9398550724637681, "grad_norm": 3.606659696119982, "learning_rate": 9.457694325743726e-09, "loss": 0.2052, "step": 5188 }, { "epoch": 0.9400362318840579, "grad_norm": 4.8674593317218005, "learning_rate": 9.400985012432139e-09, "loss": 0.3139, "step": 5189 }, { "epoch": 0.9402173913043478, "grad_norm": 5.227936482617197, "learning_rate": 9.34444461468581e-09, "loss": 0.2906, "step": 5190 }, { "epoch": 0.9403985507246376, "grad_norm": 4.3523672079135745, "learning_rate": 9.288073151971998e-09, "loss": 0.3335, "step": 5191 }, { "epoch": 0.9405797101449276, "grad_norm": 5.84180683534299, "learning_rate": 9.231870643699624e-09, "loss": 0.2399, "step": 5192 }, { "epoch": 0.9407608695652174, "grad_norm": 4.607898246114629, "learning_rate": 9.175837109219487e-09, "loss": 0.3359, "step": 5193 }, { "epoch": 0.9409420289855073, "grad_norm": 8.875740206969539, "learning_rate": 9.119972567824263e-09, "loss": 0.3195, "step": 5194 }, { "epoch": 0.9411231884057971, "grad_norm": 4.251070548163476, "learning_rate": 9.064277038748291e-09, "loss": 0.2402, "step": 5195 }, { "epoch": 0.941304347826087, "grad_norm": 6.061949092248013, "learning_rate": 9.008750541167842e-09, "loss": 0.1948, "step": 5196 }, { "epoch": 0.9414855072463768, "grad_norm": 6.802492719264378, "learning_rate": 8.9533930942009e-09, "loss": 0.2428, "step": 5197 }, { "epoch": 0.9416666666666667, "grad_norm": 3.541856869927257, "learning_rate": 8.898204716907387e-09, "loss": 0.2743, "step": 5198 }, { "epoch": 0.9418478260869565, "grad_norm": 3.6694553682078546, "learning_rate": 8.84318542828888e-09, "loss": 0.2513, "step": 5199 }, { "epoch": 0.9420289855072463, "grad_norm": 9.365049154771608, "learning_rate": 8.78833524728878e-09, "loss": 0.3543, "step": 5200 }, { "epoch": 0.9420289855072463, "eval_loss": 0.2675468623638153, "eval_runtime": 9.7634, "eval_samples_per_second": 51.212, "eval_steps_per_second": 0.102, "step": 5200 }, { "epoch": 0.9422101449275362, "grad_norm": 6.625773480054826, "learning_rate": 8.733654192792262e-09, "loss": 0.2614, "step": 5201 }, { "epoch": 0.9423913043478261, "grad_norm": 4.63804473090142, "learning_rate": 8.679142283626317e-09, "loss": 0.3326, "step": 5202 }, { "epoch": 0.942572463768116, "grad_norm": 5.027940785910337, "learning_rate": 8.624799538559491e-09, "loss": 0.287, "step": 5203 }, { "epoch": 0.9427536231884058, "grad_norm": 7.027324082011557, "learning_rate": 8.570625976302481e-09, "loss": 0.3077, "step": 5204 }, { "epoch": 0.9429347826086957, "grad_norm": 7.9237692435664, "learning_rate": 8.51662161550737e-09, "loss": 0.3299, "step": 5205 }, { "epoch": 0.9431159420289855, "grad_norm": 3.891689271742777, "learning_rate": 8.462786474768114e-09, "loss": 0.2624, "step": 5206 }, { "epoch": 0.9432971014492754, "grad_norm": 4.239166626895789, "learning_rate": 8.409120572620388e-09, "loss": 0.3155, "step": 5207 }, { "epoch": 0.9434782608695652, "grad_norm": 4.306638520026201, "learning_rate": 8.355623927541688e-09, "loss": 0.2761, "step": 5208 }, { "epoch": 0.943659420289855, "grad_norm": 3.996107002635442, "learning_rate": 8.302296557951171e-09, "loss": 0.2578, "step": 5209 }, { "epoch": 0.9438405797101449, "grad_norm": 8.06081440133288, "learning_rate": 8.249138482209594e-09, "loss": 0.3304, "step": 5210 }, { "epoch": 0.9440217391304347, "grad_norm": 3.4038023088830642, "learning_rate": 8.19614971861965e-09, "loss": 0.2391, "step": 5211 }, { "epoch": 0.9442028985507246, "grad_norm": 6.0568719774199655, "learning_rate": 8.143330285425576e-09, "loss": 0.3187, "step": 5212 }, { "epoch": 0.9443840579710145, "grad_norm": 8.94340875743482, "learning_rate": 8.090680200813327e-09, "loss": 0.2848, "step": 5213 }, { "epoch": 0.9445652173913044, "grad_norm": 6.3563296169192105, "learning_rate": 8.038199482910624e-09, "loss": 0.2873, "step": 5214 }, { "epoch": 0.9447463768115942, "grad_norm": 3.5769701521760204, "learning_rate": 7.985888149786734e-09, "loss": 0.2636, "step": 5215 }, { "epoch": 0.9449275362318841, "grad_norm": 7.784593493016849, "learning_rate": 7.933746219452863e-09, "loss": 0.364, "step": 5216 }, { "epoch": 0.9451086956521739, "grad_norm": 8.32632421718918, "learning_rate": 7.881773709861594e-09, "loss": 0.2504, "step": 5217 }, { "epoch": 0.9452898550724638, "grad_norm": 3.5305623131540873, "learning_rate": 7.829970638907335e-09, "loss": 0.2366, "step": 5218 }, { "epoch": 0.9454710144927536, "grad_norm": 8.011139312629501, "learning_rate": 7.778337024426096e-09, "loss": 0.2681, "step": 5219 }, { "epoch": 0.9456521739130435, "grad_norm": 7.020375540565217, "learning_rate": 7.7268728841956e-09, "loss": 0.308, "step": 5220 }, { "epoch": 0.9458333333333333, "grad_norm": 6.483658683535193, "learning_rate": 7.675578235935287e-09, "loss": 0.3494, "step": 5221 }, { "epoch": 0.9460144927536231, "grad_norm": 8.261431112546184, "learning_rate": 7.624453097306083e-09, "loss": 0.2719, "step": 5222 }, { "epoch": 0.946195652173913, "grad_norm": 3.523566791558715, "learning_rate": 7.573497485910518e-09, "loss": 0.2686, "step": 5223 }, { "epoch": 0.946376811594203, "grad_norm": 6.529275284799194, "learning_rate": 7.522711419292948e-09, "loss": 0.2671, "step": 5224 }, { "epoch": 0.9465579710144928, "grad_norm": 3.6375153146588364, "learning_rate": 7.472094914939275e-09, "loss": 0.2528, "step": 5225 }, { "epoch": 0.9467391304347826, "grad_norm": 5.400021225952358, "learning_rate": 7.421647990277003e-09, "loss": 0.275, "step": 5226 }, { "epoch": 0.9469202898550725, "grad_norm": 4.0549937205945215, "learning_rate": 7.371370662675125e-09, "loss": 0.2816, "step": 5227 }, { "epoch": 0.9471014492753623, "grad_norm": 8.86014557399874, "learning_rate": 7.321262949444518e-09, "loss": 0.3084, "step": 5228 }, { "epoch": 0.9472826086956522, "grad_norm": 5.932963213703208, "learning_rate": 7.2713248678374364e-09, "loss": 0.3603, "step": 5229 }, { "epoch": 0.947463768115942, "grad_norm": 5.161157132834427, "learning_rate": 7.221556435047793e-09, "loss": 0.2878, "step": 5230 }, { "epoch": 0.9476449275362319, "grad_norm": 4.707156404513242, "learning_rate": 7.171957668211048e-09, "loss": 0.2058, "step": 5231 }, { "epoch": 0.9478260869565217, "grad_norm": 5.301462569719585, "learning_rate": 7.12252858440443e-09, "loss": 0.3037, "step": 5232 }, { "epoch": 0.9480072463768116, "grad_norm": 4.158442985000619, "learning_rate": 7.073269200646493e-09, "loss": 0.2609, "step": 5233 }, { "epoch": 0.9481884057971014, "grad_norm": 6.888317241230673, "learning_rate": 7.024179533897501e-09, "loss": 0.2957, "step": 5234 }, { "epoch": 0.9483695652173914, "grad_norm": 4.664569634794076, "learning_rate": 6.9752596010592135e-09, "loss": 0.2808, "step": 5235 }, { "epoch": 0.9485507246376812, "grad_norm": 4.496352837949094, "learning_rate": 6.926509418975102e-09, "loss": 0.2798, "step": 5236 }, { "epoch": 0.948731884057971, "grad_norm": 4.987760796161818, "learning_rate": 6.877929004430016e-09, "loss": 0.2402, "step": 5237 }, { "epoch": 0.9489130434782609, "grad_norm": 5.070773683749353, "learning_rate": 6.829518374150412e-09, "loss": 0.3138, "step": 5238 }, { "epoch": 0.9490942028985507, "grad_norm": 4.318789816820521, "learning_rate": 6.7812775448043425e-09, "loss": 0.3044, "step": 5239 }, { "epoch": 0.9492753623188406, "grad_norm": 4.346974441858678, "learning_rate": 6.733206533001357e-09, "loss": 0.3256, "step": 5240 }, { "epoch": 0.9494565217391304, "grad_norm": 3.260933116258712, "learning_rate": 6.685305355292492e-09, "loss": 0.1912, "step": 5241 }, { "epoch": 0.9496376811594203, "grad_norm": 4.619320148338823, "learning_rate": 6.637574028170334e-09, "loss": 0.322, "step": 5242 }, { "epoch": 0.9498188405797101, "grad_norm": 3.1745478993421443, "learning_rate": 6.59001256806907e-09, "loss": 0.2245, "step": 5243 }, { "epoch": 0.95, "grad_norm": 3.987966966510248, "learning_rate": 6.542620991364322e-09, "loss": 0.2685, "step": 5244 }, { "epoch": 0.9501811594202898, "grad_norm": 5.347811060700223, "learning_rate": 6.4953993143732065e-09, "loss": 0.2819, "step": 5245 }, { "epoch": 0.9503623188405798, "grad_norm": 6.808749209868338, "learning_rate": 6.448347553354327e-09, "loss": 0.2953, "step": 5246 }, { "epoch": 0.9505434782608696, "grad_norm": 3.9586405492281047, "learning_rate": 6.401465724507949e-09, "loss": 0.2686, "step": 5247 }, { "epoch": 0.9507246376811594, "grad_norm": 7.088376003581387, "learning_rate": 6.354753843975602e-09, "loss": 0.3029, "step": 5248 }, { "epoch": 0.9509057971014493, "grad_norm": 3.913350536822113, "learning_rate": 6.308211927840479e-09, "loss": 0.2422, "step": 5249 }, { "epoch": 0.9510869565217391, "grad_norm": 4.05105707054459, "learning_rate": 6.261839992127149e-09, "loss": 0.2711, "step": 5250 }, { "epoch": 0.951268115942029, "grad_norm": 5.161519366178647, "learning_rate": 6.215638052801675e-09, "loss": 0.2412, "step": 5251 }, { "epoch": 0.9514492753623188, "grad_norm": 4.564154072342774, "learning_rate": 6.1696061257716095e-09, "loss": 0.3124, "step": 5252 }, { "epoch": 0.9516304347826087, "grad_norm": 4.266185334387586, "learning_rate": 6.123744226885941e-09, "loss": 0.2614, "step": 5253 }, { "epoch": 0.9518115942028985, "grad_norm": 4.5724293724969485, "learning_rate": 6.078052371935261e-09, "loss": 0.2836, "step": 5254 }, { "epoch": 0.9519927536231884, "grad_norm": 7.873857112998623, "learning_rate": 6.032530576651318e-09, "loss": 0.3303, "step": 5255 }, { "epoch": 0.9521739130434783, "grad_norm": 5.54396943809504, "learning_rate": 5.987178856707631e-09, "loss": 0.2902, "step": 5256 }, { "epoch": 0.9523550724637682, "grad_norm": 3.6892455901495556, "learning_rate": 5.9419972277188756e-09, "loss": 0.2596, "step": 5257 }, { "epoch": 0.952536231884058, "grad_norm": 5.363885630238747, "learning_rate": 5.896985705241386e-09, "loss": 0.2485, "step": 5258 }, { "epoch": 0.9527173913043478, "grad_norm": 4.966115630858673, "learning_rate": 5.8521443047728765e-09, "loss": 0.2482, "step": 5259 }, { "epoch": 0.9528985507246377, "grad_norm": 4.389787298202812, "learning_rate": 5.807473041752386e-09, "loss": 0.2873, "step": 5260 }, { "epoch": 0.9530797101449275, "grad_norm": 4.221893453112242, "learning_rate": 5.762971931560445e-09, "loss": 0.3035, "step": 5261 }, { "epoch": 0.9532608695652174, "grad_norm": 5.8723598191839965, "learning_rate": 5.7186409895189635e-09, "loss": 0.2631, "step": 5262 }, { "epoch": 0.9534420289855072, "grad_norm": 5.933176438288503, "learning_rate": 5.674480230891398e-09, "loss": 0.2467, "step": 5263 }, { "epoch": 0.9536231884057971, "grad_norm": 5.046570716069905, "learning_rate": 5.630489670882477e-09, "loss": 0.25, "step": 5264 }, { "epoch": 0.9538043478260869, "grad_norm": 4.12051128888061, "learning_rate": 5.58666932463836e-09, "loss": 0.2689, "step": 5265 }, { "epoch": 0.9539855072463768, "grad_norm": 4.015646219143732, "learning_rate": 5.54301920724648e-09, "loss": 0.2784, "step": 5266 }, { "epoch": 0.9541666666666667, "grad_norm": 7.32827706516708, "learning_rate": 5.499539333735925e-09, "loss": 0.2778, "step": 5267 }, { "epoch": 0.9543478260869566, "grad_norm": 5.0592087296365, "learning_rate": 5.456229719076944e-09, "loss": 0.2328, "step": 5268 }, { "epoch": 0.9545289855072464, "grad_norm": 4.173909970806722, "learning_rate": 5.4130903781812734e-09, "loss": 0.2777, "step": 5269 }, { "epoch": 0.9547101449275363, "grad_norm": 3.9241170686678974, "learning_rate": 5.370121325901977e-09, "loss": 0.2274, "step": 5270 }, { "epoch": 0.9548913043478261, "grad_norm": 3.7221589650692377, "learning_rate": 5.327322577033555e-09, "loss": 0.2596, "step": 5271 }, { "epoch": 0.9550724637681159, "grad_norm": 3.566411005545428, "learning_rate": 5.2846941463117745e-09, "loss": 0.2253, "step": 5272 }, { "epoch": 0.9552536231884058, "grad_norm": 5.995192068391682, "learning_rate": 5.242236048413729e-09, "loss": 0.3454, "step": 5273 }, { "epoch": 0.9554347826086956, "grad_norm": 3.3625303578624886, "learning_rate": 5.199948297958112e-09, "loss": 0.2513, "step": 5274 }, { "epoch": 0.9556159420289855, "grad_norm": 3.160437305148599, "learning_rate": 5.1578309095047234e-09, "loss": 0.227, "step": 5275 }, { "epoch": 0.9557971014492753, "grad_norm": 4.484128792821842, "learning_rate": 5.115883897554685e-09, "loss": 0.2813, "step": 5276 }, { "epoch": 0.9559782608695652, "grad_norm": 3.7893548393419008, "learning_rate": 5.074107276550665e-09, "loss": 0.2438, "step": 5277 }, { "epoch": 0.9561594202898551, "grad_norm": 5.28260982444716, "learning_rate": 5.032501060876493e-09, "loss": 0.2268, "step": 5278 }, { "epoch": 0.956340579710145, "grad_norm": 3.99639591974172, "learning_rate": 4.9910652648574856e-09, "loss": 0.2862, "step": 5279 }, { "epoch": 0.9565217391304348, "grad_norm": 7.454676001194611, "learning_rate": 4.9497999027600655e-09, "loss": 0.2739, "step": 5280 }, { "epoch": 0.9567028985507247, "grad_norm": 6.8225393795100615, "learning_rate": 4.9087049887920896e-09, "loss": 0.2651, "step": 5281 }, { "epoch": 0.9568840579710145, "grad_norm": 5.785219789631884, "learning_rate": 4.86778053710285e-09, "loss": 0.2949, "step": 5282 }, { "epoch": 0.9570652173913043, "grad_norm": 4.2164614909978475, "learning_rate": 4.827026561782743e-09, "loss": 0.2816, "step": 5283 }, { "epoch": 0.9572463768115942, "grad_norm": 3.7441271739791087, "learning_rate": 4.7864430768635445e-09, "loss": 0.2578, "step": 5284 }, { "epoch": 0.957427536231884, "grad_norm": 7.34184115492016, "learning_rate": 4.746030096318354e-09, "loss": 0.2874, "step": 5285 }, { "epoch": 0.9576086956521739, "grad_norm": 3.3537842398614575, "learning_rate": 4.705787634061598e-09, "loss": 0.2207, "step": 5286 }, { "epoch": 0.9577898550724637, "grad_norm": 6.224869312879529, "learning_rate": 4.665715703948914e-09, "loss": 0.2655, "step": 5287 }, { "epoch": 0.9579710144927536, "grad_norm": 4.584916125920998, "learning_rate": 4.62581431977721e-09, "loss": 0.2872, "step": 5288 }, { "epoch": 0.9581521739130435, "grad_norm": 3.8269892319901433, "learning_rate": 4.586083495284776e-09, "loss": 0.227, "step": 5289 }, { "epoch": 0.9583333333333334, "grad_norm": 3.993504772745209, "learning_rate": 4.546523244151168e-09, "loss": 0.3121, "step": 5290 }, { "epoch": 0.9585144927536232, "grad_norm": 4.3220758292666455, "learning_rate": 4.507133579997046e-09, "loss": 0.2895, "step": 5291 }, { "epoch": 0.9586956521739131, "grad_norm": 5.337093426651146, "learning_rate": 4.467914516384619e-09, "loss": 0.218, "step": 5292 }, { "epoch": 0.9588768115942029, "grad_norm": 6.41035892268632, "learning_rate": 4.4288660668170275e-09, "loss": 0.2631, "step": 5293 }, { "epoch": 0.9590579710144927, "grad_norm": 5.492670490114389, "learning_rate": 4.389988244738907e-09, "loss": 0.2876, "step": 5294 }, { "epoch": 0.9592391304347826, "grad_norm": 8.88771176249773, "learning_rate": 4.35128106353616e-09, "loss": 0.305, "step": 5295 }, { "epoch": 0.9594202898550724, "grad_norm": 5.199139540470069, "learning_rate": 4.312744536535684e-09, "loss": 0.2639, "step": 5296 }, { "epoch": 0.9596014492753623, "grad_norm": 4.725924075487749, "learning_rate": 4.274378677005919e-09, "loss": 0.2927, "step": 5297 }, { "epoch": 0.9597826086956521, "grad_norm": 4.380283488850347, "learning_rate": 4.236183498156409e-09, "loss": 0.263, "step": 5298 }, { "epoch": 0.9599637681159421, "grad_norm": 5.686965306264911, "learning_rate": 4.198159013137858e-09, "loss": 0.2609, "step": 5299 }, { "epoch": 0.9601449275362319, "grad_norm": 6.897693185979009, "learning_rate": 4.16030523504235e-09, "loss": 0.2424, "step": 5300 }, { "epoch": 0.9601449275362319, "eval_loss": 0.265687495470047, "eval_runtime": 9.7821, "eval_samples_per_second": 51.114, "eval_steps_per_second": 0.102, "step": 5300 }, { "epoch": 0.9603260869565218, "grad_norm": 3.92293501921209, "learning_rate": 4.122622176903012e-09, "loss": 0.2617, "step": 5301 }, { "epoch": 0.9605072463768116, "grad_norm": 3.9961230210837373, "learning_rate": 4.085109851694468e-09, "loss": 0.2824, "step": 5302 }, { "epoch": 0.9606884057971015, "grad_norm": 4.603723190223892, "learning_rate": 4.047768272332275e-09, "loss": 0.3134, "step": 5303 }, { "epoch": 0.9608695652173913, "grad_norm": 3.5847924378981872, "learning_rate": 4.010597451673315e-09, "loss": 0.242, "step": 5304 }, { "epoch": 0.9610507246376812, "grad_norm": 4.470248813825341, "learning_rate": 3.9735974025156825e-09, "loss": 0.2637, "step": 5305 }, { "epoch": 0.961231884057971, "grad_norm": 5.604232429894755, "learning_rate": 3.9367681375986895e-09, "loss": 0.2623, "step": 5306 }, { "epoch": 0.9614130434782608, "grad_norm": 9.454709939249124, "learning_rate": 3.900109669602858e-09, "loss": 0.3171, "step": 5307 }, { "epoch": 0.9615942028985507, "grad_norm": 3.9611452214911633, "learning_rate": 3.863622011149814e-09, "loss": 0.2791, "step": 5308 }, { "epoch": 0.9617753623188405, "grad_norm": 3.7451395602609288, "learning_rate": 3.827305174802453e-09, "loss": 0.2532, "step": 5309 }, { "epoch": 0.9619565217391305, "grad_norm": 8.570247117055857, "learning_rate": 3.791159173064829e-09, "loss": 0.3173, "step": 5310 }, { "epoch": 0.9621376811594203, "grad_norm": 4.029694462241031, "learning_rate": 3.755184018382207e-09, "loss": 0.2695, "step": 5311 }, { "epoch": 0.9623188405797102, "grad_norm": 6.757196905855348, "learning_rate": 3.7193797231409587e-09, "loss": 0.325, "step": 5312 }, { "epoch": 0.9625, "grad_norm": 5.285739203511072, "learning_rate": 3.683746299668722e-09, "loss": 0.2545, "step": 5313 }, { "epoch": 0.9626811594202899, "grad_norm": 7.099790587122954, "learning_rate": 3.648283760234239e-09, "loss": 0.3342, "step": 5314 }, { "epoch": 0.9628623188405797, "grad_norm": 6.877196420782656, "learning_rate": 3.612992117047409e-09, "loss": 0.2479, "step": 5315 }, { "epoch": 0.9630434782608696, "grad_norm": 6.331353055333265, "learning_rate": 3.5778713822592897e-09, "loss": 0.3105, "step": 5316 }, { "epoch": 0.9632246376811594, "grad_norm": 4.385160533531568, "learning_rate": 3.5429215679622093e-09, "loss": 0.286, "step": 5317 }, { "epoch": 0.9634057971014492, "grad_norm": 3.92321524492969, "learning_rate": 3.508142686189486e-09, "loss": 0.2476, "step": 5318 }, { "epoch": 0.9635869565217391, "grad_norm": 6.340009409255978, "learning_rate": 3.4735347489156518e-09, "loss": 0.2918, "step": 5319 }, { "epoch": 0.9637681159420289, "grad_norm": 10.430866106595952, "learning_rate": 3.439097768056398e-09, "loss": 0.2798, "step": 5320 }, { "epoch": 0.9639492753623189, "grad_norm": 4.018759683538263, "learning_rate": 3.404831755468518e-09, "loss": 0.2095, "step": 5321 }, { "epoch": 0.9641304347826087, "grad_norm": 4.408173689557219, "learning_rate": 3.370736722949963e-09, "loss": 0.2395, "step": 5322 }, { "epoch": 0.9643115942028986, "grad_norm": 5.462553308557146, "learning_rate": 3.3368126822398977e-09, "loss": 0.2967, "step": 5323 }, { "epoch": 0.9644927536231884, "grad_norm": 4.230050254888981, "learning_rate": 3.3030596450184246e-09, "loss": 0.2747, "step": 5324 }, { "epoch": 0.9646739130434783, "grad_norm": 4.915607063222736, "learning_rate": 3.269477622906913e-09, "loss": 0.271, "step": 5325 }, { "epoch": 0.9648550724637681, "grad_norm": 4.452257379206557, "learning_rate": 3.2360666274678372e-09, "loss": 0.2669, "step": 5326 }, { "epoch": 0.965036231884058, "grad_norm": 4.891015437746063, "learning_rate": 3.202826670204717e-09, "loss": 0.3355, "step": 5327 }, { "epoch": 0.9652173913043478, "grad_norm": 5.810263111482196, "learning_rate": 3.169757762562231e-09, "loss": 0.2556, "step": 5328 }, { "epoch": 0.9653985507246376, "grad_norm": 4.932339215750199, "learning_rate": 3.136859915926271e-09, "loss": 0.3235, "step": 5329 }, { "epoch": 0.9655797101449275, "grad_norm": 6.889390842868626, "learning_rate": 3.104133141623555e-09, "loss": 0.3049, "step": 5330 }, { "epoch": 0.9657608695652173, "grad_norm": 3.9181870647126145, "learning_rate": 3.0715774509221247e-09, "loss": 0.2643, "step": 5331 }, { "epoch": 0.9659420289855073, "grad_norm": 4.153760902456737, "learning_rate": 3.03919285503107e-09, "loss": 0.2556, "step": 5332 }, { "epoch": 0.9661231884057971, "grad_norm": 5.610690194105614, "learning_rate": 3.0069793651005813e-09, "loss": 0.3047, "step": 5333 }, { "epoch": 0.966304347826087, "grad_norm": 4.3715740666469705, "learning_rate": 2.974936992221955e-09, "loss": 0.2967, "step": 5334 }, { "epoch": 0.9664855072463768, "grad_norm": 4.299056585111778, "learning_rate": 2.94306574742742e-09, "loss": 0.2882, "step": 5335 }, { "epoch": 0.9666666666666667, "grad_norm": 3.626239209744918, "learning_rate": 2.9113656416904777e-09, "loss": 0.251, "step": 5336 }, { "epoch": 0.9668478260869565, "grad_norm": 3.8339972300395266, "learning_rate": 2.879836685925563e-09, "loss": 0.258, "step": 5337 }, { "epoch": 0.9670289855072464, "grad_norm": 3.406643574058657, "learning_rate": 2.848478890988326e-09, "loss": 0.2384, "step": 5338 }, { "epoch": 0.9672101449275362, "grad_norm": 4.297831596776021, "learning_rate": 2.817292267675353e-09, "loss": 0.3227, "step": 5339 }, { "epoch": 0.967391304347826, "grad_norm": 4.1032232118488885, "learning_rate": 2.786276826724332e-09, "loss": 0.2624, "step": 5340 }, { "epoch": 0.9675724637681159, "grad_norm": 5.652900636528436, "learning_rate": 2.7554325788140544e-09, "loss": 0.2628, "step": 5341 }, { "epoch": 0.9677536231884057, "grad_norm": 5.903433931780544, "learning_rate": 2.724759534564358e-09, "loss": 0.2658, "step": 5342 }, { "epoch": 0.9679347826086957, "grad_norm": 4.28748291226025, "learning_rate": 2.694257704536018e-09, "loss": 0.3366, "step": 5343 }, { "epoch": 0.9681159420289855, "grad_norm": 5.836648975444641, "learning_rate": 2.663927099231078e-09, "loss": 0.2606, "step": 5344 }, { "epoch": 0.9682971014492754, "grad_norm": 3.9110715218946335, "learning_rate": 2.6337677290925176e-09, "loss": 0.2691, "step": 5345 }, { "epoch": 0.9684782608695652, "grad_norm": 4.166385712527874, "learning_rate": 2.6037796045042525e-09, "loss": 0.2972, "step": 5346 }, { "epoch": 0.9686594202898551, "grad_norm": 6.179035052173582, "learning_rate": 2.5739627357913574e-09, "loss": 0.2561, "step": 5347 }, { "epoch": 0.9688405797101449, "grad_norm": 3.8827170465724246, "learning_rate": 2.5443171332200086e-09, "loss": 0.2542, "step": 5348 }, { "epoch": 0.9690217391304348, "grad_norm": 4.870344068581291, "learning_rate": 2.514842806997208e-09, "loss": 0.2982, "step": 5349 }, { "epoch": 0.9692028985507246, "grad_norm": 4.416705638710597, "learning_rate": 2.4855397672711718e-09, "loss": 0.2463, "step": 5350 }, { "epoch": 0.9693840579710145, "grad_norm": 4.053104386533474, "learning_rate": 2.4564080241311067e-09, "loss": 0.2722, "step": 5351 }, { "epoch": 0.9695652173913043, "grad_norm": 4.817134153346506, "learning_rate": 2.4274475876071565e-09, "loss": 0.2784, "step": 5352 }, { "epoch": 0.9697463768115943, "grad_norm": 4.628430218577455, "learning_rate": 2.3986584676705114e-09, "loss": 0.2639, "step": 5353 }, { "epoch": 0.9699275362318841, "grad_norm": 4.62442616196145, "learning_rate": 2.3700406742334646e-09, "loss": 0.3144, "step": 5354 }, { "epoch": 0.970108695652174, "grad_norm": 5.794690268446705, "learning_rate": 2.3415942171492454e-09, "loss": 0.2912, "step": 5355 }, { "epoch": 0.9702898550724638, "grad_norm": 8.525358047871158, "learning_rate": 2.313319106212075e-09, "loss": 0.3158, "step": 5356 }, { "epoch": 0.9704710144927536, "grad_norm": 4.286941355940873, "learning_rate": 2.2852153511572214e-09, "loss": 0.3297, "step": 5357 }, { "epoch": 0.9706521739130435, "grad_norm": 3.5505002243232013, "learning_rate": 2.257282961661e-09, "loss": 0.2566, "step": 5358 }, { "epoch": 0.9708333333333333, "grad_norm": 7.7463865282986175, "learning_rate": 2.229521947340496e-09, "loss": 0.2687, "step": 5359 }, { "epoch": 0.9710144927536232, "grad_norm": 6.979732451728607, "learning_rate": 2.2019323177541737e-09, "loss": 0.277, "step": 5360 }, { "epoch": 0.971195652173913, "grad_norm": 4.045830369807944, "learning_rate": 2.174514082401102e-09, "loss": 0.3198, "step": 5361 }, { "epoch": 0.9713768115942029, "grad_norm": 5.614473530109136, "learning_rate": 2.147267250721674e-09, "loss": 0.3076, "step": 5362 }, { "epoch": 0.9715579710144927, "grad_norm": 5.466830402189616, "learning_rate": 2.1201918320969405e-09, "loss": 0.3121, "step": 5363 }, { "epoch": 0.9717391304347827, "grad_norm": 4.607817489277193, "learning_rate": 2.093287835849167e-09, "loss": 0.2664, "step": 5364 }, { "epoch": 0.9719202898550725, "grad_norm": 4.5466711311238726, "learning_rate": 2.066555271241499e-09, "loss": 0.3068, "step": 5365 }, { "epoch": 0.9721014492753624, "grad_norm": 4.844353536563658, "learning_rate": 2.03999414747813e-09, "loss": 0.3075, "step": 5366 }, { "epoch": 0.9722826086956522, "grad_norm": 4.0221347980556885, "learning_rate": 2.0136044737041892e-09, "loss": 0.2792, "step": 5367 }, { "epoch": 0.972463768115942, "grad_norm": 3.830436147135888, "learning_rate": 1.9873862590056855e-09, "loss": 0.2385, "step": 5368 }, { "epoch": 0.9726449275362319, "grad_norm": 6.055466069082649, "learning_rate": 1.9613395124097875e-09, "loss": 0.2952, "step": 5369 }, { "epoch": 0.9728260869565217, "grad_norm": 4.65234318075288, "learning_rate": 1.935464242884377e-09, "loss": 0.2841, "step": 5370 }, { "epoch": 0.9730072463768116, "grad_norm": 3.5272328187023114, "learning_rate": 1.9097604593385498e-09, "loss": 0.2579, "step": 5371 }, { "epoch": 0.9731884057971014, "grad_norm": 5.0692889021067975, "learning_rate": 1.884228170622226e-09, "loss": 0.2742, "step": 5372 }, { "epoch": 0.9733695652173913, "grad_norm": 10.365122437017554, "learning_rate": 1.8588673855262083e-09, "loss": 0.3455, "step": 5373 }, { "epoch": 0.9735507246376811, "grad_norm": 3.8885618036197633, "learning_rate": 1.8336781127824551e-09, "loss": 0.1974, "step": 5374 }, { "epoch": 0.9737318840579711, "grad_norm": 5.205476668740088, "learning_rate": 1.808660361063641e-09, "loss": 0.2156, "step": 5375 }, { "epoch": 0.9739130434782609, "grad_norm": 4.378081512286682, "learning_rate": 1.7838141389835415e-09, "loss": 0.3248, "step": 5376 }, { "epoch": 0.9740942028985508, "grad_norm": 3.506761683127004, "learning_rate": 1.7591394550968142e-09, "loss": 0.2646, "step": 5377 }, { "epoch": 0.9742753623188406, "grad_norm": 15.107223794431473, "learning_rate": 1.734636317899163e-09, "loss": 0.3262, "step": 5378 }, { "epoch": 0.9744565217391304, "grad_norm": 7.41229487072595, "learning_rate": 1.7103047358270062e-09, "loss": 0.2687, "step": 5379 }, { "epoch": 0.9746376811594203, "grad_norm": 7.698617950518482, "learning_rate": 1.6861447172578647e-09, "loss": 0.3011, "step": 5380 }, { "epoch": 0.9748188405797101, "grad_norm": 6.716326450497823, "learning_rate": 1.6621562705101954e-09, "loss": 0.2363, "step": 5381 }, { "epoch": 0.975, "grad_norm": 3.710004508605813, "learning_rate": 1.6383394038432252e-09, "loss": 0.2564, "step": 5382 }, { "epoch": 0.9751811594202898, "grad_norm": 3.781146427523997, "learning_rate": 1.6146941254573943e-09, "loss": 0.2713, "step": 5383 }, { "epoch": 0.9753623188405797, "grad_norm": 4.525821925330528, "learning_rate": 1.5912204434936905e-09, "loss": 0.2687, "step": 5384 }, { "epoch": 0.9755434782608695, "grad_norm": 4.031518798232069, "learning_rate": 1.5679183660343153e-09, "loss": 0.308, "step": 5385 }, { "epoch": 0.9757246376811595, "grad_norm": 5.415305598671766, "learning_rate": 1.5447879011022957e-09, "loss": 0.2961, "step": 5386 }, { "epoch": 0.9759057971014493, "grad_norm": 3.0763232948411168, "learning_rate": 1.5218290566614834e-09, "loss": 0.1875, "step": 5387 }, { "epoch": 0.9760869565217392, "grad_norm": 4.4429645427529145, "learning_rate": 1.4990418406168327e-09, "loss": 0.2615, "step": 5388 }, { "epoch": 0.976268115942029, "grad_norm": 3.806457330306627, "learning_rate": 1.476426260814012e-09, "loss": 0.2879, "step": 5389 }, { "epoch": 0.9764492753623188, "grad_norm": 6.668742748691466, "learning_rate": 1.4539823250396266e-09, "loss": 0.2675, "step": 5390 }, { "epoch": 0.9766304347826087, "grad_norm": 3.6137435470625086, "learning_rate": 1.431710041021328e-09, "loss": 0.2544, "step": 5391 }, { "epoch": 0.9768115942028985, "grad_norm": 5.294527710341784, "learning_rate": 1.409609416427482e-09, "loss": 0.2939, "step": 5392 }, { "epoch": 0.9769927536231884, "grad_norm": 6.247914122452138, "learning_rate": 1.3876804588675572e-09, "loss": 0.2603, "step": 5393 }, { "epoch": 0.9771739130434782, "grad_norm": 5.003101428863767, "learning_rate": 1.3659231758916812e-09, "loss": 0.2929, "step": 5394 }, { "epoch": 0.9773550724637681, "grad_norm": 4.841999496179696, "learning_rate": 1.344337574991028e-09, "loss": 0.2664, "step": 5395 }, { "epoch": 0.9775362318840579, "grad_norm": 5.545665157081709, "learning_rate": 1.3229236635976527e-09, "loss": 0.3151, "step": 5396 }, { "epoch": 0.9777173913043479, "grad_norm": 4.892995303869446, "learning_rate": 1.3016814490844352e-09, "loss": 0.3325, "step": 5397 }, { "epoch": 0.9778985507246377, "grad_norm": 4.536290409219323, "learning_rate": 1.2806109387651364e-09, "loss": 0.2897, "step": 5398 }, { "epoch": 0.9780797101449276, "grad_norm": 3.461072078436241, "learning_rate": 1.2597121398945643e-09, "loss": 0.2342, "step": 5399 }, { "epoch": 0.9782608695652174, "grad_norm": 3.7211699810280576, "learning_rate": 1.238985059668074e-09, "loss": 0.2442, "step": 5400 }, { "epoch": 0.9782608695652174, "eval_loss": 0.2665937542915344, "eval_runtime": 9.7749, "eval_samples_per_second": 51.152, "eval_steps_per_second": 0.102, "step": 5400 }, { "epoch": 0.9784420289855073, "grad_norm": 5.422118211327256, "learning_rate": 1.2184297052222902e-09, "loss": 0.2554, "step": 5401 }, { "epoch": 0.9786231884057971, "grad_norm": 5.0091135585910465, "learning_rate": 1.1980460836343852e-09, "loss": 0.3259, "step": 5402 }, { "epoch": 0.9788043478260869, "grad_norm": 5.541540763611145, "learning_rate": 1.177834201922634e-09, "loss": 0.2812, "step": 5403 }, { "epoch": 0.9789855072463768, "grad_norm": 3.9651602429877033, "learning_rate": 1.1577940670459696e-09, "loss": 0.2695, "step": 5404 }, { "epoch": 0.9791666666666666, "grad_norm": 8.697458753403248, "learning_rate": 1.1379256859044285e-09, "loss": 0.2721, "step": 5405 }, { "epoch": 0.9793478260869565, "grad_norm": 7.292452380824839, "learning_rate": 1.118229065338705e-09, "loss": 0.3269, "step": 5406 }, { "epoch": 0.9795289855072464, "grad_norm": 4.317707391023914, "learning_rate": 1.0987042121304856e-09, "loss": 0.3478, "step": 5407 }, { "epoch": 0.9797101449275363, "grad_norm": 3.8484598246853974, "learning_rate": 1.0793511330022265e-09, "loss": 0.2611, "step": 5408 }, { "epoch": 0.9798913043478261, "grad_norm": 4.572601236962431, "learning_rate": 1.0601698346173194e-09, "loss": 0.2671, "step": 5409 }, { "epoch": 0.980072463768116, "grad_norm": 4.86510860319875, "learning_rate": 1.0411603235799816e-09, "loss": 0.2558, "step": 5410 }, { "epoch": 0.9802536231884058, "grad_norm": 6.706864546338535, "learning_rate": 1.0223226064352553e-09, "loss": 0.2526, "step": 5411 }, { "epoch": 0.9804347826086957, "grad_norm": 4.133209617955311, "learning_rate": 1.0036566896690634e-09, "loss": 0.2185, "step": 5412 }, { "epoch": 0.9806159420289855, "grad_norm": 4.5515383124251425, "learning_rate": 9.851625797080988e-10, "loss": 0.2473, "step": 5413 }, { "epoch": 0.9807971014492753, "grad_norm": 3.361838207279218, "learning_rate": 9.668402829201005e-10, "loss": 0.2391, "step": 5414 }, { "epoch": 0.9809782608695652, "grad_norm": 3.889868020065799, "learning_rate": 9.486898056134674e-10, "loss": 0.269, "step": 5415 }, { "epoch": 0.981159420289855, "grad_norm": 3.7282981107979767, "learning_rate": 9.307111540374779e-10, "loss": 0.2563, "step": 5416 }, { "epoch": 0.9813405797101449, "grad_norm": 3.8754572835249577, "learning_rate": 9.129043343822917e-10, "loss": 0.2398, "step": 5417 }, { "epoch": 0.9815217391304348, "grad_norm": 4.007719066054778, "learning_rate": 8.952693527788379e-10, "loss": 0.274, "step": 5418 }, { "epoch": 0.9817028985507247, "grad_norm": 6.050332604115417, "learning_rate": 8.778062152989818e-10, "loss": 0.2845, "step": 5419 }, { "epoch": 0.9818840579710145, "grad_norm": 7.580077735401974, "learning_rate": 8.605149279553026e-10, "loss": 0.2422, "step": 5420 }, { "epoch": 0.9820652173913044, "grad_norm": 4.402840921667809, "learning_rate": 8.433954967013712e-10, "loss": 0.2901, "step": 5421 }, { "epoch": 0.9822463768115942, "grad_norm": 3.673598356625748, "learning_rate": 8.264479274313618e-10, "loss": 0.2521, "step": 5422 }, { "epoch": 0.9824275362318841, "grad_norm": 6.365360275780007, "learning_rate": 8.096722259804956e-10, "loss": 0.2447, "step": 5423 }, { "epoch": 0.9826086956521739, "grad_norm": 4.03821607486304, "learning_rate": 7.930683981246522e-10, "loss": 0.3603, "step": 5424 }, { "epoch": 0.9827898550724637, "grad_norm": 6.739572399971401, "learning_rate": 7.766364495807032e-10, "loss": 0.3262, "step": 5425 }, { "epoch": 0.9829710144927536, "grad_norm": 4.571141720022084, "learning_rate": 7.603763860061785e-10, "loss": 0.2794, "step": 5426 }, { "epoch": 0.9831521739130434, "grad_norm": 5.065904520178715, "learning_rate": 7.442882129994887e-10, "loss": 0.2453, "step": 5427 }, { "epoch": 0.9833333333333333, "grad_norm": 4.855145685520532, "learning_rate": 7.283719360999252e-10, "loss": 0.2557, "step": 5428 }, { "epoch": 0.9835144927536232, "grad_norm": 4.441041132766409, "learning_rate": 7.126275607874932e-10, "loss": 0.3177, "step": 5429 }, { "epoch": 0.9836956521739131, "grad_norm": 4.179488390084364, "learning_rate": 6.970550924830232e-10, "loss": 0.2663, "step": 5430 }, { "epoch": 0.9838768115942029, "grad_norm": 4.465427232747172, "learning_rate": 6.816545365482818e-10, "loss": 0.2136, "step": 5431 }, { "epoch": 0.9840579710144928, "grad_norm": 4.456559508536437, "learning_rate": 6.664258982856941e-10, "loss": 0.2659, "step": 5432 }, { "epoch": 0.9842391304347826, "grad_norm": 7.829687893352149, "learning_rate": 6.513691829385659e-10, "loss": 0.3099, "step": 5433 }, { "epoch": 0.9844202898550725, "grad_norm": 4.70524836610605, "learning_rate": 6.364843956909727e-10, "loss": 0.2732, "step": 5434 }, { "epoch": 0.9846014492753623, "grad_norm": 4.064049887970608, "learning_rate": 6.217715416678149e-10, "loss": 0.2644, "step": 5435 }, { "epoch": 0.9847826086956522, "grad_norm": 4.278682606974456, "learning_rate": 6.072306259348736e-10, "loss": 0.2378, "step": 5436 }, { "epoch": 0.984963768115942, "grad_norm": 5.105688634048712, "learning_rate": 5.928616534985331e-10, "loss": 0.3339, "step": 5437 }, { "epoch": 0.9851449275362318, "grad_norm": 5.825569143195145, "learning_rate": 5.786646293062247e-10, "loss": 0.3376, "step": 5438 }, { "epoch": 0.9853260869565217, "grad_norm": 4.949625180464565, "learning_rate": 5.646395582459829e-10, "loss": 0.2317, "step": 5439 }, { "epoch": 0.9855072463768116, "grad_norm": 6.800571368945507, "learning_rate": 5.507864451467226e-10, "loss": 0.3033, "step": 5440 }, { "epoch": 0.9856884057971015, "grad_norm": 4.629772350923752, "learning_rate": 5.37105294778073e-10, "loss": 0.2796, "step": 5441 }, { "epoch": 0.9858695652173913, "grad_norm": 8.381900124221994, "learning_rate": 5.235961118506549e-10, "loss": 0.3246, "step": 5442 }, { "epoch": 0.9860507246376812, "grad_norm": 4.041833176217775, "learning_rate": 5.102589010155811e-10, "loss": 0.3084, "step": 5443 }, { "epoch": 0.986231884057971, "grad_norm": 4.510579425868753, "learning_rate": 4.970936668650672e-10, "loss": 0.2239, "step": 5444 }, { "epoch": 0.9864130434782609, "grad_norm": 5.226292915931263, "learning_rate": 4.84100413931876e-10, "loss": 0.2804, "step": 5445 }, { "epoch": 0.9865942028985507, "grad_norm": 6.873202770859247, "learning_rate": 4.712791466896515e-10, "loss": 0.2946, "step": 5446 }, { "epoch": 0.9867753623188406, "grad_norm": 5.264889708568753, "learning_rate": 4.5862986955286234e-10, "loss": 0.2559, "step": 5447 }, { "epoch": 0.9869565217391304, "grad_norm": 4.2741344049337435, "learning_rate": 4.4615258687669134e-10, "loss": 0.2449, "step": 5448 }, { "epoch": 0.9871376811594202, "grad_norm": 4.183967611620681, "learning_rate": 4.3384730295709105e-10, "loss": 0.3258, "step": 5449 }, { "epoch": 0.9873188405797102, "grad_norm": 4.638276899178078, "learning_rate": 4.2171402203083904e-10, "loss": 0.2821, "step": 5450 }, { "epoch": 0.9875, "grad_norm": 8.410577616809684, "learning_rate": 4.0975274827553807e-10, "loss": 0.2704, "step": 5451 }, { "epoch": 0.9876811594202899, "grad_norm": 4.754367755517139, "learning_rate": 3.979634858094494e-10, "loss": 0.3379, "step": 5452 }, { "epoch": 0.9878623188405797, "grad_norm": 4.6351726551871515, "learning_rate": 3.8634623869171487e-10, "loss": 0.2816, "step": 5453 }, { "epoch": 0.9880434782608696, "grad_norm": 3.6255123351982994, "learning_rate": 3.749010109221906e-10, "loss": 0.2753, "step": 5454 }, { "epoch": 0.9882246376811594, "grad_norm": 3.9850490392522833, "learning_rate": 3.6362780644150217e-10, "loss": 0.2491, "step": 5455 }, { "epoch": 0.9884057971014493, "grad_norm": 4.8421733803156135, "learning_rate": 3.525266291311002e-10, "loss": 0.2438, "step": 5456 }, { "epoch": 0.9885869565217391, "grad_norm": 5.628931566659291, "learning_rate": 3.415974828132051e-10, "loss": 0.3735, "step": 5457 }, { "epoch": 0.988768115942029, "grad_norm": 5.429817581893199, "learning_rate": 3.3084037125064023e-10, "loss": 0.3151, "step": 5458 }, { "epoch": 0.9889492753623188, "grad_norm": 7.646579950171526, "learning_rate": 3.2025529814727615e-10, "loss": 0.2749, "step": 5459 }, { "epoch": 0.9891304347826086, "grad_norm": 6.334711858303099, "learning_rate": 3.098422671475309e-10, "loss": 0.3431, "step": 5460 }, { "epoch": 0.9893115942028986, "grad_norm": 5.414762033517194, "learning_rate": 2.996012818367033e-10, "loss": 0.2212, "step": 5461 }, { "epoch": 0.9894927536231884, "grad_norm": 4.086616243696165, "learning_rate": 2.8953234574075056e-10, "loss": 0.3023, "step": 5462 }, { "epoch": 0.9896739130434783, "grad_norm": 3.8956174718852687, "learning_rate": 2.7963546232645516e-10, "loss": 0.2601, "step": 5463 }, { "epoch": 0.9898550724637681, "grad_norm": 5.930213521427466, "learning_rate": 2.6991063500142465e-10, "loss": 0.3049, "step": 5464 }, { "epoch": 0.990036231884058, "grad_norm": 8.909291520141176, "learning_rate": 2.6035786711398053e-10, "loss": 0.2707, "step": 5465 }, { "epoch": 0.9902173913043478, "grad_norm": 4.854253565918032, "learning_rate": 2.5097716195310315e-10, "loss": 0.3038, "step": 5466 }, { "epoch": 0.9903985507246377, "grad_norm": 4.937258246366047, "learning_rate": 2.4176852274865325e-10, "loss": 0.3134, "step": 5467 }, { "epoch": 0.9905797101449275, "grad_norm": 6.691044884794042, "learning_rate": 2.327319526711502e-10, "loss": 0.2869, "step": 5468 }, { "epoch": 0.9907608695652174, "grad_norm": 4.093884814767755, "learning_rate": 2.2386745483204962e-10, "loss": 0.2447, "step": 5469 }, { "epoch": 0.9909420289855072, "grad_norm": 4.250586372863622, "learning_rate": 2.1517503228335455e-10, "loss": 0.3075, "step": 5470 }, { "epoch": 0.991123188405797, "grad_norm": 5.539273136072462, "learning_rate": 2.0665468801794873e-10, "loss": 0.3143, "step": 5471 }, { "epoch": 0.991304347826087, "grad_norm": 4.238100541488654, "learning_rate": 1.9830642496937445e-10, "loss": 0.2263, "step": 5472 }, { "epoch": 0.9914855072463769, "grad_norm": 4.095263786767813, "learning_rate": 1.9013024601199913e-10, "loss": 0.2975, "step": 5473 }, { "epoch": 0.9916666666666667, "grad_norm": 5.329225075674741, "learning_rate": 1.8212615396095975e-10, "loss": 0.2876, "step": 5474 }, { "epoch": 0.9918478260869565, "grad_norm": 4.959822526288037, "learning_rate": 1.7429415157205195e-10, "loss": 0.3141, "step": 5475 }, { "epoch": 0.9920289855072464, "grad_norm": 5.153386631951864, "learning_rate": 1.6663424154189642e-10, "loss": 0.2689, "step": 5476 }, { "epoch": 0.9922101449275362, "grad_norm": 7.4229996497174096, "learning_rate": 1.5914642650782795e-10, "loss": 0.3066, "step": 5477 }, { "epoch": 0.9923913043478261, "grad_norm": 5.439655998064373, "learning_rate": 1.5183070904795093e-10, "loss": 0.2675, "step": 5478 }, { "epoch": 0.9925724637681159, "grad_norm": 4.203423533849288, "learning_rate": 1.4468709168108384e-10, "loss": 0.3242, "step": 5479 }, { "epoch": 0.9927536231884058, "grad_norm": 5.179510812789131, "learning_rate": 1.3771557686681479e-10, "loss": 0.3707, "step": 5480 }, { "epoch": 0.9929347826086956, "grad_norm": 4.930300349517559, "learning_rate": 1.3091616700544594e-10, "loss": 0.3034, "step": 5481 }, { "epoch": 0.9931159420289855, "grad_norm": 7.220794549135853, "learning_rate": 1.2428886443810461e-10, "loss": 0.2843, "step": 5482 }, { "epoch": 0.9932971014492754, "grad_norm": 4.214810346606237, "learning_rate": 1.1783367144652112e-10, "loss": 0.298, "step": 5483 }, { "epoch": 0.9934782608695653, "grad_norm": 5.508075277282163, "learning_rate": 1.1155059025336199e-10, "loss": 0.2578, "step": 5484 }, { "epoch": 0.9936594202898551, "grad_norm": 3.614860085170085, "learning_rate": 1.0543962302184129e-10, "loss": 0.2495, "step": 5485 }, { "epoch": 0.993840579710145, "grad_norm": 3.6827084359797553, "learning_rate": 9.950077185594263e-11, "loss": 0.24, "step": 5486 }, { "epoch": 0.9940217391304348, "grad_norm": 7.467738509768066, "learning_rate": 9.373403880058584e-11, "loss": 0.2676, "step": 5487 }, { "epoch": 0.9942028985507246, "grad_norm": 5.411381852990324, "learning_rate": 8.813942584118273e-11, "loss": 0.3359, "step": 5488 }, { "epoch": 0.9943840579710145, "grad_norm": 7.004289629299719, "learning_rate": 8.271693490397025e-11, "loss": 0.3007, "step": 5489 }, { "epoch": 0.9945652173913043, "grad_norm": 3.7988064531036145, "learning_rate": 7.746656785601046e-11, "loss": 0.2605, "step": 5490 }, { "epoch": 0.9947463768115942, "grad_norm": 4.464541104464458, "learning_rate": 7.238832650502402e-11, "loss": 0.3206, "step": 5491 }, { "epoch": 0.994927536231884, "grad_norm": 4.6955437267846785, "learning_rate": 6.748221259939014e-11, "loss": 0.2586, "step": 5492 }, { "epoch": 0.9951086956521739, "grad_norm": 6.495103498924206, "learning_rate": 6.274822782836864e-11, "loss": 0.3049, "step": 5493 }, { "epoch": 0.9952898550724638, "grad_norm": 3.3438673053709094, "learning_rate": 5.8186373821877966e-11, "loss": 0.1991, "step": 5494 }, { "epoch": 0.9954710144927537, "grad_norm": 3.979301586623832, "learning_rate": 5.3796652150606135e-11, "loss": 0.2931, "step": 5495 }, { "epoch": 0.9956521739130435, "grad_norm": 5.001210512934167, "learning_rate": 4.9579064325955265e-11, "loss": 0.2877, "step": 5496 }, { "epoch": 0.9958333333333333, "grad_norm": 4.259152831377175, "learning_rate": 4.553361180004156e-11, "loss": 0.2743, "step": 5497 }, { "epoch": 0.9960144927536232, "grad_norm": 8.745702706044188, "learning_rate": 4.1660295965750824e-11, "loss": 0.2693, "step": 5498 }, { "epoch": 0.996195652173913, "grad_norm": 3.377521619414664, "learning_rate": 3.795911815662744e-11, "loss": 0.2079, "step": 5499 }, { "epoch": 0.9963768115942029, "grad_norm": 9.176887964146367, "learning_rate": 3.443007964709643e-11, "loss": 0.2444, "step": 5500 }, { "epoch": 0.9963768115942029, "eval_loss": 0.26609376072883606, "eval_runtime": 9.9093, "eval_samples_per_second": 50.458, "eval_steps_per_second": 0.101, "step": 5500 }, { "epoch": 0.9965579710144927, "grad_norm": 10.880353098499464, "learning_rate": 3.1073181652130354e-11, "loss": 0.2796, "step": 5501 }, { "epoch": 0.9967391304347826, "grad_norm": 4.910476782688724, "learning_rate": 2.7888425327582398e-11, "loss": 0.3411, "step": 5502 }, { "epoch": 0.9969202898550724, "grad_norm": 4.660099329650357, "learning_rate": 2.487581177001985e-11, "loss": 0.2875, "step": 5503 }, { "epoch": 0.9971014492753624, "grad_norm": 3.97791689712913, "learning_rate": 2.2035342016613055e-11, "loss": 0.2749, "step": 5504 }, { "epoch": 0.9972826086956522, "grad_norm": 3.333737148729066, "learning_rate": 1.936701704535748e-11, "loss": 0.1992, "step": 5505 }, { "epoch": 0.9974637681159421, "grad_norm": 8.32486290836786, "learning_rate": 1.6870837775018187e-11, "loss": 0.241, "step": 5506 }, { "epoch": 0.9976449275362319, "grad_norm": 5.152901366150169, "learning_rate": 1.454680506501882e-11, "loss": 0.3158, "step": 5507 }, { "epoch": 0.9978260869565218, "grad_norm": 3.950925255581313, "learning_rate": 1.239491971549711e-11, "loss": 0.3058, "step": 5508 }, { "epoch": 0.9980072463768116, "grad_norm": 4.680356667952785, "learning_rate": 1.0415182467471417e-11, "loss": 0.2732, "step": 5509 }, { "epoch": 0.9981884057971014, "grad_norm": 3.1183893730487924, "learning_rate": 8.607594002452145e-12, "loss": 0.2336, "step": 5510 }, { "epoch": 0.9983695652173913, "grad_norm": 6.70328003220962, "learning_rate": 6.972154942830322e-12, "loss": 0.2884, "step": 5511 }, { "epoch": 0.9985507246376811, "grad_norm": 3.726600456585857, "learning_rate": 5.508865851766575e-12, "loss": 0.2332, "step": 5512 }, { "epoch": 0.998731884057971, "grad_norm": 4.185692384356067, "learning_rate": 4.217727232969093e-12, "loss": 0.2789, "step": 5513 }, { "epoch": 0.9989130434782608, "grad_norm": 5.320448399575653, "learning_rate": 3.098739531082195e-12, "loss": 0.2807, "step": 5514 }, { "epoch": 0.9990942028985508, "grad_norm": 4.351928942124165, "learning_rate": 2.151903131297761e-12, "loss": 0.2525, "step": 5515 }, { "epoch": 0.9992753623188406, "grad_norm": 6.602538203595945, "learning_rate": 1.3772183596882925e-12, "loss": 0.3251, "step": 5516 }, { "epoch": 0.9994565217391305, "grad_norm": 5.979729564355757, "learning_rate": 7.746854829293603e-13, "loss": 0.3123, "step": 5517 }, { "epoch": 0.9996376811594203, "grad_norm": 11.580413662837362, "learning_rate": 3.4430470846613656e-13, "loss": 0.3552, "step": 5518 }, { "epoch": 0.9998188405797102, "grad_norm": 4.705549896670449, "learning_rate": 8.607618451339505e-14, "loss": 0.273, "step": 5519 }, { "epoch": 1.0, "grad_norm": 4.435987266997774, "learning_rate": 0.0, "loss": 0.3062, "step": 5520 } ], "logging_steps": 1, "max_steps": 5520, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3744526270464000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }